Re: [PATCH 6/7] mm/balloon_compaction: use common page ballooning

2014-08-20 Thread Rafael Aquini
,7 @@ static inline void balloon_page_insert(struct page *page,
>   */
>  static inline void balloon_page_delete(struct page *page)
>  {
> + __ClearPageBalloon(page);
>   page->mapping = NULL;
>   list_del(>lru);
>  }
> @@ -250,24 +181,16 @@ static inline void balloon_page_insert(struct page 
> *page,
>  struct address_space *mapping,
>  struct list_head *head)
>  {
> + __SetPageBalloon(page);
>   list_add(>lru, head);
>  }
>  
>  static inline void balloon_page_delete(struct page *page)
>  {
> + __ClearPageBalloon(page);
>   list_del(>lru);
>  }
>  
> -static inline bool balloon_page_movable(struct page *page)
> -{
> - return false;
> -}
> -
> -static inline bool isolated_balloon_page(struct page *page)
> -{
> - return false;
> -}
> -
>  static inline bool balloon_page_isolate(struct page *page)
>  {
>   return false;
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 72e0db0..e09cf0a 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -237,7 +237,7 @@ config MEMORY_BALLOON
>  config BALLOON_COMPACTION
>   bool "Allow for balloon memory compaction/migration"
>   def_bool y
> - depends on COMPACTION && VIRTIO_BALLOON
> + depends on COMPACTION && MEMORY_BALLOON
>   help
> Memory fragmentation introduced by ballooning might reduce
> significantly the number of 2MB contiguous memory blocks that can be
> diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
> index 533c567..22c8e03 100644
> --- a/mm/balloon_compaction.c
> +++ b/mm/balloon_compaction.c
> @@ -253,8 +253,7 @@ bool balloon_page_isolate(struct page *page)
>* Prevent concurrent compaction threads from isolating
>* an already isolated balloon page by refcount check.
>*/
> - if (__is_movable_balloon_page(page) &&
> - page_count(page) == 2) {
> + if (PageBalloon(page) && page_count(page) == 2) {
>   __isolate_balloon_page(page);
>   unlock_page(page);
>   return true;
> @@ -275,7 +274,7 @@ void balloon_page_putback(struct page *page)
>*/
>   lock_page(page);
>  
> - if (__is_movable_balloon_page(page)) {
> + if (PageBalloon(page)) {
>   __putback_balloon_page(page);
>   /* drop the extra ref count taken for page isolation */
>   put_page(page);
> @@ -300,7 +299,7 @@ int balloon_page_migrate(struct page *newpage,
>*/
>   BUG_ON(!trylock_page(newpage));
>  
> - if (WARN_ON(!__is_movable_balloon_page(page))) {
> + if (WARN_ON(!PageBalloon(page))) {
>   dump_page(page, "not movable balloon page");
>   unlock_page(newpage);
>   return rc;
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 0653f5f..e9aeed2 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -596,11 +596,10 @@ isolate_migratepages_range(struct zone *zone, struct 
> compact_control *cc,
>* Skip any other type of page
>*/
>   if (!PageLRU(page)) {
> - if (unlikely(balloon_page_movable(page))) {
> - if (balloon_page_isolate(page)) {
> - /* Successfully isolated */
> - goto isolate_success;
> - }
> + if (unlikely(PageBalloon(page)) &&
> + balloon_page_isolate(page)) {
> + /* Successfully isolated */
> + goto isolate_success;
>   }
>   continue;
>   }
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 161d044..c35e6f2 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -92,7 +92,7 @@ void putback_movable_pages(struct list_head *l)
>   list_del(>lru);
>   dec_zone_page_state(page, NR_ISOLATED_ANON +
>   page_is_file_cache(page));
> - if (unlikely(isolated_balloon_page(page)))
> + if (unlikely(PageBalloon(page)))
>   balloon_page_putback(page);
>   else
>   putback_lru_page(page);
> @@ -873,7 +873,7 @@ static int __unmap_and_move(struct page *page, struct 
> page *newpage,
>   }
>   }
>  
> -

Re: [PATCH 5/7] mm: introduce common page state for ballooned memory

2014-08-20 Thread Rafael Aquini
On Wed, Aug 20, 2014 at 07:04:58PM +0400, Konstantin Khlebnikov wrote:
> This patch adds page state PageBallon() and functions __Set/ClearPageBalloon.
> Like PageBuddy() PageBalloon() looks like page-flag but actually this is 
> special
> state of page->_mapcount counter. There is no conflict because ballooned pages
> cannot be mapped and cannot be in buddy allocator.
> 
> Ballooned pages are counted in vmstat counter NR_BALLOON_PAGES, it's shown 
> them
> in /proc/meminfo and /proc/meminfo. Also this patch it exports PageBallon into
> userspace via /proc/kpageflags as KPF_BALLOON.
> 
> All this code including mm/balloon_compaction.o is under 
> CONFIG_MEMORY_BALLOON,
> it should be selected by ballooning driver which want use this feature.
> 

Very nice overhaul Konstantin!
Please, consider the nits I have below:


> Signed-off-by: Konstantin Khlebnikov 
> ---
>  Documentation/filesystems/proc.txt |2 ++
>  drivers/base/node.c|   16 ++--
>  drivers/virtio/Kconfig |1 +
>  fs/proc/meminfo.c  |6 ++
>  fs/proc/page.c |3 +++
>  include/linux/mm.h |   10 ++
>  include/linux/mmzone.h |3 +++
>  include/uapi/linux/kernel-page-flags.h |1 +
>  mm/Kconfig |5 +
>  mm/Makefile|3 ++-
>  mm/balloon_compaction.c|   14 ++
>  mm/vmstat.c|8 +++-
>  tools/vm/page-types.c  |1 +
>  13 files changed, 65 insertions(+), 8 deletions(-)
> 
> diff --git a/Documentation/filesystems/proc.txt 
> b/Documentation/filesystems/proc.txt
> index eb8a10e..154a345 100644
> --- a/Documentation/filesystems/proc.txt
> +++ b/Documentation/filesystems/proc.txt
> @@ -796,6 +796,7 @@ VmallocTotal:   112216 kB
>  VmallocUsed:   428 kB
>  VmallocChunk:   111088 kB
>  AnonHugePages:   49152 kB
> +BalloonPages:0 kB
>  
>  MemTotal: Total usable ram (i.e. physical ram minus a few reserved
>bits and the kernel binary code)
> @@ -838,6 +839,7 @@ MemAvailable: An estimate of how much memory is available 
> for starting new
> Writeback: Memory which is actively being written back to the disk
> AnonPages: Non-file backed pages mapped into userspace page tables
>  AnonHugePages: Non-file backed huge pages mapped into userspace page tables
> +BalloonPages: Memory which was ballooned, not included into MemTotal
>Mapped: files which have been mmaped, such as libraries
>  Slab: in-kernel data structures cache
>  SReclaimable: Part of Slab, that might be reclaimed, such as caches
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index c6d3ae0..59e565c 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -120,6 +120,9 @@ static ssize_t node_read_meminfo(struct device *dev,
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  "Node %d AnonHugePages:  %8lu kB\n"
>  #endif
> +#ifdef CONFIG_MEMORY_BALLOON
> +"Node %d BalloonPages:   %8lu kB\n"
> +#endif
>   ,
>  nid, K(node_page_state(nid, NR_FILE_DIRTY)),
>  nid, K(node_page_state(nid, NR_WRITEBACK)),
> @@ -136,14 +139,15 @@ static ssize_t node_read_meminfo(struct device *dev,
>  nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
>   node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
>  nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))
> - , nid,
> - K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
> - HPAGE_PMD_NR));
> -#else
> -nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +,nid, K(node_page_state(nid,
> + NR_ANON_TRANSPARENT_HUGEPAGES) * HPAGE_PMD_NR)
> +#endif
> +#ifdef CONFIG_MEMORY_BALLOON
> +,nid, K(node_page_state(nid, NR_BALLOON_PAGES))
>  #endif
> +);
>   n += hugetlb_report_node_meminfo(nid, buf + n);
>   return n;
>  }
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index c6683f2..00b2286 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -25,6 +25,7 @@ config VIRTIO_PCI
>  config VIRTIO_BALLOON
>   tristate "Virtio balloon driver"
>   depends on VIRTIO
> + select MEMORY_BALLOON
>   ---help---
>This driver supports increasing and decreasing the amount
>of memory within a KVM guest.
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index aa1eee0..f897fbf 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -138,6 +138,9 @@ static int 

Re: [PATCH 3/7] mm/balloon_compaction: isolate balloon pages without lru_lock

2014-08-20 Thread Rafael Aquini
On Wed, Aug 20, 2014 at 07:04:46PM +0400, Konstantin Khlebnikov wrote:
> LRU-lock isn't required for balloon page isolation. This check makes migration
> of some ballooned pages mostly impossible because isolate_migratepages_range()
> drops LRU lock periodically.
>
just for historical/explanatory purposes: https://lkml.org/lkml/2013/12/6/183 

> Signed-off-by: Konstantin Khlebnikov 
> Cc: stable  # v3.8
> ---
>  mm/compaction.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 21bf292..0653f5f 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -597,7 +597,7 @@ isolate_migratepages_range(struct zone *zone, struct 
> compact_control *cc,
>*/
>   if (!PageLRU(page)) {
>   if (unlikely(balloon_page_movable(page))) {
> - if (locked && balloon_page_isolate(page)) {
> + if (balloon_page_isolate(page)) {
>   /* Successfully isolated */
>       goto isolate_success;
>   }
> 
Acked-by: Rafael Aquini 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/7] mm/balloon_compaction: keep ballooned pages away from normal migration path

2014-08-20 Thread Rafael Aquini
On Wed, Aug 20, 2014 at 07:04:40PM +0400, Konstantin Khlebnikov wrote:
> Proper testing shows yet another problem in balloon migration: it works only
> once for each page. balloon_page_movable() check page flags and page_count.
> In __unmap_and_move page is locked, reference counter is elevated, so
> balloon_page_movable() _always_ fails here. As result in __unmap_and_move()
> migration goes to the normal migration path.
> 
> Balloon ->migratepage() is so special, it returns MIGRATEPAGE_BALLOON_SUCCESS
> instead of MIGRATEPAGE_SUCCESS. After that in move_to_new_page() successfully
> migrated page got NULL into its mapping pointer and loses connectivity with
> balloon and ability for further migration.
> 
> It's safe to use __is_movable_balloon_page here: page is isolated and pinned.
> 
> Signed-off-by: Konstantin Khlebnikov 
> Cc: stable  # v3.8
> ---
>  mm/migrate.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/mm/migrate.c b/mm/migrate.c
> index f78ec9b..161d044 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -873,7 +873,7 @@ static int __unmap_and_move(struct page *page, struct 
> page *newpage,
>   }
>   }
>  
> - if (unlikely(balloon_page_movable(page))) {
> + if (unlikely(__is_movable_balloon_page(page))) {
>   /*
>* A ballooned page does not need any special attention from
>* physical to virtual reverse mapping procedures.
> 
Acked-by: Rafael Aquini 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/7] mm/balloon_compaction: ignore anonymous pages

2014-08-20 Thread Rafael Aquini
On Wed, Aug 20, 2014 at 07:04:35PM +0400, Konstantin Khlebnikov wrote:
> Sasha Levin reported KASAN splash inside isolate_migratepages_range().
> Problem is in function __is_movable_balloon_page() which tests AS_BALLOON_MAP
> in page->mapping->flags. This function has no protection against anonymous
> pages. As result it tried to check address space flags in inside anon-vma.
> 
> Signed-off-by: Konstantin Khlebnikov 
> Reported-by: Sasha Levin 
> Link: http://lkml.kernel.org/p/53e6ceaa.9020...@oracle.com
> Cc: stable  # v3.8
> ---
>  include/linux/balloon_compaction.h |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/linux/balloon_compaction.h 
> b/include/linux/balloon_compaction.h
> index 089743a..53d482e 100644
> --- a/include/linux/balloon_compaction.h
> +++ b/include/linux/balloon_compaction.h
> @@ -128,7 +128,7 @@ static inline bool page_flags_cleared(struct page *page)
>  static inline bool __is_movable_balloon_page(struct page *page)
>  {
>   struct address_space *mapping = page->mapping;
> - return mapping_balloon(mapping);
> + return !PageAnon(page) && mapping_balloon(mapping);
>  }
>  
>  /*
> 
Acked-by: Rafael Aquini 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/7] mm/balloon_compaction: ignore anonymous pages

2014-08-20 Thread Rafael Aquini
On Wed, Aug 20, 2014 at 07:04:35PM +0400, Konstantin Khlebnikov wrote:
 Sasha Levin reported KASAN splash inside isolate_migratepages_range().
 Problem is in function __is_movable_balloon_page() which tests AS_BALLOON_MAP
 in page-mapping-flags. This function has no protection against anonymous
 pages. As result it tried to check address space flags in inside anon-vma.
 
 Signed-off-by: Konstantin Khlebnikov k.khlebni...@samsung.com
 Reported-by: Sasha Levin sasha.le...@oracle.com
 Link: http://lkml.kernel.org/p/53e6ceaa.9020...@oracle.com
 Cc: stable sta...@vger.kernel.org # v3.8
 ---
  include/linux/balloon_compaction.h |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/include/linux/balloon_compaction.h 
 b/include/linux/balloon_compaction.h
 index 089743a..53d482e 100644
 --- a/include/linux/balloon_compaction.h
 +++ b/include/linux/balloon_compaction.h
 @@ -128,7 +128,7 @@ static inline bool page_flags_cleared(struct page *page)
  static inline bool __is_movable_balloon_page(struct page *page)
  {
   struct address_space *mapping = page-mapping;
 - return mapping_balloon(mapping);
 + return !PageAnon(page)  mapping_balloon(mapping);
  }
  
  /*
 
Acked-by: Rafael Aquini aqu...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/7] mm/balloon_compaction: keep ballooned pages away from normal migration path

2014-08-20 Thread Rafael Aquini
On Wed, Aug 20, 2014 at 07:04:40PM +0400, Konstantin Khlebnikov wrote:
 Proper testing shows yet another problem in balloon migration: it works only
 once for each page. balloon_page_movable() check page flags and page_count.
 In __unmap_and_move page is locked, reference counter is elevated, so
 balloon_page_movable() _always_ fails here. As result in __unmap_and_move()
 migration goes to the normal migration path.
 
 Balloon -migratepage() is so special, it returns MIGRATEPAGE_BALLOON_SUCCESS
 instead of MIGRATEPAGE_SUCCESS. After that in move_to_new_page() successfully
 migrated page got NULL into its mapping pointer and loses connectivity with
 balloon and ability for further migration.
 
 It's safe to use __is_movable_balloon_page here: page is isolated and pinned.
 
 Signed-off-by: Konstantin Khlebnikov k.khlebni...@samsung.com
 Cc: stable sta...@vger.kernel.org # v3.8
 ---
  mm/migrate.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/mm/migrate.c b/mm/migrate.c
 index f78ec9b..161d044 100644
 --- a/mm/migrate.c
 +++ b/mm/migrate.c
 @@ -873,7 +873,7 @@ static int __unmap_and_move(struct page *page, struct 
 page *newpage,
   }
   }
  
 - if (unlikely(balloon_page_movable(page))) {
 + if (unlikely(__is_movable_balloon_page(page))) {
   /*
* A ballooned page does not need any special attention from
* physical to virtual reverse mapping procedures.
 
Acked-by: Rafael Aquini aqu...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/7] mm/balloon_compaction: isolate balloon pages without lru_lock

2014-08-20 Thread Rafael Aquini
On Wed, Aug 20, 2014 at 07:04:46PM +0400, Konstantin Khlebnikov wrote:
 LRU-lock isn't required for balloon page isolation. This check makes migration
 of some ballooned pages mostly impossible because isolate_migratepages_range()
 drops LRU lock periodically.

just for historical/explanatory purposes: https://lkml.org/lkml/2013/12/6/183 

 Signed-off-by: Konstantin Khlebnikov k.khlebni...@samsung.com
 Cc: stable sta...@vger.kernel.org # v3.8
 ---
  mm/compaction.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/mm/compaction.c b/mm/compaction.c
 index 21bf292..0653f5f 100644
 --- a/mm/compaction.c
 +++ b/mm/compaction.c
 @@ -597,7 +597,7 @@ isolate_migratepages_range(struct zone *zone, struct 
 compact_control *cc,
*/
   if (!PageLRU(page)) {
   if (unlikely(balloon_page_movable(page))) {
 - if (locked  balloon_page_isolate(page)) {
 + if (balloon_page_isolate(page)) {
   /* Successfully isolated */
   goto isolate_success;
   }
 
Acked-by: Rafael Aquini aqu...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/7] mm: introduce common page state for ballooned memory

2014-08-20 Thread Rafael Aquini
On Wed, Aug 20, 2014 at 07:04:58PM +0400, Konstantin Khlebnikov wrote:
 This patch adds page state PageBallon() and functions __Set/ClearPageBalloon.
 Like PageBuddy() PageBalloon() looks like page-flag but actually this is 
 special
 state of page-_mapcount counter. There is no conflict because ballooned pages
 cannot be mapped and cannot be in buddy allocator.
 
 Ballooned pages are counted in vmstat counter NR_BALLOON_PAGES, it's shown 
 them
 in /proc/meminfo and /proc/meminfo. Also this patch it exports PageBallon into
 userspace via /proc/kpageflags as KPF_BALLOON.
 
 All this code including mm/balloon_compaction.o is under 
 CONFIG_MEMORY_BALLOON,
 it should be selected by ballooning driver which want use this feature.
 

Very nice overhaul Konstantin!
Please, consider the nits I have below:


 Signed-off-by: Konstantin Khlebnikov k.khlebni...@samsung.com
 ---
  Documentation/filesystems/proc.txt |2 ++
  drivers/base/node.c|   16 ++--
  drivers/virtio/Kconfig |1 +
  fs/proc/meminfo.c  |6 ++
  fs/proc/page.c |3 +++
  include/linux/mm.h |   10 ++
  include/linux/mmzone.h |3 +++
  include/uapi/linux/kernel-page-flags.h |1 +
  mm/Kconfig |5 +
  mm/Makefile|3 ++-
  mm/balloon_compaction.c|   14 ++
  mm/vmstat.c|8 +++-
  tools/vm/page-types.c  |1 +
  13 files changed, 65 insertions(+), 8 deletions(-)
 
 diff --git a/Documentation/filesystems/proc.txt 
 b/Documentation/filesystems/proc.txt
 index eb8a10e..154a345 100644
 --- a/Documentation/filesystems/proc.txt
 +++ b/Documentation/filesystems/proc.txt
 @@ -796,6 +796,7 @@ VmallocTotal:   112216 kB
  VmallocUsed:   428 kB
  VmallocChunk:   111088 kB
  AnonHugePages:   49152 kB
 +BalloonPages:0 kB
  
  MemTotal: Total usable ram (i.e. physical ram minus a few reserved
bits and the kernel binary code)
 @@ -838,6 +839,7 @@ MemAvailable: An estimate of how much memory is available 
 for starting new
 Writeback: Memory which is actively being written back to the disk
 AnonPages: Non-file backed pages mapped into userspace page tables
  AnonHugePages: Non-file backed huge pages mapped into userspace page tables
 +BalloonPages: Memory which was ballooned, not included into MemTotal
Mapped: files which have been mmaped, such as libraries
  Slab: in-kernel data structures cache
  SReclaimable: Part of Slab, that might be reclaimed, such as caches
 diff --git a/drivers/base/node.c b/drivers/base/node.c
 index c6d3ae0..59e565c 100644
 --- a/drivers/base/node.c
 +++ b/drivers/base/node.c
 @@ -120,6 +120,9 @@ static ssize_t node_read_meminfo(struct device *dev,
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  Node %d AnonHugePages:  %8lu kB\n
  #endif
 +#ifdef CONFIG_MEMORY_BALLOON
 +Node %d BalloonPages:   %8lu kB\n
 +#endif
   ,
  nid, K(node_page_state(nid, NR_FILE_DIRTY)),
  nid, K(node_page_state(nid, NR_WRITEBACK)),
 @@ -136,14 +139,15 @@ static ssize_t node_read_meminfo(struct device *dev,
  nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
   node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
  nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))
 - , nid,
 - K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
 - HPAGE_PMD_NR));
 -#else
 -nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 +,nid, K(node_page_state(nid,
 + NR_ANON_TRANSPARENT_HUGEPAGES) * HPAGE_PMD_NR)
 +#endif
 +#ifdef CONFIG_MEMORY_BALLOON
 +,nid, K(node_page_state(nid, NR_BALLOON_PAGES))
  #endif
 +);
   n += hugetlb_report_node_meminfo(nid, buf + n);
   return n;
  }
 diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
 index c6683f2..00b2286 100644
 --- a/drivers/virtio/Kconfig
 +++ b/drivers/virtio/Kconfig
 @@ -25,6 +25,7 @@ config VIRTIO_PCI
  config VIRTIO_BALLOON
   tristate Virtio balloon driver
   depends on VIRTIO
 + select MEMORY_BALLOON
   ---help---
This driver supports increasing and decreasing the amount
of memory within a KVM guest.
 diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
 index aa1eee0..f897fbf 100644
 --- a/fs/proc/meminfo.c
 +++ b/fs/proc/meminfo.c
 @@ -138,6 +138,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
   

Re: [PATCH 6/7] mm/balloon_compaction: use common page ballooning

2014-08-20 Thread Rafael Aquini
 *page)
  {
 + __ClearPageBalloon(page);
   list_del(page-lru);
  }
  
 -static inline bool balloon_page_movable(struct page *page)
 -{
 - return false;
 -}
 -
 -static inline bool isolated_balloon_page(struct page *page)
 -{
 - return false;
 -}
 -
  static inline bool balloon_page_isolate(struct page *page)
  {
   return false;
 diff --git a/mm/Kconfig b/mm/Kconfig
 index 72e0db0..e09cf0a 100644
 --- a/mm/Kconfig
 +++ b/mm/Kconfig
 @@ -237,7 +237,7 @@ config MEMORY_BALLOON
  config BALLOON_COMPACTION
   bool Allow for balloon memory compaction/migration
   def_bool y
 - depends on COMPACTION  VIRTIO_BALLOON
 + depends on COMPACTION  MEMORY_BALLOON
   help
 Memory fragmentation introduced by ballooning might reduce
 significantly the number of 2MB contiguous memory blocks that can be
 diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
 index 533c567..22c8e03 100644
 --- a/mm/balloon_compaction.c
 +++ b/mm/balloon_compaction.c
 @@ -253,8 +253,7 @@ bool balloon_page_isolate(struct page *page)
* Prevent concurrent compaction threads from isolating
* an already isolated balloon page by refcount check.
*/
 - if (__is_movable_balloon_page(page) 
 - page_count(page) == 2) {
 + if (PageBalloon(page)  page_count(page) == 2) {
   __isolate_balloon_page(page);
   unlock_page(page);
   return true;
 @@ -275,7 +274,7 @@ void balloon_page_putback(struct page *page)
*/
   lock_page(page);
  
 - if (__is_movable_balloon_page(page)) {
 + if (PageBalloon(page)) {
   __putback_balloon_page(page);
   /* drop the extra ref count taken for page isolation */
   put_page(page);
 @@ -300,7 +299,7 @@ int balloon_page_migrate(struct page *newpage,
*/
   BUG_ON(!trylock_page(newpage));
  
 - if (WARN_ON(!__is_movable_balloon_page(page))) {
 + if (WARN_ON(!PageBalloon(page))) {
   dump_page(page, not movable balloon page);
   unlock_page(newpage);
   return rc;
 diff --git a/mm/compaction.c b/mm/compaction.c
 index 0653f5f..e9aeed2 100644
 --- a/mm/compaction.c
 +++ b/mm/compaction.c
 @@ -596,11 +596,10 @@ isolate_migratepages_range(struct zone *zone, struct 
 compact_control *cc,
* Skip any other type of page
*/
   if (!PageLRU(page)) {
 - if (unlikely(balloon_page_movable(page))) {
 - if (balloon_page_isolate(page)) {
 - /* Successfully isolated */
 - goto isolate_success;
 - }
 + if (unlikely(PageBalloon(page)) 
 + balloon_page_isolate(page)) {
 + /* Successfully isolated */
 + goto isolate_success;
   }
   continue;
   }
 diff --git a/mm/migrate.c b/mm/migrate.c
 index 161d044..c35e6f2 100644
 --- a/mm/migrate.c
 +++ b/mm/migrate.c
 @@ -92,7 +92,7 @@ void putback_movable_pages(struct list_head *l)
   list_del(page-lru);
   dec_zone_page_state(page, NR_ISOLATED_ANON +
   page_is_file_cache(page));
 - if (unlikely(isolated_balloon_page(page)))
 + if (unlikely(PageBalloon(page)))
   balloon_page_putback(page);
   else
   putback_lru_page(page);
 @@ -873,7 +873,7 @@ static int __unmap_and_move(struct page *page, struct 
 page *newpage,
   }
   }
  
 - if (unlikely(__is_movable_balloon_page(page))) {
 + if (unlikely(PageBalloon(page))) {
   /*
* A ballooned page does not need any special attention from
* physical to virtual reverse mapping procedures.
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 2836b53..f90f93e 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -1160,7 +1160,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone 
 *zone,
  
   list_for_each_entry_safe(page, next, page_list, lru) {
   if (page_is_file_cache(page)  !PageDirty(page) 
 - !isolated_balloon_page(page)) {
 + !PageBalloon(page)) {
   ClearPageActive(page);
   list_move(page-lru, clean_pages);
   }
 
Acked-by: Rafael Aquini aqu...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/7] mm/balloon_compaction: general cleanup

2014-08-20 Thread Rafael Aquini
On Wed, Aug 20, 2014 at 07:05:09PM +0400, Konstantin Khlebnikov wrote:
 * move special branch for balloon migraion into migrate_pages
 * remove special mapping for balloon and its flag AS_BALLOON_MAP
 * embed struct balloon_dev_info into struct virtio_balloon
 * cleanup balloon_page_dequeue, kill balloon_page_free
 
 Signed-off-by: Konstantin Khlebnikov k.khlebni...@samsung.com
 ---
  drivers/virtio/virtio_balloon.c|   77 -
  include/linux/balloon_compaction.h |  107 ++
  include/linux/migrate.h|   11 --
  include/linux/pagemap.h|   18 ---
  mm/balloon_compaction.c|  214 
 
  mm/migrate.c   |   27 +
  6 files changed, 130 insertions(+), 324 deletions(-)
 
Very nice clean-up, just as all other patches in this set.
Please, just consider amending the following changes to this patch of yours

Rafael
---

diff --git a/include/linux/balloon_compaction.h 
b/include/linux/balloon_compaction.h
index dc7073b..569cf96 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -75,41 +75,6 @@ extern struct page *balloon_page_dequeue(struct 
balloon_dev_info *b_dev_info);
 #ifdef CONFIG_BALLOON_COMPACTION
 extern bool balloon_page_isolate(struct page *page);
 extern void balloon_page_putback(struct page *page);
-
-/*
- * balloon_page_insert - insert a page into the balloon's page list and make
- *  the page-mapping assignment accordingly.
- * @page: page to be assigned as a 'balloon page'
- * @mapping : allocated special 'balloon_mapping'
- * @head: balloon's device page list head
- *
- * Caller must ensure the page is locked and the spin_lock protecting balloon
- * pages list is held before inserting a page into the balloon device.
- */
-static inline void
-balloon_page_insert(struct balloon_dev_info *balloon, struct page *page)
-{
-   __SetPageBalloon(page);
-   set_page_private(page, (unsigned long)balloon);
-   list_add(page-lru, balloon-pages);
-}
-
-/*
- * balloon_page_delete - delete a page from balloon's page list and clear
- *  the page-mapping assignement accordingly.
- * @page: page to be released from balloon's page list
- *
- * Caller must ensure the page is locked and the spin_lock protecting balloon
- * pages list is held before deleting a page from the balloon device.
- */
-static inline void balloon_page_delete(struct page *page, bool isolated)
-{
-   __ClearPageBalloon(page);
-   set_page_private(page, 0);
-   if (!isolated)
-   list_del(page-lru);
-}
-
 int balloon_page_migrate(new_page_t get_new_page, free_page_t put_new_page,
unsigned long private, struct page *page,
int force, enum migrate_mode mode);
@@ -130,31 +95,6 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
 
 #else /* !CONFIG_BALLOON_COMPACTION */
 
-static inline void *balloon_mapping_alloc(void *balloon_device,
-   const struct address_space_operations *a_ops)
-{
-   return ERR_PTR(-EOPNOTSUPP);
-}
-
-static inline void balloon_mapping_free(struct address_space *balloon_mapping)
-{
-   return;
-}
-
-static inline void
-balloon_page_insert(struct balloon_dev_info *balloon, struct page *page)
-{
-   __SetPageBalloon(page);
-   list_add(page-lru, head);
-}
-
-static inline void balloon_page_delete(struct page *page, bool isolated)
-{
-   __ClearPageBalloon(page);
-   if (!isolated)
-   list_del(page-lru);
-}
-
 static inline int balloon_page_migrate(new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
struct page *page, int force, enum migrate_mode mode)
@@ -176,6 +116,46 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
 {
return GFP_HIGHUSER;
 }
-
 #endif /* CONFIG_BALLOON_COMPACTION */
+
+/*
+ * balloon_page_insert - insert a page into the balloon's page list and make
+ *  the page-mapping assignment accordingly.
+ * @page: page to be assigned as a 'balloon page'
+ * @mapping : allocated special 'balloon_mapping'
+ * @head: balloon's device page list head
+ *
+ * Caller must ensure the page is locked and the spin_lock protecting balloon
+ * pages list is held before inserting a page into the balloon device.
+ */
+static inline void
+balloon_page_insert(struct balloon_dev_info *balloon, struct page *page)
+{
+#ifdef CONFIG_MEMORY_BALLOON
+   __SetPageBalloon(page);
+   set_page_private(page, (unsigned long)balloon);
+   list_add(page-lru, balloon-pages);
+   inc_zone_page_state(page, NR_BALLOON_PAGES);
+#endif
+}
+
+/*
+ * balloon_page_delete - delete a page from balloon's page list and clear
+ *  the page-mapping assignement accordingly.
+ * @page: page to be released from balloon's page list
+ *
+ * Caller must ensure the page is locked and the spin_lock 

Re: [PATCH 2/3] ipc/sem.c: increase SEMMSL, SEMMNI, SEMOPM

2014-08-15 Thread Rafael Aquini
On Tue, Aug 12, 2014 at 09:29:16AM +0200, Manfred Spraul wrote:
> a)
> SysV can be abused to allocate locked kernel memory.  For most systems, a
> small limit doesn't make sense, see the discussion with regards to SHMMAX.
> 
> Therefore: Increase the sysv sem limits so that all known applications
> will work with these defaults.
> 
> b)
> With regards to the maximum supported:
> Some of the specified hard limits are not correct anymore, therefore the
> patch updates the documentation.
> 
> - SEMMNI must stay below IPCMNI, which is 32768.
>   As for SHMMAX: Stay a bit below this limit.
> 
> - SEMMSL was limited to 8k, to ensure that the kmalloc for the kernel array
>   was limited to 16 kB (order=2)
> 
>   This doesn't apply anymore:
>- the allocation size isn't sizeof(short)*nsems anymore.
>- ipc_alloc falls back to vmalloc
> 
> - SEMOPM should stay below 1000, to limit the kmalloc in semtimedop() to an
>   order=1 allocation.
>   Therefore: Leave it at 500 (order=0 allocation).
> 
> Note:
> If an administrator must limit the memory allocations, then he can set the
> values as necessary.
> 
> Or he can disable sysv entirely (as e.g. done by Android).
> 
> Signed-off-by: Manfred Spraul 
> ---

Acked-by: Rafael Aquini 


>  include/uapi/linux/sem.h | 18 +++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h
> index 541fce0..dd73b90 100644
> --- a/include/uapi/linux/sem.h
> +++ b/include/uapi/linux/sem.h
> @@ -63,10 +63,22 @@ struct  seminfo {
>   int semaem;
>  };
>  
> -#define SEMMNI  128 /* <= IPCMNI  max # of semaphore identifiers 
> */
> -#define SEMMSL  250 /* <= 8 000 max num of semaphores per id */
> +/*
> + * SEMMNI, SEMMSL and SEMMNS are default values which can be
> + * modified by sysctl.
> + * The values has been chosen to be larger than necessary for any
> + * known configuration.
> + *
> + * SEMOPM should not be increased beyond 1000, otherwise there is the
> + * risk that semop()/semtimedop() fails due to kernel memory fragmentation 
> when
> + * allocating the sop array.
> + */
> +
> +
> +#define SEMMNI  32000   /* <= IPCMNI  max # of semaphore identifiers 
> */
> +#define SEMMSL  32000   /* <= INT_MAX max num of semaphores per id */
>  #define SEMMNS  (SEMMNI*SEMMSL) /* <= INT_MAX max # of semaphores in system 
> */
> -#define SEMOPM  32   /* <= 1 000 max num of ops per semop call */
> +#define SEMOPM  500  /* <= 1 000 max num of ops per semop call */
>  #define SEMVMX  32767   /* <= 32767 semaphore maximum value */
>  #define SEMAEM  SEMVMX  /* adjust on exit max value */
>  
> -- 
> 1.9.3
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] ipc namespace: copy settings from parent namespace

2014-08-15 Thread Rafael Aquini
On Tue, Aug 12, 2014 at 09:29:17AM +0200, Manfred Spraul wrote:
> Right now, each new IPC namespace starts with the kernel default values.
> This means that changes that were made to the limits get overwritten.
> 
> With this patch, a new namespace inherits the settings from the parent
> namespace, which is less surprising.
> 
> The patch updates
> - SysV msg
> - SysV sem
> - SysV shm
> - POSIX mqueues
> 
> Cc: se...@hallyn.com
> Cc: ebied...@xmission.com
> Cc: contain...@lists.linux-foundation.org
> Cc: mtk.manpa...@gmail.com
> 
> Signed-off-by: Manfred Spraul 
> ---

Acked-by: Rafael Aquini 


>  include/linux/ipc_namespace.h |  6 --
>  ipc/mqueue.c  | 23 ---
>  ipc/msg.c | 16 +++-
>  ipc/namespace.c   |  8 
>  ipc/sem.c | 19 +--
>  ipc/shm.c | 19 +--
>  ipc/util.h| 20 ++--
>  7 files changed, 75 insertions(+), 36 deletions(-)
> 
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index e365d5e..1cc36a0 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -73,7 +73,7 @@ static inline void shm_destroy_orphaned(struct 
> ipc_namespace *ns) {}
>  #endif /* CONFIG_SYSVIPC */
>  
>  #ifdef CONFIG_POSIX_MQUEUE
> -extern int mq_init_ns(struct ipc_namespace *ns);
> +extern int mq_init_ns(struct ipc_namespace *ns, struct ipc_namespace 
> *old_ns);
>  /*
>   * POSIX Message Queue default values:
>   *
> @@ -108,7 +108,9 @@ extern int mq_init_ns(struct ipc_namespace *ns);
>  #define DFLT_MSGSIZEMAX   8192
>  #define HARD_MSGSIZEMAX  (16*1024*1024)
>  #else
> -static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
> +static inline int mq_init_ns(struct ipc_namespace *ns,
> + struct ipc_namespace *old_ns)
> +{ return 0; }
>  #endif
>  
>  #if defined(CONFIG_IPC_NS)
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index 4fcf39a..3473072d 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -1397,14 +1397,23 @@ static struct file_system_type mqueue_fs_type = {
>   .fs_flags = FS_USERNS_MOUNT,
>  };
>  
> -int mq_init_ns(struct ipc_namespace *ns)
> +int mq_init_ns(struct ipc_namespace *ns, struct ipc_namespace *old_ns)
>  {
> + if (old_ns != NULL) {
> + ns->mq_queues_max= old_ns->mq_queues_max;
> + ns->mq_msg_max   = old_ns->mq_msg_max;
> + ns->mq_msgsize_max   = old_ns->mq_msgsize_max;
> + ns->mq_msg_default   = old_ns->mq_msg_default;
> + ns->mq_msgsize_default  = old_ns->mq_msgsize_default;
> + } else {
> + ns->mq_queues_max= DFLT_QUEUESMAX;
> + ns->mq_msg_max   = DFLT_MSGMAX;
> + ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
> + ns->mq_msg_default   = DFLT_MSG;
> + ns->mq_msgsize_default  = DFLT_MSGSIZE;
> + }
> +
>   ns->mq_queues_count  = 0;
> - ns->mq_queues_max= DFLT_QUEUESMAX;
> - ns->mq_msg_max   = DFLT_MSGMAX;
> - ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
> - ns->mq_msg_default   = DFLT_MSG;
> - ns->mq_msgsize_default  = DFLT_MSGSIZE;
>  
>   ns->mq_mnt = kern_mount_data(_fs_type, ns);
>   if (IS_ERR(ns->mq_mnt)) {
> @@ -1444,7 +1453,7 @@ static int __init init_mqueue_fs(void)
>  
>   spin_lock_init(_lock);
>  
> - error = mq_init_ns(_ipc_ns);
> + error = mq_init_ns(_ipc_ns, NULL);
>   if (error)
>   goto out_filesystem;
>  
> diff --git a/ipc/msg.c b/ipc/msg.c
> index a7261d5..3cbd2ad 100644
> --- a/ipc/msg.c
> +++ b/ipc/msg.c
> @@ -990,11 +990,17 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf 
> __user *, msgp, size_t, msgsz,
>  }
>  
>  
> -void msg_init_ns(struct ipc_namespace *ns)
> +void msg_init_ns(struct ipc_namespace *ns, struct ipc_namespace *old_ns)
>  {
> - ns->msg_ctlmax = MSGMAX;
> - ns->msg_ctlmnb = MSGMNB;
> - ns->msg_ctlmni = MSGMNI;
> + if (old_ns != NULL) {
> + ns->msg_ctlmax = old_ns->msg_ctlmax;
> + ns->msg_ctlmnb = old_ns->msg_ctlmnb;
> + ns->msg_ctlmni = old_ns->msg_ctlmni;
> + } else {
> + ns->msg_ctlmax = MSGMAX;
> + ns->msg_ctlmnb = MSGMNB;
> + ns->msg_ctlmni = MSGMNI;
> + }
>  
>   atomic_set(>msg_bytes, 0);
>   atomic_set(>msg_hdrs, 0);
> @@ -1036,7 +1042,7 @@ static i

Re: [PATCH 1/3] ipc/msg: increase MSGMNI, remove scaling

2014-08-15 Thread Rafael Aquini
On Tue, Aug 12, 2014 at 09:29:15AM +0200, Manfred Spraul wrote:
> SysV can be abused to allocate locked kernel memory.
> For most systems, a small limit doesn't make sense, see the discussion with
> regards to SHMMAX.
> 
> Therefore: increase MSGMNI to the maximum supported.
> 
> And: If we ignore the risk of locking too much memory, then an automatic
> scaling of MSGMNI doesn't make sense. Therefore the logic can be removed.
> 
> Notes:
> 1) If an administrator must limit the memory allocations, then he can set
> MSGMNI as necessary.
> 
> Or he can disable sysv entirely (as e.g. done by Android).
> 
> 2) MSGMAX and MSGMNB are intentionally not increased, as these values are used
> to control latency vs. throughput:
> If MSGMNB is large, then msgsnd() just returns and more messages can be queued
> before a task switch to a task that calls msgrcv() is forced.
> 
> Signed-off-by: Manfred Spraul 
> ---

Acked-by: Rafael Aquini 


>  include/linux/ipc_namespace.h | 20 --
>  include/uapi/linux/msg.h  | 28 +
>  ipc/Makefile  |  2 +-
>  ipc/ipc_sysctl.c  | 86 +---
>  ipc/ipcns_notifier.c  | 92 
> ---
>  ipc/msg.c | 36 +
>  ipc/namespace.c   | 22 ---
>  ipc/util.c| 40 ---
>  8 files changed, 23 insertions(+), 303 deletions(-)
>  delete mode 100644 ipc/ipcns_notifier.c
> 
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index 35e7eca..e365d5e 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -7,15 +7,6 @@
>  #include 
>  #include 
>  
> -/*
> - * ipc namespace events
> - */
> -#define IPCNS_MEMCHANGED   0x0001   /* Notify lowmem size changed */
> -#define IPCNS_CREATED  0x0002   /* Notify new ipc namespace created */
> -#define IPCNS_REMOVED  0x0003   /* Notify ipc namespace removed */
> -
> -#define IPCNS_CALLBACK_PRI 0
> -
>  struct user_namespace;
>  
>  struct ipc_ids {
> @@ -38,7 +29,6 @@ struct ipc_namespace {
>   unsigned intmsg_ctlmni;
>   atomic_tmsg_bytes;
>   atomic_tmsg_hdrs;
> - int auto_msgmni;
>  
>   size_t  shm_ctlmax;
>   size_t  shm_ctlall;
> @@ -77,18 +67,8 @@ extern atomic_t nr_ipc_ns;
>  extern spinlock_t mq_lock;
>  
>  #ifdef CONFIG_SYSVIPC
> -extern int register_ipcns_notifier(struct ipc_namespace *);
> -extern int cond_register_ipcns_notifier(struct ipc_namespace *);
> -extern void unregister_ipcns_notifier(struct ipc_namespace *);
> -extern int ipcns_notify(unsigned long);
>  extern void shm_destroy_orphaned(struct ipc_namespace *ns);
>  #else /* CONFIG_SYSVIPC */
> -static inline int register_ipcns_notifier(struct ipc_namespace *ns)
> -{ return 0; }
> -static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns)
> -{ return 0; }
> -static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { }
> -static inline int ipcns_notify(unsigned long l) { return 0; }
>  static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
>  #endif /* CONFIG_SYSVIPC */
>  
> diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h
> index a703755..2733ec8 100644
> --- a/include/uapi/linux/msg.h
> +++ b/include/uapi/linux/msg.h
> @@ -51,16 +51,28 @@ struct msginfo {
>  };
>  
>  /*
> - * Scaling factor to compute msgmni:
> - * the memory dedicated to msg queues (msgmni * msgmnb) should occupy
> - * at most 1/MSG_MEM_SCALE of the lowmem (see the formula in ipc/msg.c):
> - * up to 8MB   : msgmni = 16 (MSGMNI)
> - * 4 GB: msgmni = 8K
> - * more than 16 GB : msgmni = 32K (IPCMNI)
> + * MSGMNI, MSGMAX and MSGMNB are default values which can be
> + * modified by sysctl.
> + *
> + * MSGMNI is the upper limit for the number of messages queues per
> + * namespace.
> + * It has been chosen to be as large possible without facilitating
> + * scenarios where userspace causes overflows when adjusting the limits via
> + * operations of the form retrieve current limit; add X; update limit".
> + *
> + * MSGMNB is the default size of a new message queue. Non-root tasks can
> + * decrease the size with msgctl(IPC_SET), root tasks
> + * (actually: CAP_SYS_RESOURCE) can both increase and decrease the queue
> + * size. The optimal value is application dependant.
> + * 16384 is used because it was always used (since 0.99.10)
> + *
> + * MAXMAX is the maximum size of an individual message, it's a global
> + * (per-namespace) limit that applies for all message que

Re: [PATCH 1/3] ipc/msg: increase MSGMNI, remove scaling

2014-08-15 Thread Rafael Aquini
On Tue, Aug 12, 2014 at 09:29:15AM +0200, Manfred Spraul wrote:
 SysV can be abused to allocate locked kernel memory.
 For most systems, a small limit doesn't make sense, see the discussion with
 regards to SHMMAX.
 
 Therefore: increase MSGMNI to the maximum supported.
 
 And: If we ignore the risk of locking too much memory, then an automatic
 scaling of MSGMNI doesn't make sense. Therefore the logic can be removed.
 
 Notes:
 1) If an administrator must limit the memory allocations, then he can set
 MSGMNI as necessary.
 
 Or he can disable sysv entirely (as e.g. done by Android).
 
 2) MSGMAX and MSGMNB are intentionally not increased, as these values are used
 to control latency vs. throughput:
 If MSGMNB is large, then msgsnd() just returns and more messages can be queued
 before a task switch to a task that calls msgrcv() is forced.
 
 Signed-off-by: Manfred Spraul manf...@colorfullife.com
 ---

Acked-by: Rafael Aquini aqu...@redhat.com


  include/linux/ipc_namespace.h | 20 --
  include/uapi/linux/msg.h  | 28 +
  ipc/Makefile  |  2 +-
  ipc/ipc_sysctl.c  | 86 +---
  ipc/ipcns_notifier.c  | 92 
 ---
  ipc/msg.c | 36 +
  ipc/namespace.c   | 22 ---
  ipc/util.c| 40 ---
  8 files changed, 23 insertions(+), 303 deletions(-)
  delete mode 100644 ipc/ipcns_notifier.c
 
 diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
 index 35e7eca..e365d5e 100644
 --- a/include/linux/ipc_namespace.h
 +++ b/include/linux/ipc_namespace.h
 @@ -7,15 +7,6 @@
  #include linux/notifier.h
  #include linux/nsproxy.h
  
 -/*
 - * ipc namespace events
 - */
 -#define IPCNS_MEMCHANGED   0x0001   /* Notify lowmem size changed */
 -#define IPCNS_CREATED  0x0002   /* Notify new ipc namespace created */
 -#define IPCNS_REMOVED  0x0003   /* Notify ipc namespace removed */
 -
 -#define IPCNS_CALLBACK_PRI 0
 -
  struct user_namespace;
  
  struct ipc_ids {
 @@ -38,7 +29,6 @@ struct ipc_namespace {
   unsigned intmsg_ctlmni;
   atomic_tmsg_bytes;
   atomic_tmsg_hdrs;
 - int auto_msgmni;
  
   size_t  shm_ctlmax;
   size_t  shm_ctlall;
 @@ -77,18 +67,8 @@ extern atomic_t nr_ipc_ns;
  extern spinlock_t mq_lock;
  
  #ifdef CONFIG_SYSVIPC
 -extern int register_ipcns_notifier(struct ipc_namespace *);
 -extern int cond_register_ipcns_notifier(struct ipc_namespace *);
 -extern void unregister_ipcns_notifier(struct ipc_namespace *);
 -extern int ipcns_notify(unsigned long);
  extern void shm_destroy_orphaned(struct ipc_namespace *ns);
  #else /* CONFIG_SYSVIPC */
 -static inline int register_ipcns_notifier(struct ipc_namespace *ns)
 -{ return 0; }
 -static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns)
 -{ return 0; }
 -static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { }
 -static inline int ipcns_notify(unsigned long l) { return 0; }
  static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
  #endif /* CONFIG_SYSVIPC */
  
 diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h
 index a703755..2733ec8 100644
 --- a/include/uapi/linux/msg.h
 +++ b/include/uapi/linux/msg.h
 @@ -51,16 +51,28 @@ struct msginfo {
  };
  
  /*
 - * Scaling factor to compute msgmni:
 - * the memory dedicated to msg queues (msgmni * msgmnb) should occupy
 - * at most 1/MSG_MEM_SCALE of the lowmem (see the formula in ipc/msg.c):
 - * up to 8MB   : msgmni = 16 (MSGMNI)
 - * 4 GB: msgmni = 8K
 - * more than 16 GB : msgmni = 32K (IPCMNI)
 + * MSGMNI, MSGMAX and MSGMNB are default values which can be
 + * modified by sysctl.
 + *
 + * MSGMNI is the upper limit for the number of messages queues per
 + * namespace.
 + * It has been chosen to be as large possible without facilitating
 + * scenarios where userspace causes overflows when adjusting the limits via
 + * operations of the form retrieve current limit; add X; update limit.
 + *
 + * MSGMNB is the default size of a new message queue. Non-root tasks can
 + * decrease the size with msgctl(IPC_SET), root tasks
 + * (actually: CAP_SYS_RESOURCE) can both increase and decrease the queue
 + * size. The optimal value is application dependant.
 + * 16384 is used because it was always used (since 0.99.10)
 + *
 + * MAXMAX is the maximum size of an individual message, it's a global
 + * (per-namespace) limit that applies for all message queues.
 + * It's set to 1/2 of MSGMNB, to ensure that at least two messages fit into
 + * the queue. This is also an arbitrary choice (since 2.6.0).
   */
 -#define MSG_MEM_SCALE 32
  
 -#define MSGMNI16   /* = IPCMNI */ /* max # of msg queue identifiers 
 */
 +#define MSGMNI 32000   /* = IPCMNI */ /* max # of msg queue identifiers 
 */
  #define MSGMAX  8192   /* = INT_MAX

Re: [PATCH 3/3] ipc namespace: copy settings from parent namespace

2014-08-15 Thread Rafael Aquini
On Tue, Aug 12, 2014 at 09:29:17AM +0200, Manfred Spraul wrote:
 Right now, each new IPC namespace starts with the kernel default values.
 This means that changes that were made to the limits get overwritten.
 
 With this patch, a new namespace inherits the settings from the parent
 namespace, which is less surprising.
 
 The patch updates
 - SysV msg
 - SysV sem
 - SysV shm
 - POSIX mqueues
 
 Cc: se...@hallyn.com
 Cc: ebied...@xmission.com
 Cc: contain...@lists.linux-foundation.org
 Cc: mtk.manpa...@gmail.com
 
 Signed-off-by: Manfred Spraul manf...@colorfullife.com
 ---

Acked-by: Rafael Aquini aqu...@redhat.com


  include/linux/ipc_namespace.h |  6 --
  ipc/mqueue.c  | 23 ---
  ipc/msg.c | 16 +++-
  ipc/namespace.c   |  8 
  ipc/sem.c | 19 +--
  ipc/shm.c | 19 +--
  ipc/util.h| 20 ++--
  7 files changed, 75 insertions(+), 36 deletions(-)
 
 diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
 index e365d5e..1cc36a0 100644
 --- a/include/linux/ipc_namespace.h
 +++ b/include/linux/ipc_namespace.h
 @@ -73,7 +73,7 @@ static inline void shm_destroy_orphaned(struct 
 ipc_namespace *ns) {}
  #endif /* CONFIG_SYSVIPC */
  
  #ifdef CONFIG_POSIX_MQUEUE
 -extern int mq_init_ns(struct ipc_namespace *ns);
 +extern int mq_init_ns(struct ipc_namespace *ns, struct ipc_namespace 
 *old_ns);
  /*
   * POSIX Message Queue default values:
   *
 @@ -108,7 +108,9 @@ extern int mq_init_ns(struct ipc_namespace *ns);
  #define DFLT_MSGSIZEMAX   8192
  #define HARD_MSGSIZEMAX  (16*1024*1024)
  #else
 -static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 +static inline int mq_init_ns(struct ipc_namespace *ns,
 + struct ipc_namespace *old_ns)
 +{ return 0; }
  #endif
  
  #if defined(CONFIG_IPC_NS)
 diff --git a/ipc/mqueue.c b/ipc/mqueue.c
 index 4fcf39a..3473072d 100644
 --- a/ipc/mqueue.c
 +++ b/ipc/mqueue.c
 @@ -1397,14 +1397,23 @@ static struct file_system_type mqueue_fs_type = {
   .fs_flags = FS_USERNS_MOUNT,
  };
  
 -int mq_init_ns(struct ipc_namespace *ns)
 +int mq_init_ns(struct ipc_namespace *ns, struct ipc_namespace *old_ns)
  {
 + if (old_ns != NULL) {
 + ns-mq_queues_max= old_ns-mq_queues_max;
 + ns-mq_msg_max   = old_ns-mq_msg_max;
 + ns-mq_msgsize_max   = old_ns-mq_msgsize_max;
 + ns-mq_msg_default   = old_ns-mq_msg_default;
 + ns-mq_msgsize_default  = old_ns-mq_msgsize_default;
 + } else {
 + ns-mq_queues_max= DFLT_QUEUESMAX;
 + ns-mq_msg_max   = DFLT_MSGMAX;
 + ns-mq_msgsize_max   = DFLT_MSGSIZEMAX;
 + ns-mq_msg_default   = DFLT_MSG;
 + ns-mq_msgsize_default  = DFLT_MSGSIZE;
 + }
 +
   ns-mq_queues_count  = 0;
 - ns-mq_queues_max= DFLT_QUEUESMAX;
 - ns-mq_msg_max   = DFLT_MSGMAX;
 - ns-mq_msgsize_max   = DFLT_MSGSIZEMAX;
 - ns-mq_msg_default   = DFLT_MSG;
 - ns-mq_msgsize_default  = DFLT_MSGSIZE;
  
   ns-mq_mnt = kern_mount_data(mqueue_fs_type, ns);
   if (IS_ERR(ns-mq_mnt)) {
 @@ -1444,7 +1453,7 @@ static int __init init_mqueue_fs(void)
  
   spin_lock_init(mq_lock);
  
 - error = mq_init_ns(init_ipc_ns);
 + error = mq_init_ns(init_ipc_ns, NULL);
   if (error)
   goto out_filesystem;
  
 diff --git a/ipc/msg.c b/ipc/msg.c
 index a7261d5..3cbd2ad 100644
 --- a/ipc/msg.c
 +++ b/ipc/msg.c
 @@ -990,11 +990,17 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf 
 __user *, msgp, size_t, msgsz,
  }
  
  
 -void msg_init_ns(struct ipc_namespace *ns)
 +void msg_init_ns(struct ipc_namespace *ns, struct ipc_namespace *old_ns)
  {
 - ns-msg_ctlmax = MSGMAX;
 - ns-msg_ctlmnb = MSGMNB;
 - ns-msg_ctlmni = MSGMNI;
 + if (old_ns != NULL) {
 + ns-msg_ctlmax = old_ns-msg_ctlmax;
 + ns-msg_ctlmnb = old_ns-msg_ctlmnb;
 + ns-msg_ctlmni = old_ns-msg_ctlmni;
 + } else {
 + ns-msg_ctlmax = MSGMAX;
 + ns-msg_ctlmnb = MSGMNB;
 + ns-msg_ctlmni = MSGMNI;
 + }
  
   atomic_set(ns-msg_bytes, 0);
   atomic_set(ns-msg_hdrs, 0);
 @@ -1036,7 +1042,7 @@ static int sysvipc_msg_proc_show(struct seq_file *s, 
 void *it)
  
  void __init msg_init(void)
  {
 - msg_init_ns(init_ipc_ns);
 + msg_init_ns(init_ipc_ns, NULL);
  
   ipc_init_proc_interface(sysvipc/msg,
  key  msqid perms  cbytes   
 qnum lspid lrpid   uid   gid  cuid  cgid  stime  rtime  ctime\n,
 diff --git a/ipc/namespace.c b/ipc/namespace.c
 index 1a3ffd4..97e3332 100644
 --- a/ipc/namespace.c
 +++ b/ipc/namespace.c
 @@ -33,7 +33,7 @@ static struct ipc_namespace *create_ipc_ns(struct 
 user_namespace

Re: [PATCH 2/3] ipc/sem.c: increase SEMMSL, SEMMNI, SEMOPM

2014-08-15 Thread Rafael Aquini
On Tue, Aug 12, 2014 at 09:29:16AM +0200, Manfred Spraul wrote:
 a)
 SysV can be abused to allocate locked kernel memory.  For most systems, a
 small limit doesn't make sense, see the discussion with regards to SHMMAX.
 
 Therefore: Increase the sysv sem limits so that all known applications
 will work with these defaults.
 
 b)
 With regards to the maximum supported:
 Some of the specified hard limits are not correct anymore, therefore the
 patch updates the documentation.
 
 - SEMMNI must stay below IPCMNI, which is 32768.
   As for SHMMAX: Stay a bit below this limit.
 
 - SEMMSL was limited to 8k, to ensure that the kmalloc for the kernel array
   was limited to 16 kB (order=2)
 
   This doesn't apply anymore:
- the allocation size isn't sizeof(short)*nsems anymore.
- ipc_alloc falls back to vmalloc
 
 - SEMOPM should stay below 1000, to limit the kmalloc in semtimedop() to an
   order=1 allocation.
   Therefore: Leave it at 500 (order=0 allocation).
 
 Note:
 If an administrator must limit the memory allocations, then he can set the
 values as necessary.
 
 Or he can disable sysv entirely (as e.g. done by Android).
 
 Signed-off-by: Manfred Spraul manf...@colorfullife.com
 ---

Acked-by: Rafael Aquini aqu...@redhat.com


  include/uapi/linux/sem.h | 18 +++---
  1 file changed, 15 insertions(+), 3 deletions(-)
 
 diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h
 index 541fce0..dd73b90 100644
 --- a/include/uapi/linux/sem.h
 +++ b/include/uapi/linux/sem.h
 @@ -63,10 +63,22 @@ struct  seminfo {
   int semaem;
  };
  
 -#define SEMMNI  128 /* = IPCMNI  max # of semaphore identifiers 
 */
 -#define SEMMSL  250 /* = 8 000 max num of semaphores per id */
 +/*
 + * SEMMNI, SEMMSL and SEMMNS are default values which can be
 + * modified by sysctl.
 + * The values has been chosen to be larger than necessary for any
 + * known configuration.
 + *
 + * SEMOPM should not be increased beyond 1000, otherwise there is the
 + * risk that semop()/semtimedop() fails due to kernel memory fragmentation 
 when
 + * allocating the sop array.
 + */
 +
 +
 +#define SEMMNI  32000   /* = IPCMNI  max # of semaphore identifiers 
 */
 +#define SEMMSL  32000   /* = INT_MAX max num of semaphores per id */
  #define SEMMNS  (SEMMNI*SEMMSL) /* = INT_MAX max # of semaphores in system 
 */
 -#define SEMOPM  32   /* = 1 000 max num of ops per semop call */
 +#define SEMOPM  500  /* = 1 000 max num of ops per semop call */
  #define SEMVMX  32767   /* = 32767 semaphore maximum value */
  #define SEMAEM  SEMVMX  /* adjust on exit max value */
  
 -- 
 1.9.3
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
On Fri, Aug 15, 2014 at 07:36:16AM +0400, Konstantin Khlebnikov wrote:
> Don't hurry. The code in this state for years.
> I'm working on patches for this, if everything goes well I'll show it today.
> As usual I couldn't stop myself from cleaning the mess, so it will be
> bigger than yours.
>
Sorry,

I didn't see this reply of yours before sending out an adjusted-and-tested 
version of that patch, and asked Sasha to check it against his test-case.

Please, do not hesitate in providing your change ideas, though. I'd really
appreciate your assessment feedback on that code. 

Cheers,
-- Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
Here's a potential final version for the patch mentioned in a earlier message.
The nitpick I raised to myself and a couple of other minor typing issues
are fixed.

I did a preliminary testround, in a KVM guest ballooning in and out memory by 
chunks of 1GB while a script within the guest was forcing 
compaction concurrently verything looked alright.

Sasha, could you give this a try to see if that reported KASAN warning
fades away, please?

Cheers,
-- Rafael

---8<---
From: Rafael Aquini 
Subject: [PATCH v2] mm: balloon_compaction: enhance balloon_page_movable()
 checkpoint against races

While testing linux-next for the Kernel Address Sanitizer patchset (KASAN)
Sasha Levin reported a buffer overflow warning triggered for
isolate_migratepages_range(), which later was discovered happening due to
a condition where balloon_page_movable() raced against move_to_new_page(),
while the later was copying the page->mapping of an anon page.

Because we can perform balloon_page_movable() in a lockless fashion at
isolate_migratepages_range(), the discovered race has unveiled the scheme
actually used to spot ballooned pages among page blocks that checks for
page_flags_cleared() and dereference page->mapping to check its mapping flags
is weak and potentially prone to stumble across another similar conditions
in the future.

Following Konstantin Khlebnikov's and Andrey Ryabinin's suggestions,
this patch replaces the old page->flags && mapping->flags checking scheme
with a more simple and strong page->_mapcount read and compare value test.
Similarly to what is done for PageBuddy() checks, BALLOON_PAGE_MAPCOUNT_VALUE
is introduced here to mark balloon pages. This allows balloon_page_movable()
to skip the proven troublesome dereference of page->mapping for flag checking
while it goes on isolate_migratepages_range() lockless rounds.
page->mapping dereference and flag-checking will be performed later, when
all locks are held properly.

Signed-off-by: Rafael Aquini 
---
 include/linux/balloon_compaction.h | 61 +++---
 mm/balloon_compaction.c| 59 ++--
 2 files changed, 54 insertions(+), 66 deletions(-)

diff --git a/include/linux/balloon_compaction.h 
b/include/linux/balloon_compaction.h
index 089743a..e00d5b0 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -108,54 +108,29 @@ static inline void balloon_mapping_free(struct 
address_space *balloon_mapping)
 }
 
 /*
- * page_flags_cleared - helper to perform balloon @page ->flags tests.
+ * balloon_page_movable - identify balloon pages that can be moved by
+ *   compaction / migration.
  *
- * As balloon pages are obtained from buddy and we do not play with page->flags
- * at driver level (exception made when we get the page lock for compaction),
- * we can safely identify a ballooned page by checking if the
- * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared.  This approach also
- * helps us skip ballooned pages that are locked for compaction or release, 
thus
- * mitigating their racy check at balloon_page_movable()
+ * BALLOON_PAGE_MAPCOUNT_VALUE must be <= -2 but better not too close to
+ * -2 so that an underflow of the page_mapcount() won't be mistaken
+ * for a genuine BALLOON_PAGE_MAPCOUNT_VALUE.
  */
-static inline bool page_flags_cleared(struct page *page)
+#define BALLOON_PAGE_MAPCOUNT_VALUE (-256)
+static inline bool balloon_page_movable(struct page *page)
 {
-   return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP);
+   return atomic_read(>_mapcount) == BALLOON_PAGE_MAPCOUNT_VALUE;
 }
 
-/*
- * __is_movable_balloon_page - helper to perform @page mapping->flags tests
- */
-static inline bool __is_movable_balloon_page(struct page *page)
+static inline void __balloon_page_set(struct page *page)
 {
-   struct address_space *mapping = page->mapping;
-   return mapping_balloon(mapping);
+   VM_BUG_ON_PAGE(atomic_read(>_mapcount) != -1, page);
+   atomic_set(>_mapcount, BALLOON_PAGE_MAPCOUNT_VALUE);
 }
 
-/*
- * balloon_page_movable - test page->mapping->flags to identify balloon pages
- *   that can be moved by compaction/migration.
- *
- * This function is used at core compaction's page isolation scheme, therefore
- * most pages exposed to it are not enlisted as balloon pages and so, to avoid
- * undesired side effects like racing against __free_pages(), we cannot afford
- * holding the page locked while testing page->mapping->flags here.
- *
- * As we might return false positives in the case of a balloon page being just
- * released under us, the page->mapping->flags need to be re-tested later,
- * under the proper page lock, at the functions that will be coping with the
- * balloon page case.
- */
-static inline bool balloon_page_movable(struct page *page)
+static inline void __balloon_page_clear(struct 

Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
On Thu, Aug 14, 2014 at 06:43:50PM -0300, Rafael Aquini wrote:
> On Thu, Aug 14, 2014 at 10:07:40PM +0400, Andrey Ryabinin wrote:
> > We discussed this with Konstantin and he suggested a better solution for 
> > this.
> > If I understood him correctly the main idea was to store bit
> > identifying ballon page
> > in struct page (special value in _mapcount), so we won't need to check
> > mapping->flags.
> >
> 
> Here goes what I thought doing, following that suggestion of Konstantin and 
> yours. (I didn't tested it yet)
> 
> Comments are welcomed.
> 
> Cheers,
> -- Rafael
> 
>  8< 
> From: Rafael Aquini 
> Subject: mm: balloon_compaction: enhance balloon_page_movable() checkpoint 
> against races
> 
> While testing linux-next for the Kernel Address Sanitizer patchset (KASAN) 
> Sasha Levin reported a buffer overflow warning triggered for 
> isolate_migratepages_range(), which lated was discovered happening due to
> a condition where balloon_page_movable() raced against move_to_new_page(),
> while the later was copying the page->mapping of an anon page.
> 
> Because we can perform balloon_page_movable() in a lockless fashion at 
> isolate_migratepages_range(), the dicovered race has unveiled the scheme 
> actually used to spot ballooned pages among page blocks that checks for
> page_flags_cleared() and dereference page->mapping to check its mapping flags
> is weak and potentially prone to stumble across another similar conditions 
> in the future.
> 
> Following Konstantin Khlebnikov's and Andrey Ryabinin's suggestions,
> this patch replaces the old page->flags && mapping->flags checking scheme
> with a more simple and strong page->_mapcount read and compare value test.
> Similarly to what is done for PageBuddy() checks, BALLOON_PAGE_MAPCOUNT_VALUE
> is introduced here to mark balloon pages. This allows balloon_page_movable()
> to skip the proven troublesome dereference of page->mapping for flag checking
> while it goes on isolate_migratepages_range() lockless rounds.
> page->mapping dereference and flag-checking will be performed later, when
> all locks are held properly.
> 
> ---
>  include/linux/balloon_compaction.h | 61 
> +++---
>  mm/balloon_compaction.c| 53 +
>  2 files changed, 45 insertions(+), 69 deletions(-)
> 
> diff --git a/include/linux/balloon_compaction.h 
> b/include/linux/balloon_compaction.h
> index 089743a..1409ccc 100644
> --- a/include/linux/balloon_compaction.h
> +++ b/include/linux/balloon_compaction.h
> @@ -108,54 +108,29 @@ static inline void balloon_mapping_free(struct 
> address_space *balloon_mapping)
>  }
>  
>  /*
> - * page_flags_cleared - helper to perform balloon @page ->flags tests.
> + * balloon_page_movable - identify balloon pages that can be moved by
> + * compaction / migration.
>   *
> - * As balloon pages are obtained from buddy and we do not play with 
> page->flags
> - * at driver level (exception made when we get the page lock for compaction),
> - * we can safely identify a ballooned page by checking if the
> - * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared.  This approach also
> - * helps us skip ballooned pages that are locked for compaction or release, 
> thus
> - * mitigating their racy check at balloon_page_movable()
> + * BALLOON_PAGE_MAPCOUNT_VALUE must be <= -2 but better not too close to
> + * -2 so that an underflow of the page_mapcount() won't be mistaken
> + * for a genuine BALLOON_PAGE_MAPCOUNT_VALUE.
>   */
> -static inline bool page_flags_cleared(struct page *page)
> +#define BALLOON_PAGE_MAPCOUNT_VALUE (-256)
> +static inline bool balloon_page_movable(struct page *page)
>  {
> - return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP);
> + return atomic_read(>_mapcount) == BALLOON_PAGE_MAPCOUNT_VALUE;
>  }
>  
> -/*
> - * __is_movable_balloon_page - helper to perform @page mapping->flags tests
> - */
> -static inline bool __is_movable_balloon_page(struct page *page)
> +static inline void __balloon_page_set(struct page *page)
>  {
> - struct address_space *mapping = page->mapping;
> - return mapping_balloon(mapping);
> + VM_BUG_ON_PAGE(!atomic_read(>_mapcount) != -1, page);
> + atomic_set(>_mapcount, BALLOON_PAGE_MAPCOUNT_VALUE);
>  }
>  
> -/*
> - * balloon_page_movable - test page->mapping->flags to identify balloon pages
> - * that can be moved by compaction/migration.
> - *
> - * This function is used at core compaction's page isolation scheme, 
> therefore
> - * most pages exposed to it are

Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
On Thu, Aug 14, 2014 at 10:07:40PM +0400, Andrey Ryabinin wrote:
> We discussed this with Konstantin and he suggested a better solution for this.
> If I understood him correctly the main idea was to store bit
> identifying ballon page
> in struct page (special value in _mapcount), so we won't need to check
> mapping->flags.
>

Here goes what I thought doing, following that suggestion of Konstantin and 
yours. (I didn't tested it yet)

Comments are welcomed.

Cheers,
-- Rafael

 8< 
From: Rafael Aquini 
Subject: mm: balloon_compaction: enhance balloon_page_movable() checkpoint 
against races

While testing linux-next for the Kernel Address Sanitizer patchset (KASAN) 
Sasha Levin reported a buffer overflow warning triggered for 
isolate_migratepages_range(), which lated was discovered happening due to
a condition where balloon_page_movable() raced against move_to_new_page(),
while the later was copying the page->mapping of an anon page.

Because we can perform balloon_page_movable() in a lockless fashion at 
isolate_migratepages_range(), the dicovered race has unveiled the scheme 
actually used to spot ballooned pages among page blocks that checks for
page_flags_cleared() and dereference page->mapping to check its mapping flags
is weak and potentially prone to stumble across another similar conditions 
in the future.

Following Konstantin Khlebnikov's and Andrey Ryabinin's suggestions,
this patch replaces the old page->flags && mapping->flags checking scheme
with a more simple and strong page->_mapcount read and compare value test.
Similarly to what is done for PageBuddy() checks, BALLOON_PAGE_MAPCOUNT_VALUE
is introduced here to mark balloon pages. This allows balloon_page_movable()
to skip the proven troublesome dereference of page->mapping for flag checking
while it goes on isolate_migratepages_range() lockless rounds.
page->mapping dereference and flag-checking will be performed later, when
all locks are held properly.

---
 include/linux/balloon_compaction.h | 61 +++---
 mm/balloon_compaction.c| 53 +
 2 files changed, 45 insertions(+), 69 deletions(-)

diff --git a/include/linux/balloon_compaction.h 
b/include/linux/balloon_compaction.h
index 089743a..1409ccc 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -108,54 +108,29 @@ static inline void balloon_mapping_free(struct 
address_space *balloon_mapping)
 }
 
 /*
- * page_flags_cleared - helper to perform balloon @page ->flags tests.
+ * balloon_page_movable - identify balloon pages that can be moved by
+ *   compaction / migration.
  *
- * As balloon pages are obtained from buddy and we do not play with page->flags
- * at driver level (exception made when we get the page lock for compaction),
- * we can safely identify a ballooned page by checking if the
- * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared.  This approach also
- * helps us skip ballooned pages that are locked for compaction or release, 
thus
- * mitigating their racy check at balloon_page_movable()
+ * BALLOON_PAGE_MAPCOUNT_VALUE must be <= -2 but better not too close to
+ * -2 so that an underflow of the page_mapcount() won't be mistaken
+ * for a genuine BALLOON_PAGE_MAPCOUNT_VALUE.
  */
-static inline bool page_flags_cleared(struct page *page)
+#define BALLOON_PAGE_MAPCOUNT_VALUE (-256)
+static inline bool balloon_page_movable(struct page *page)
 {
-   return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP);
+   return atomic_read(>_mapcount) == BALLOON_PAGE_MAPCOUNT_VALUE;
 }
 
-/*
- * __is_movable_balloon_page - helper to perform @page mapping->flags tests
- */
-static inline bool __is_movable_balloon_page(struct page *page)
+static inline void __balloon_page_set(struct page *page)
 {
-   struct address_space *mapping = page->mapping;
-   return mapping_balloon(mapping);
+   VM_BUG_ON_PAGE(!atomic_read(>_mapcount) != -1, page);
+   atomic_set(>_mapcount, BALLOON_PAGE_MAPCOUNT_VALUE);
 }
 
-/*
- * balloon_page_movable - test page->mapping->flags to identify balloon pages
- *   that can be moved by compaction/migration.
- *
- * This function is used at core compaction's page isolation scheme, therefore
- * most pages exposed to it are not enlisted as balloon pages and so, to avoid
- * undesired side effects like racing against __free_pages(), we cannot afford
- * holding the page locked while testing page->mapping->flags here.
- *
- * As we might return false positives in the case of a balloon page being just
- * released under us, the page->mapping->flags need to be re-tested later,
- * under the proper page lock, at the functions that will be coping with the
- * balloon page case.
- */
-static inline bool balloon_page_movable(struct page *page)
+static inline void __balloon_page_clear(str

Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
On Thu, Aug 14, 2014 at 10:07:40PM +0400, Andrey Ryabinin wrote:
> 2014-08-14 19:13 GMT+04:00 Rafael Aquini :
> > It still a harmless condition as before, but considering what goes above
> > I'm now convinced & confident the patch proposed by Andrey is the real fix
> > for such occurrences.
> >
> 
> I don't think that it's harmless, because we could cross page boundary here 
> and
> try to read from a memory hole.
>
I think isolate_migratepages_range() skips over holes, doesn't it? 


> And this code has more potential problems like use after free. Since
> we don't hold locks properly here,
> page->mapping could point to freed struct address_space.
>
Thinking on how things go for isolate_migratepages_range() and balloon
pages, I struggle to find a way where that could happen. OTOH, I failed
to see things more blatant before, so I won't argue here. Defensive
programming is always better than negating possibilities ;)

 
> We discussed this with Konstantin and he suggested a better solution for this.
> If I understood him correctly the main idea was to store bit
> identifying ballon page
> in struct page (special value in _mapcount), so we won't need to check
> mapping->flags.
>
I liked it. Something in the line of PageBuddy()/PAGE_BUDDY_MAPCOUNT_VALUE 
scheme.
This is clearly cleaner than what we have in place today, and I'm
ashamed I didn't think of it before. Thanks for pointing that out.

Cheers,
-- Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
On Wed, Aug 13, 2014 at 12:35:02PM -0300, Rafael Aquini wrote:
> On Sun, Aug 10, 2014 at 12:49:47PM +0400, Andrey Ryabinin wrote:
> > 2014-08-10 5:45 GMT+04:00 Sasha Levin :
> > > Hi all,
> > >
> > > While fuzzing with trinity inside a KVM tools guest running the latest 
> > > -next
> > > kernel with the KASAN patchset, I've stumbled on the following spew:
> > >
> > >
> > > [ 3837.070452] 
> > > ==
> > > [ 3837.073101] AddressSanitizer: buffer overflow in 
> > > isolate_migratepages_range+0x85f/0xd90 at addr 88051b70eb49
> > > [ 3837.076310] page:ea00146dc380 count:0 mapcount:0 mapping:  
> > > (null) index:0x0
> > > [ 3837.079876] page flags: 0xaf80008000(tail)
> > > [ 3837.114592] page dumped because: kasan error
> > > [ 3837.115897] CPU: 4 PID: 29613 Comm: trinity-c467 Not tainted 
> > > 3.16.0-next-20140808-sasha-00051-gf368221 #1051
> > > [ 3837.118024]  00fc  ea00146dc380 
> > > 8801f326f718
> > > [ 3837.119837]  97e0d344 8801f326f7e8 8801f326f7d8 
> > > 9342d5bc
> > > [ 3837.121708]  ea00085163c0  8801f326f8e0 
> > > 93fe02fb
> > > [ 3837.123704] Call Trace:
> > > [ 3837.124272] dump_stack (lib/dump_stack.c:52)
> > > [ 3837.125166] kasan_report_error (mm/kasan/report.c:98 
> > > mm/kasan/report.c:166)
> > > [ 3837.126128] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:33)
> > > [ 3837.127462] ? retint_restore_args (arch/x86/kernel/entry_64.S:828)
> > > [ 3837.128753] __asan_load8 (mm/kasan/kasan.c:364)
> > > [ 3837.129914] ? isolate_migratepages_range 
> > > (./arch/x86/include/asm/bitops.h:311 include/linux/pagemap.h:70 
> > > include/linux/balloon_compaction.h:131 
> > > include/linux/balloon_compaction.h:156 mm/compaction.c:596)
> > > [ 3837.131613] isolate_migratepages_range 
> > > (./arch/x86/include/asm/bitops.h:311 include/linux/pagemap.h:70 
> > > include/linux/balloon_compaction.h:131 
> > > include/linux/balloon_compaction.h:156 mm/compaction.c:596)
> > > [ 3837.132838] compact_zone (mm/compaction.c:877 mm/compaction.c:1044)
> > > [ 3837.133818] compact_zone_order (mm/compaction.c:1106)
> > > [ 3837.134982] try_to_compact_pages (mm/compaction.c:1161)
> > > [ 3837.135970] __alloc_pages_direct_compact (mm/page_alloc.c:2313)
> > > [ 3837.137217] ? next_zones_zonelist (mm/mmzone.c:72)
> > > [ 3837.138861] __alloc_pages_nodemask (mm/page_alloc.c:2640 
> > > mm/page_alloc.c:2806)
> > > [ 3837.139897] ? check_chain_key (kernel/locking/lockdep.c:2190)
> > > [ 3837.141220] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
> > > [ 3837.142434] alloc_pages_vma (mm/mempolicy.c:2046)
> > > [ 3837.143479] ? do_huge_pmd_wp_page (mm/huge_memory.c:774 
> > > mm/huge_memory.c:1123)
> > > [ 3837.144663] do_huge_pmd_wp_page (mm/huge_memory.c:774 
> > > mm/huge_memory.c:1123)
> > > [ 3837.145653] handle_mm_fault (mm/memory.c:3312 mm/memory.c:3370)
> > > [ 3837.146717] ? vmacache_find (mm/vmacache.c:100 (discriminator 1))
> > > [ 3837.147404] ? find_vma (mm/mmap.c:2024)
> > > [ 3837.147982] __do_page_fault (arch/x86/mm/fault.c:1231)
> > > [ 3837.148613] ? context_tracking_user_exit 
> > > (kernel/context_tracking.c:184)
> > > [ 3837.149388] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
> > > [ 3837.150212] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2641 
> > > (discriminator 8))
> > > [ 3837.150977] ? trace_hardirqs_off (kernel/locking/lockdep.c:2647)
> > > [ 3837.151686] trace_do_page_fault (arch/x86/mm/fault.c:1314 
> > > include/linux/jump_label.h:115 include/linux/context_tracking_state.h:27 
> > > include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1315)
> > > [ 3837.152870] do_async_page_fault (arch/x86/kernel/kvm.c:279)
> > > [ 3837.153886] async_page_fault (arch/x86/kernel/entry_64.S:1313)
> > > [ 3837.155293] Read of size 8 by thread T29613:
> > > [ 3837.156058] Memory state around the buggy address:
> > > [ 3837.156885]  88051b70e880: 00 00 00 00 00 00 00 fc fc fc fc fc fc 
> > > fc fc fc
> > > [ 3837.158141]  88051b70e900: fc fc fc fc fc fc fc fc fc fc fc fc fc 
> > > fc fc fc
> > > [ 3837.159492]  88051b70e980: fc fc fc fc fc fc fc fc fc fc fc fc fc 
> > > fc fc fc
> > > [ 3837.160863]  88051b70ea0

Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
On Wed, Aug 13, 2014 at 12:35:02PM -0300, Rafael Aquini wrote:
 On Sun, Aug 10, 2014 at 12:49:47PM +0400, Andrey Ryabinin wrote:
  2014-08-10 5:45 GMT+04:00 Sasha Levin sasha.le...@oracle.com:
   Hi all,
  
   While fuzzing with trinity inside a KVM tools guest running the latest 
   -next
   kernel with the KASAN patchset, I've stumbled on the following spew:
  
  
   [ 3837.070452] 
   ==
   [ 3837.073101] AddressSanitizer: buffer overflow in 
   isolate_migratepages_range+0x85f/0xd90 at addr 88051b70eb49
   [ 3837.076310] page:ea00146dc380 count:0 mapcount:0 mapping:  
   (null) index:0x0
   [ 3837.079876] page flags: 0xaf80008000(tail)
   [ 3837.114592] page dumped because: kasan error
   [ 3837.115897] CPU: 4 PID: 29613 Comm: trinity-c467 Not tainted 
   3.16.0-next-20140808-sasha-00051-gf368221 #1051
   [ 3837.118024]  00fc  ea00146dc380 
   8801f326f718
   [ 3837.119837]  97e0d344 8801f326f7e8 8801f326f7d8 
   9342d5bc
   [ 3837.121708]  ea00085163c0  8801f326f8e0 
   93fe02fb
   [ 3837.123704] Call Trace:
   [ 3837.124272] dump_stack (lib/dump_stack.c:52)
   [ 3837.125166] kasan_report_error (mm/kasan/report.c:98 
   mm/kasan/report.c:166)
   [ 3837.126128] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:33)
   [ 3837.127462] ? retint_restore_args (arch/x86/kernel/entry_64.S:828)
   [ 3837.128753] __asan_load8 (mm/kasan/kasan.c:364)
   [ 3837.129914] ? isolate_migratepages_range 
   (./arch/x86/include/asm/bitops.h:311 include/linux/pagemap.h:70 
   include/linux/balloon_compaction.h:131 
   include/linux/balloon_compaction.h:156 mm/compaction.c:596)
   [ 3837.131613] isolate_migratepages_range 
   (./arch/x86/include/asm/bitops.h:311 include/linux/pagemap.h:70 
   include/linux/balloon_compaction.h:131 
   include/linux/balloon_compaction.h:156 mm/compaction.c:596)
   [ 3837.132838] compact_zone (mm/compaction.c:877 mm/compaction.c:1044)
   [ 3837.133818] compact_zone_order (mm/compaction.c:1106)
   [ 3837.134982] try_to_compact_pages (mm/compaction.c:1161)
   [ 3837.135970] __alloc_pages_direct_compact (mm/page_alloc.c:2313)
   [ 3837.137217] ? next_zones_zonelist (mm/mmzone.c:72)
   [ 3837.138861] __alloc_pages_nodemask (mm/page_alloc.c:2640 
   mm/page_alloc.c:2806)
   [ 3837.139897] ? check_chain_key (kernel/locking/lockdep.c:2190)
   [ 3837.141220] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
   [ 3837.142434] alloc_pages_vma (mm/mempolicy.c:2046)
   [ 3837.143479] ? do_huge_pmd_wp_page (mm/huge_memory.c:774 
   mm/huge_memory.c:1123)
   [ 3837.144663] do_huge_pmd_wp_page (mm/huge_memory.c:774 
   mm/huge_memory.c:1123)
   [ 3837.145653] handle_mm_fault (mm/memory.c:3312 mm/memory.c:3370)
   [ 3837.146717] ? vmacache_find (mm/vmacache.c:100 (discriminator 1))
   [ 3837.147404] ? find_vma (mm/mmap.c:2024)
   [ 3837.147982] __do_page_fault (arch/x86/mm/fault.c:1231)
   [ 3837.148613] ? context_tracking_user_exit 
   (kernel/context_tracking.c:184)
   [ 3837.149388] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
   [ 3837.150212] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2641 
   (discriminator 8))
   [ 3837.150977] ? trace_hardirqs_off (kernel/locking/lockdep.c:2647)
   [ 3837.151686] trace_do_page_fault (arch/x86/mm/fault.c:1314 
   include/linux/jump_label.h:115 include/linux/context_tracking_state.h:27 
   include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1315)
   [ 3837.152870] do_async_page_fault (arch/x86/kernel/kvm.c:279)
   [ 3837.153886] async_page_fault (arch/x86/kernel/entry_64.S:1313)
   [ 3837.155293] Read of size 8 by thread T29613:
   [ 3837.156058] Memory state around the buggy address:
   [ 3837.156885]  88051b70e880: 00 00 00 00 00 00 00 fc fc fc fc fc fc 
   fc fc fc
   [ 3837.158141]  88051b70e900: fc fc fc fc fc fc fc fc fc fc fc fc fc 
   fc fc fc
   [ 3837.159492]  88051b70e980: fc fc fc fc fc fc fc fc fc fc fc fc fc 
   fc fc fc
   [ 3837.160863]  88051b70ea00: 00 00 00 00 00 00 00 00 00 00 00 00 00 
   00 00 00
   [ 3837.162165]  88051b70ea80: 00 00 00 00 00 00 00 fc fc fc fc fc fc 
   fc fc fc
   [ 3837.163552] 88051b70eb00: fc fc fc fc fc fc fc fc fc fc fc fc fc 
   fc fc fc
   [ 3837.164866]   ^
   [ 3837.165914]  88051b70eb80: fc fc fc fc fc fc fc fc fc fc fc fc fc 
   fc fc fc
   [ 3837.167317]  88051b70ec00: fb fb fb fb fb fb fb fb fb fb fb fb fb 
   fb fb fb
   [ 3837.168616]  88051b70ec80: fb fb fb fb fb fb fb fb fb fb fb fb fb 
   fb fb fb
   [ 3837.169898]  88051b70ed00: fb fb fb fb fb fb fb fb fb fb fb fb fb 
   fb fb fb
   [ 3837.171298]  88051b70ed80: fb fb fb fb fb fb fb fb fb fb fb fb fb 
   fb fb fb
   [ 3837.172611] 
   ==
  
  
   Thanks,
   Sasha
 
 Nice pick from the sanitizer bits

Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
On Thu, Aug 14, 2014 at 10:07:40PM +0400, Andrey Ryabinin wrote:
 2014-08-14 19:13 GMT+04:00 Rafael Aquini aqu...@redhat.com:
  It still a harmless condition as before, but considering what goes above
  I'm now convinced  confident the patch proposed by Andrey is the real fix
  for such occurrences.
 
 
 I don't think that it's harmless, because we could cross page boundary here 
 and
 try to read from a memory hole.

I think isolate_migratepages_range() skips over holes, doesn't it? 


 And this code has more potential problems like use after free. Since
 we don't hold locks properly here,
 page-mapping could point to freed struct address_space.

Thinking on how things go for isolate_migratepages_range() and balloon
pages, I struggle to find a way where that could happen. OTOH, I failed
to see things more blatant before, so I won't argue here. Defensive
programming is always better than negating possibilities ;)

 
 We discussed this with Konstantin and he suggested a better solution for this.
 If I understood him correctly the main idea was to store bit
 identifying ballon page
 in struct page (special value in _mapcount), so we won't need to check
 mapping-flags.

I liked it. Something in the line of PageBuddy()/PAGE_BUDDY_MAPCOUNT_VALUE 
scheme.
This is clearly cleaner than what we have in place today, and I'm
ashamed I didn't think of it before. Thanks for pointing that out.

Cheers,
-- Rafael
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
On Thu, Aug 14, 2014 at 10:07:40PM +0400, Andrey Ryabinin wrote:
 We discussed this with Konstantin and he suggested a better solution for this.
 If I understood him correctly the main idea was to store bit
 identifying ballon page
 in struct page (special value in _mapcount), so we won't need to check
 mapping-flags.


Here goes what I thought doing, following that suggestion of Konstantin and 
yours. (I didn't tested it yet)

Comments are welcomed.

Cheers,
-- Rafael

 8 
From: Rafael Aquini aqu...@redhat.com
Subject: mm: balloon_compaction: enhance balloon_page_movable() checkpoint 
against races

While testing linux-next for the Kernel Address Sanitizer patchset (KASAN) 
Sasha Levin reported a buffer overflow warning triggered for 
isolate_migratepages_range(), which lated was discovered happening due to
a condition where balloon_page_movable() raced against move_to_new_page(),
while the later was copying the page-mapping of an anon page.

Because we can perform balloon_page_movable() in a lockless fashion at 
isolate_migratepages_range(), the dicovered race has unveiled the scheme 
actually used to spot ballooned pages among page blocks that checks for
page_flags_cleared() and dereference page-mapping to check its mapping flags
is weak and potentially prone to stumble across another similar conditions 
in the future.

Following Konstantin Khlebnikov's and Andrey Ryabinin's suggestions,
this patch replaces the old page-flags  mapping-flags checking scheme
with a more simple and strong page-_mapcount read and compare value test.
Similarly to what is done for PageBuddy() checks, BALLOON_PAGE_MAPCOUNT_VALUE
is introduced here to mark balloon pages. This allows balloon_page_movable()
to skip the proven troublesome dereference of page-mapping for flag checking
while it goes on isolate_migratepages_range() lockless rounds.
page-mapping dereference and flag-checking will be performed later, when
all locks are held properly.

---
 include/linux/balloon_compaction.h | 61 +++---
 mm/balloon_compaction.c| 53 +
 2 files changed, 45 insertions(+), 69 deletions(-)

diff --git a/include/linux/balloon_compaction.h 
b/include/linux/balloon_compaction.h
index 089743a..1409ccc 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -108,54 +108,29 @@ static inline void balloon_mapping_free(struct 
address_space *balloon_mapping)
 }
 
 /*
- * page_flags_cleared - helper to perform balloon @page -flags tests.
+ * balloon_page_movable - identify balloon pages that can be moved by
+ *   compaction / migration.
  *
- * As balloon pages are obtained from buddy and we do not play with page-flags
- * at driver level (exception made when we get the page lock for compaction),
- * we can safely identify a ballooned page by checking if the
- * PAGE_FLAGS_CHECK_AT_PREP page-flags are all cleared.  This approach also
- * helps us skip ballooned pages that are locked for compaction or release, 
thus
- * mitigating their racy check at balloon_page_movable()
+ * BALLOON_PAGE_MAPCOUNT_VALUE must be = -2 but better not too close to
+ * -2 so that an underflow of the page_mapcount() won't be mistaken
+ * for a genuine BALLOON_PAGE_MAPCOUNT_VALUE.
  */
-static inline bool page_flags_cleared(struct page *page)
+#define BALLOON_PAGE_MAPCOUNT_VALUE (-256)
+static inline bool balloon_page_movable(struct page *page)
 {
-   return !(page-flags  PAGE_FLAGS_CHECK_AT_PREP);
+   return atomic_read(page-_mapcount) == BALLOON_PAGE_MAPCOUNT_VALUE;
 }
 
-/*
- * __is_movable_balloon_page - helper to perform @page mapping-flags tests
- */
-static inline bool __is_movable_balloon_page(struct page *page)
+static inline void __balloon_page_set(struct page *page)
 {
-   struct address_space *mapping = page-mapping;
-   return mapping_balloon(mapping);
+   VM_BUG_ON_PAGE(!atomic_read(page-_mapcount) != -1, page);
+   atomic_set(page-_mapcount, BALLOON_PAGE_MAPCOUNT_VALUE);
 }
 
-/*
- * balloon_page_movable - test page-mapping-flags to identify balloon pages
- *   that can be moved by compaction/migration.
- *
- * This function is used at core compaction's page isolation scheme, therefore
- * most pages exposed to it are not enlisted as balloon pages and so, to avoid
- * undesired side effects like racing against __free_pages(), we cannot afford
- * holding the page locked while testing page-mapping-flags here.
- *
- * As we might return false positives in the case of a balloon page being just
- * released under us, the page-mapping-flags need to be re-tested later,
- * under the proper page lock, at the functions that will be coping with the
- * balloon page case.
- */
-static inline bool balloon_page_movable(struct page *page)
+static inline void __balloon_page_clear(struct page *page)
 {
-   /*
-* Before dereferencing and testing mapping-flags, let's make sure

Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
On Thu, Aug 14, 2014 at 06:43:50PM -0300, Rafael Aquini wrote:
 On Thu, Aug 14, 2014 at 10:07:40PM +0400, Andrey Ryabinin wrote:
  We discussed this with Konstantin and he suggested a better solution for 
  this.
  If I understood him correctly the main idea was to store bit
  identifying ballon page
  in struct page (special value in _mapcount), so we won't need to check
  mapping-flags.
 
 
 Here goes what I thought doing, following that suggestion of Konstantin and 
 yours. (I didn't tested it yet)
 
 Comments are welcomed.
 
 Cheers,
 -- Rafael
 
  8 
 From: Rafael Aquini aqu...@redhat.com
 Subject: mm: balloon_compaction: enhance balloon_page_movable() checkpoint 
 against races
 
 While testing linux-next for the Kernel Address Sanitizer patchset (KASAN) 
 Sasha Levin reported a buffer overflow warning triggered for 
 isolate_migratepages_range(), which lated was discovered happening due to
 a condition where balloon_page_movable() raced against move_to_new_page(),
 while the later was copying the page-mapping of an anon page.
 
 Because we can perform balloon_page_movable() in a lockless fashion at 
 isolate_migratepages_range(), the dicovered race has unveiled the scheme 
 actually used to spot ballooned pages among page blocks that checks for
 page_flags_cleared() and dereference page-mapping to check its mapping flags
 is weak and potentially prone to stumble across another similar conditions 
 in the future.
 
 Following Konstantin Khlebnikov's and Andrey Ryabinin's suggestions,
 this patch replaces the old page-flags  mapping-flags checking scheme
 with a more simple and strong page-_mapcount read and compare value test.
 Similarly to what is done for PageBuddy() checks, BALLOON_PAGE_MAPCOUNT_VALUE
 is introduced here to mark balloon pages. This allows balloon_page_movable()
 to skip the proven troublesome dereference of page-mapping for flag checking
 while it goes on isolate_migratepages_range() lockless rounds.
 page-mapping dereference and flag-checking will be performed later, when
 all locks are held properly.
 
 ---
  include/linux/balloon_compaction.h | 61 
 +++---
  mm/balloon_compaction.c| 53 +
  2 files changed, 45 insertions(+), 69 deletions(-)
 
 diff --git a/include/linux/balloon_compaction.h 
 b/include/linux/balloon_compaction.h
 index 089743a..1409ccc 100644
 --- a/include/linux/balloon_compaction.h
 +++ b/include/linux/balloon_compaction.h
 @@ -108,54 +108,29 @@ static inline void balloon_mapping_free(struct 
 address_space *balloon_mapping)
  }
  
  /*
 - * page_flags_cleared - helper to perform balloon @page -flags tests.
 + * balloon_page_movable - identify balloon pages that can be moved by
 + * compaction / migration.
   *
 - * As balloon pages are obtained from buddy and we do not play with 
 page-flags
 - * at driver level (exception made when we get the page lock for compaction),
 - * we can safely identify a ballooned page by checking if the
 - * PAGE_FLAGS_CHECK_AT_PREP page-flags are all cleared.  This approach also
 - * helps us skip ballooned pages that are locked for compaction or release, 
 thus
 - * mitigating their racy check at balloon_page_movable()
 + * BALLOON_PAGE_MAPCOUNT_VALUE must be = -2 but better not too close to
 + * -2 so that an underflow of the page_mapcount() won't be mistaken
 + * for a genuine BALLOON_PAGE_MAPCOUNT_VALUE.
   */
 -static inline bool page_flags_cleared(struct page *page)
 +#define BALLOON_PAGE_MAPCOUNT_VALUE (-256)
 +static inline bool balloon_page_movable(struct page *page)
  {
 - return !(page-flags  PAGE_FLAGS_CHECK_AT_PREP);
 + return atomic_read(page-_mapcount) == BALLOON_PAGE_MAPCOUNT_VALUE;
  }
  
 -/*
 - * __is_movable_balloon_page - helper to perform @page mapping-flags tests
 - */
 -static inline bool __is_movable_balloon_page(struct page *page)
 +static inline void __balloon_page_set(struct page *page)
  {
 - struct address_space *mapping = page-mapping;
 - return mapping_balloon(mapping);
 + VM_BUG_ON_PAGE(!atomic_read(page-_mapcount) != -1, page);
 + atomic_set(page-_mapcount, BALLOON_PAGE_MAPCOUNT_VALUE);
  }
  
 -/*
 - * balloon_page_movable - test page-mapping-flags to identify balloon pages
 - * that can be moved by compaction/migration.
 - *
 - * This function is used at core compaction's page isolation scheme, 
 therefore
 - * most pages exposed to it are not enlisted as balloon pages and so, to 
 avoid
 - * undesired side effects like racing against __free_pages(), we cannot 
 afford
 - * holding the page locked while testing page-mapping-flags here.
 - *
 - * As we might return false positives in the case of a balloon page being 
 just
 - * released under us, the page-mapping-flags need to be re-tested later,
 - * under the proper page lock, at the functions that will be coping with the
 - * balloon page case.
 - */
 -static inline bool balloon_page_movable

Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
Here's a potential final version for the patch mentioned in a earlier message.
The nitpick I raised to myself and a couple of other minor typing issues
are fixed.

I did a preliminary testround, in a KVM guest ballooning in and out memory by 
chunks of 1GB while a script within the guest was forcing 
compaction concurrently verything looked alright.

Sasha, could you give this a try to see if that reported KASAN warning
fades away, please?

Cheers,
-- Rafael

---8---
From: Rafael Aquini aqu...@redhat.com
Subject: [PATCH v2] mm: balloon_compaction: enhance balloon_page_movable()
 checkpoint against races

While testing linux-next for the Kernel Address Sanitizer patchset (KASAN)
Sasha Levin reported a buffer overflow warning triggered for
isolate_migratepages_range(), which later was discovered happening due to
a condition where balloon_page_movable() raced against move_to_new_page(),
while the later was copying the page-mapping of an anon page.

Because we can perform balloon_page_movable() in a lockless fashion at
isolate_migratepages_range(), the discovered race has unveiled the scheme
actually used to spot ballooned pages among page blocks that checks for
page_flags_cleared() and dereference page-mapping to check its mapping flags
is weak and potentially prone to stumble across another similar conditions
in the future.

Following Konstantin Khlebnikov's and Andrey Ryabinin's suggestions,
this patch replaces the old page-flags  mapping-flags checking scheme
with a more simple and strong page-_mapcount read and compare value test.
Similarly to what is done for PageBuddy() checks, BALLOON_PAGE_MAPCOUNT_VALUE
is introduced here to mark balloon pages. This allows balloon_page_movable()
to skip the proven troublesome dereference of page-mapping for flag checking
while it goes on isolate_migratepages_range() lockless rounds.
page-mapping dereference and flag-checking will be performed later, when
all locks are held properly.

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
 include/linux/balloon_compaction.h | 61 +++---
 mm/balloon_compaction.c| 59 ++--
 2 files changed, 54 insertions(+), 66 deletions(-)

diff --git a/include/linux/balloon_compaction.h 
b/include/linux/balloon_compaction.h
index 089743a..e00d5b0 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -108,54 +108,29 @@ static inline void balloon_mapping_free(struct 
address_space *balloon_mapping)
 }
 
 /*
- * page_flags_cleared - helper to perform balloon @page -flags tests.
+ * balloon_page_movable - identify balloon pages that can be moved by
+ *   compaction / migration.
  *
- * As balloon pages are obtained from buddy and we do not play with page-flags
- * at driver level (exception made when we get the page lock for compaction),
- * we can safely identify a ballooned page by checking if the
- * PAGE_FLAGS_CHECK_AT_PREP page-flags are all cleared.  This approach also
- * helps us skip ballooned pages that are locked for compaction or release, 
thus
- * mitigating their racy check at balloon_page_movable()
+ * BALLOON_PAGE_MAPCOUNT_VALUE must be = -2 but better not too close to
+ * -2 so that an underflow of the page_mapcount() won't be mistaken
+ * for a genuine BALLOON_PAGE_MAPCOUNT_VALUE.
  */
-static inline bool page_flags_cleared(struct page *page)
+#define BALLOON_PAGE_MAPCOUNT_VALUE (-256)
+static inline bool balloon_page_movable(struct page *page)
 {
-   return !(page-flags  PAGE_FLAGS_CHECK_AT_PREP);
+   return atomic_read(page-_mapcount) == BALLOON_PAGE_MAPCOUNT_VALUE;
 }
 
-/*
- * __is_movable_balloon_page - helper to perform @page mapping-flags tests
- */
-static inline bool __is_movable_balloon_page(struct page *page)
+static inline void __balloon_page_set(struct page *page)
 {
-   struct address_space *mapping = page-mapping;
-   return mapping_balloon(mapping);
+   VM_BUG_ON_PAGE(atomic_read(page-_mapcount) != -1, page);
+   atomic_set(page-_mapcount, BALLOON_PAGE_MAPCOUNT_VALUE);
 }
 
-/*
- * balloon_page_movable - test page-mapping-flags to identify balloon pages
- *   that can be moved by compaction/migration.
- *
- * This function is used at core compaction's page isolation scheme, therefore
- * most pages exposed to it are not enlisted as balloon pages and so, to avoid
- * undesired side effects like racing against __free_pages(), we cannot afford
- * holding the page locked while testing page-mapping-flags here.
- *
- * As we might return false positives in the case of a balloon page being just
- * released under us, the page-mapping-flags need to be re-tested later,
- * under the proper page lock, at the functions that will be coping with the
- * balloon page case.
- */
-static inline bool balloon_page_movable(struct page *page)
+static inline void __balloon_page_clear(struct page *page)
 {
-   /*
-* Before dereferencing

Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-14 Thread Rafael Aquini
On Fri, Aug 15, 2014 at 07:36:16AM +0400, Konstantin Khlebnikov wrote:
 Don't hurry. The code in this state for years.
 I'm working on patches for this, if everything goes well I'll show it today.
 As usual I couldn't stop myself from cleaning the mess, so it will be
 bigger than yours.

Sorry,

I didn't see this reply of yours before sending out an adjusted-and-tested 
version of that patch, and asked Sasha to check it against his test-case.

Please, do not hesitate in providing your change ideas, though. I'd really
appreciate your assessment feedback on that code. 

Cheers,
-- Rafael
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-13 Thread Rafael Aquini
On Sun, Aug 10, 2014 at 12:49:47PM +0400, Andrey Ryabinin wrote:
> 2014-08-10 5:45 GMT+04:00 Sasha Levin :
> > Hi all,
> >
> > While fuzzing with trinity inside a KVM tools guest running the latest -next
> > kernel with the KASAN patchset, I've stumbled on the following spew:
> >
> >
> > [ 3837.070452] 
> > ==
> > [ 3837.073101] AddressSanitizer: buffer overflow in 
> > isolate_migratepages_range+0x85f/0xd90 at addr 88051b70eb49
> > [ 3837.076310] page:ea00146dc380 count:0 mapcount:0 mapping:  
> > (null) index:0x0
> > [ 3837.079876] page flags: 0xaf80008000(tail)
> > [ 3837.114592] page dumped because: kasan error
> > [ 3837.115897] CPU: 4 PID: 29613 Comm: trinity-c467 Not tainted 
> > 3.16.0-next-20140808-sasha-00051-gf368221 #1051
> > [ 3837.118024]  00fc  ea00146dc380 
> > 8801f326f718
> > [ 3837.119837]  97e0d344 8801f326f7e8 8801f326f7d8 
> > 9342d5bc
> > [ 3837.121708]  ea00085163c0  8801f326f8e0 
> > 93fe02fb
> > [ 3837.123704] Call Trace:
> > [ 3837.124272] dump_stack (lib/dump_stack.c:52)
> > [ 3837.125166] kasan_report_error (mm/kasan/report.c:98 
> > mm/kasan/report.c:166)
> > [ 3837.126128] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:33)
> > [ 3837.127462] ? retint_restore_args (arch/x86/kernel/entry_64.S:828)
> > [ 3837.128753] __asan_load8 (mm/kasan/kasan.c:364)
> > [ 3837.129914] ? isolate_migratepages_range 
> > (./arch/x86/include/asm/bitops.h:311 include/linux/pagemap.h:70 
> > include/linux/balloon_compaction.h:131 
> > include/linux/balloon_compaction.h:156 mm/compaction.c:596)
> > [ 3837.131613] isolate_migratepages_range 
> > (./arch/x86/include/asm/bitops.h:311 include/linux/pagemap.h:70 
> > include/linux/balloon_compaction.h:131 
> > include/linux/balloon_compaction.h:156 mm/compaction.c:596)
> > [ 3837.132838] compact_zone (mm/compaction.c:877 mm/compaction.c:1044)
> > [ 3837.133818] compact_zone_order (mm/compaction.c:1106)
> > [ 3837.134982] try_to_compact_pages (mm/compaction.c:1161)
> > [ 3837.135970] __alloc_pages_direct_compact (mm/page_alloc.c:2313)
> > [ 3837.137217] ? next_zones_zonelist (mm/mmzone.c:72)
> > [ 3837.138861] __alloc_pages_nodemask (mm/page_alloc.c:2640 
> > mm/page_alloc.c:2806)
> > [ 3837.139897] ? check_chain_key (kernel/locking/lockdep.c:2190)
> > [ 3837.141220] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
> > [ 3837.142434] alloc_pages_vma (mm/mempolicy.c:2046)
> > [ 3837.143479] ? do_huge_pmd_wp_page (mm/huge_memory.c:774 
> > mm/huge_memory.c:1123)
> > [ 3837.144663] do_huge_pmd_wp_page (mm/huge_memory.c:774 
> > mm/huge_memory.c:1123)
> > [ 3837.145653] handle_mm_fault (mm/memory.c:3312 mm/memory.c:3370)
> > [ 3837.146717] ? vmacache_find (mm/vmacache.c:100 (discriminator 1))
> > [ 3837.147404] ? find_vma (mm/mmap.c:2024)
> > [ 3837.147982] __do_page_fault (arch/x86/mm/fault.c:1231)
> > [ 3837.148613] ? context_tracking_user_exit (kernel/context_tracking.c:184)
> > [ 3837.149388] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
> > [ 3837.150212] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2641 
> > (discriminator 8))
> > [ 3837.150977] ? trace_hardirqs_off (kernel/locking/lockdep.c:2647)
> > [ 3837.151686] trace_do_page_fault (arch/x86/mm/fault.c:1314 
> > include/linux/jump_label.h:115 include/linux/context_tracking_state.h:27 
> > include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1315)
> > [ 3837.152870] do_async_page_fault (arch/x86/kernel/kvm.c:279)
> > [ 3837.153886] async_page_fault (arch/x86/kernel/entry_64.S:1313)
> > [ 3837.155293] Read of size 8 by thread T29613:
> > [ 3837.156058] Memory state around the buggy address:
> > [ 3837.156885]  88051b70e880: 00 00 00 00 00 00 00 fc fc fc fc fc fc fc 
> > fc fc
> > [ 3837.158141]  88051b70e900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
> > fc fc
> > [ 3837.159492]  88051b70e980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
> > fc fc
> > [ 3837.160863]  88051b70ea00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
> > 00 00
> > [ 3837.162165]  88051b70ea80: 00 00 00 00 00 00 00 fc fc fc fc fc fc fc 
> > fc fc
> > [ 3837.163552] >88051b70eb00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
> > fc fc
> > [ 3837.164866]   ^
> > [ 3837.165914]  88051b70eb80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
> > fc fc
> > [ 3837.167317]  88051b70ec00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
> > fb fb
> > [ 3837.168616]  88051b70ec80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
> > fb fb
> > [ 3837.169898]  88051b70ed00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
> > fb fb
> > [ 3837.171298]  88051b70ed80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
> > fb fb
> > [ 3837.172611] 
> > ==
> >
> >
> > Thanks,
> > Sasha

Nice pick from the sanitizer bits!

> 
> 

Re: mm: compaction: buffer overflow in isolate_migratepages_range

2014-08-13 Thread Rafael Aquini
On Sun, Aug 10, 2014 at 12:49:47PM +0400, Andrey Ryabinin wrote:
 2014-08-10 5:45 GMT+04:00 Sasha Levin sasha.le...@oracle.com:
  Hi all,
 
  While fuzzing with trinity inside a KVM tools guest running the latest -next
  kernel with the KASAN patchset, I've stumbled on the following spew:
 
 
  [ 3837.070452] 
  ==
  [ 3837.073101] AddressSanitizer: buffer overflow in 
  isolate_migratepages_range+0x85f/0xd90 at addr 88051b70eb49
  [ 3837.076310] page:ea00146dc380 count:0 mapcount:0 mapping:  
  (null) index:0x0
  [ 3837.079876] page flags: 0xaf80008000(tail)
  [ 3837.114592] page dumped because: kasan error
  [ 3837.115897] CPU: 4 PID: 29613 Comm: trinity-c467 Not tainted 
  3.16.0-next-20140808-sasha-00051-gf368221 #1051
  [ 3837.118024]  00fc  ea00146dc380 
  8801f326f718
  [ 3837.119837]  97e0d344 8801f326f7e8 8801f326f7d8 
  9342d5bc
  [ 3837.121708]  ea00085163c0  8801f326f8e0 
  93fe02fb
  [ 3837.123704] Call Trace:
  [ 3837.124272] dump_stack (lib/dump_stack.c:52)
  [ 3837.125166] kasan_report_error (mm/kasan/report.c:98 
  mm/kasan/report.c:166)
  [ 3837.126128] ? trace_hardirqs_on_thunk (arch/x86/lib/thunk_64.S:33)
  [ 3837.127462] ? retint_restore_args (arch/x86/kernel/entry_64.S:828)
  [ 3837.128753] __asan_load8 (mm/kasan/kasan.c:364)
  [ 3837.129914] ? isolate_migratepages_range 
  (./arch/x86/include/asm/bitops.h:311 include/linux/pagemap.h:70 
  include/linux/balloon_compaction.h:131 
  include/linux/balloon_compaction.h:156 mm/compaction.c:596)
  [ 3837.131613] isolate_migratepages_range 
  (./arch/x86/include/asm/bitops.h:311 include/linux/pagemap.h:70 
  include/linux/balloon_compaction.h:131 
  include/linux/balloon_compaction.h:156 mm/compaction.c:596)
  [ 3837.132838] compact_zone (mm/compaction.c:877 mm/compaction.c:1044)
  [ 3837.133818] compact_zone_order (mm/compaction.c:1106)
  [ 3837.134982] try_to_compact_pages (mm/compaction.c:1161)
  [ 3837.135970] __alloc_pages_direct_compact (mm/page_alloc.c:2313)
  [ 3837.137217] ? next_zones_zonelist (mm/mmzone.c:72)
  [ 3837.138861] __alloc_pages_nodemask (mm/page_alloc.c:2640 
  mm/page_alloc.c:2806)
  [ 3837.139897] ? check_chain_key (kernel/locking/lockdep.c:2190)
  [ 3837.141220] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
  [ 3837.142434] alloc_pages_vma (mm/mempolicy.c:2046)
  [ 3837.143479] ? do_huge_pmd_wp_page (mm/huge_memory.c:774 
  mm/huge_memory.c:1123)
  [ 3837.144663] do_huge_pmd_wp_page (mm/huge_memory.c:774 
  mm/huge_memory.c:1123)
  [ 3837.145653] handle_mm_fault (mm/memory.c:3312 mm/memory.c:3370)
  [ 3837.146717] ? vmacache_find (mm/vmacache.c:100 (discriminator 1))
  [ 3837.147404] ? find_vma (mm/mmap.c:2024)
  [ 3837.147982] __do_page_fault (arch/x86/mm/fault.c:1231)
  [ 3837.148613] ? context_tracking_user_exit (kernel/context_tracking.c:184)
  [ 3837.149388] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
  [ 3837.150212] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2641 
  (discriminator 8))
  [ 3837.150977] ? trace_hardirqs_off (kernel/locking/lockdep.c:2647)
  [ 3837.151686] trace_do_page_fault (arch/x86/mm/fault.c:1314 
  include/linux/jump_label.h:115 include/linux/context_tracking_state.h:27 
  include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1315)
  [ 3837.152870] do_async_page_fault (arch/x86/kernel/kvm.c:279)
  [ 3837.153886] async_page_fault (arch/x86/kernel/entry_64.S:1313)
  [ 3837.155293] Read of size 8 by thread T29613:
  [ 3837.156058] Memory state around the buggy address:
  [ 3837.156885]  88051b70e880: 00 00 00 00 00 00 00 fc fc fc fc fc fc fc 
  fc fc
  [ 3837.158141]  88051b70e900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
  fc fc
  [ 3837.159492]  88051b70e980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
  fc fc
  [ 3837.160863]  88051b70ea00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
  00 00
  [ 3837.162165]  88051b70ea80: 00 00 00 00 00 00 00 fc fc fc fc fc fc fc 
  fc fc
  [ 3837.163552] 88051b70eb00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
  fc fc
  [ 3837.164866]   ^
  [ 3837.165914]  88051b70eb80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
  fc fc
  [ 3837.167317]  88051b70ec00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
  fb fb
  [ 3837.168616]  88051b70ec80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
  fb fb
  [ 3837.169898]  88051b70ed00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
  fb fb
  [ 3837.171298]  88051b70ed80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
  fb fb
  [ 3837.172611] 
  ==
 
 
  Thanks,
  Sasha

Nice pick from the sanitizer bits!

 
 Bad access happens when we read page-mapping-flags in mapping_ballon().
 Address of page-mapping-flags here is 88051b70eb49, so the
 lowest bit is set,
 which means that 

[PATCH v2] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces

2014-06-26 Thread Rafael Aquini
Historically, we exported shared pages to userspace via sysinfo(2) sharedram
and /proc/meminfo's "MemShared" fields. With the advent of tmpfs, from kernel
v2.4 onward, that old way for accounting shared mem was deemed inaccurate and
we started to export a hard-coded 0 for sysinfo.sharedram. Later on, during
the 2.6 timeframe, "MemShared" got re-introduced to /proc/meminfo re-branded
as "Shmem", but we're still reporting sysinfo.sharedmem as that old hard-coded
zero, which makes the "shared memory" report inconsistent across interfaces.

This patch leverages the addition of explicit accounting for pages used by
shmem/tmpfs -- "4b02108 mm: oom analysis: add shmem vmstat" -- in order to
make the users of sysinfo(2) and si_meminfo*() friends aware of that
vmstat entry and make them report it consistently across the interfaces,
as well to make sysinfo(2) returned data consistent with our current API
documentation states.

Signed-off-by: Rafael Aquini 
---
Changelog from v1:
- updated commit log message to include historical context   (kosaki-san)

 drivers/base/node.c | 2 +-
 fs/proc/meminfo.c   | 2 +-
 mm/page_alloc.c | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 8f7ed99..c6d3ae0 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -126,7 +126,7 @@ static ssize_t node_read_meminfo(struct device *dev,
   nid, K(node_page_state(nid, NR_FILE_PAGES)),
   nid, K(node_page_state(nid, NR_FILE_MAPPED)),
   nid, K(node_page_state(nid, NR_ANON_PAGES)),
-  nid, K(node_page_state(nid, NR_SHMEM)),
+  nid, K(i.sharedram),
   nid, node_page_state(nid, NR_KERNEL_STACK) *
THREAD_SIZE / 1024,
   nid, K(node_page_state(nid, NR_PAGETABLE)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 7445af0..aa1eee0 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_WRITEBACK)),
K(global_page_state(NR_ANON_PAGES)),
K(global_page_state(NR_FILE_MAPPED)),
-   K(global_page_state(NR_SHMEM)),
+   K(i.sharedram),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20d17f8..f72ea38 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3040,7 +3040,7 @@ static inline void show_node(struct zone *zone)
 void si_meminfo(struct sysinfo *val)
 {
val->totalram = totalram_pages;
-   val->sharedram = 0;
+   val->sharedram = global_page_state(NR_SHMEM);
val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
@@ -3060,6 +3060,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
managed_pages += pgdat->node_zones[zone_type].managed_pages;
val->totalram = managed_pages;
+   val->sharedram = node_page_state(nid, NR_SHMEM);
val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces

2014-06-26 Thread Rafael Aquini
Historically, we exported shared pages to userspace via sysinfo(2) sharedram
and /proc/meminfo's MemShared fields. With the advent of tmpfs, from kernel
v2.4 onward, that old way for accounting shared mem was deemed inaccurate and
we started to export a hard-coded 0 for sysinfo.sharedram. Later on, during
the 2.6 timeframe, MemShared got re-introduced to /proc/meminfo re-branded
as Shmem, but we're still reporting sysinfo.sharedmem as that old hard-coded
zero, which makes the shared memory report inconsistent across interfaces.

This patch leverages the addition of explicit accounting for pages used by
shmem/tmpfs -- 4b02108 mm: oom analysis: add shmem vmstat -- in order to
make the users of sysinfo(2) and si_meminfo*() friends aware of that
vmstat entry and make them report it consistently across the interfaces,
as well to make sysinfo(2) returned data consistent with our current API
documentation states.

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
Changelog from v1:
- updated commit log message to include historical context   (kosaki-san)

 drivers/base/node.c | 2 +-
 fs/proc/meminfo.c   | 2 +-
 mm/page_alloc.c | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 8f7ed99..c6d3ae0 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -126,7 +126,7 @@ static ssize_t node_read_meminfo(struct device *dev,
   nid, K(node_page_state(nid, NR_FILE_PAGES)),
   nid, K(node_page_state(nid, NR_FILE_MAPPED)),
   nid, K(node_page_state(nid, NR_ANON_PAGES)),
-  nid, K(node_page_state(nid, NR_SHMEM)),
+  nid, K(i.sharedram),
   nid, node_page_state(nid, NR_KERNEL_STACK) *
THREAD_SIZE / 1024,
   nid, K(node_page_state(nid, NR_PAGETABLE)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 7445af0..aa1eee0 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_WRITEBACK)),
K(global_page_state(NR_ANON_PAGES)),
K(global_page_state(NR_FILE_MAPPED)),
-   K(global_page_state(NR_SHMEM)),
+   K(i.sharedram),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20d17f8..f72ea38 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3040,7 +3040,7 @@ static inline void show_node(struct zone *zone)
 void si_meminfo(struct sysinfo *val)
 {
val-totalram = totalram_pages;
-   val-sharedram = 0;
+   val-sharedram = global_page_state(NR_SHMEM);
val-freeram = global_page_state(NR_FREE_PAGES);
val-bufferram = nr_blockdev_pages();
val-totalhigh = totalhigh_pages;
@@ -3060,6 +3060,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
for (zone_type = 0; zone_type  MAX_NR_ZONES; zone_type++)
managed_pages += pgdat-node_zones[zone_type].managed_pages;
val-totalram = managed_pages;
+   val-sharedram = node_page_state(nid, NR_SHMEM);
val-freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
val-totalhigh = pgdat-node_zones[ZONE_HIGHMEM].managed_pages;
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces

2014-06-25 Thread Rafael Aquini
On Wed, Jun 25, 2014 at 01:27:53PM -0700, Motohiro Kosaki wrote:
> 
> 
> > -Original Message-
> > From: Rafael Aquini [mailto:aqu...@redhat.com]
> > Sent: Wednesday, June 25, 2014 4:16 PM
> > To: Motohiro Kosaki
> > Cc: linux...@kvack.org; Andrew Morton; Rik van Riel; Mel Gorman; Johannes 
> > Weiner; Motohiro Kosaki JP; linux-
> > ker...@vger.kernel.org
> > Subject: Re: [PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() 
> > interfaces
> > 
> > On Wed, Jun 25, 2014 at 12:41:17PM -0700, Motohiro Kosaki wrote:
> > >
> > >
> > > > -Original Message-
> > > > From: Rafael Aquini [mailto:aqu...@redhat.com]
> > > > Sent: Wednesday, June 25, 2014 2:40 PM
> > > > To: linux...@kvack.org
> > > > Cc: Andrew Morton; Rik van Riel; Mel Gorman; Johannes Weiner;
> > > > Motohiro Kosaki JP; linux-kernel@vger.kernel.org
> > > > Subject: [PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo()
> > > > interfaces
> > > >
> > > > This patch leverages the addition of explicit accounting for pages
> > > > used by shmem/tmpfs -- "4b02108 mm: oom analysis: add shmem vmstat"
> > > > -- in order to make the users of sysinfo(2) and si_meminfo*() friends 
> > > > aware of that vmstat entry consistently across the interfaces.
> > >
> > > Why?
> > 
> > Because we do not report consistently across the interfaces we declare 
> > exporting that data. Check sysinfo(2) manpage, for instance:
> > [...]
> >struct sysinfo {
> >long uptime; /* Seconds since boot */
> >unsigned long loads[3];  /* 1, 5, and 15 minute load 
> > averages */
> >unsigned long totalram;  /* Total usable main memory size */
> >unsigned long freeram;   /* Available memory size */
> >unsigned long sharedram; /* Amount of shared memory */ <<<<< 
> > [...]
> > 
> > userspace tools resorting to sysinfo() syscall will get a hardcoded 0 for 
> > shared memory which is reported differently from
> > /proc/meminfo.
> > 
> > Also, si_meminfo() & si_meminfo_node() are utilized within the kernel to 
> > gather statistics for /proc/meminfo & friends, and so we
> > can leverage collecting sharedmem from those calls as well, just as we do 
> > for totalram, freeram & bufferram.
> 
> But "Amount of shared memory"  didn't mean amout of shmem. It actually meant 
> amout of page of page-count>=2.
> Again, there is a possibility to change the semantics. But I don't have 
> enough userland knowledge to do. Please investigate
> and explain why your change don't break any userland. 

I agree that reporting the amount of shared pages in that historically fashion 
might not be interesting for userspace tools resorting to sysinfo(2),
nowadays.

OTOH, our documentation implies we do return shared memory there, and FWIW,
considering the other places we do export the "shared memory" concept to 
userspace nowadays, we are suggesting it's the amount of tmpfs/shmem, and not 
the
amount of shared mapped pages it historiacally represented once. What is really
confusing is having a field that supposedely/expectedely would return the amount
of shmem to userspace queries, but instead returns a hard-coded zero (0).

I could easily find out that there were some user complaint/confusion on this 
semantic inconsistency in the past, as in:
https://groups.google.com/forum/#!topic/comp.os.linux.development.system/ogWVn6XdvGA

or in:
http://marc.info/?l=net-snmp-cvs=132148788500667

which suggests users seem to always have understood it as being shmem/tmpfs
usage, as the /proc/meminfo field "MemShared" was tied direclty to
sysinfo.sharedram. Historically we reported shared memory that way, and
when it wasn't accurately meaning that anymore a 0 was hardcoded there to
potentially not break compatibility with older tools (older than 2.4).
In 2.6 we got rid of meminfo's "MemShared" until 2009, when you sort of
re-introduced it re-branded as Shmem. IMO, we should leverage what we 
have in kernel now and take this change to make the exposed data consistent 
across the interfaces that export it today -- sysinfo(2) & /proc/meminfo.

This is not a hard requirement, though, but rather a simple maintenance
nitpick from code review. 

Regards,
-- Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces

2014-06-25 Thread Rafael Aquini
On Wed, Jun 25, 2014 at 12:41:17PM -0700, Motohiro Kosaki wrote:
> 
> 
> > -Original Message-
> > From: Rafael Aquini [mailto:aqu...@redhat.com]
> > Sent: Wednesday, June 25, 2014 2:40 PM
> > To: linux...@kvack.org
> > Cc: Andrew Morton; Rik van Riel; Mel Gorman; Johannes Weiner; Motohiro 
> > Kosaki JP; linux-kernel@vger.kernel.org
> > Subject: [PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() 
> > interfaces
> > 
> > This patch leverages the addition of explicit accounting for pages used by 
> > shmem/tmpfs -- "4b02108 mm: oom analysis: add shmem
> > vmstat" -- in order to make the users of sysinfo(2) and si_meminfo*() 
> > friends aware of that vmstat entry consistently across the
> > interfaces.
> 
> Why?

Because we do not report consistently across the interfaces we declare
exporting that data. Check sysinfo(2) manpage, for instance:
[...]
   struct sysinfo {
   long uptime; /* Seconds since boot */
   unsigned long loads[3];  /* 1, 5, and 15 minute load averages */
   unsigned long totalram;  /* Total usable main memory size */
   unsigned long freeram;   /* Available memory size */
   unsigned long sharedram; /* Amount of shared memory */ <<<<<
[...]

userspace tools resorting to sysinfo() syscall will get a hardcoded 0
for shared memory which is reported differently from /proc/meminfo.

Also, si_meminfo() & si_meminfo_node() are utilized within the kernel to
gather statistics for /proc/meminfo & friends, and so we can leverage collecting
sharedmem from those calls as well, just as we do for totalram, freeram & 
bufferram.

Regards,
-- Rafael

> Traditionally sysinfo.sharedram was not used for shmem. It was totally 
> strange semantics and completely outdated feature. 
> So, we may reuse it for another purpose. But I'm not sure its benefit. 
> 
> Why don't you use /proc/meminfo?
> I'm afraid userland programs get a confusion. 
> 
> 
> > 
> > Signed-off-by: Rafael Aquini 
> > ---
> >  drivers/base/node.c | 2 +-
> >  fs/proc/meminfo.c   | 2 +-
> >  mm/page_alloc.c | 3 ++-
> >  3 files changed, 4 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/base/node.c b/drivers/base/node.c index 
> > 8f7ed99..c6d3ae0 100644
> > --- a/drivers/base/node.c
> > +++ b/drivers/base/node.c
> > @@ -126,7 +126,7 @@ static ssize_t node_read_meminfo(struct device *dev,
> >nid, K(node_page_state(nid, NR_FILE_PAGES)),
> >nid, K(node_page_state(nid, NR_FILE_MAPPED)),
> >nid, K(node_page_state(nid, NR_ANON_PAGES)),
> > -  nid, K(node_page_state(nid, NR_SHMEM)),
> > +  nid, K(i.sharedram),
> >nid, node_page_state(nid, NR_KERNEL_STACK) *
> > THREAD_SIZE / 1024,
> >nid, K(node_page_state(nid, NR_PAGETABLE)), diff --git 
> > a/fs/proc/meminfo.c b/fs/proc/meminfo.c index
> > 7445af0..aa1eee0 100644
> > --- a/fs/proc/meminfo.c
> > +++ b/fs/proc/meminfo.c
> > @@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void 
> > *v)
> > K(global_page_state(NR_WRITEBACK)),
> > K(global_page_state(NR_ANON_PAGES)),
> > K(global_page_state(NR_FILE_MAPPED)),
> > -   K(global_page_state(NR_SHMEM)),
> > +   K(i.sharedram),
> > K(global_page_state(NR_SLAB_RECLAIMABLE) +
> > global_page_state(NR_SLAB_UNRECLAIMABLE)),
> > K(global_page_state(NR_SLAB_RECLAIMABLE)),
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20d17f8..f72ea38 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -3040,7 +3040,7 @@ static inline void show_node(struct zone *zone)  void 
> > si_meminfo(struct sysinfo *val)  {
> > val->totalram = totalram_pages;
> > -   val->sharedram = 0;
> > +   val->sharedram = global_page_state(NR_SHMEM);
> > val->freeram = global_page_state(NR_FREE_PAGES);
> > val->bufferram = nr_blockdev_pages();
> > val->totalhigh = totalhigh_pages;
> > @@ -3060,6 +3060,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
> > for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
> > managed_pages += pgdat->node_zones[zone_type].managed_pages;
> > val->totalram = managed_pages;
> > +   val->sharedram = node_page_state(nid, NR_SHMEM);
> > val->freeram = node_page_state(nid, NR_FREE_PAGES);  #ifdef 
> > CONFIG_HIGHMEM
> > val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
> > --
> > 1.9.3
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces

2014-06-25 Thread Rafael Aquini
This patch leverages the addition of explicit accounting for pages used by
shmem/tmpfs -- "4b02108 mm: oom analysis: add shmem vmstat" -- in order to
make the users of sysinfo(2) and si_meminfo*() friends aware of that
vmstat entry consistently across the interfaces.

Signed-off-by: Rafael Aquini 
---
 drivers/base/node.c | 2 +-
 fs/proc/meminfo.c   | 2 +-
 mm/page_alloc.c | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 8f7ed99..c6d3ae0 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -126,7 +126,7 @@ static ssize_t node_read_meminfo(struct device *dev,
   nid, K(node_page_state(nid, NR_FILE_PAGES)),
   nid, K(node_page_state(nid, NR_FILE_MAPPED)),
   nid, K(node_page_state(nid, NR_ANON_PAGES)),
-  nid, K(node_page_state(nid, NR_SHMEM)),
+  nid, K(i.sharedram),
   nid, node_page_state(nid, NR_KERNEL_STACK) *
THREAD_SIZE / 1024,
   nid, K(node_page_state(nid, NR_PAGETABLE)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 7445af0..aa1eee0 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_WRITEBACK)),
K(global_page_state(NR_ANON_PAGES)),
K(global_page_state(NR_FILE_MAPPED)),
-   K(global_page_state(NR_SHMEM)),
+   K(i.sharedram),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20d17f8..f72ea38 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3040,7 +3040,7 @@ static inline void show_node(struct zone *zone)
 void si_meminfo(struct sysinfo *val)
 {
val->totalram = totalram_pages;
-   val->sharedram = 0;
+   val->sharedram = global_page_state(NR_SHMEM);
val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
@@ -3060,6 +3060,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
managed_pages += pgdat->node_zones[zone_type].managed_pages;
val->totalram = managed_pages;
+   val->sharedram = node_page_state(nid, NR_SHMEM);
val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces

2014-06-25 Thread Rafael Aquini
This patch leverages the addition of explicit accounting for pages used by
shmem/tmpfs -- 4b02108 mm: oom analysis: add shmem vmstat -- in order to
make the users of sysinfo(2) and si_meminfo*() friends aware of that
vmstat entry consistently across the interfaces.

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
 drivers/base/node.c | 2 +-
 fs/proc/meminfo.c   | 2 +-
 mm/page_alloc.c | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 8f7ed99..c6d3ae0 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -126,7 +126,7 @@ static ssize_t node_read_meminfo(struct device *dev,
   nid, K(node_page_state(nid, NR_FILE_PAGES)),
   nid, K(node_page_state(nid, NR_FILE_MAPPED)),
   nid, K(node_page_state(nid, NR_ANON_PAGES)),
-  nid, K(node_page_state(nid, NR_SHMEM)),
+  nid, K(i.sharedram),
   nid, node_page_state(nid, NR_KERNEL_STACK) *
THREAD_SIZE / 1024,
   nid, K(node_page_state(nid, NR_PAGETABLE)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 7445af0..aa1eee0 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_WRITEBACK)),
K(global_page_state(NR_ANON_PAGES)),
K(global_page_state(NR_FILE_MAPPED)),
-   K(global_page_state(NR_SHMEM)),
+   K(i.sharedram),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20d17f8..f72ea38 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3040,7 +3040,7 @@ static inline void show_node(struct zone *zone)
 void si_meminfo(struct sysinfo *val)
 {
val-totalram = totalram_pages;
-   val-sharedram = 0;
+   val-sharedram = global_page_state(NR_SHMEM);
val-freeram = global_page_state(NR_FREE_PAGES);
val-bufferram = nr_blockdev_pages();
val-totalhigh = totalhigh_pages;
@@ -3060,6 +3060,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
for (zone_type = 0; zone_type  MAX_NR_ZONES; zone_type++)
managed_pages += pgdat-node_zones[zone_type].managed_pages;
val-totalram = managed_pages;
+   val-sharedram = node_page_state(nid, NR_SHMEM);
val-freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
val-totalhigh = pgdat-node_zones[ZONE_HIGHMEM].managed_pages;
-- 
1.9.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces

2014-06-25 Thread Rafael Aquini
On Wed, Jun 25, 2014 at 12:41:17PM -0700, Motohiro Kosaki wrote:
 
 
  -Original Message-
  From: Rafael Aquini [mailto:aqu...@redhat.com]
  Sent: Wednesday, June 25, 2014 2:40 PM
  To: linux...@kvack.org
  Cc: Andrew Morton; Rik van Riel; Mel Gorman; Johannes Weiner; Motohiro 
  Kosaki JP; linux-kernel@vger.kernel.org
  Subject: [PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() 
  interfaces
  
  This patch leverages the addition of explicit accounting for pages used by 
  shmem/tmpfs -- 4b02108 mm: oom analysis: add shmem
  vmstat -- in order to make the users of sysinfo(2) and si_meminfo*() 
  friends aware of that vmstat entry consistently across the
  interfaces.
 
 Why?

Because we do not report consistently across the interfaces we declare
exporting that data. Check sysinfo(2) manpage, for instance:
[...]
   struct sysinfo {
   long uptime; /* Seconds since boot */
   unsigned long loads[3];  /* 1, 5, and 15 minute load averages */
   unsigned long totalram;  /* Total usable main memory size */
   unsigned long freeram;   /* Available memory size */
   unsigned long sharedram; /* Amount of shared memory */ 
[...]

userspace tools resorting to sysinfo() syscall will get a hardcoded 0
for shared memory which is reported differently from /proc/meminfo.

Also, si_meminfo()  si_meminfo_node() are utilized within the kernel to
gather statistics for /proc/meminfo  friends, and so we can leverage collecting
sharedmem from those calls as well, just as we do for totalram, freeram  
bufferram.

Regards,
-- Rafael

 Traditionally sysinfo.sharedram was not used for shmem. It was totally 
 strange semantics and completely outdated feature. 
 So, we may reuse it for another purpose. But I'm not sure its benefit. 
 
 Why don't you use /proc/meminfo?
 I'm afraid userland programs get a confusion. 
 
 
  
  Signed-off-by: Rafael Aquini aqu...@redhat.com
  ---
   drivers/base/node.c | 2 +-
   fs/proc/meminfo.c   | 2 +-
   mm/page_alloc.c | 3 ++-
   3 files changed, 4 insertions(+), 3 deletions(-)
  
  diff --git a/drivers/base/node.c b/drivers/base/node.c index 
  8f7ed99..c6d3ae0 100644
  --- a/drivers/base/node.c
  +++ b/drivers/base/node.c
  @@ -126,7 +126,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 nid, K(node_page_state(nid, NR_FILE_PAGES)),
 nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 nid, K(node_page_state(nid, NR_ANON_PAGES)),
  -  nid, K(node_page_state(nid, NR_SHMEM)),
  +  nid, K(i.sharedram),
 nid, node_page_state(nid, NR_KERNEL_STACK) *
  THREAD_SIZE / 1024,
 nid, K(node_page_state(nid, NR_PAGETABLE)), diff --git 
  a/fs/proc/meminfo.c b/fs/proc/meminfo.c index
  7445af0..aa1eee0 100644
  --- a/fs/proc/meminfo.c
  +++ b/fs/proc/meminfo.c
  @@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void 
  *v)
  K(global_page_state(NR_WRITEBACK)),
  K(global_page_state(NR_ANON_PAGES)),
  K(global_page_state(NR_FILE_MAPPED)),
  -   K(global_page_state(NR_SHMEM)),
  +   K(i.sharedram),
  K(global_page_state(NR_SLAB_RECLAIMABLE) +
  global_page_state(NR_SLAB_UNRECLAIMABLE)),
  K(global_page_state(NR_SLAB_RECLAIMABLE)),
  diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20d17f8..f72ea38 100644
  --- a/mm/page_alloc.c
  +++ b/mm/page_alloc.c
  @@ -3040,7 +3040,7 @@ static inline void show_node(struct zone *zone)  void 
  si_meminfo(struct sysinfo *val)  {
  val-totalram = totalram_pages;
  -   val-sharedram = 0;
  +   val-sharedram = global_page_state(NR_SHMEM);
  val-freeram = global_page_state(NR_FREE_PAGES);
  val-bufferram = nr_blockdev_pages();
  val-totalhigh = totalhigh_pages;
  @@ -3060,6 +3060,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  for (zone_type = 0; zone_type  MAX_NR_ZONES; zone_type++)
  managed_pages += pgdat-node_zones[zone_type].managed_pages;
  val-totalram = managed_pages;
  +   val-sharedram = node_page_state(nid, NR_SHMEM);
  val-freeram = node_page_state(nid, NR_FREE_PAGES);  #ifdef 
  CONFIG_HIGHMEM
  val-totalhigh = pgdat-node_zones[ZONE_HIGHMEM].managed_pages;
  --
  1.9.3
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() interfaces

2014-06-25 Thread Rafael Aquini
On Wed, Jun 25, 2014 at 01:27:53PM -0700, Motohiro Kosaki wrote:
 
 
  -Original Message-
  From: Rafael Aquini [mailto:aqu...@redhat.com]
  Sent: Wednesday, June 25, 2014 4:16 PM
  To: Motohiro Kosaki
  Cc: linux...@kvack.org; Andrew Morton; Rik van Riel; Mel Gorman; Johannes 
  Weiner; Motohiro Kosaki JP; linux-
  ker...@vger.kernel.org
  Subject: Re: [PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo() 
  interfaces
  
  On Wed, Jun 25, 2014 at 12:41:17PM -0700, Motohiro Kosaki wrote:
  
  
-Original Message-
From: Rafael Aquini [mailto:aqu...@redhat.com]
Sent: Wednesday, June 25, 2014 2:40 PM
To: linux...@kvack.org
Cc: Andrew Morton; Rik van Riel; Mel Gorman; Johannes Weiner;
Motohiro Kosaki JP; linux-kernel@vger.kernel.org
Subject: [PATCH] mm: export NR_SHMEM via sysinfo(2) / si_meminfo()
interfaces
   
This patch leverages the addition of explicit accounting for pages
used by shmem/tmpfs -- 4b02108 mm: oom analysis: add shmem vmstat
-- in order to make the users of sysinfo(2) and si_meminfo*() friends 
aware of that vmstat entry consistently across the interfaces.
  
   Why?
  
  Because we do not report consistently across the interfaces we declare 
  exporting that data. Check sysinfo(2) manpage, for instance:
  [...]
 struct sysinfo {
 long uptime; /* Seconds since boot */
 unsigned long loads[3];  /* 1, 5, and 15 minute load 
  averages */
 unsigned long totalram;  /* Total usable main memory size */
 unsigned long freeram;   /* Available memory size */
 unsigned long sharedram; /* Amount of shared memory */  
  [...]
  
  userspace tools resorting to sysinfo() syscall will get a hardcoded 0 for 
  shared memory which is reported differently from
  /proc/meminfo.
  
  Also, si_meminfo()  si_meminfo_node() are utilized within the kernel to 
  gather statistics for /proc/meminfo  friends, and so we
  can leverage collecting sharedmem from those calls as well, just as we do 
  for totalram, freeram  bufferram.
 
 But Amount of shared memory  didn't mean amout of shmem. It actually meant 
 amout of page of page-count=2.
 Again, there is a possibility to change the semantics. But I don't have 
 enough userland knowledge to do. Please investigate
 and explain why your change don't break any userland. 

I agree that reporting the amount of shared pages in that historically fashion 
might not be interesting for userspace tools resorting to sysinfo(2),
nowadays.

OTOH, our documentation implies we do return shared memory there, and FWIW,
considering the other places we do export the shared memory concept to 
userspace nowadays, we are suggesting it's the amount of tmpfs/shmem, and not 
the
amount of shared mapped pages it historiacally represented once. What is really
confusing is having a field that supposedely/expectedely would return the amount
of shmem to userspace queries, but instead returns a hard-coded zero (0).

I could easily find out that there were some user complaint/confusion on this 
semantic inconsistency in the past, as in:
https://groups.google.com/forum/#!topic/comp.os.linux.development.system/ogWVn6XdvGA

or in:
http://marc.info/?l=net-snmp-cvsm=132148788500667

which suggests users seem to always have understood it as being shmem/tmpfs
usage, as the /proc/meminfo field MemShared was tied direclty to
sysinfo.sharedram. Historically we reported shared memory that way, and
when it wasn't accurately meaning that anymore a 0 was hardcoded there to
potentially not break compatibility with older tools (older than 2.4).
In 2.6 we got rid of meminfo's MemShared until 2009, when you sort of
re-introduced it re-branded as Shmem. IMO, we should leverage what we 
have in kernel now and take this change to make the exposed data consistent 
across the interfaces that export it today -- sysinfo(2)  /proc/meminfo.

This is not a hard requirement, though, but rather a simple maintenance
nitpick from code review. 

Regards,
-- Rafael
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/8] mm: add page cache limit and reclaim feature

2014-06-16 Thread Rafael Aquini
On Mon, Jun 16, 2014 at 01:14:22PM +0200, Michal Hocko wrote:
> On Mon 16-06-14 17:24:38, Xishi Qiu wrote:
> > When system(e.g. smart phone) running for a long time, the cache often takes
> > a large memory, maybe the free memory is less than 50M, then OOM will happen
> > if APP allocate a large order pages suddenly and memory reclaim too slowly. 
> 
> Have you ever seen this to happen? Page cache should be easy to reclaim and
> if there is too mach dirty memory then you should be able to tune the
> amount by dirty_bytes/ratio knob. If the page allocator falls back to
> OOM and there is a lot of page cache then I would call it a bug. I do
> not think that limiting the amount of the page cache globally makes
> sense. There are Unix systems which offer this feature but I think it is
> a bad interface which only papers over the reclaim inefficiency or lack
> of other isolations between loads.
>
+1

It would be good if you could show some numbers that serve as evidence
of your theory on "excessive" pagecache acting as a trigger to your
observed OOMs. I'm assuming, by your 'e.g', you're running a swapless
system, so I would think your system OOMs are due to inability to
reclaim anon memory, instead of pagecache.

 
> > Use "echo 3 > /proc/sys/vm/drop_caches" will drop the whole cache, this will
> > affect the performance, so it is used for debugging only. 
> > 

If you are able to drop the whole pagecache by issuing the command
above, than it means the majority of it is just unmapped cache pages, 
and those would be normally reclaimed upon demand by the PFRA. One more 
thing that makes me wonder you're just seeing the effect of a leaky app 
making the system unable to swap out anon pages.


> > suse has this feature, I tested it before, but it can not limit the page 
> > cache
> > actually. So I rewrite the feature and add some parameters.
> 
> The feature is there for historic reasons and I _really_ think the
> interface is not appropriate. If there is a big pagecache usage which
> affects other loads then Memory cgroup controller can be used to help
> from interference.
> 
> > Christoph Lameter has written a patch "Limit the size of the pagecache"
> > http://marc.info/?l=linux-mm=116959990228182=2
> > It changes in zone fallback, this is not a good way.
> > 
> > The patchset is based on v3.15, it introduces two features, page cache limit
> > and page cache reclaim in circles.
> > 
> > Add four parameters in /proc/sys/vm
> > 
> > 1) cache_limit_mbytes
> > This is used to limit page cache amount.
> > The input unit is MB, value range is from 0 to totalram_pages.
> > If this is set to 0, it will not limit page cache.
> > When written to the file, cache_limit_ratio will be updated too.
> > The default value is 0.
> > 
> > 2) cache_limit_ratio
> > This is used to limit page cache amount.
> > The input unit is percent, value range is from 0 to 100.
> > If this is set to 0, it will not limit page cache.
> > When written to the file, cache_limit_mbytes will be updated too.
> > The default value is 0.
> > 
> > 3) cache_reclaim_s
> > This is used to reclaim page cache in circles.
> > The input unit is second, the minimum value is 0.
> > If this is set to 0, it will disable the feature.
> > The default value is 0.
> > 
> > 4) cache_reclaim_weight
> > This is used to speed up page cache reclaim.
> > It depend on enabling cache_limit_mbytes/cache_limit_ratio or 
> > cache_reclaim_s.
> > Value range is from 1(slow) to 100(fast).
> > The default value is 1.
> > 
> > I tested the two features on my system(x86_64), it seems to work right.
> > However, as it changes the hot path "add_to_page_cache_lru()", I don't know
> > how much it will the affect the performance, maybe there are some errors
> > in the patches too, RFC.
> 
> I haven't looked at patches yet but you would need to explain why the
> feature is needed much better and why the existing features are not
> sufficient.
> -- 
> Michal Hocko
> SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/8] mm: add page cache limit and reclaim feature

2014-06-16 Thread Rafael Aquini
On Mon, Jun 16, 2014 at 01:14:22PM +0200, Michal Hocko wrote:
 On Mon 16-06-14 17:24:38, Xishi Qiu wrote:
  When system(e.g. smart phone) running for a long time, the cache often takes
  a large memory, maybe the free memory is less than 50M, then OOM will happen
  if APP allocate a large order pages suddenly and memory reclaim too slowly. 
 
 Have you ever seen this to happen? Page cache should be easy to reclaim and
 if there is too mach dirty memory then you should be able to tune the
 amount by dirty_bytes/ratio knob. If the page allocator falls back to
 OOM and there is a lot of page cache then I would call it a bug. I do
 not think that limiting the amount of the page cache globally makes
 sense. There are Unix systems which offer this feature but I think it is
 a bad interface which only papers over the reclaim inefficiency or lack
 of other isolations between loads.

+1

It would be good if you could show some numbers that serve as evidence
of your theory on excessive pagecache acting as a trigger to your
observed OOMs. I'm assuming, by your 'e.g', you're running a swapless
system, so I would think your system OOMs are due to inability to
reclaim anon memory, instead of pagecache.

 
  Use echo 3  /proc/sys/vm/drop_caches will drop the whole cache, this will
  affect the performance, so it is used for debugging only. 
  

If you are able to drop the whole pagecache by issuing the command
above, than it means the majority of it is just unmapped cache pages, 
and those would be normally reclaimed upon demand by the PFRA. One more 
thing that makes me wonder you're just seeing the effect of a leaky app 
making the system unable to swap out anon pages.


  suse has this feature, I tested it before, but it can not limit the page 
  cache
  actually. So I rewrite the feature and add some parameters.
 
 The feature is there for historic reasons and I _really_ think the
 interface is not appropriate. If there is a big pagecache usage which
 affects other loads then Memory cgroup controller can be used to help
 from interference.
 
  Christoph Lameter has written a patch Limit the size of the pagecache
  http://marc.info/?l=linux-mmm=116959990228182w=2
  It changes in zone fallback, this is not a good way.
  
  The patchset is based on v3.15, it introduces two features, page cache limit
  and page cache reclaim in circles.
  
  Add four parameters in /proc/sys/vm
  
  1) cache_limit_mbytes
  This is used to limit page cache amount.
  The input unit is MB, value range is from 0 to totalram_pages.
  If this is set to 0, it will not limit page cache.
  When written to the file, cache_limit_ratio will be updated too.
  The default value is 0.
  
  2) cache_limit_ratio
  This is used to limit page cache amount.
  The input unit is percent, value range is from 0 to 100.
  If this is set to 0, it will not limit page cache.
  When written to the file, cache_limit_mbytes will be updated too.
  The default value is 0.
  
  3) cache_reclaim_s
  This is used to reclaim page cache in circles.
  The input unit is second, the minimum value is 0.
  If this is set to 0, it will disable the feature.
  The default value is 0.
  
  4) cache_reclaim_weight
  This is used to speed up page cache reclaim.
  It depend on enabling cache_limit_mbytes/cache_limit_ratio or 
  cache_reclaim_s.
  Value range is from 1(slow) to 100(fast).
  The default value is 1.
  
  I tested the two features on my system(x86_64), it seems to work right.
  However, as it changes the hot path add_to_page_cache_lru(), I don't know
  how much it will the affect the performance, maybe there are some errors
  in the patches too, RFC.
 
 I haven't looked at patches yet but you would need to explain why the
 feature is needed much better and why the existing features are not
 sufficient.
 -- 
 Michal Hocko
 SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm/vmscan.c: use DIV_ROUND_UP for calculation of zone's balance_gap and correct comments.

2014-05-19 Thread Rafael Aquini
On Mon, May 19, 2014 at 12:08:30PM +0800, Jianyu Zhan wrote:
> Currently, we use (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
> KSWAPD_ZONE_BALANCE_GAP_RATIO to avoid a zero gap value. It's better to
> use DIV_ROUND_UP macro for neater code and clear meaning.
> 
> Besides, the gap value is calculated against the per-zone "managed pages",
> not "present pages". This patch also corrects the comment and do some
> rephrasing.
> 
> Signed-off-by: Jianyu Zhan 
> ---
Acked-by: Rafael Aquini 

>  include/linux/swap.h |  8 
>  mm/vmscan.c  | 10 --
>  2 files changed, 8 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 5a14b92..58e1696 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -166,10 +166,10 @@ enum {
>  #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
>  
>  /*
> - * Ratio between the present memory in the zone and the "gap" that
> - * we're allowing kswapd to shrink in addition to the per-zone high
> - * wmark, even for zones that already have the high wmark satisfied,
> - * in order to provide better per-zone lru behavior. We are ok to
> + * Ratio between zone->managed_pages and the "gap" that above the per-zone
> + * "high_wmark". While balancing nodes, We allow kswapd to shrink zones that
> + * do not meet the (high_wmark + gap) watermark, even which already met the
> + * high_wmark, in order to provide better per-zone lru behavior. We are ok to
>   * spend not more than 1% of the memory for this zone balancing "gap".
>   */
>  #define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 32c661d..9ef9f6c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2268,9 +2268,8 @@ static inline bool compaction_ready(struct zone *zone, 
> struct scan_control *sc)
>* there is a buffer of free pages available to give compaction
>* a reasonable chance of completing and allocating the page
>*/
> - balance_gap = min(low_wmark_pages(zone),
> - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
> - KSWAPD_ZONE_BALANCE_GAP_RATIO);
> + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
> + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
>   watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
>   watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
>  
> @@ -2891,9 +2890,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
>* high wmark plus a "gap" where the gap is either the low
>* watermark or 1% of the zone, whichever is smaller.
>*/
> - balance_gap = min(low_wmark_pages(zone),
> - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
> - KSWAPD_ZONE_BALANCE_GAP_RATIO);
> + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
> + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
>  
>   /*
>* If there is no low memory pressure or the zone is balanced then no
> -- 
> 2.0.0-rc3
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm/vmscan.c: use DIV_ROUND_UP for calculation of zone's balance_gap and correct comments.

2014-05-19 Thread Rafael Aquini
On Mon, May 19, 2014 at 12:08:30PM +0800, Jianyu Zhan wrote:
 Currently, we use (zone-managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
 KSWAPD_ZONE_BALANCE_GAP_RATIO to avoid a zero gap value. It's better to
 use DIV_ROUND_UP macro for neater code and clear meaning.
 
 Besides, the gap value is calculated against the per-zone managed pages,
 not present pages. This patch also corrects the comment and do some
 rephrasing.
 
 Signed-off-by: Jianyu Zhan nasa4...@gmail.com
 ---
Acked-by: Rafael Aquini aqu...@redhat.com

  include/linux/swap.h |  8 
  mm/vmscan.c  | 10 --
  2 files changed, 8 insertions(+), 10 deletions(-)
 
 diff --git a/include/linux/swap.h b/include/linux/swap.h
 index 5a14b92..58e1696 100644
 --- a/include/linux/swap.h
 +++ b/include/linux/swap.h
 @@ -166,10 +166,10 @@ enum {
  #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
  
  /*
 - * Ratio between the present memory in the zone and the gap that
 - * we're allowing kswapd to shrink in addition to the per-zone high
 - * wmark, even for zones that already have the high wmark satisfied,
 - * in order to provide better per-zone lru behavior. We are ok to
 + * Ratio between zone-managed_pages and the gap that above the per-zone
 + * high_wmark. While balancing nodes, We allow kswapd to shrink zones that
 + * do not meet the (high_wmark + gap) watermark, even which already met the
 + * high_wmark, in order to provide better per-zone lru behavior. We are ok to
   * spend not more than 1% of the memory for this zone balancing gap.
   */
  #define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 32c661d..9ef9f6c 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -2268,9 +2268,8 @@ static inline bool compaction_ready(struct zone *zone, 
 struct scan_control *sc)
* there is a buffer of free pages available to give compaction
* a reasonable chance of completing and allocating the page
*/
 - balance_gap = min(low_wmark_pages(zone),
 - (zone-managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
 - KSWAPD_ZONE_BALANCE_GAP_RATIO);
 + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
 + zone-managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
   watermark = high_wmark_pages(zone) + balance_gap + (2UL  sc-order);
   watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
  
 @@ -2891,9 +2890,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
* high wmark plus a gap where the gap is either the low
* watermark or 1% of the zone, whichever is smaller.
*/
 - balance_gap = min(low_wmark_pages(zone),
 - (zone-managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
 - KSWAPD_ZONE_BALANCE_GAP_RATIO);
 + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
 + zone-managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
  
   /*
* If there is no low memory pressure or the zone is balanced then no
 -- 
 2.0.0-rc3
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: commit 0bf1457f0cfca7b " mm: vmscan: do not swap anon pages just because free+file is low" causes heavy performance regression on paging

2014-04-22 Thread Rafael Aquini
On Tue, Apr 22, 2014 at 11:06:56AM -0400, Johannes Weiner wrote:
> Hi Christian,
> 
> On Tue, Apr 22, 2014 at 12:55:37PM +0200, Christian Borntraeger wrote:
> > While preparing/testing some KVM on s390 patches for the next merge window 
> > (target is kvm/next which is based on 3.15-rc1) I faced a very severe 
> > performance hickup on guest paging (all anonymous memory).
> > 
> > All memory bound guests are in "D" state now and the system is barely 
> > unusable.
> > 
> > Reverting commit 0bf1457f0cfca7bc026a82323ad34bcf58ad035d
> > "mm: vmscan: do not swap anon pages just because free+file is low" makes 
> > the problem go away.
> > 
> > According to /proc/vmstat the system is now in direct reclaim almost all 
> > the time for every page fault (more than 10x more direct reclaims than 
> > kswap reclaims)
> > With the patch being reverted everything is fine again.
> 
> Ouch.  Yes, I think we have to revert this for now.
> 
> How about this?
> 
> ---
> From: Johannes Weiner 
> Subject: [patch] Revert "mm: vmscan: do not swap anon pages just because
>  free+file is low"
> 
> This reverts commit 0bf1457f0cfc ("mm: vmscan: do not swap anon pages
> just because free+file is low") because it introduced a regression in
> mostly-anonymous workloads, where reclaim would become ineffective and
> trap every allocating task in direct reclaim.
> 
> The problem is that there is a runaway feedback loop in the scan
> balance between file and anon, where the balance tips heavily towards
> a tiny thrashing file LRU and anonymous pages are no longer being
> looked at.  The commit in question removed the safe guard that would
> detect such situations and respond with forced anonymous reclaim.
> 
> This commit was part of a series to fix premature swapping in loads
> with relatively little cache, and while it made a small difference,
> the cure is obviously worse than the disease.  Revert it.
> 
> Reported-by: Christian Borntraeger 
> Signed-off-by: Johannes Weiner 
> Cc:[3.12+]
> ---
>  mm/vmscan.c | 18 ++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9b6497eda806..169acb8e31c9 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1916,6 +1916,24 @@ static void get_scan_count(struct lruvec *lruvec, 
> struct scan_control *sc,
>   get_lru_size(lruvec, LRU_INACTIVE_FILE);
>  
>   /*
> +  * Prevent the reclaimer from falling into the cache trap: as
> +  * cache pages start out inactive, every cache fault will tip
> +  * the scan balance towards the file LRU.  And as the file LRU
> +  * shrinks, so does the window for rotation from references.
> +  * This means we have a runaway feedback loop where a tiny
> +  * thrashing file LRU becomes infinitely more attractive than
> +  * anon pages.  Try to detect this based on file LRU size.
> +  */
> + if (global_reclaim(sc)) {
> + unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
> +
> + if (unlikely(file + free <= high_wmark_pages(zone))) {
> + scan_balance = SCAN_ANON;
> + goto out;
> + }
> + }
> +
> + /*
>* There is enough inactive page cache, do not reclaim
>* anything from the anonymous working set right now.
>*/
> -- 
> 1.9.2
> 
Acked-by: Rafael Aquini 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: commit 0bf1457f0cfca7b " mm: vmscan: do not swap anon pages just because free+file is low" causes heavy performance regression on paging

2014-04-22 Thread Rafael Aquini
On Tue, Apr 22, 2014 at 10:40:17AM -0400, Rik van Riel wrote:
> On 04/22/2014 07:57 AM, Christian Borntraeger wrote:
> > On 22/04/14 12:55, Christian Borntraeger wrote:
> >> While preparing/testing some KVM on s390 patches for the next merge window 
> >> (target is kvm/next which is based on 3.15-rc1) I faced a very severe 
> >> performance hickup on guest paging (all anonymous memory).
> >>
> >> All memory bound guests are in "D" state now and the system is barely 
> >> unusable.
> >>
> >> Reverting commit 0bf1457f0cfca7bc026a82323ad34bcf58ad035d
> >> "mm: vmscan: do not swap anon pages just because free+file is low" makes 
> >> the problem go away.
> >>
> >> According to /proc/vmstat the system is now in direct reclaim almost all 
> >> the time for every page fault (more than 10x more direct reclaims than 
> >> kswap reclaims)
> >> With the patch being reverted everything is fine again.
> >>
> >> Any ideas?
> > 
> > Here is an idea to tackle my problem and the original problem:
> > 
> > reverting  0bf1457f0cfca7bc026a82323ad34bcf58ad035d + checking against low, 
> > also seems to make my system usable.
> > 
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1923,7 +1923,7 @@ static void get_scan_count(struct lruvec *lruvec, 
> > struct scan_control *sc,
> >  */
> > if (global_reclaim(sc)) {
> > free = zone_page_state(zone, NR_FREE_PAGES);
> > -   if (unlikely(file + free <= high_wmark_pages(zone))) {
> > +   if (unlikely(file + free <= low_wmark_pages(zone))) {
> > scan_balance = SCAN_ANON;
> > goto out;
> > }
> > 
> 
> Looks reasonable to me.
+1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: commit 0bf1457f0cfca7b mm: vmscan: do not swap anon pages just because free+file is low causes heavy performance regression on paging

2014-04-22 Thread Rafael Aquini
On Tue, Apr 22, 2014 at 10:40:17AM -0400, Rik van Riel wrote:
 On 04/22/2014 07:57 AM, Christian Borntraeger wrote:
  On 22/04/14 12:55, Christian Borntraeger wrote:
  While preparing/testing some KVM on s390 patches for the next merge window 
  (target is kvm/next which is based on 3.15-rc1) I faced a very severe 
  performance hickup on guest paging (all anonymous memory).
 
  All memory bound guests are in D state now and the system is barely 
  unusable.
 
  Reverting commit 0bf1457f0cfca7bc026a82323ad34bcf58ad035d
  mm: vmscan: do not swap anon pages just because free+file is low makes 
  the problem go away.
 
  According to /proc/vmstat the system is now in direct reclaim almost all 
  the time for every page fault (more than 10x more direct reclaims than 
  kswap reclaims)
  With the patch being reverted everything is fine again.
 
  Any ideas?
  
  Here is an idea to tackle my problem and the original problem:
  
  reverting  0bf1457f0cfca7bc026a82323ad34bcf58ad035d + checking against low, 
  also seems to make my system usable.
  
  --- a/mm/vmscan.c
  +++ b/mm/vmscan.c
  @@ -1923,7 +1923,7 @@ static void get_scan_count(struct lruvec *lruvec, 
  struct scan_control *sc,
   */
  if (global_reclaim(sc)) {
  free = zone_page_state(zone, NR_FREE_PAGES);
  -   if (unlikely(file + free = high_wmark_pages(zone))) {
  +   if (unlikely(file + free = low_wmark_pages(zone))) {
  scan_balance = SCAN_ANON;
  goto out;
  }
  
 
 Looks reasonable to me.
+1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: commit 0bf1457f0cfca7b mm: vmscan: do not swap anon pages just because free+file is low causes heavy performance regression on paging

2014-04-22 Thread Rafael Aquini
On Tue, Apr 22, 2014 at 11:06:56AM -0400, Johannes Weiner wrote:
 Hi Christian,
 
 On Tue, Apr 22, 2014 at 12:55:37PM +0200, Christian Borntraeger wrote:
  While preparing/testing some KVM on s390 patches for the next merge window 
  (target is kvm/next which is based on 3.15-rc1) I faced a very severe 
  performance hickup on guest paging (all anonymous memory).
  
  All memory bound guests are in D state now and the system is barely 
  unusable.
  
  Reverting commit 0bf1457f0cfca7bc026a82323ad34bcf58ad035d
  mm: vmscan: do not swap anon pages just because free+file is low makes 
  the problem go away.
  
  According to /proc/vmstat the system is now in direct reclaim almost all 
  the time for every page fault (more than 10x more direct reclaims than 
  kswap reclaims)
  With the patch being reverted everything is fine again.
 
 Ouch.  Yes, I think we have to revert this for now.
 
 How about this?
 
 ---
 From: Johannes Weiner han...@cmpxchg.org
 Subject: [patch] Revert mm: vmscan: do not swap anon pages just because
  free+file is low
 
 This reverts commit 0bf1457f0cfc (mm: vmscan: do not swap anon pages
 just because free+file is low) because it introduced a regression in
 mostly-anonymous workloads, where reclaim would become ineffective and
 trap every allocating task in direct reclaim.
 
 The problem is that there is a runaway feedback loop in the scan
 balance between file and anon, where the balance tips heavily towards
 a tiny thrashing file LRU and anonymous pages are no longer being
 looked at.  The commit in question removed the safe guard that would
 detect such situations and respond with forced anonymous reclaim.
 
 This commit was part of a series to fix premature swapping in loads
 with relatively little cache, and while it made a small difference,
 the cure is obviously worse than the disease.  Revert it.
 
 Reported-by: Christian Borntraeger borntrae...@de.ibm.com
 Signed-off-by: Johannes Weiner han...@cmpxchg.org
 Cc: sta...@kernel.org   [3.12+]
 ---
  mm/vmscan.c | 18 ++
  1 file changed, 18 insertions(+)
 
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 9b6497eda806..169acb8e31c9 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -1916,6 +1916,24 @@ static void get_scan_count(struct lruvec *lruvec, 
 struct scan_control *sc,
   get_lru_size(lruvec, LRU_INACTIVE_FILE);
  
   /*
 +  * Prevent the reclaimer from falling into the cache trap: as
 +  * cache pages start out inactive, every cache fault will tip
 +  * the scan balance towards the file LRU.  And as the file LRU
 +  * shrinks, so does the window for rotation from references.
 +  * This means we have a runaway feedback loop where a tiny
 +  * thrashing file LRU becomes infinitely more attractive than
 +  * anon pages.  Try to detect this based on file LRU size.
 +  */
 + if (global_reclaim(sc)) {
 + unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
 +
 + if (unlikely(file + free = high_wmark_pages(zone))) {
 + scan_balance = SCAN_ANON;
 + goto out;
 + }
 + }
 +
 + /*
* There is enough inactive page cache, do not reclaim
* anything from the anonymous working set right now.
*/
 -- 
 1.9.2
 
Acked-by: Rafael Aquini aqu...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: Only force scan in reclaim when none of the LRUs are big enough.

2014-03-28 Thread Rafael Aquini
On Sat, Mar 15, 2014 at 08:36:02PM -0700, Hugh Dickins wrote:
> From: Suleiman Souhlal 
> 
> Prior to this change, we would decide whether to force scan a LRU
> during reclaim if that LRU itself was too small for the current
> priority. However, this can lead to the file LRU getting force
> scanned even if there are a lot of anonymous pages we can reclaim,
> leading to hot file pages getting needlessly reclaimed.
> 
> To address this, we instead only force scan when none of the
> reclaimable LRUs are big enough.
> 
> Gives huge improvements with zswap. For example, when doing -j20
> kernel build in a 500MB container with zswap enabled, runtime (in
> seconds) is greatly reduced:
> 
> x without this change
> + with this change
> N   Min   MaxMedian   AvgStddev
> x   5   700.997   790.076   763.928754.05  39.59493
> +   5   141.634   197.899   155.706 161.9 21.270224
> Difference at 95.0% confidence
> -592.15 +/- 46.3521
> -78.5293% +/- 6.14709%
> (Student's t, pooled s = 31.7819)
> 
> Should also give some improvements in regular (non-zswap) swap cases.
> 
> Yes, hughd found significant speedup using regular swap, with several
> memcgs under pressure; and it should also be effective in the non-memcg
> case, whenever one or another zone LRU is forced too small.
> 
> Signed-off-by: Suleiman Souhlal 
> Signed-off-by: Hugh Dickins 
> ---
> 

Acked-by: Rafael Aquini 

> I apologize to everyone for holding on to this so long: I think it's
> a very helpful patch (which we've been using in Google for months now).
> Been sitting on my TODO list, now prompted to send by related patches
> 
> https://lkml.org/lkml/2014/3/13/217
> https://lkml.org/lkml/2014/3/14/277
> 
> Certainly worth considering all three together, but my understanding
> is that they're actually three independent attacks on different ways
> in which we currently squeeze an LRU too small; and this patch from
> Suleiman seems to be the most valuable of the three, at least for
> the workloads I've tried it on.  But I'm not much of a page reclaim
> performance tester: please try it out to see if it's good for you.
> Thanks!
> 
>  mm/vmscan.c |   72 +-
>  1 file changed, 42 insertions(+), 30 deletions(-)
> 
> We did experiment with different ways of writing the patch, I'm afraid
> the way it came out best indents deeper, making it look more than it is.
> 
> --- 3.14-rc6/mm/vmscan.c  2014-02-02 18:49:07.949302116 -0800
> +++ linux/mm/vmscan.c 2014-03-15 19:31:44.948977032 -0700
> @@ -1852,6 +1852,8 @@ static void get_scan_count(struct lruvec
>   bool force_scan = false;
>   unsigned long ap, fp;
>   enum lru_list lru;
> + bool some_scanned;
> + int pass;
>  
>   /*
>* If the zone or memcg is small, nr[l] can be 0.  This
> @@ -1971,39 +1973,49 @@ static void get_scan_count(struct lruvec
>   fraction[1] = fp;
>   denominator = ap + fp + 1;
>  out:
> - for_each_evictable_lru(lru) {
> - int file = is_file_lru(lru);
> - unsigned long size;
> - unsigned long scan;
> -
> - size = get_lru_size(lruvec, lru);
> - scan = size >> sc->priority;
> -
> - if (!scan && force_scan)
> - scan = min(size, SWAP_CLUSTER_MAX);
> -
> - switch (scan_balance) {
> - case SCAN_EQUAL:
> - /* Scan lists relative to size */
> - break;
> - case SCAN_FRACT:
> + some_scanned = false;
> + /* Only use force_scan on second pass. */
> + for (pass = 0; !some_scanned && pass < 2; pass++) {
> + for_each_evictable_lru(lru) {
> + int file = is_file_lru(lru);
> + unsigned long size;
> + unsigned long scan;
> +
> + size = get_lru_size(lruvec, lru);
> + scan = size >> sc->priority;
> +
> + if (!scan && pass && force_scan)
> + scan = min(size, SWAP_CLUSTER_MAX);
> +
> + switch (scan_balance) {
> + case SCAN_EQUAL:
> + /* Scan lists relative to size */
> + break;
> + case SCAN_FRACT:
> + /*
> +  * Scan types proportional to swappiness and
> +  * their relative rece

Re: [PATCH] mm: Only force scan in reclaim when none of the LRUs are big enough.

2014-03-28 Thread Rafael Aquini
On Sat, Mar 15, 2014 at 08:36:02PM -0700, Hugh Dickins wrote:
 From: Suleiman Souhlal sulei...@google.com
 
 Prior to this change, we would decide whether to force scan a LRU
 during reclaim if that LRU itself was too small for the current
 priority. However, this can lead to the file LRU getting force
 scanned even if there are a lot of anonymous pages we can reclaim,
 leading to hot file pages getting needlessly reclaimed.
 
 To address this, we instead only force scan when none of the
 reclaimable LRUs are big enough.
 
 Gives huge improvements with zswap. For example, when doing -j20
 kernel build in a 500MB container with zswap enabled, runtime (in
 seconds) is greatly reduced:
 
 x without this change
 + with this change
 N   Min   MaxMedian   AvgStddev
 x   5   700.997   790.076   763.928754.05  39.59493
 +   5   141.634   197.899   155.706 161.9 21.270224
 Difference at 95.0% confidence
 -592.15 +/- 46.3521
 -78.5293% +/- 6.14709%
 (Student's t, pooled s = 31.7819)
 
 Should also give some improvements in regular (non-zswap) swap cases.
 
 Yes, hughd found significant speedup using regular swap, with several
 memcgs under pressure; and it should also be effective in the non-memcg
 case, whenever one or another zone LRU is forced too small.
 
 Signed-off-by: Suleiman Souhlal sulei...@google.com
 Signed-off-by: Hugh Dickins hu...@google.com
 ---
 

Acked-by: Rafael Aquini aqu...@redhat.com

 I apologize to everyone for holding on to this so long: I think it's
 a very helpful patch (which we've been using in Google for months now).
 Been sitting on my TODO list, now prompted to send by related patches
 
 https://lkml.org/lkml/2014/3/13/217
 https://lkml.org/lkml/2014/3/14/277
 
 Certainly worth considering all three together, but my understanding
 is that they're actually three independent attacks on different ways
 in which we currently squeeze an LRU too small; and this patch from
 Suleiman seems to be the most valuable of the three, at least for
 the workloads I've tried it on.  But I'm not much of a page reclaim
 performance tester: please try it out to see if it's good for you.
 Thanks!
 
  mm/vmscan.c |   72 +-
  1 file changed, 42 insertions(+), 30 deletions(-)
 
 We did experiment with different ways of writing the patch, I'm afraid
 the way it came out best indents deeper, making it look more than it is.
 
 --- 3.14-rc6/mm/vmscan.c  2014-02-02 18:49:07.949302116 -0800
 +++ linux/mm/vmscan.c 2014-03-15 19:31:44.948977032 -0700
 @@ -1852,6 +1852,8 @@ static void get_scan_count(struct lruvec
   bool force_scan = false;
   unsigned long ap, fp;
   enum lru_list lru;
 + bool some_scanned;
 + int pass;
  
   /*
* If the zone or memcg is small, nr[l] can be 0.  This
 @@ -1971,39 +1973,49 @@ static void get_scan_count(struct lruvec
   fraction[1] = fp;
   denominator = ap + fp + 1;
  out:
 - for_each_evictable_lru(lru) {
 - int file = is_file_lru(lru);
 - unsigned long size;
 - unsigned long scan;
 -
 - size = get_lru_size(lruvec, lru);
 - scan = size  sc-priority;
 -
 - if (!scan  force_scan)
 - scan = min(size, SWAP_CLUSTER_MAX);
 -
 - switch (scan_balance) {
 - case SCAN_EQUAL:
 - /* Scan lists relative to size */
 - break;
 - case SCAN_FRACT:
 + some_scanned = false;
 + /* Only use force_scan on second pass. */
 + for (pass = 0; !some_scanned  pass  2; pass++) {
 + for_each_evictable_lru(lru) {
 + int file = is_file_lru(lru);
 + unsigned long size;
 + unsigned long scan;
 +
 + size = get_lru_size(lruvec, lru);
 + scan = size  sc-priority;
 +
 + if (!scan  pass  force_scan)
 + scan = min(size, SWAP_CLUSTER_MAX);
 +
 + switch (scan_balance) {
 + case SCAN_EQUAL:
 + /* Scan lists relative to size */
 + break;
 + case SCAN_FRACT:
 + /*
 +  * Scan types proportional to swappiness and
 +  * their relative recent reclaim efficiency.
 +  */
 + scan = div64_u64(scan * fraction[file],
 + denominator);
 + break;
 + case SCAN_FILE:
 + case SCAN_ANON:
 + /* Scan one type exclusively */
 + if ((scan_balance == SCAN_FILE) != file

Re: [patch] mm: vmscan: do not swap anon pages just because free+file is low

2014-03-14 Thread Rafael Aquini
On Fri, Mar 14, 2014 at 11:35:02AM -0400, Johannes Weiner wrote:
> Page reclaim force-scans / swaps anonymous pages when file cache drops
> below the high watermark of a zone in order to prevent what little
> cache remains from thrashing.
> 
> However, on bigger machines the high watermark value can be quite
> large and when the workload is dominated by a static anonymous/shmem
> set, the file set might just be a small window of used-once cache.  In
> such situations, the VM starts swapping heavily when instead it should
> be recycling the no longer used cache.
> 
> This is a longer-standing problem, but it's more likely to trigger
> after 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy")
> because file pages can no longer accumulate in a single zone and are
> dispersed into smaller fractions among the available zones.
> 
> To resolve this, do not force scan anon when file pages are low but
> instead rely on the scan/rotation ratios to make the right prediction.
> 
> Signed-off-by: Johannes Weiner 
> Cc:  [3.12+]
> ---

Acked-by: Rafael Aquini 

>  mm/vmscan.c | 16 +---
>  1 file changed, 1 insertion(+), 15 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index a9c74b409681..e58e9ad5b5d1 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1848,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, 
> struct scan_control *sc,
>   struct zone *zone = lruvec_zone(lruvec);
>   unsigned long anon_prio, file_prio;
>   enum scan_balance scan_balance;
> - unsigned long anon, file, free;
> + unsigned long anon, file;
>   bool force_scan = false;
>   unsigned long ap, fp;
>   enum lru_list lru;
> @@ -1902,20 +1902,6 @@ static void get_scan_count(struct lruvec *lruvec, 
> struct scan_control *sc,
>   get_lru_size(lruvec, LRU_INACTIVE_FILE);
>  
>   /*
> -  * If it's foreseeable that reclaiming the file cache won't be
> -  * enough to get the zone back into a desirable shape, we have
> -  * to swap.  Better start now and leave the - probably heavily
> -  * thrashing - remaining file pages alone.
> -  */
> - if (global_reclaim(sc)) {
> - free = zone_page_state(zone, NR_FREE_PAGES);
> - if (unlikely(file + free <= high_wmark_pages(zone))) {
> - scan_balance = SCAN_ANON;
> - goto out;
> - }
> - }
> -
> - /*
>* There is enough inactive page cache, do not reclaim
>* anything from the anonymous working set right now.
>*/
> -- 
> 1.9.0
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] mm: vmscan: do not swap anon pages just because free+file is low

2014-03-14 Thread Rafael Aquini
On Fri, Mar 14, 2014 at 11:35:02AM -0400, Johannes Weiner wrote:
 Page reclaim force-scans / swaps anonymous pages when file cache drops
 below the high watermark of a zone in order to prevent what little
 cache remains from thrashing.
 
 However, on bigger machines the high watermark value can be quite
 large and when the workload is dominated by a static anonymous/shmem
 set, the file set might just be a small window of used-once cache.  In
 such situations, the VM starts swapping heavily when instead it should
 be recycling the no longer used cache.
 
 This is a longer-standing problem, but it's more likely to trigger
 after 81c0a2bb515f (mm: page_alloc: fair zone allocator policy)
 because file pages can no longer accumulate in a single zone and are
 dispersed into smaller fractions among the available zones.
 
 To resolve this, do not force scan anon when file pages are low but
 instead rely on the scan/rotation ratios to make the right prediction.
 
 Signed-off-by: Johannes Weiner han...@cmpxchg.org
 Cc: sta...@kernel.org [3.12+]
 ---

Acked-by: Rafael Aquini aqu...@redhat.com

  mm/vmscan.c | 16 +---
  1 file changed, 1 insertion(+), 15 deletions(-)
 
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index a9c74b409681..e58e9ad5b5d1 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -1848,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, 
 struct scan_control *sc,
   struct zone *zone = lruvec_zone(lruvec);
   unsigned long anon_prio, file_prio;
   enum scan_balance scan_balance;
 - unsigned long anon, file, free;
 + unsigned long anon, file;
   bool force_scan = false;
   unsigned long ap, fp;
   enum lru_list lru;
 @@ -1902,20 +1902,6 @@ static void get_scan_count(struct lruvec *lruvec, 
 struct scan_control *sc,
   get_lru_size(lruvec, LRU_INACTIVE_FILE);
  
   /*
 -  * If it's foreseeable that reclaiming the file cache won't be
 -  * enough to get the zone back into a desirable shape, we have
 -  * to swap.  Better start now and leave the - probably heavily
 -  * thrashing - remaining file pages alone.
 -  */
 - if (global_reclaim(sc)) {
 - free = zone_page_state(zone, NR_FREE_PAGES);
 - if (unlikely(file + free = high_wmark_pages(zone))) {
 - scan_balance = SCAN_ANON;
 - goto out;
 - }
 - }
 -
 - /*
* There is enough inactive page cache, do not reclaim
* anything from the anonymous working set right now.
*/
 -- 
 1.9.0
 
 --
 To unsubscribe, send a message with 'unsubscribe linux-mm' in
 the body to majord...@kvack.org.  For more info on Linux MM,
 see: http://www.linux-mm.org/ .
 Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/6] header file for DRBG

2014-03-10 Thread Rafael Aquini
On Sun, Mar 09, 2014 at 12:46:42AM +0100, Stephan Mueller wrote:
> The header file includes the definition of:
> 
> * DRBG data structures with
>   - struct drbg_state as main structure
>   - struct drbg_core referencing the backend ciphers
>   - struct drbg_state_ops callbach handlers for specific code
> supporting the Hash, HMAC, CTR DRBG implementations
>   - struct drbg_conc defining a linked list for input data
>   - struct drbg_test_data holding the test "entropy" data for CAVS
> testing and testmgr.c
>   - struct drbg_gen allowing test data, additional information
> string and personalization string data to be funneled through
> the kernel crypto API -- the DRBG requires additional
> parameters when invoking the reset and random number
> generation requests than intended by the kernel crypto API
> 
> * wrapper function to the kernel crypto API functions using struct
>   drbg_gen to pass through all data needed for DRBG
> 
> * wrapper functions to kernel crypto API functions usable for testing
>   code to inject test_data into the DRBG as needed by CAVS testing and
>   testmgr.c.
> 
> * DRBG flags required for the operation of the DRBG and for selecting
>   the particular DRBG type and backend cipher
> 
> * getter functions for data from struct drbg_core
> 
> Signed-off-by: Stephan Mueller 
> 
>  create mode 100644 include/crypto/drbg.h
> 
> diff --git a/include/crypto/drbg.h b/include/crypto/drbg.h
> new file mode 100644
> index 000..16515f9
> --- /dev/null
> +++ b/include/crypto/drbg.h
> @@ -0,0 +1,340 @@
> +/*
> + * DRBG based on NIST SP800-90A
> + *
> + * Copyright Stephan Mueller , 2014
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *notice, and the entire permission notice in its entirety,
> + *including the disclaimer of warranties.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *notice, this list of conditions and the following disclaimer in the
> + *documentation and/or other materials provided with the distribution.
> + * 3. The name of the author may not be used to endorse or promote
> + *products derived from this software without specific prior
> + *written permission.
> + *
> + * ALTERNATIVELY, this product may be distributed under the terms of
> + * the GNU General Public License, in which case the provisions of the GPL 
> are
> + * required INSTEAD OF the above restrictions.  (This clause is
> + * necessary due to a potential bad interaction between the GPL and
> + * the restrictions contained in a BSD-style copyright.)
> + *
> + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
> + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
> + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
> + * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
> + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
> + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
> + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
> + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
> + * DAMAGE.
> + */
> +
> +#ifndef _DRBG_H
> +#define _DRBG_H
> +
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include  /* needed for kzalloc */
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +/*
> + * Concatenation Helper
> + *
> + * SP800-90A requires the concatenation of different data. To avoid copying
> + * buffers around or allocate additional memory, the following data structure
> + * is used to point to the original memory with its size. In addition, it
> + * is used to build a linked list. The linked list defines the concatenation
> + * of individual buffers. The order of memory block referenced in that
> + * linked list determines the order of concatenation.
> + */
> +
> +struct drbg_conc
> +{
> + unsigned char *in;
> + size_t len;
> + struct drbg_conc *next;
> +};
> +
> +#define DRBG_CLEAR_CONC(x)   \
> + x.in = NULL;\
> + x.len = 0;  \
> + x.next = NULL;
> +

Please, consider getting rid of these ugly preprocessor macros and use static
inline functions instead. It not only is much better for maintainability, 
but also helps a lot on strong typechecking and can avoid nasty bugs in the 
future.


> +struct drbg_state;
> +typedef uint32_t drbg_flag_t;
> +
> +struct drbg_core 
> +{
> + drbg_flag_t flags;  /* flags for the cipher */
> + __u8 statelen;  /* maximum state 

Re: [PATCH 2/6] header file for DRBG

2014-03-10 Thread Rafael Aquini
On Sun, Mar 09, 2014 at 12:46:42AM +0100, Stephan Mueller wrote:
 The header file includes the definition of:
 
 * DRBG data structures with
   - struct drbg_state as main structure
   - struct drbg_core referencing the backend ciphers
   - struct drbg_state_ops callbach handlers for specific code
 supporting the Hash, HMAC, CTR DRBG implementations
   - struct drbg_conc defining a linked list for input data
   - struct drbg_test_data holding the test entropy data for CAVS
 testing and testmgr.c
   - struct drbg_gen allowing test data, additional information
 string and personalization string data to be funneled through
 the kernel crypto API -- the DRBG requires additional
 parameters when invoking the reset and random number
 generation requests than intended by the kernel crypto API
 
 * wrapper function to the kernel crypto API functions using struct
   drbg_gen to pass through all data needed for DRBG
 
 * wrapper functions to kernel crypto API functions usable for testing
   code to inject test_data into the DRBG as needed by CAVS testing and
   testmgr.c.
 
 * DRBG flags required for the operation of the DRBG and for selecting
   the particular DRBG type and backend cipher
 
 * getter functions for data from struct drbg_core
 
 Signed-off-by: Stephan Mueller smuel...@chronox.de
 
  create mode 100644 include/crypto/drbg.h
 
 diff --git a/include/crypto/drbg.h b/include/crypto/drbg.h
 new file mode 100644
 index 000..16515f9
 --- /dev/null
 +++ b/include/crypto/drbg.h
 @@ -0,0 +1,340 @@
 +/*
 + * DRBG based on NIST SP800-90A
 + *
 + * Copyright Stephan Mueller smuel...@chronox.de, 2014
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + * 1. Redistributions of source code must retain the above copyright
 + *notice, and the entire permission notice in its entirety,
 + *including the disclaimer of warranties.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *notice, this list of conditions and the following disclaimer in the
 + *documentation and/or other materials provided with the distribution.
 + * 3. The name of the author may not be used to endorse or promote
 + *products derived from this software without specific prior
 + *written permission.
 + *
 + * ALTERNATIVELY, this product may be distributed under the terms of
 + * the GNU General Public License, in which case the provisions of the GPL 
 are
 + * required INSTEAD OF the above restrictions.  (This clause is
 + * necessary due to a potential bad interaction between the GPL and
 + * the restrictions contained in a BSD-style copyright.)
 + *
 + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
 + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
 + * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
 + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
 + * DAMAGE.
 + */
 +
 +#ifndef _DRBG_H
 +#define _DRBG_H
 +
 +
 +#include linux/random.h
 +#include linux/scatterlist.h
 +#include crypto/hash.h
 +#include linux/module.h
 +#include linux/crypto.h
 +#include linux/slab.h /* needed for kzalloc */
 +#include crypto/internal/rng.h
 +#include crypto/rng.h
 +#include linux/fips.h
 +#include linux/spinlock.h
 +
 +/*
 + * Concatenation Helper
 + *
 + * SP800-90A requires the concatenation of different data. To avoid copying
 + * buffers around or allocate additional memory, the following data structure
 + * is used to point to the original memory with its size. In addition, it
 + * is used to build a linked list. The linked list defines the concatenation
 + * of individual buffers. The order of memory block referenced in that
 + * linked list determines the order of concatenation.
 + */
 +
 +struct drbg_conc
 +{
 + unsigned char *in;
 + size_t len;
 + struct drbg_conc *next;
 +};
 +
 +#define DRBG_CLEAR_CONC(x)   \
 + x.in = NULL;\
 + x.len = 0;  \
 + x.next = NULL;
 +

Please, consider getting rid of these ugly preprocessor macros and use static
inline functions instead. It not only is much better for maintainability, 
but also helps a lot on strong typechecking and can avoid nasty bugs in the 
future.


 +struct drbg_state;
 +typedef uint32_t drbg_flag_t;
 +
 +struct drbg_core 
 +{
 + drbg_flag_t flags;  /* flags for 

Re: [PATCH] fs/proc/meminfo: meminfo_proc_show(): fix typo in comment

2014-02-21 Thread Rafael Aquini
On Tue, Feb 18, 2014 at 05:00:27PM -0500, Luiz Capitulino wrote:
> It should read "reclaimable slab" and not "reclaimable swap".
> 
> Signed-off-by: Luiz Capitulino 
> ---
>  fs/proc/meminfo.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 136e548..7445af0 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
>   available += pagecache;
>  
>   /*
> -  * Part of the reclaimable swap consists of items that are in use,
> +  * Part of the reclaimable slab consists of items that are in use,
>* and cannot be freed. Cap this estimate at the low watermark.
>*/
>   available += global_page_state(NR_SLAB_RECLAIMABLE) -
> -- 
> 1.8.1.4

Acked-by: Rafael Aquini 

> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] kref: oops on zero or negative refcount

2014-02-21 Thread Rafael Aquini
On Thu, Feb 20, 2014 at 01:17:44PM -0500, Rik van Riel wrote:
> On 02/20/2014 01:14 PM, Dave Jones wrote:
> > On Thu, Feb 20, 2014 at 06:44:59PM +0100, Mateusz Guzik wrote:
> >  > In use after free situations, it is possible for one thread to write to
> >  > memory that has just been reallocated to a new user. This could open up
> >  > potential security issues.
> >  > 
> >  > diff --git a/include/linux/kref.h b/include/linux/kref.h
> >  > index 484604d..c3f8a0a 100644
> >  > --- a/include/linux/kref.h
> >  > +++ b/include/linux/kref.h
> >  > @@ -43,8 +43,10 @@ static inline void kref_get(struct kref *kref)
> >  >  /* If refcount was 0 before incrementing then we have a race
> >  >   * condition when this kref is freeing by some other thread 
> > right now.
> >  >   * In this case one should use kref_get_unless_zero()
> >  > + *
> >  > + * Terminate the current thread to stop potential security 
> > exploits.
> >  >   */
> >  > -WARN_ON_ONCE(atomic_inc_return(>refcount) < 2);
> >  > +BUG_ON(atomic_inc_return(>refcount) < 2);
> > 
> > This isn't "terminating the thread", this is "lock up the box".
> 
> Only if kref_get holds a lock while encountering a refcount
> underflow, right?
>

Yes, and in a quick glance through the tree it seems we have several
codesites where we can find such condition likely to happen,
unfortunately.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] kref: oops on zero or negative refcount

2014-02-21 Thread Rafael Aquini
On Thu, Feb 20, 2014 at 01:17:44PM -0500, Rik van Riel wrote:
 On 02/20/2014 01:14 PM, Dave Jones wrote:
  On Thu, Feb 20, 2014 at 06:44:59PM +0100, Mateusz Guzik wrote:
In use after free situations, it is possible for one thread to write to
memory that has just been reallocated to a new user. This could open up
potential security issues.

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 484604d..c3f8a0a 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -43,8 +43,10 @@ static inline void kref_get(struct kref *kref)
 /* If refcount was 0 before incrementing then we have a race
  * condition when this kref is freeing by some other thread 
  right now.
  * In this case one should use kref_get_unless_zero()
+ *
+ * Terminate the current thread to stop potential security 
  exploits.
  */
-WARN_ON_ONCE(atomic_inc_return(kref-refcount)  2);
+BUG_ON(atomic_inc_return(kref-refcount)  2);
  
  This isn't terminating the thread, this is lock up the box.
 
 Only if kref_get holds a lock while encountering a refcount
 underflow, right?


Yes, and in a quick glance through the tree it seems we have several
codesites where we can find such condition likely to happen,
unfortunately.


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs/proc/meminfo: meminfo_proc_show(): fix typo in comment

2014-02-21 Thread Rafael Aquini
On Tue, Feb 18, 2014 at 05:00:27PM -0500, Luiz Capitulino wrote:
 It should read reclaimable slab and not reclaimable swap.
 
 Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
 ---
  fs/proc/meminfo.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
 index 136e548..7445af0 100644
 --- a/fs/proc/meminfo.c
 +++ b/fs/proc/meminfo.c
 @@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
   available += pagecache;
  
   /*
 -  * Part of the reclaimable swap consists of items that are in use,
 +  * Part of the reclaimable slab consists of items that are in use,
* and cannot be freed. Cap this estimate at the low watermark.
*/
   available += global_page_state(NR_SLAB_RECLAIMABLE) -
 -- 
 1.8.1.4

Acked-by: Rafael Aquini aqu...@redhat.com

 
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] drop_caches: add some documentation and info message

2014-02-08 Thread Rafael Aquini
m.
>  
>  ==
>  
> diff --git a/fs/drop_caches.c b/fs/drop_caches.c
> index 9fd702f5bfb2..02ae3386e08f 100644
> --- a/fs/drop_caches.c
> +++ b/fs/drop_caches.c
> @@ -5,6 +5,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -63,6 +64,9 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
>   iterate_supers(drop_pagecache_sb, NULL);
>   if (sysctl_drop_caches & 2)
>   drop_slab();
> + printk_ratelimited(KERN_INFO "%s (%d): dropped kernel caches: 
> %d\n",

Just a nitpick here: 
 s/printk_ratelimited(KERN_INFO ...)/pr_info_ratelimited(...)


Acked-by: Rafael Aquini 


> +current->comm, task_pid_nr(current),
> +sysctl_drop_caches);
>   }
>   return 0;
>  }
> -- 
> 1.8.5.3
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch 02/10] fs: cachefiles: use add_to_page_cache_lru()

2014-02-08 Thread Rafael Aquini
On Mon, Feb 03, 2014 at 07:53:34PM -0500, Johannes Weiner wrote:
> This code used to have its own lru cache pagevec up until a0b8cab3
> ("mm: remove lru parameter from __pagevec_lru_add and remove parts of
> pagevec API").  Now it's just add_to_page_cache() followed by
> lru_cache_add(), might as well use add_to_page_cache_lru() directly.
>

Just a heads-up, here: take a look at https://lkml.org/lkml/2014/2/7/587

I'm not saying that hunks below will cause the same leak issue as depicted on 
the thread I pointed, but it surely doesn't hurt to double-check them

Regards,
-- Rafael

> Signed-off-by: Johannes Weiner 
> Reviewed-by: Rik van Riel 
> Reviewed-by: Minchan Kim 
> ---
>  fs/cachefiles/rdwr.c | 33 +
>  1 file changed, 13 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
> index ebaff368120d..4b1fb5ca65b8 100644
> --- a/fs/cachefiles/rdwr.c
> +++ b/fs/cachefiles/rdwr.c
> @@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct 
> cachefiles_object *object,
>   goto nomem_monitor;
>   }
>  
> - ret = add_to_page_cache(newpage, bmapping,
> - netpage->index, cachefiles_gfp);
> + ret = add_to_page_cache_lru(newpage, bmapping,
> + netpage->index, cachefiles_gfp);
>   if (ret == 0)
>   goto installed_new_backing_page;
>   if (ret != -EEXIST)
>   goto nomem_page;
>   }
>  
> - /* we've installed a new backing page, so now we need to add it
> -  * to the LRU list and start it reading */
> + /* we've installed a new backing page, so now we need to start
> +  * it reading */
>  installed_new_backing_page:
>   _debug("- new %p", newpage);
>  
>   backpage = newpage;
>   newpage = NULL;
>  
> - lru_cache_add_file(backpage);
> -
>  read_backing_page:
>   ret = bmapping->a_ops->readpage(NULL, backpage);
>   if (ret < 0)
> @@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct 
> cachefiles_object *object,
>   goto nomem;
>   }
>  
> - ret = add_to_page_cache(newpage, bmapping,
> - netpage->index, cachefiles_gfp);
> + ret = add_to_page_cache_lru(newpage, bmapping,
> + netpage->index,
> + cachefiles_gfp);
>   if (ret == 0)
>   goto installed_new_backing_page;
>   if (ret != -EEXIST)
>   goto nomem;
>   }
>  
> - /* we've installed a new backing page, so now we need to add it
> -  * to the LRU list and start it reading */
> + /* we've installed a new backing page, so now we need
> +  * to start it reading */
>   installed_new_backing_page:
>   _debug("- new %p", newpage);
>  
>   backpage = newpage;
>   newpage = NULL;
>  
> - lru_cache_add_file(backpage);
> -
>   reread_backing_page:
>   ret = bmapping->a_ops->readpage(NULL, backpage);
>   if (ret < 0)
> @@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct 
> cachefiles_object *object,
>   monitor_backing_page:
>   _debug("- monitor add");
>  
> - ret = add_to_page_cache(netpage, op->mapping, netpage->index,
> - cachefiles_gfp);
> + ret = add_to_page_cache_lru(netpage, op->mapping,
> + netpage->index, cachefiles_gfp);
>   if (ret < 0) {
>   if (ret == -EEXIST) {
>   page_cache_release(netpage);
> @@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct 
> cachefiles_object *object,
>   goto nomem;
>   }
>  
> - lru_cache_add_file(netpage);
> -
>   /* install a monitor */
>   page_cache_get(netpage);
>   monitor->netfs_page = netpage;
> @@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct 
> cachefiles_object *object,
>   backing_page_already_uptodate:
>   _debug("- uptodate");
>  
> - ret = add_to_page_cache(netpage, op->mapping, netpage->index,
> - cachefiles_gfp);
> + ret = add_to_page_cache_lru(netpage, op->mapping,
> + netpage->index, cachefiles_gfp);
>   if (ret < 0) {
>   if (ret == -EEXIST) {
>   page_cache_release(netpage);
> @@ -631,8 +626,6 @@ static int 

Re: [patch 02/10] fs: cachefiles: use add_to_page_cache_lru()

2014-02-08 Thread Rafael Aquini
On Mon, Feb 03, 2014 at 07:53:34PM -0500, Johannes Weiner wrote:
 This code used to have its own lru cache pagevec up until a0b8cab3
 (mm: remove lru parameter from __pagevec_lru_add and remove parts of
 pagevec API).  Now it's just add_to_page_cache() followed by
 lru_cache_add(), might as well use add_to_page_cache_lru() directly.


Just a heads-up, here: take a look at https://lkml.org/lkml/2014/2/7/587

I'm not saying that hunks below will cause the same leak issue as depicted on 
the thread I pointed, but it surely doesn't hurt to double-check them

Regards,
-- Rafael

 Signed-off-by: Johannes Weiner han...@cmpxchg.org
 Reviewed-by: Rik van Riel r...@redhat.com
 Reviewed-by: Minchan Kim minc...@kernel.org
 ---
  fs/cachefiles/rdwr.c | 33 +
  1 file changed, 13 insertions(+), 20 deletions(-)
 
 diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
 index ebaff368120d..4b1fb5ca65b8 100644
 --- a/fs/cachefiles/rdwr.c
 +++ b/fs/cachefiles/rdwr.c
 @@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct 
 cachefiles_object *object,
   goto nomem_monitor;
   }
  
 - ret = add_to_page_cache(newpage, bmapping,
 - netpage-index, cachefiles_gfp);
 + ret = add_to_page_cache_lru(newpage, bmapping,
 + netpage-index, cachefiles_gfp);
   if (ret == 0)
   goto installed_new_backing_page;
   if (ret != -EEXIST)
   goto nomem_page;
   }
  
 - /* we've installed a new backing page, so now we need to add it
 -  * to the LRU list and start it reading */
 + /* we've installed a new backing page, so now we need to start
 +  * it reading */
  installed_new_backing_page:
   _debug(- new %p, newpage);
  
   backpage = newpage;
   newpage = NULL;
  
 - lru_cache_add_file(backpage);
 -
  read_backing_page:
   ret = bmapping-a_ops-readpage(NULL, backpage);
   if (ret  0)
 @@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct 
 cachefiles_object *object,
   goto nomem;
   }
  
 - ret = add_to_page_cache(newpage, bmapping,
 - netpage-index, cachefiles_gfp);
 + ret = add_to_page_cache_lru(newpage, bmapping,
 + netpage-index,
 + cachefiles_gfp);
   if (ret == 0)
   goto installed_new_backing_page;
   if (ret != -EEXIST)
   goto nomem;
   }
  
 - /* we've installed a new backing page, so now we need to add it
 -  * to the LRU list and start it reading */
 + /* we've installed a new backing page, so now we need
 +  * to start it reading */
   installed_new_backing_page:
   _debug(- new %p, newpage);
  
   backpage = newpage;
   newpage = NULL;
  
 - lru_cache_add_file(backpage);
 -
   reread_backing_page:
   ret = bmapping-a_ops-readpage(NULL, backpage);
   if (ret  0)
 @@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct 
 cachefiles_object *object,
   monitor_backing_page:
   _debug(- monitor add);
  
 - ret = add_to_page_cache(netpage, op-mapping, netpage-index,
 - cachefiles_gfp);
 + ret = add_to_page_cache_lru(netpage, op-mapping,
 + netpage-index, cachefiles_gfp);
   if (ret  0) {
   if (ret == -EEXIST) {
   page_cache_release(netpage);
 @@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct 
 cachefiles_object *object,
   goto nomem;
   }
  
 - lru_cache_add_file(netpage);
 -
   /* install a monitor */
   page_cache_get(netpage);
   monitor-netfs_page = netpage;
 @@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct 
 cachefiles_object *object,
   backing_page_already_uptodate:
   _debug(- uptodate);
  
 - ret = add_to_page_cache(netpage, op-mapping, netpage-index,
 - cachefiles_gfp);
 + ret = add_to_page_cache_lru(netpage, op-mapping,
 + netpage-index, cachefiles_gfp);
   if (ret  0) {
   if (ret == -EEXIST) {
   page_cache_release(netpage);
 @@ -631,8 +626,6 @@ static int cachefiles_read_backing_file(struct 
 cachefiles_object *object,
  
   fscache_mark_page_cached(op, netpage);
  

Re: [patch] drop_caches: add some documentation and info message

2014-02-08 Thread Rafael Aquini
,
   iterate_supers(drop_pagecache_sb, NULL);
   if (sysctl_drop_caches  2)
   drop_slab();
 + printk_ratelimited(KERN_INFO %s (%d): dropped kernel caches: 
 %d\n,

Just a nitpick here: 
 s/printk_ratelimited(KERN_INFO ...)/pr_info_ratelimited(...)


Acked-by: Rafael Aquini aqu...@redhat.com


 +current-comm, task_pid_nr(current),
 +sysctl_drop_caches);
   }
   return 0;
  }
 -- 
 1.8.5.3
 
 --
 To unsubscribe, send a message with 'unsubscribe linux-mm' in
 the body to majord...@kvack.org.  For more info on Linux MM,
 see: http://www.linux-mm.org/ .
 Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: fix page leak at nfs_symlink()

2014-02-07 Thread Rafael Aquini
On Fri, Feb 07, 2014 at 10:39:24AM -0500, Jeff Layton wrote:
> On Fri,  7 Feb 2014 13:19:54 -0200
> Rafael Aquini  wrote:
> 
> > Changes committed by "a0b8cab3 mm: remove lru parameter from
> > __pagevec_lru_add and remove parts of pagevec API" have introduced
> > a call to add_to_page_cache_lru() which causes a leak in nfs_symlink() 
> > as now the page gets an extra refcount that is not dropped.
> > 
> > Jan Stancek observed and reported the leak effect while running test8 from
> > Connectathon Testsuite. After several iterations over the test case,
> > which creates several symlinks on a NFS mountpoint, the test system was
> > quickly getting into an out-of-memory scenario.
> > 
> > This patch fixes the page leak by dropping that extra refcount 
> > add_to_page_cache_lru() is grabbing. 
> > 
> > Signed-off-by: Jan Stancek 
> > Signed-off-by: Rafael Aquini 
> > ---
> >  fs/nfs/dir.c | 5 +
> >  1 file changed, 5 insertions(+)
> > 
> > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> > index be38b57..4a48fe4 100644
> > --- a/fs/nfs/dir.c
> > +++ b/fs/nfs/dir.c
> > @@ -1846,6 +1846,11 @@ int nfs_symlink(struct inode *dir, struct dentry 
> > *dentry, const char *symname)
> > GFP_KERNEL)) {
> > SetPageUptodate(page);
> > unlock_page(page);
> > +   /*
> > +* add_to_page_cache_lru() grabs an extra page refcount.
> > +* Drop it here to avoid leaking this page later.
> > +*/
> > +   page_cache_release(page);
> > } else
> > __free_page(page);
> >  
> 
> Looks reasonable as an interim fix and should almost certainly go to
> stable.
> 
> Longer term, I think it would be best from an API standpoint to fix
> add_to_page_cache_lru not to take this extra reference (or to have it
> drop it itself) and fix up the callers accordingly. That seems like a
> trap for the unwary...
>

100% agreed. I'll look into the long term approach you suggested, but as
you mentioned, the interim fix is the reasonable thing to go with now, for
mainline and stable.

Thanks for looking into it Jeff.

-- Rafael 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: fix page leak at nfs_symlink()

2014-02-07 Thread Rafael Aquini
Changes committed by "a0b8cab3 mm: remove lru parameter from
__pagevec_lru_add and remove parts of pagevec API" have introduced
a call to add_to_page_cache_lru() which causes a leak in nfs_symlink() 
as now the page gets an extra refcount that is not dropped.

Jan Stancek observed and reported the leak effect while running test8 from
Connectathon Testsuite. After several iterations over the test case,
which creates several symlinks on a NFS mountpoint, the test system was
quickly getting into an out-of-memory scenario.

This patch fixes the page leak by dropping that extra refcount 
add_to_page_cache_lru() is grabbing. 

Signed-off-by: Jan Stancek 
Signed-off-by: Rafael Aquini 
---
 fs/nfs/dir.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index be38b57..4a48fe4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1846,6 +1846,11 @@ int nfs_symlink(struct inode *dir, struct dentry 
*dentry, const char *symname)
GFP_KERNEL)) {
SetPageUptodate(page);
unlock_page(page);
+   /*
+* add_to_page_cache_lru() grabs an extra page refcount.
+* Drop it here to avoid leaking this page later.
+*/
+   page_cache_release(page);
} else
__free_page(page);
 
-- 
1.8.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: fix page leak at nfs_symlink()

2014-02-07 Thread Rafael Aquini
Changes committed by a0b8cab3 mm: remove lru parameter from
__pagevec_lru_add and remove parts of pagevec API have introduced
a call to add_to_page_cache_lru() which causes a leak in nfs_symlink() 
as now the page gets an extra refcount that is not dropped.

Jan Stancek observed and reported the leak effect while running test8 from
Connectathon Testsuite. After several iterations over the test case,
which creates several symlinks on a NFS mountpoint, the test system was
quickly getting into an out-of-memory scenario.

This patch fixes the page leak by dropping that extra refcount 
add_to_page_cache_lru() is grabbing. 

Signed-off-by: Jan Stancek jstan...@redhat.com
Signed-off-by: Rafael Aquini aqu...@redhat.com
---
 fs/nfs/dir.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index be38b57..4a48fe4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1846,6 +1846,11 @@ int nfs_symlink(struct inode *dir, struct dentry 
*dentry, const char *symname)
GFP_KERNEL)) {
SetPageUptodate(page);
unlock_page(page);
+   /*
+* add_to_page_cache_lru() grabs an extra page refcount.
+* Drop it here to avoid leaking this page later.
+*/
+   page_cache_release(page);
} else
__free_page(page);
 
-- 
1.8.5.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: fix page leak at nfs_symlink()

2014-02-07 Thread Rafael Aquini
On Fri, Feb 07, 2014 at 10:39:24AM -0500, Jeff Layton wrote:
 On Fri,  7 Feb 2014 13:19:54 -0200
 Rafael Aquini aqu...@redhat.com wrote:
 
  Changes committed by a0b8cab3 mm: remove lru parameter from
  __pagevec_lru_add and remove parts of pagevec API have introduced
  a call to add_to_page_cache_lru() which causes a leak in nfs_symlink() 
  as now the page gets an extra refcount that is not dropped.
  
  Jan Stancek observed and reported the leak effect while running test8 from
  Connectathon Testsuite. After several iterations over the test case,
  which creates several symlinks on a NFS mountpoint, the test system was
  quickly getting into an out-of-memory scenario.
  
  This patch fixes the page leak by dropping that extra refcount 
  add_to_page_cache_lru() is grabbing. 
  
  Signed-off-by: Jan Stancek jstan...@redhat.com
  Signed-off-by: Rafael Aquini aqu...@redhat.com
  ---
   fs/nfs/dir.c | 5 +
   1 file changed, 5 insertions(+)
  
  diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
  index be38b57..4a48fe4 100644
  --- a/fs/nfs/dir.c
  +++ b/fs/nfs/dir.c
  @@ -1846,6 +1846,11 @@ int nfs_symlink(struct inode *dir, struct dentry 
  *dentry, const char *symname)
  GFP_KERNEL)) {
  SetPageUptodate(page);
  unlock_page(page);
  +   /*
  +* add_to_page_cache_lru() grabs an extra page refcount.
  +* Drop it here to avoid leaking this page later.
  +*/
  +   page_cache_release(page);
  } else
  __free_page(page);
   
 
 Looks reasonable as an interim fix and should almost certainly go to
 stable.
 
 Longer term, I think it would be best from an API standpoint to fix
 add_to_page_cache_lru not to take this extra reference (or to have it
 drop it itself) and fix up the callers accordingly. That seems like a
 trap for the unwary...


100% agreed. I'll look into the long term approach you suggested, but as
you mentioned, the interim fix is the reasonable thing to go with now, for
mainline and stable.

Thanks for looking into it Jeff.

-- Rafael 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] char: random: stir the output pools differently when the random_write lenght allows splitting the seed

2014-01-30 Thread Rafael Aquini
On Fri, Jan 10, 2014 at 01:32:10PM +0100, Clemens Ladisch wrote:
> Stephan Mueller wrote:
> > Am Freitag, 10. Januar 2014, 12:37:26 schrieb Clemens Ladisch:
> >> Stephan Mueller wrote:
> >>> Am Freitag, 10. Januar 2014, 09:13:57 schrieb Clemens Ladisch:
> >>>> Rafael Aquini wrote:
> >>>>> This patch introduces changes to the random_write method so it can
> >>>>> split the given seed and completely stir the output pools with
> >>>>> different halves of it, when seed lenght allows us doing so.
> >>>>>
> >>>>> -   ret = write_pool(_pool, buffer, count);
> >>>>> +   ret = write_pool(pool1, buffer, count1);
> >>>>>
> >>>>> if (ret)
> >>>>> 
> >>>>> return ret;
> >>>>>
> >>>>> -   ret = write_pool(_pool, buffer, count);
> >>>>> +   ret = write_pool(pool2, buffer + offset, count2);
> >>>>
> >>>> Doesn't this assume that both halves of the buffer contain some
> >>>> (uncredited) entropy?  In other words, wouldn't this result in worse
> >>>> randomness for pool2 if the second half of the buffer contains just
> >>>> zero padding?
> >>>
> >>> [...]
> >>> Coming back to your concern: sure, the caller can pad any data
> >>> injected into /dev/?random with zeros.
> >>
> >> Assume that the userspace of an embedded device wants to do the same
> >> kind of initialization that a call to add_device_randomness() does, and
> >> that it has some data like "char serial_number[256]".  The padding
> >> wouldn't be done intentionally, it's just a property of the data (and
> >> it wouldn't have mattered before this patch).
> >>
> >>> But as writing to the character files is allowed to every user, this
> >>> per definition must not matter (e.g. an attacker may simply write
> >>> zeros or other known data into the character file). And the random.c
> >>> driver handles that case appropriately by not increasing the entropy
> >>> estimator when receiving data.
> >>
> >> The problem is not with the entropy estimate.
> >>
> >>> All the patch tries to achieve is to ensure that both pools are not
> >>> always mixed with the same values.
> >>
> >> Before this patch, both pools got mixed with the same values.  After
> >> this patch, both pools indeed get mixed with different values, but now
> >> one pool gets mixed with a known value if one half of the buffer
> >> happens to be known.
> >
> > Do you imply in your example above that the serial number is unknown?
> > Anything that unprivileged user space tries to inject into /dev/?random
> > should be considered data with known value.
> 
> Like the kernel's add_device_randomness() function, this example assumes
> that there is no persistent storage with a saved seed (or that it isn't
> yet available), and that mixing a device-specific value at least
> prevents multiple device instances from generating identical random
> numbers.
> 
> This indeed helps only against attackers that do not know that serial
> number.
> 
> If the data written by unprivileged users to /dev/?random were
> considered known to *all* attackers, then it wouldn't make sense to
> allow such writes at all.
> 

Sorry folks, 

Although I left this one a little behind, I'd like to follow it up and reach
some consensus.


After re-reading the whole discussion, it became clear to me that the source of
Stephan's request on splitting the seed we feed into the LRNG lies around the
fact that any unprivileged user is capable to inject data and stir the entropy
extraction pools arbitrarily.

$ ls -l /dev/{u,}random
crw-rw-rw-. 1 root root 1, 8 Jan 21 23:44 /dev/random
crw-rw-rw-. 1 root root 1, 9 Jan 21 23:44 /dev/urandom

Considering what goes within this thread, wouldn't be simpler to just remove the
privileged of writing to /dev/{u,}random from the wild world? 

Stephan, I'll repeat myself here: theoretically speaking there's no diff between
using the same seed to mix both output pools and splitting it to use its
different halves to stir the pools separately, for the /dev/{u,}random writes,
if an attacker could successfully compromise the pools by feeding them with a
known pattern seed. I understand you raised the split-the-seed point on a
security concern and that concern might eventually become a requirement.
Please, let us (me) know know if:

a) is th

Re: [RFC PATCH] char: random: stir the output pools differently when the random_write lenght allows splitting the seed

2014-01-30 Thread Rafael Aquini
On Fri, Jan 10, 2014 at 01:32:10PM +0100, Clemens Ladisch wrote:
 Stephan Mueller wrote:
  Am Freitag, 10. Januar 2014, 12:37:26 schrieb Clemens Ladisch:
  Stephan Mueller wrote:
  Am Freitag, 10. Januar 2014, 09:13:57 schrieb Clemens Ladisch:
  Rafael Aquini wrote:
  This patch introduces changes to the random_write method so it can
  split the given seed and completely stir the output pools with
  different halves of it, when seed lenght allows us doing so.
 
  -   ret = write_pool(blocking_pool, buffer, count);
  +   ret = write_pool(pool1, buffer, count1);
 
  if (ret)
  
  return ret;
 
  -   ret = write_pool(nonblocking_pool, buffer, count);
  +   ret = write_pool(pool2, buffer + offset, count2);
 
  Doesn't this assume that both halves of the buffer contain some
  (uncredited) entropy?  In other words, wouldn't this result in worse
  randomness for pool2 if the second half of the buffer contains just
  zero padding?
 
  [...]
  Coming back to your concern: sure, the caller can pad any data
  injected into /dev/?random with zeros.
 
  Assume that the userspace of an embedded device wants to do the same
  kind of initialization that a call to add_device_randomness() does, and
  that it has some data like char serial_number[256].  The padding
  wouldn't be done intentionally, it's just a property of the data (and
  it wouldn't have mattered before this patch).
 
  But as writing to the character files is allowed to every user, this
  per definition must not matter (e.g. an attacker may simply write
  zeros or other known data into the character file). And the random.c
  driver handles that case appropriately by not increasing the entropy
  estimator when receiving data.
 
  The problem is not with the entropy estimate.
 
  All the patch tries to achieve is to ensure that both pools are not
  always mixed with the same values.
 
  Before this patch, both pools got mixed with the same values.  After
  this patch, both pools indeed get mixed with different values, but now
  one pool gets mixed with a known value if one half of the buffer
  happens to be known.
 
  Do you imply in your example above that the serial number is unknown?
  Anything that unprivileged user space tries to inject into /dev/?random
  should be considered data with known value.
 
 Like the kernel's add_device_randomness() function, this example assumes
 that there is no persistent storage with a saved seed (or that it isn't
 yet available), and that mixing a device-specific value at least
 prevents multiple device instances from generating identical random
 numbers.
 
 This indeed helps only against attackers that do not know that serial
 number.
 
 If the data written by unprivileged users to /dev/?random were
 considered known to *all* attackers, then it wouldn't make sense to
 allow such writes at all.
 

Sorry folks, 

Although I left this one a little behind, I'd like to follow it up and reach
some consensus.


After re-reading the whole discussion, it became clear to me that the source of
Stephan's request on splitting the seed we feed into the LRNG lies around the
fact that any unprivileged user is capable to inject data and stir the entropy
extraction pools arbitrarily.

$ ls -l /dev/{u,}random
crw-rw-rw-. 1 root root 1, 8 Jan 21 23:44 /dev/random
crw-rw-rw-. 1 root root 1, 9 Jan 21 23:44 /dev/urandom

Considering what goes within this thread, wouldn't be simpler to just remove the
privileged of writing to /dev/{u,}random from the wild world? 

Stephan, I'll repeat myself here: theoretically speaking there's no diff between
using the same seed to mix both output pools and splitting it to use its
different halves to stir the pools separately, for the /dev/{u,}random writes,
if an attacker could successfully compromise the pools by feeding them with a
known pattern seed. I understand you raised the split-the-seed point on a
security concern and that concern might eventually become a requirement.
Please, let us (me) know know if:

a) is this request based on an existent pronouncement of standardization?
b) is this (potential) pronouncement based on math proof that one could 
   compromise the LRNG internal state by feeding known seeds into 
/dev/{u,}random?

If (a)  (b) are true, and there's no code in the actual random.c implementation
that does not address those security claims, and the naive approach of
restricting who can actually write to /dev/{u,}random is not deemed feasible,
then something like the (ugly) hack that goes bellow would be considered
feasible? (perhaps making it conditional to fips_enabled)


Thank you all for the comments till here, and have you all a nice weekend!

Rafael
---
vers/char/random.c | 66 +++
 1 file changed, 56 insertions(+), 10 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 429b75b..63e8852 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -274,6

Re: [PATCH] mm: Improve documentation of page_order

2014-01-17 Thread Rafael Aquini
On Fri, Jan 17, 2014 at 02:32:21PM +, Mel Gorman wrote:
> Developers occasionally try and optimise PFN scanners by using page_order
> but miss that in general it requires zone->lock. This has happened twice for
> compaction.c and rejected both times.  This patch clarifies the documentation
> of page_order and adds a note to compaction.c why page_order is not used.
> 
> Signed-off-by: Mel Gorman 
> ---
>  mm/compaction.c | 5 -
>  mm/internal.h   | 8 +---
>  2 files changed, 9 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/compaction.c b/mm/compaction.c
> index f58bcd0..f91d26b 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -522,7 +522,10 @@ isolate_migratepages_range(struct zone *zone, struct 
> compact_control *cc,
>   if (!isolation_suitable(cc, page))
>   goto next_pageblock;
>  
> - /* Skip if free */
> + /*
> +  * Skip if free. page_order cannot be used without zone->lock
> +  * as nothing prevents parallel allocations or buddy merging.
> +  */
>   if (PageBuddy(page))
>   continue;
>  
> diff --git a/mm/internal.h b/mm/internal.h
> index 684f7aa..09cd8be 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -144,9 +144,11 @@ isolate_migratepages_range(struct zone *zone, struct 
> compact_control *cc,
>  #endif
>  
>  /*
> - * function for dealing with page's order in buddy system.
> - * zone->lock is already acquired when we use these.
> - * So, we don't need atomic page->flags operations here.
> + * This functions returns the order of a free page in the buddy system.
> + * In general, page_zone(page)->lock must be held by the caller to prevent
> + * the page being allocated in parallel and returning garbage as the order.
> + * If the caller does not hold page_zone(page), they must guarantee that
> + * the page cannot be allocated or merged in parallel.
>   */
>  static inline unsigned long page_order(struct page *page)
>  {

Acked-by: Rafael Aquini 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: Improve documentation of page_order

2014-01-17 Thread Rafael Aquini
On Fri, Jan 17, 2014 at 02:32:21PM +, Mel Gorman wrote:
 Developers occasionally try and optimise PFN scanners by using page_order
 but miss that in general it requires zone-lock. This has happened twice for
 compaction.c and rejected both times.  This patch clarifies the documentation
 of page_order and adds a note to compaction.c why page_order is not used.
 
 Signed-off-by: Mel Gorman mgor...@suse.de
 ---
  mm/compaction.c | 5 -
  mm/internal.h   | 8 +---
  2 files changed, 9 insertions(+), 4 deletions(-)
 
 diff --git a/mm/compaction.c b/mm/compaction.c
 index f58bcd0..f91d26b 100644
 --- a/mm/compaction.c
 +++ b/mm/compaction.c
 @@ -522,7 +522,10 @@ isolate_migratepages_range(struct zone *zone, struct 
 compact_control *cc,
   if (!isolation_suitable(cc, page))
   goto next_pageblock;
  
 - /* Skip if free */
 + /*
 +  * Skip if free. page_order cannot be used without zone-lock
 +  * as nothing prevents parallel allocations or buddy merging.
 +  */
   if (PageBuddy(page))
   continue;
  
 diff --git a/mm/internal.h b/mm/internal.h
 index 684f7aa..09cd8be 100644
 --- a/mm/internal.h
 +++ b/mm/internal.h
 @@ -144,9 +144,11 @@ isolate_migratepages_range(struct zone *zone, struct 
 compact_control *cc,
  #endif
  
  /*
 - * function for dealing with page's order in buddy system.
 - * zone-lock is already acquired when we use these.
 - * So, we don't need atomic page-flags operations here.
 + * This functions returns the order of a free page in the buddy system.
 + * In general, page_zone(page)-lock must be held by the caller to prevent
 + * the page being allocated in parallel and returning garbage as the order.
 + * If the caller does not hold page_zone(page), they must guarantee that
 + * the page cannot be allocated or merged in parallel.
   */
  static inline unsigned long page_order(struct page *page)
  {

Acked-by: Rafael Aquini aqu...@redhat.com


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] char: random: stir the output pools differently when the random_write lenght allows splitting the seed

2014-01-10 Thread Rafael Aquini
On Fri, Jan 10, 2014 at 12:37:26PM +0100, Clemens Ladisch wrote:
> Stephan Mueller wrote:
> > Am Freitag, 10. Januar 2014, 09:13:57 schrieb Clemens Ladisch:
> >> Rafael Aquini wrote:
> >>> This patch introduces changes to the random_write method so it can
> >>> split the given seed and completely stir the output pools with
> >>> different halves of it, when seed lenght allows us doing so.
> >>>
> >>> - ret = write_pool(_pool, buffer, count);
> >>> + ret = write_pool(pool1, buffer, count1);
> >>>   if (ret)
> >>>   return ret;
> >>> - ret = write_pool(_pool, buffer, count);
> >>> + ret = write_pool(pool2, buffer + offset, count2);
> >>
> >> Doesn't this assume that both halves of the buffer contain some
> >> (uncredited) entropy?  In other words, wouldn't this result in worse
> >> randomness for pool2 if the second half of the buffer contains just
> >> zero padding?
> >
> > [...]
> > Coming back to your concern: sure, the caller can pad any data injected
> > into /dev/?random with zeros.
> 
> Assume that the userspace of an embedded device wants to do the same
> kind of initialization that a call to add_device_randomness() does, and
> that it has some data like "char serial_number[256]".  The padding
> wouldn't be done intentionally, it's just a property of the data (and it
> wouldn't have mattered before this patch).
> 
> > But as writing to the character files is allowed to every user, this
> > per definition must not matter (e.g. an attacker may simply write
> > zeros or other known data into the character file). And the random.c
> > driver handles that case appropriately by not increasing the entropy
> > estimator when receiving data.
> 
> The problem is not with the entropy estimate.
> 
> > All the patch tries to achieve is to ensure that both pools are not
> > always mixed with the same values.
> 
> Before this patch, both pools got mixed with the same values.  After
> this patch, both pools indeed get mixed with different values, but now
> one pool gets mixed with a known value if one half of the buffer happens
> to be known.
>

Yeah, nice catch. I haven't thought about it. Theoretically speaking there's no
big difference between using the same seed to mix both output pools and split 
the
seed to use its different halves to mix the pools separately in this case.
Supposing an attacker could successfully compromise the blocking pool
(/dev/random) by injecting a known pattern seed into /dev/urandom, a split seed
would also not be able to do any greater good for us, as the attacker can surely
handcraft an input that shows the same pattern on both halves.


I'm wondering if doing something like initializing an extra 
structentropy_store, 
fill in its pool with 'OUTPUT_POOL_SIZE' get_random_bytes(), stir this extra 
pool
with whatever came in as seed from userland and extract two seeds from this pool
to then separately stir the output pools would be something more feasible here, 
or would it just be considered too much for too little... 

if the approach above is sth worth to pursue, I'll come up with something next 
week.

Thank you all for the comments till here, and have you all a nice weekend!
-- Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] char: random: stir the output pools differently when the random_write lenght allows splitting the seed

2014-01-10 Thread Rafael Aquini
On Fri, Jan 10, 2014 at 12:37:26PM +0100, Clemens Ladisch wrote:
 Stephan Mueller wrote:
  Am Freitag, 10. Januar 2014, 09:13:57 schrieb Clemens Ladisch:
  Rafael Aquini wrote:
  This patch introduces changes to the random_write method so it can
  split the given seed and completely stir the output pools with
  different halves of it, when seed lenght allows us doing so.
 
  - ret = write_pool(blocking_pool, buffer, count);
  + ret = write_pool(pool1, buffer, count1);
if (ret)
return ret;
  - ret = write_pool(nonblocking_pool, buffer, count);
  + ret = write_pool(pool2, buffer + offset, count2);
 
  Doesn't this assume that both halves of the buffer contain some
  (uncredited) entropy?  In other words, wouldn't this result in worse
  randomness for pool2 if the second half of the buffer contains just
  zero padding?
 
  [...]
  Coming back to your concern: sure, the caller can pad any data injected
  into /dev/?random with zeros.
 
 Assume that the userspace of an embedded device wants to do the same
 kind of initialization that a call to add_device_randomness() does, and
 that it has some data like char serial_number[256].  The padding
 wouldn't be done intentionally, it's just a property of the data (and it
 wouldn't have mattered before this patch).
 
  But as writing to the character files is allowed to every user, this
  per definition must not matter (e.g. an attacker may simply write
  zeros or other known data into the character file). And the random.c
  driver handles that case appropriately by not increasing the entropy
  estimator when receiving data.
 
 The problem is not with the entropy estimate.
 
  All the patch tries to achieve is to ensure that both pools are not
  always mixed with the same values.
 
 Before this patch, both pools got mixed with the same values.  After
 this patch, both pools indeed get mixed with different values, but now
 one pool gets mixed with a known value if one half of the buffer happens
 to be known.


Yeah, nice catch. I haven't thought about it. Theoretically speaking there's no
big difference between using the same seed to mix both output pools and split 
the
seed to use its different halves to mix the pools separately in this case.
Supposing an attacker could successfully compromise the blocking pool
(/dev/random) by injecting a known pattern seed into /dev/urandom, a split seed
would also not be able to do any greater good for us, as the attacker can surely
handcraft an input that shows the same pattern on both halves.


I'm wondering if doing something like initializing an extra 
structentropy_store, 
fill in its pool with 'OUTPUT_POOL_SIZE' get_random_bytes(), stir this extra 
pool
with whatever came in as seed from userland and extract two seeds from this pool
to then separately stir the output pools would be something more feasible here, 
or would it just be considered too much for too little... 

if the approach above is sth worth to pursue, I'll come up with something next 
week.

Thank you all for the comments till here, and have you all a nice weekend!
-- Rafael
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH] char: random: stir the output pools differently when the random_write lenght allows splitting the seed

2014-01-09 Thread Rafael Aquini
Since commit "7f397dc random: fix seeding with zero entropy" we are adding
data from zero-entropy random_writes directly to output pools. We can leverage
the fact the seed used for such case is usually long enough to completely stir
all bits from the input pool which is, by default, 4 times longer than the
output pools and break it in two to stir differently the output pools. This
can help on making a stronger security claim on output pool internal state.

This patch introduces changes to the random_write method so it can split the
given seed and completely stir the output pools with different halves of it, 
when seed lenght allows us doing so. 

Signed-off-by: Rafael Aquini 
---
Suggested by Stephan Mueller 

 drivers/char/random.c | 38 --
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 429b75b..d623234 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -274,6 +274,7 @@
 #define INPUT_POOL_WORDS   (1 << (INPUT_POOL_SHIFT-5))
 #define OUTPUT_POOL_SHIFT  10
 #define OUTPUT_POOL_WORDS  (1 << (OUTPUT_POOL_SHIFT-5))
+#define OUTPUT_POOL_SIZE   ((1 << OUTPUT_POOL_SHIFT) >> 3)
 #define SEC_XFER_SIZE  512
 #define EXTRACT_SIZE   10
 
@@ -1387,19 +1388,44 @@ write_pool(struct entropy_store *r, const char __user 
*buffer, size_t count)
return 0;
 }
 
-static ssize_t random_write(struct file *file, const char __user *buffer,
-   size_t count, loff_t *ppos)
+static size_t __do_random_write(const char __user *buffer,
+   size_t count, bool split_buffer)
 {
-   size_t ret;
+   size_t ret, offset, count1, count2;
+   struct entropy_store *pool1, *pool2;
+
+   offset = 0;
+   count1 = count2 = count;
+   pool1 = _pool;
+   pool2 = _pool;
+
+   if (split_buffer) {
+   size_t rnd;
+   count1 = count / 2;
+   count2 = count - count1;
+   offset = count1;
+
+   get_random_bytes(, 2);
+   if (rnd % 2) {
+   pool1 = _pool;
+   pool2 = _pool;
+   }
+   }
 
-   ret = write_pool(_pool, buffer, count);
+   ret = write_pool(pool1, buffer, count1);
if (ret)
return ret;
-   ret = write_pool(_pool, buffer, count);
+   ret = write_pool(pool2, buffer + offset, count2);
if (ret)
return ret;
 
-   return (ssize_t)count;
+   return count;
+}
+
+static ssize_t random_write(struct file *file, const char __user *buffer,
+   size_t count, loff_t *ppos)
+{
+   return __do_random_write(buffer, count, (count >= 2*OUTPUT_POOL_SIZE));
 }
 
 static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH] char: random: stir the output pools differently when the random_write lenght allows splitting the seed

2014-01-09 Thread Rafael Aquini
Since commit 7f397dc random: fix seeding with zero entropy we are adding
data from zero-entropy random_writes directly to output pools. We can leverage
the fact the seed used for such case is usually long enough to completely stir
all bits from the input pool which is, by default, 4 times longer than the
output pools and break it in two to stir differently the output pools. This
can help on making a stronger security claim on output pool internal state.

This patch introduces changes to the random_write method so it can split the
given seed and completely stir the output pools with different halves of it, 
when seed lenght allows us doing so. 

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
Suggested by Stephan Mueller stephan.muel...@atsec.com

 drivers/char/random.c | 38 --
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 429b75b..d623234 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -274,6 +274,7 @@
 #define INPUT_POOL_WORDS   (1  (INPUT_POOL_SHIFT-5))
 #define OUTPUT_POOL_SHIFT  10
 #define OUTPUT_POOL_WORDS  (1  (OUTPUT_POOL_SHIFT-5))
+#define OUTPUT_POOL_SIZE   ((1  OUTPUT_POOL_SHIFT)  3)
 #define SEC_XFER_SIZE  512
 #define EXTRACT_SIZE   10
 
@@ -1387,19 +1388,44 @@ write_pool(struct entropy_store *r, const char __user 
*buffer, size_t count)
return 0;
 }
 
-static ssize_t random_write(struct file *file, const char __user *buffer,
-   size_t count, loff_t *ppos)
+static size_t __do_random_write(const char __user *buffer,
+   size_t count, bool split_buffer)
 {
-   size_t ret;
+   size_t ret, offset, count1, count2;
+   struct entropy_store *pool1, *pool2;
+
+   offset = 0;
+   count1 = count2 = count;
+   pool1 = blocking_pool;
+   pool2 = nonblocking_pool;
+
+   if (split_buffer) {
+   size_t rnd;
+   count1 = count / 2;
+   count2 = count - count1;
+   offset = count1;
+
+   get_random_bytes(rnd, 2);
+   if (rnd % 2) {
+   pool1 = nonblocking_pool;
+   pool2 = blocking_pool;
+   }
+   }
 
-   ret = write_pool(blocking_pool, buffer, count);
+   ret = write_pool(pool1, buffer, count1);
if (ret)
return ret;
-   ret = write_pool(nonblocking_pool, buffer, count);
+   ret = write_pool(pool2, buffer + offset, count2);
if (ret)
return ret;
 
-   return (ssize_t)count;
+   return count;
+}
+
+static ssize_t random_write(struct file *file, const char __user *buffer,
+   size_t count, loff_t *ppos)
+{
+   return __do_random_write(buffer, count, (count = 2*OUTPUT_POOL_SIZE));
 }
 
 static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: ipc whitespace cleanups

2013-12-23 Thread Rafael Aquini
On Mon, Dec 23, 2013 at 11:19:44AM +0100, Manfred Spraul wrote:
> Hi Rafael,
> 
> Is anything else in your IPC todo list?
> For me, it seems as if the locking updates have finally stabilized,
> no more wrong return codes, races and whatever else.
> (/me knocks on wood).
> 
> Therefore I would like to send the attached patch to Andrew (perhaps
> with a rediff, if required) - unless it collides with something you
> plan.
>

Manfred,

Thanks for taking the time to go through this work!
Merry Christmas! 

> From 06359c767e3fb5e966efa4653f1db4f6d977da36 Mon Sep 17 00:00:00 2001
> From: Manfred Spraul 
> Date: Mon, 23 Dec 2013 11:03:39 +0100
> Subject: [PATCH] ipc: whitespace cleanup
> 
> The ipc code does not adhere the typical linux coding style.
> This patch fixes all simple whitespace errors.
> 
> - mostly autogenerated by
>   scripts/checkpatch.pl -f --fix \
>   --types=pointer_location,spacing,space_before_tab
> - one manual fixup (keep structure members tab-aligned)
> 
> diff -w is empty.
> Tested with some msg and sem test apps.
> 
> Signed-off-by: Manfred Spraul 
> Cc: Joe Perches 
> ---

Acked-by: Rafael Aquini 

>  include/linux/msg.h |  2 +-
>  include/linux/shm.h |  2 +-
>  ipc/compat.c| 10 +++---
>  ipc/compat_mq.c |  2 +-
>  ipc/ipc_sysctl.c| 14 
>  ipc/mqueue.c| 16 -
>  ipc/msg.c   | 18 +-
>  ipc/sem.c   | 98 
> ++---
>  ipc/shm.c   | 32 -
>  ipc/util.c  | 24 ++---
>  ipc/util.h  | 14 
>  11 files changed, 116 insertions(+), 116 deletions(-)
> 
> diff --git a/include/linux/msg.h b/include/linux/msg.h
> index e21f9d4..f3f302f 100644
> --- a/include/linux/msg.h
> +++ b/include/linux/msg.h
> @@ -9,7 +9,7 @@ struct msg_msg {
>   struct list_head m_list;
>   long m_type;
>   size_t m_ts;/* message text size */
> - struct msg_msgseg* next;
> + struct msg_msgseg *next;
>   void *security;
>   /* the actual message follows immediately */
>  };
> diff --git a/include/linux/shm.h b/include/linux/shm.h
> index 429c199..1e2cd2e 100644
> --- a/include/linux/shm.h
> +++ b/include/linux/shm.h
> @@ -9,7 +9,7 @@
>  struct shmid_kernel /* private to the kernel */
>  {
>   struct kern_ipc_permshm_perm;
> - struct file *   shm_file;
> + struct file *shm_file;
>   unsigned long   shm_nattch;
>   unsigned long   shm_segsz;
>   time_t  shm_atim;
> diff --git a/ipc/compat.c b/ipc/compat.c
> index 892f658..ed0530b 100644
> --- a/ipc/compat.c
> +++ b/ipc/compat.c
> @@ -197,7 +197,7 @@ static inline int __put_compat_ipc_perm(struct ipc64_perm 
> *p,
>  static inline int get_compat_semid64_ds(struct semid64_ds *s64,
>   struct compat_semid64_ds __user *up64)
>  {
> - if (!access_ok (VERIFY_READ, up64, sizeof(*up64)))
> + if (!access_ok(VERIFY_READ, up64, sizeof(*up64)))
>   return -EFAULT;
>   return __get_compat_ipc64_perm(>sem_perm, >sem_perm);
>  }
> @@ -205,7 +205,7 @@ static inline int get_compat_semid64_ds(struct semid64_ds 
> *s64,
>  static inline int get_compat_semid_ds(struct semid64_ds *s,
> struct compat_semid_ds __user *up)
>  {
> - if (!access_ok (VERIFY_READ, up, sizeof(*up)))
> + if (!access_ok(VERIFY_READ, up, sizeof(*up)))
>   return -EFAULT;
>   return __get_compat_ipc_perm(>sem_perm, >sem_perm);
>  }
> @@ -215,7 +215,7 @@ static inline int put_compat_semid64_ds(struct semid64_ds 
> *s64,
>  {
>   int err;
>  
> - if (!access_ok (VERIFY_WRITE, up64, sizeof(*up64)))
> + if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
>   return -EFAULT;
>   err  = __put_compat_ipc64_perm(>sem_perm, >sem_perm);
>   err |= __put_user(s64->sem_otime, >sem_otime);
> @@ -229,7 +229,7 @@ static inline int put_compat_semid_ds(struct semid64_ds 
> *s,
>  {
>   int err;
>  
> - if (!access_ok (VERIFY_WRITE, up, sizeof(*up)))
> + if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
>   return -EFAULT;
>   err  = __put_compat_ipc_perm(>sem_perm, >sem_perm);
>   err |= __put_user(s->sem_otime, >sem_otime);
> @@ -376,7 +376,7 @@ COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, 
> second,
>   struct compat_ipc_kludge ipck;
>   if (!uptr)
>   return -EINVAL;
> - if (copy_from_user (, upt

Re: ipc whitespace cleanups

2013-12-23 Thread Rafael Aquini
On Mon, Dec 23, 2013 at 11:19:44AM +0100, Manfred Spraul wrote:
 Hi Rafael,
 
 Is anything else in your IPC todo list?
 For me, it seems as if the locking updates have finally stabilized,
 no more wrong return codes, races and whatever else.
 (/me knocks on wood).
 
 Therefore I would like to send the attached patch to Andrew (perhaps
 with a rediff, if required) - unless it collides with something you
 plan.


Manfred,

Thanks for taking the time to go through this work!
Merry Christmas! 

 From 06359c767e3fb5e966efa4653f1db4f6d977da36 Mon Sep 17 00:00:00 2001
 From: Manfred Spraul manf...@colorfullife.com
 Date: Mon, 23 Dec 2013 11:03:39 +0100
 Subject: [PATCH] ipc: whitespace cleanup
 
 The ipc code does not adhere the typical linux coding style.
 This patch fixes all simple whitespace errors.
 
 - mostly autogenerated by
   scripts/checkpatch.pl -f --fix \
   --types=pointer_location,spacing,space_before_tab
 - one manual fixup (keep structure members tab-aligned)
 
 diff -w is empty.
 Tested with some msg and sem test apps.
 
 Signed-off-by: Manfred Spraul manf...@colorfullife.com
 Cc: Joe Perches j...@perches.com
 ---

Acked-by: Rafael Aquini aqu...@redhat.com

  include/linux/msg.h |  2 +-
  include/linux/shm.h |  2 +-
  ipc/compat.c| 10 +++---
  ipc/compat_mq.c |  2 +-
  ipc/ipc_sysctl.c| 14 
  ipc/mqueue.c| 16 -
  ipc/msg.c   | 18 +-
  ipc/sem.c   | 98 
 ++---
  ipc/shm.c   | 32 -
  ipc/util.c  | 24 ++---
  ipc/util.h  | 14 
  11 files changed, 116 insertions(+), 116 deletions(-)
 
 diff --git a/include/linux/msg.h b/include/linux/msg.h
 index e21f9d4..f3f302f 100644
 --- a/include/linux/msg.h
 +++ b/include/linux/msg.h
 @@ -9,7 +9,7 @@ struct msg_msg {
   struct list_head m_list;
   long m_type;
   size_t m_ts;/* message text size */
 - struct msg_msgseg* next;
 + struct msg_msgseg *next;
   void *security;
   /* the actual message follows immediately */
  };
 diff --git a/include/linux/shm.h b/include/linux/shm.h
 index 429c199..1e2cd2e 100644
 --- a/include/linux/shm.h
 +++ b/include/linux/shm.h
 @@ -9,7 +9,7 @@
  struct shmid_kernel /* private to the kernel */
  {
   struct kern_ipc_permshm_perm;
 - struct file *   shm_file;
 + struct file *shm_file;
   unsigned long   shm_nattch;
   unsigned long   shm_segsz;
   time_t  shm_atim;
 diff --git a/ipc/compat.c b/ipc/compat.c
 index 892f658..ed0530b 100644
 --- a/ipc/compat.c
 +++ b/ipc/compat.c
 @@ -197,7 +197,7 @@ static inline int __put_compat_ipc_perm(struct ipc64_perm 
 *p,
  static inline int get_compat_semid64_ds(struct semid64_ds *s64,
   struct compat_semid64_ds __user *up64)
  {
 - if (!access_ok (VERIFY_READ, up64, sizeof(*up64)))
 + if (!access_ok(VERIFY_READ, up64, sizeof(*up64)))
   return -EFAULT;
   return __get_compat_ipc64_perm(s64-sem_perm, up64-sem_perm);
  }
 @@ -205,7 +205,7 @@ static inline int get_compat_semid64_ds(struct semid64_ds 
 *s64,
  static inline int get_compat_semid_ds(struct semid64_ds *s,
 struct compat_semid_ds __user *up)
  {
 - if (!access_ok (VERIFY_READ, up, sizeof(*up)))
 + if (!access_ok(VERIFY_READ, up, sizeof(*up)))
   return -EFAULT;
   return __get_compat_ipc_perm(s-sem_perm, up-sem_perm);
  }
 @@ -215,7 +215,7 @@ static inline int put_compat_semid64_ds(struct semid64_ds 
 *s64,
  {
   int err;
  
 - if (!access_ok (VERIFY_WRITE, up64, sizeof(*up64)))
 + if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
   return -EFAULT;
   err  = __put_compat_ipc64_perm(s64-sem_perm, up64-sem_perm);
   err |= __put_user(s64-sem_otime, up64-sem_otime);
 @@ -229,7 +229,7 @@ static inline int put_compat_semid_ds(struct semid64_ds 
 *s,
  {
   int err;
  
 - if (!access_ok (VERIFY_WRITE, up, sizeof(*up)))
 + if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
   return -EFAULT;
   err  = __put_compat_ipc_perm(s-sem_perm, up-sem_perm);
   err |= __put_user(s-sem_otime, up-sem_otime);
 @@ -376,7 +376,7 @@ COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, 
 second,
   struct compat_ipc_kludge ipck;
   if (!uptr)
   return -EINVAL;
 - if (copy_from_user (ipck, uptr, sizeof(ipck)))
 + if (copy_from_user(ipck, uptr, sizeof(ipck)))
   return -EFAULT;
   uptr = compat_ptr(ipck.msgp);
   fifth = ipck.msgtyp;
 diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c
 index 380ea4f..63d7c6de 100644
 --- a/ipc/compat_mq.c
 +++ b/ipc/compat_mq.c
 @@ -64,7 +64,7 @@ asmlinkage long

Re: [PATCH] ipc: change kern_ipc_perm.deleted type to bool

2013-12-19 Thread Rafael Aquini
On Thu, Dec 19, 2013 at 03:23:02PM -0200, Rafael Aquini wrote:
> struct kern_ipc_perm.deleted is meant to be used as a boolean toogle, and
> the changes introduced by this patch are just to make the case explicit.
>
> Signed-off-by: Rafael Aquini 

s/toogle/toggle

my bad, sorry. shall I resend, or it can be fixed before merging?
 
Thanks,
-- Rafael

> ---
> * a quick sidenote: this patch goes on top of the recently posted
> [PATCH v3] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races
> 
>  include/linux/ipc.h | 2 +-
>  ipc/sem.c   | 2 +-
>  ipc/util.c  | 6 +++---
>  ipc/util.h  | 2 +-
>  4 files changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/ipc.h b/include/linux/ipc.h
> index 8d861b2..9d84942 100644
> --- a/include/linux/ipc.h
> +++ b/include/linux/ipc.h
> @@ -11,7 +11,7 @@
>  struct kern_ipc_perm
>  {
>   spinlock_t  lock;
> - int deleted;
> + booldeleted;
>   int id;
>   key_t   key;
>   kuid_t  uid;
> diff --git a/ipc/sem.c b/ipc/sem.c
> index 5972e60..1659cd9 100644
> --- a/ipc/sem.c
> +++ b/ipc/sem.c
> @@ -394,7 +394,7 @@ static inline struct sem_array *sem_obtain_lock(struct 
> ipc_namespace *ns,
>   /* ipc_rmid() may have already freed the ID while sem_lock
>* was spinning: verify that the structure is still valid
>*/
> - if (!ipcp->deleted)
> + if (ipc_valid_object(ipcp))
>   return container_of(ipcp, struct sem_array, sem_perm);
>  
>   sem_unlock(sma, *locknum);
> diff --git a/ipc/util.c b/ipc/util.c
> index 3ae17a4..9dc67fa 100644
> --- a/ipc/util.c
> +++ b/ipc/util.c
> @@ -286,7 +286,7 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* 
> new, int size)
>   idr_preload(GFP_KERNEL);
>  
>   spin_lock_init(>lock);
> - new->deleted = 0;
> + new->deleted = false;
>   rcu_read_lock();
>   spin_lock(>lock);
>  
> @@ -447,7 +447,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm 
> *ipcp)
>  
>   ids->in_use--;
>  
> - ipcp->deleted = 1;
> + ipcp->deleted = true;
>  
>   return;
>  }
> @@ -657,7 +657,7 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int 
> id)
>   /* ipc_rmid() may have already freed the ID while ipc_lock
>* was spinning: here verify that the structure is still valid
>*/
> - if (!out->deleted)
> + if (ipc_valid_object(out))
>   return out;
>  
>   spin_unlock(>lock);
> diff --git a/ipc/util.h b/ipc/util.h
> index d05b708..a1cbc3a 100644
> --- a/ipc/util.h
> +++ b/ipc/util.h
> @@ -195,7 +195,7 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
>   */
>  static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
>  {
> - return perm->deleted == 0;
> + return !perm->deleted;
>  }
>  
>  struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
> -- 
> 1.8.3.1
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ipc: change kern_ipc_perm.deleted type to bool

2013-12-19 Thread Rafael Aquini
struct kern_ipc_perm.deleted is meant to be used as a boolean toogle, and
the changes introduced by this patch are just to make the case explicit.

Signed-off-by: Rafael Aquini 
---
* a quick sidenote: this patch goes on top of the recently posted
[PATCH v3] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

 include/linux/ipc.h | 2 +-
 ipc/sem.c   | 2 +-
 ipc/util.c  | 6 +++---
 ipc/util.h  | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 8d861b2..9d84942 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -11,7 +11,7 @@
 struct kern_ipc_perm
 {
spinlock_t  lock;
-   int deleted;
+   booldeleted;
int id;
key_t   key;
kuid_t  uid;
diff --git a/ipc/sem.c b/ipc/sem.c
index 5972e60..1659cd9 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -394,7 +394,7 @@ static inline struct sem_array *sem_obtain_lock(struct 
ipc_namespace *ns,
/* ipc_rmid() may have already freed the ID while sem_lock
 * was spinning: verify that the structure is still valid
 */
-   if (!ipcp->deleted)
+   if (ipc_valid_object(ipcp))
return container_of(ipcp, struct sem_array, sem_perm);
 
sem_unlock(sma, *locknum);
diff --git a/ipc/util.c b/ipc/util.c
index 3ae17a4..9dc67fa 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -286,7 +286,7 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* 
new, int size)
idr_preload(GFP_KERNEL);
 
spin_lock_init(>lock);
-   new->deleted = 0;
+   new->deleted = false;
rcu_read_lock();
spin_lock(>lock);
 
@@ -447,7 +447,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm 
*ipcp)
 
ids->in_use--;
 
-   ipcp->deleted = 1;
+   ipcp->deleted = true;
 
return;
 }
@@ -657,7 +657,7 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
/* ipc_rmid() may have already freed the ID while ipc_lock
 * was spinning: here verify that the structure is still valid
 */
-   if (!out->deleted)
+   if (ipc_valid_object(out))
return out;
 
spin_unlock(>lock);
diff --git a/ipc/util.h b/ipc/util.h
index d05b708..a1cbc3a 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -195,7 +195,7 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
  */
 static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
 {
-   return perm->deleted == 0;
+   return !perm->deleted;
 }
 
 struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ipc: change kern_ipc_perm.deleted type to bool

2013-12-19 Thread Rafael Aquini
struct kern_ipc_perm.deleted is meant to be used as a boolean toogle, and
the changes introduced by this patch are just to make the case explicit.

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
* a quick sidenote: this patch goes on top of the recently posted
[PATCH v3] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

 include/linux/ipc.h | 2 +-
 ipc/sem.c   | 2 +-
 ipc/util.c  | 6 +++---
 ipc/util.h  | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 8d861b2..9d84942 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -11,7 +11,7 @@
 struct kern_ipc_perm
 {
spinlock_t  lock;
-   int deleted;
+   booldeleted;
int id;
key_t   key;
kuid_t  uid;
diff --git a/ipc/sem.c b/ipc/sem.c
index 5972e60..1659cd9 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -394,7 +394,7 @@ static inline struct sem_array *sem_obtain_lock(struct 
ipc_namespace *ns,
/* ipc_rmid() may have already freed the ID while sem_lock
 * was spinning: verify that the structure is still valid
 */
-   if (!ipcp-deleted)
+   if (ipc_valid_object(ipcp))
return container_of(ipcp, struct sem_array, sem_perm);
 
sem_unlock(sma, *locknum);
diff --git a/ipc/util.c b/ipc/util.c
index 3ae17a4..9dc67fa 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -286,7 +286,7 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* 
new, int size)
idr_preload(GFP_KERNEL);
 
spin_lock_init(new-lock);
-   new-deleted = 0;
+   new-deleted = false;
rcu_read_lock();
spin_lock(new-lock);
 
@@ -447,7 +447,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm 
*ipcp)
 
ids-in_use--;
 
-   ipcp-deleted = 1;
+   ipcp-deleted = true;
 
return;
 }
@@ -657,7 +657,7 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
/* ipc_rmid() may have already freed the ID while ipc_lock
 * was spinning: here verify that the structure is still valid
 */
-   if (!out-deleted)
+   if (ipc_valid_object(out))
return out;
 
spin_unlock(out-lock);
diff --git a/ipc/util.h b/ipc/util.h
index d05b708..a1cbc3a 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -195,7 +195,7 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
  */
 static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
 {
-   return perm-deleted == 0;
+   return !perm-deleted;
 }
 
 struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ipc: change kern_ipc_perm.deleted type to bool

2013-12-19 Thread Rafael Aquini
On Thu, Dec 19, 2013 at 03:23:02PM -0200, Rafael Aquini wrote:
 struct kern_ipc_perm.deleted is meant to be used as a boolean toogle, and
 the changes introduced by this patch are just to make the case explicit.

 Signed-off-by: Rafael Aquini aqu...@redhat.com

s/toogle/toggle

my bad, sorry. shall I resend, or it can be fixed before merging?
 
Thanks,
-- Rafael

 ---
 * a quick sidenote: this patch goes on top of the recently posted
 [PATCH v3] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races
 
  include/linux/ipc.h | 2 +-
  ipc/sem.c   | 2 +-
  ipc/util.c  | 6 +++---
  ipc/util.h  | 2 +-
  4 files changed, 6 insertions(+), 6 deletions(-)
 
 diff --git a/include/linux/ipc.h b/include/linux/ipc.h
 index 8d861b2..9d84942 100644
 --- a/include/linux/ipc.h
 +++ b/include/linux/ipc.h
 @@ -11,7 +11,7 @@
  struct kern_ipc_perm
  {
   spinlock_t  lock;
 - int deleted;
 + booldeleted;
   int id;
   key_t   key;
   kuid_t  uid;
 diff --git a/ipc/sem.c b/ipc/sem.c
 index 5972e60..1659cd9 100644
 --- a/ipc/sem.c
 +++ b/ipc/sem.c
 @@ -394,7 +394,7 @@ static inline struct sem_array *sem_obtain_lock(struct 
 ipc_namespace *ns,
   /* ipc_rmid() may have already freed the ID while sem_lock
* was spinning: verify that the structure is still valid
*/
 - if (!ipcp-deleted)
 + if (ipc_valid_object(ipcp))
   return container_of(ipcp, struct sem_array, sem_perm);
  
   sem_unlock(sma, *locknum);
 diff --git a/ipc/util.c b/ipc/util.c
 index 3ae17a4..9dc67fa 100644
 --- a/ipc/util.c
 +++ b/ipc/util.c
 @@ -286,7 +286,7 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* 
 new, int size)
   idr_preload(GFP_KERNEL);
  
   spin_lock_init(new-lock);
 - new-deleted = 0;
 + new-deleted = false;
   rcu_read_lock();
   spin_lock(new-lock);
  
 @@ -447,7 +447,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm 
 *ipcp)
  
   ids-in_use--;
  
 - ipcp-deleted = 1;
 + ipcp-deleted = true;
  
   return;
  }
 @@ -657,7 +657,7 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int 
 id)
   /* ipc_rmid() may have already freed the ID while ipc_lock
* was spinning: here verify that the structure is still valid
*/
 - if (!out-deleted)
 + if (ipc_valid_object(out))
   return out;
  
   spin_unlock(out-lock);
 diff --git a/ipc/util.h b/ipc/util.h
 index d05b708..a1cbc3a 100644
 --- a/ipc/util.h
 +++ b/ipc/util.h
 @@ -195,7 +195,7 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
   */
  static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
  {
 - return perm-deleted == 0;
 + return !perm-deleted;
  }
  
  struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
 -- 
 1.8.3.1
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
On Wed, Dec 18, 2013 at 04:38:24PM -0800, Davidlohr Bueso wrote:
> On Wed, 2013-12-18 at 18:33 -0200, Rafael Aquini wrote:
> > After the locking semantics for the SysV IPC API got improved, a couple of
> > IPC_RMID race windows were opened because we ended up dropping the
> > 'kern_ipc_perm.deleted' check performed way down in ipc_lock().
> > The spotted races got sorted out by re-introducing the old test within
> > the racy critical sections.
> > 
> > This patch introduces ipc_valid_object() to consolidate the way we cope with
> > IPC_RMID races by using the same abstraction across the API implementation.
> > 
> > Signed-off-by: Rafael Aquini 
> > Acked-by: Rik van Riel 
> > Acked-by: Greg Thelen 
> 
> Reviewed-by: Davidlohr Bueso 
> 
> [...]
> 
> > +/*
> > + * ipc_valid_object() - helper to sort out IPC_RMID races for codepaths
> > + * where the respective ipc_ids.rwsem is not being held down.
> > + * Checks whether the ipc object is still around or if it's gone already, 
> > as
> > + * ipc_rmid() may have already freed the ID while the ipc lock was 
> > spinning.
> > + * Needs to be called with kern_ipc_perm.lock held -- exception made for 
> > one
> > + * checkpoint case at sys_semtimedop() as noted in code commentary.
> > + */
> > +static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
> > +{
> > +   return perm->deleted == 0;
> > +}
> 
> I would like to see .deleted being converted to bool while we're at it
> though, that return statement just bugs the hell out of me. Could you
> send a patch for that as well?
>

Sure, as I mentioned earlier the full .deleted conversion from int to bool
it's on my todo list already for a follow-up patch.

Thanks!
-- Rafael 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
After the locking semantics for the SysV IPC API got improved, a couple of
IPC_RMID race windows were opened because we ended up dropping the
'kern_ipc_perm.deleted' check performed way down in ipc_lock().
The spotted races got sorted out by re-introducing the old test within
the racy critical sections.

This patch introduces ipc_valid_object() to consolidate the way we cope with
IPC_RMID races by using the same abstraction across the API implementation.

Signed-off-by: Rafael Aquini 
Acked-by: Rik van Riel 
Acked-by: Greg Thelen 
---
Changelog:
* v3:
 - code commentary changes as requested by reviewers

* v2:
 - drop assert_spin_locked() from ipc_valid_object() for less overhead
 - extend ipc_valid_object() usage in sem.c (not spotted checkpoints)
 - keep the received ACKs

 ipc/msg.c  |  7 ---
 ipc/sem.c  | 24 
 ipc/shm.c  | 16 
 ipc/util.h | 13 +
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index 558aa91..8983ea5 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -696,7 +696,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
goto out_unlock0;
 
/* raced with RMID? */
-   if (msq->q_perm.deleted) {
+   if (!ipc_valid_object(>q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -731,7 +731,8 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_lock_object(>q_perm);
 
ipc_rcu_putref(msq, ipc_rcu_free);
-   if (msq->q_perm.deleted) {
+   /* raced with RMID? */
+   if (!ipc_valid_object(>q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -909,7 +910,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, 
long msgtyp, int msgfl
ipc_lock_object(>q_perm);
 
/* raced with RMID? */
-   if (msq->q_perm.deleted) {
+   if (!ipc_valid_object(>q_perm)) {
msg = ERR_PTR(-EIDRM);
goto out_unlock0;
}
diff --git a/ipc/sem.c b/ipc/sem.c
index db9d241..5972e60 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1282,7 +1282,7 @@ static int semctl_setval(struct ipc_namespace *ns, int 
semid, int semnum,
 
sem_lock(sma, NULL, -1);
 
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
return -EIDRM;
@@ -1342,7 +1342,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
int i;
 
sem_lock(sma, NULL, -1);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1361,7 +1361,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
 
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1409,7 +1409,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
}
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1435,7 +1435,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
goto out_rcu_wakeup;
 
sem_lock(sma, NULL, -1);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1699,7 +1699,7 @@ static struct sem_undo *find_alloc_undo(struct 
ipc_namespace *ns, int semid)
/* step 3: Acquire the lock on semaphore array */
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
kfree(new);
@@ -1846,7 +1846,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf 
__user *, tsops,
 
error = -EIDRM;
locknum = sem_lock(sma, sops, nsops);
-   if (sma->sem_perm.deleted)
+   /*
+* We eventually might perform the following check in a lockless
+* fashion, considering ipc_valid_object() locking constraints.
+* If nsops == 1 and there is no contention for sem_perm.lock

Re: [PATCH v2] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
On Wed, Dec 18, 2013 at 07:46:27AM -0800, Davidlohr Bueso wrote:
> On Wed, 2013-12-18 at 10:51 -0200, Rafael Aquini wrote:
> > On Wed, Dec 18, 2013 at 01:11:29PM +0100, Manfred Spraul wrote:
> > > On 12/18/2013 12:28 AM, Rafael Aquini wrote:
> > > >After the locking semantics for the SysV IPC API got improved, a couple 
> > > >of
> > > >IPC_RMID race windows were opened because we ended up dropping the
> > > >'kern_ipc_perm.deleted' check performed way down in ipc_lock().
> > > >The spotted races got sorted out by re-introducing the old test within
> > > >the racy critical sections.
> > > >
> > > >This patch introduces ipc_valid_object() to consolidate the way we cope 
> > > >with
> > > >IPC_RMID races by using the same abstraction across the API 
> > > >implementation.
> > > >
> > > >Signed-off-by: Rafael Aquini 
> > > >Acked-by: Rik van Riel 
> > > >Acked-by: Greg Thelen 
> > > >---
> > > >Changelog:
> > > >* v2:
> > > >  - drop assert_spin_locked() from ipc_valid_object() for less overhead
> > > a) sysv ipc is lockless whereever possible, without writing to any
> > > shared cachelines.
> > > Therefore my first reaction was: No, please leave the assert in. It
> > > will help us to catch bugs.
> > > 
> > > b) then I noticed: the assert would be a bug, the comment in front
> > > of ipc_valid_object() that the caller must hold _perm.lock is wrong:
> > > >@@ -1846,7 +1846,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct 
> > > >sembuf __user *, tsops,
> > > > error = -EIDRM;
> > > > locknum = sem_lock(sma, sops, nsops);
> > > >-if (sma->sem_perm.deleted)
> > > >+if (!ipc_valid_object(>sem_perm))
> > > > goto out_unlock_free;
> > > simple semtimedop() operation do not acquire sem_perm.lock, they
> > > only acquire the per-semaphore lock and check that sem_perm.lock is
> > > not held. This is sufficient to prevent races with RMID.
> > > 
> > > Could you update the comment?
> > 
> > The comment for ipc_valid_object() is not entirely wrong, as holding the 
> > spinlock 
> > is clearly necessary for all cases except for this one you pointed above. 
> > When I dropped the assert as Davilohr suggested, I then could have this one 
> > exception 
> > case (where the check can, eventually, be done lockless) converted too, but 
> > I did not include 
> > an exception comment at that particular checkpoint. Perhaps, that's what I 
> > should have done, or
> > perhaps the best thing is to just let all that as is sits right now.
> 
> Yeah, Manfred is entirely correct - I didn't mention that sem_lock()
> tries to be fine grained about its locking, so semaphores can in fact
> not take the larger ipc lock (kern perm), but just the sem->lock
> instead. This means that ipc_valid_object() must be called either way
> with some lock held, but that assertion is indeed incorrect, not just
> redundant like I suggested before. So, I think that if you update the
> comment mentioning this corner case, then it should be ok.
>

Folks,

Before I re-submit the v3 with the commentary changes requested, I'm pasting
here what I'm planning to amend to v2 patch:
---
diff --git a/ipc/sem.c b/ipc/sem.c
index ed0057a..23379b6 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1846,6 +1846,14 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __u
 
error = -EIDRM;
locknum = sem_lock(sma, sops, nsops);
+   /*
+* We eventually might perform the following check in a lockless
+* fashion here, considering ipc_valid_object() locking constraints.
+* If nsops == 1 and there's no contention for sem_perm.lock, then
+* only a per-semaphore lock is held and it's OK to go on the check
+* below. More details on the fine grained locking scheme entangled
+* here, and why it's RMID race safe on comments at sem_lock()
+*/
if (!ipc_valid_object(>sem_perm))
goto out_unlock_free;
/*
diff --git a/ipc/util.h b/ipc/util.h
index 071ed58..d05b708 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -190,7 +190,8 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
  * where the respective ipc_ids.rwsem is not being held down.
  * Checks whether the ipc object is still around or if it's gone already, as
  * ipc_rmid() may have already freed the ID while the ipc lock was spinning.
- * Needs to be called with kern_ipc_perm.lock held.
+ * Needs to be called with kern_ipc_perm.lock held -- exception made for one
+ * checkpoint case at sys_semtimedop() as noted in code commentary.
  */
 static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
 {
---

Do we need to change somthing else?
Looking forward your thoughts!
-- Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
On Wed, Dec 18, 2013 at 07:46:27AM -0800, Davidlohr Bueso wrote:
> On Wed, 2013-12-18 at 10:51 -0200, Rafael Aquini wrote:
> > On Wed, Dec 18, 2013 at 01:11:29PM +0100, Manfred Spraul wrote:
> > > On 12/18/2013 12:28 AM, Rafael Aquini wrote:
> > > >After the locking semantics for the SysV IPC API got improved, a couple 
> > > >of
> > > >IPC_RMID race windows were opened because we ended up dropping the
> > > >'kern_ipc_perm.deleted' check performed way down in ipc_lock().
> > > >The spotted races got sorted out by re-introducing the old test within
> > > >the racy critical sections.
> > > >
> > > >This patch introduces ipc_valid_object() to consolidate the way we cope 
> > > >with
> > > >IPC_RMID races by using the same abstraction across the API 
> > > >implementation.
> > > >
> > > >Signed-off-by: Rafael Aquini 
> > > >Acked-by: Rik van Riel 
> > > >Acked-by: Greg Thelen 
> > > >---
> > > >Changelog:
> > > >* v2:
> > > >  - drop assert_spin_locked() from ipc_valid_object() for less overhead
> > > a) sysv ipc is lockless whereever possible, without writing to any
> > > shared cachelines.
> > > Therefore my first reaction was: No, please leave the assert in. It
> > > will help us to catch bugs.
> > > 
> > > b) then I noticed: the assert would be a bug, the comment in front
> > > of ipc_valid_object() that the caller must hold _perm.lock is wrong:
> > > >@@ -1846,7 +1846,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct 
> > > >sembuf __user *, tsops,
> > > > error = -EIDRM;
> > > > locknum = sem_lock(sma, sops, nsops);
> > > >-if (sma->sem_perm.deleted)
> > > >+if (!ipc_valid_object(>sem_perm))
> > > > goto out_unlock_free;
> > > simple semtimedop() operation do not acquire sem_perm.lock, they
> > > only acquire the per-semaphore lock and check that sem_perm.lock is
> > > not held. This is sufficient to prevent races with RMID.
> > > 
> > > Could you update the comment?
> > 
> > The comment for ipc_valid_object() is not entirely wrong, as holding the 
> > spinlock 
> > is clearly necessary for all cases except for this one you pointed above. 
> > When I dropped the assert as Davilohr suggested, I then could have this one 
> > exception 
> > case (where the check can, eventually, be done lockless) converted too, but 
> > I did not include 
> > an exception comment at that particular checkpoint. Perhaps, that's what I 
> > should have done, or
> > perhaps the best thing is to just let all that as is sits right now.
> 
> Yeah, Manfred is entirely correct - I didn't mention that sem_lock()
> tries to be fine grained about its locking, so semaphores can in fact
> not take the larger ipc lock (kern perm), but just the sem->lock
> instead. This means that ipc_valid_object() must be called either way
> with some lock held, but that assertion is indeed incorrect, not just
> redundant like I suggested before. So, I think that if you update the
> comment mentioning this corner case, then it should be ok.
>

Cool, will do it, then. But I'll do it just above the exception case, in sem.c
to not cause more confusion. Does it sounds good to all?

Thanks, folks!
-- Rafael 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
On Wed, Dec 18, 2013 at 10:50:59AM -0200, Rafael Aquini wrote:
> On Wed, Dec 18, 2013 at 01:11:29PM +0100, Manfred Spraul wrote:
> > On 12/18/2013 12:28 AM, Rafael Aquini wrote:
> > >After the locking semantics for the SysV IPC API got improved, a couple of
> > >IPC_RMID race windows were opened because we ended up dropping the
> > >'kern_ipc_perm.deleted' check performed way down in ipc_lock().
> > >The spotted races got sorted out by re-introducing the old test within
> > >the racy critical sections.
> > >
> > >This patch introduces ipc_valid_object() to consolidate the way we cope 
> > >with
> > >IPC_RMID races by using the same abstraction across the API implementation.
> > >
> > >Signed-off-by: Rafael Aquini 
> > >Acked-by: Rik van Riel 
> > >Acked-by: Greg Thelen 
> > >---
> > >Changelog:
> > >* v2:
> > >  - drop assert_spin_locked() from ipc_valid_object() for less overhead
> > a) sysv ipc is lockless whereever possible, without writing to any
> > shared cachelines.
> > Therefore my first reaction was: No, please leave the assert in. It
> > will help us to catch bugs.
> > 
> > b) then I noticed: the assert would be a bug, the comment in front
> > of ipc_valid_object() that the caller must hold _perm.lock is wrong:
> > >@@ -1846,7 +1846,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct 
> > >sembuf __user *, tsops,
> > >   error = -EIDRM;
> > >   locknum = sem_lock(sma, sops, nsops);
> > >-  if (sma->sem_perm.deleted)
> > >+  if (!ipc_valid_object(>sem_perm))
> > >   goto out_unlock_free;
> > simple semtimedop() operation do not acquire sem_perm.lock, they
> > only acquire the per-semaphore lock and check that sem_perm.lock is
> > not held. This is sufficient to prevent races with RMID.
> > 
> > Could you update the comment?
> 
> The comment for ipc_valid_object() is not entirely wrong, as holding the 
> spinlock 
> is clearly necessary for all cases except for this one you pointed above. 
> When I dropped the assert as Davilohr suggested, I then could have this one 
> exception 
> case (where the check can, eventually, be done lockless) converted too, but I 
> did not include 
> an exception comment at that particular checkpoint. Perhaps, that's what I 
> should have done, or
> perhaps the best thing is to just let all that as is sits right now.
>

Or, as a second thought, we could perhaps re-instate the assert in
ipc_valid_object(), and change only this exception checkpoint back to a
if (sma->sem_perm.deleted) case, adding a comment there on why it's different
from the others.


Looking up to hear your thoughts here!

Thanks!
-- Rafael

> 
> > [...]
> > >@@ -1116,7 +1116,7 @@ long do_shmat(int shmid, char __user *shmaddr, int 
> > >shmflg, ulong *raddr,
> > >   ipc_lock_object(>shm_perm);
> > >   /* check if shm_destroy() is tearing down shp */
> > >-  if (shp->shm_file == NULL) {
> > >+  if (!ipc_valid_object(>shm_perm)) {
> > >   ipc_unlock_object(>shm_perm);
> > >   err = -EIDRM;
> > >   goto out_unlock;
> > Please mention the change from "shm_file == NULL" to perm.deleted in
> > the changelog.
> > With regards to the impact of this change: No idea, I've never
> > worked on the shm code.
> 
> This change is, essentially, the proper way to cope with such races. Please
> refer to the following reply on this same trhead, for further info:
> https://lkml.org/lkml/2013/12/17/704
> 
> Thanks!
> -- Rafael
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
On Wed, Dec 18, 2013 at 01:11:29PM +0100, Manfred Spraul wrote:
> On 12/18/2013 12:28 AM, Rafael Aquini wrote:
> >After the locking semantics for the SysV IPC API got improved, a couple of
> >IPC_RMID race windows were opened because we ended up dropping the
> >'kern_ipc_perm.deleted' check performed way down in ipc_lock().
> >The spotted races got sorted out by re-introducing the old test within
> >the racy critical sections.
> >
> >This patch introduces ipc_valid_object() to consolidate the way we cope with
> >IPC_RMID races by using the same abstraction across the API implementation.
> >
> >Signed-off-by: Rafael Aquini 
> >Acked-by: Rik van Riel 
> >Acked-by: Greg Thelen 
> >---
> >Changelog:
> >* v2:
> >  - drop assert_spin_locked() from ipc_valid_object() for less overhead
> a) sysv ipc is lockless whereever possible, without writing to any
> shared cachelines.
> Therefore my first reaction was: No, please leave the assert in. It
> will help us to catch bugs.
> 
> b) then I noticed: the assert would be a bug, the comment in front
> of ipc_valid_object() that the caller must hold _perm.lock is wrong:
> >@@ -1846,7 +1846,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf 
> >__user *, tsops,
> > error = -EIDRM;
> > locknum = sem_lock(sma, sops, nsops);
> >-if (sma->sem_perm.deleted)
> >+if (!ipc_valid_object(>sem_perm))
> > goto out_unlock_free;
> simple semtimedop() operation do not acquire sem_perm.lock, they
> only acquire the per-semaphore lock and check that sem_perm.lock is
> not held. This is sufficient to prevent races with RMID.
> 
> Could you update the comment?

The comment for ipc_valid_object() is not entirely wrong, as holding the 
spinlock 
is clearly necessary for all cases except for this one you pointed above. 
When I dropped the assert as Davilohr suggested, I then could have this one 
exception 
case (where the check can, eventually, be done lockless) converted too, but I 
did not include 
an exception comment at that particular checkpoint. Perhaps, that's what I 
should have done, or
perhaps the best thing is to just let all that as is sits right now.


> [...]
> >@@ -1116,7 +1116,7 @@ long do_shmat(int shmid, char __user *shmaddr, int 
> >shmflg, ulong *raddr,
> > ipc_lock_object(>shm_perm);
> > /* check if shm_destroy() is tearing down shp */
> >-if (shp->shm_file == NULL) {
> >+if (!ipc_valid_object(>shm_perm)) {
> > ipc_unlock_object(>shm_perm);
> > err = -EIDRM;
> > goto out_unlock;
> Please mention the change from "shm_file == NULL" to perm.deleted in
> the changelog.
> With regards to the impact of this change: No idea, I've never
> worked on the shm code.

This change is, essentially, the proper way to cope with such races. Please
refer to the following reply on this same trhead, for further info:
https://lkml.org/lkml/2013/12/17/704

Thanks!
-- Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
On Wed, Dec 18, 2013 at 01:11:29PM +0100, Manfred Spraul wrote:
 On 12/18/2013 12:28 AM, Rafael Aquini wrote:
 After the locking semantics for the SysV IPC API got improved, a couple of
 IPC_RMID race windows were opened because we ended up dropping the
 'kern_ipc_perm.deleted' check performed way down in ipc_lock().
 The spotted races got sorted out by re-introducing the old test within
 the racy critical sections.
 
 This patch introduces ipc_valid_object() to consolidate the way we cope with
 IPC_RMID races by using the same abstraction across the API implementation.
 
 Signed-off-by: Rafael Aquini aqu...@redhat.com
 Acked-by: Rik van Riel r...@redhat.com
 Acked-by: Greg Thelen gthe...@google.com
 ---
 Changelog:
 * v2:
   - drop assert_spin_locked() from ipc_valid_object() for less overhead
 a) sysv ipc is lockless whereever possible, without writing to any
 shared cachelines.
 Therefore my first reaction was: No, please leave the assert in. It
 will help us to catch bugs.
 
 b) then I noticed: the assert would be a bug, the comment in front
 of ipc_valid_object() that the caller must hold _perm.lock is wrong:
 @@ -1846,7 +1846,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf 
 __user *, tsops,
  error = -EIDRM;
  locknum = sem_lock(sma, sops, nsops);
 -if (sma-sem_perm.deleted)
 +if (!ipc_valid_object(sma-sem_perm))
  goto out_unlock_free;
 simple semtimedop() operation do not acquire sem_perm.lock, they
 only acquire the per-semaphore lock and check that sem_perm.lock is
 not held. This is sufficient to prevent races with RMID.
 
 Could you update the comment?

The comment for ipc_valid_object() is not entirely wrong, as holding the 
spinlock 
is clearly necessary for all cases except for this one you pointed above. 
When I dropped the assert as Davilohr suggested, I then could have this one 
exception 
case (where the check can, eventually, be done lockless) converted too, but I 
did not include 
an exception comment at that particular checkpoint. Perhaps, that's what I 
should have done, or
perhaps the best thing is to just let all that as is sits right now.


 [...]
 @@ -1116,7 +1116,7 @@ long do_shmat(int shmid, char __user *shmaddr, int 
 shmflg, ulong *raddr,
  ipc_lock_object(shp-shm_perm);
  /* check if shm_destroy() is tearing down shp */
 -if (shp-shm_file == NULL) {
 +if (!ipc_valid_object(shp-shm_perm)) {
  ipc_unlock_object(shp-shm_perm);
  err = -EIDRM;
  goto out_unlock;
 Please mention the change from shm_file == NULL to perm.deleted in
 the changelog.
 With regards to the impact of this change: No idea, I've never
 worked on the shm code.

This change is, essentially, the proper way to cope with such races. Please
refer to the following reply on this same trhead, for further info:
https://lkml.org/lkml/2013/12/17/704

Thanks!
-- Rafael

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
On Wed, Dec 18, 2013 at 10:50:59AM -0200, Rafael Aquini wrote:
 On Wed, Dec 18, 2013 at 01:11:29PM +0100, Manfred Spraul wrote:
  On 12/18/2013 12:28 AM, Rafael Aquini wrote:
  After the locking semantics for the SysV IPC API got improved, a couple of
  IPC_RMID race windows were opened because we ended up dropping the
  'kern_ipc_perm.deleted' check performed way down in ipc_lock().
  The spotted races got sorted out by re-introducing the old test within
  the racy critical sections.
  
  This patch introduces ipc_valid_object() to consolidate the way we cope 
  with
  IPC_RMID races by using the same abstraction across the API implementation.
  
  Signed-off-by: Rafael Aquini aqu...@redhat.com
  Acked-by: Rik van Riel r...@redhat.com
  Acked-by: Greg Thelen gthe...@google.com
  ---
  Changelog:
  * v2:
- drop assert_spin_locked() from ipc_valid_object() for less overhead
  a) sysv ipc is lockless whereever possible, without writing to any
  shared cachelines.
  Therefore my first reaction was: No, please leave the assert in. It
  will help us to catch bugs.
  
  b) then I noticed: the assert would be a bug, the comment in front
  of ipc_valid_object() that the caller must hold _perm.lock is wrong:
  @@ -1846,7 +1846,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct 
  sembuf __user *, tsops,
 error = -EIDRM;
 locknum = sem_lock(sma, sops, nsops);
  -  if (sma-sem_perm.deleted)
  +  if (!ipc_valid_object(sma-sem_perm))
 goto out_unlock_free;
  simple semtimedop() operation do not acquire sem_perm.lock, they
  only acquire the per-semaphore lock and check that sem_perm.lock is
  not held. This is sufficient to prevent races with RMID.
  
  Could you update the comment?
 
 The comment for ipc_valid_object() is not entirely wrong, as holding the 
 spinlock 
 is clearly necessary for all cases except for this one you pointed above. 
 When I dropped the assert as Davilohr suggested, I then could have this one 
 exception 
 case (where the check can, eventually, be done lockless) converted too, but I 
 did not include 
 an exception comment at that particular checkpoint. Perhaps, that's what I 
 should have done, or
 perhaps the best thing is to just let all that as is sits right now.


Or, as a second thought, we could perhaps re-instate the assert in
ipc_valid_object(), and change only this exception checkpoint back to a
if (sma-sem_perm.deleted) case, adding a comment there on why it's different
from the others.


Looking up to hear your thoughts here!

Thanks!
-- Rafael

 
  [...]
  @@ -1116,7 +1116,7 @@ long do_shmat(int shmid, char __user *shmaddr, int 
  shmflg, ulong *raddr,
 ipc_lock_object(shp-shm_perm);
 /* check if shm_destroy() is tearing down shp */
  -  if (shp-shm_file == NULL) {
  +  if (!ipc_valid_object(shp-shm_perm)) {
 ipc_unlock_object(shp-shm_perm);
 err = -EIDRM;
 goto out_unlock;
  Please mention the change from shm_file == NULL to perm.deleted in
  the changelog.
  With regards to the impact of this change: No idea, I've never
  worked on the shm code.
 
 This change is, essentially, the proper way to cope with such races. Please
 refer to the following reply on this same trhead, for further info:
 https://lkml.org/lkml/2013/12/17/704
 
 Thanks!
 -- Rafael
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
On Wed, Dec 18, 2013 at 07:46:27AM -0800, Davidlohr Bueso wrote:
 On Wed, 2013-12-18 at 10:51 -0200, Rafael Aquini wrote:
  On Wed, Dec 18, 2013 at 01:11:29PM +0100, Manfred Spraul wrote:
   On 12/18/2013 12:28 AM, Rafael Aquini wrote:
   After the locking semantics for the SysV IPC API got improved, a couple 
   of
   IPC_RMID race windows were opened because we ended up dropping the
   'kern_ipc_perm.deleted' check performed way down in ipc_lock().
   The spotted races got sorted out by re-introducing the old test within
   the racy critical sections.
   
   This patch introduces ipc_valid_object() to consolidate the way we cope 
   with
   IPC_RMID races by using the same abstraction across the API 
   implementation.
   
   Signed-off-by: Rafael Aquini aqu...@redhat.com
   Acked-by: Rik van Riel r...@redhat.com
   Acked-by: Greg Thelen gthe...@google.com
   ---
   Changelog:
   * v2:
 - drop assert_spin_locked() from ipc_valid_object() for less overhead
   a) sysv ipc is lockless whereever possible, without writing to any
   shared cachelines.
   Therefore my first reaction was: No, please leave the assert in. It
   will help us to catch bugs.
   
   b) then I noticed: the assert would be a bug, the comment in front
   of ipc_valid_object() that the caller must hold _perm.lock is wrong:
   @@ -1846,7 +1846,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct 
   sembuf __user *, tsops,
error = -EIDRM;
locknum = sem_lock(sma, sops, nsops);
   -if (sma-sem_perm.deleted)
   +if (!ipc_valid_object(sma-sem_perm))
goto out_unlock_free;
   simple semtimedop() operation do not acquire sem_perm.lock, they
   only acquire the per-semaphore lock and check that sem_perm.lock is
   not held. This is sufficient to prevent races with RMID.
   
   Could you update the comment?
  
  The comment for ipc_valid_object() is not entirely wrong, as holding the 
  spinlock 
  is clearly necessary for all cases except for this one you pointed above. 
  When I dropped the assert as Davilohr suggested, I then could have this one 
  exception 
  case (where the check can, eventually, be done lockless) converted too, but 
  I did not include 
  an exception comment at that particular checkpoint. Perhaps, that's what I 
  should have done, or
  perhaps the best thing is to just let all that as is sits right now.
 
 Yeah, Manfred is entirely correct - I didn't mention that sem_lock()
 tries to be fine grained about its locking, so semaphores can in fact
 not take the larger ipc lock (kern perm), but just the sem-lock
 instead. This means that ipc_valid_object() must be called either way
 with some lock held, but that assertion is indeed incorrect, not just
 redundant like I suggested before. So, I think that if you update the
 comment mentioning this corner case, then it should be ok.


Cool, will do it, then. But I'll do it just above the exception case, in sem.c
to not cause more confusion. Does it sounds good to all?

Thanks, folks!
-- Rafael 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
On Wed, Dec 18, 2013 at 07:46:27AM -0800, Davidlohr Bueso wrote:
 On Wed, 2013-12-18 at 10:51 -0200, Rafael Aquini wrote:
  On Wed, Dec 18, 2013 at 01:11:29PM +0100, Manfred Spraul wrote:
   On 12/18/2013 12:28 AM, Rafael Aquini wrote:
   After the locking semantics for the SysV IPC API got improved, a couple 
   of
   IPC_RMID race windows were opened because we ended up dropping the
   'kern_ipc_perm.deleted' check performed way down in ipc_lock().
   The spotted races got sorted out by re-introducing the old test within
   the racy critical sections.
   
   This patch introduces ipc_valid_object() to consolidate the way we cope 
   with
   IPC_RMID races by using the same abstraction across the API 
   implementation.
   
   Signed-off-by: Rafael Aquini aqu...@redhat.com
   Acked-by: Rik van Riel r...@redhat.com
   Acked-by: Greg Thelen gthe...@google.com
   ---
   Changelog:
   * v2:
 - drop assert_spin_locked() from ipc_valid_object() for less overhead
   a) sysv ipc is lockless whereever possible, without writing to any
   shared cachelines.
   Therefore my first reaction was: No, please leave the assert in. It
   will help us to catch bugs.
   
   b) then I noticed: the assert would be a bug, the comment in front
   of ipc_valid_object() that the caller must hold _perm.lock is wrong:
   @@ -1846,7 +1846,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct 
   sembuf __user *, tsops,
error = -EIDRM;
locknum = sem_lock(sma, sops, nsops);
   -if (sma-sem_perm.deleted)
   +if (!ipc_valid_object(sma-sem_perm))
goto out_unlock_free;
   simple semtimedop() operation do not acquire sem_perm.lock, they
   only acquire the per-semaphore lock and check that sem_perm.lock is
   not held. This is sufficient to prevent races with RMID.
   
   Could you update the comment?
  
  The comment for ipc_valid_object() is not entirely wrong, as holding the 
  spinlock 
  is clearly necessary for all cases except for this one you pointed above. 
  When I dropped the assert as Davilohr suggested, I then could have this one 
  exception 
  case (where the check can, eventually, be done lockless) converted too, but 
  I did not include 
  an exception comment at that particular checkpoint. Perhaps, that's what I 
  should have done, or
  perhaps the best thing is to just let all that as is sits right now.
 
 Yeah, Manfred is entirely correct - I didn't mention that sem_lock()
 tries to be fine grained about its locking, so semaphores can in fact
 not take the larger ipc lock (kern perm), but just the sem-lock
 instead. This means that ipc_valid_object() must be called either way
 with some lock held, but that assertion is indeed incorrect, not just
 redundant like I suggested before. So, I think that if you update the
 comment mentioning this corner case, then it should be ok.


Folks,

Before I re-submit the v3 with the commentary changes requested, I'm pasting
here what I'm planning to amend to v2 patch:
---
diff --git a/ipc/sem.c b/ipc/sem.c
index ed0057a..23379b6 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1846,6 +1846,14 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __u
 
error = -EIDRM;
locknum = sem_lock(sma, sops, nsops);
+   /*
+* We eventually might perform the following check in a lockless
+* fashion here, considering ipc_valid_object() locking constraints.
+* If nsops == 1 and there's no contention for sem_perm.lock, then
+* only a per-semaphore lock is held and it's OK to go on the check
+* below. More details on the fine grained locking scheme entangled
+* here, and why it's RMID race safe on comments at sem_lock()
+*/
if (!ipc_valid_object(sma-sem_perm))
goto out_unlock_free;
/*
diff --git a/ipc/util.h b/ipc/util.h
index 071ed58..d05b708 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -190,7 +190,8 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
  * where the respective ipc_ids.rwsem is not being held down.
  * Checks whether the ipc object is still around or if it's gone already, as
  * ipc_rmid() may have already freed the ID while the ipc lock was spinning.
- * Needs to be called with kern_ipc_perm.lock held.
+ * Needs to be called with kern_ipc_perm.lock held -- exception made for one
+ * checkpoint case at sys_semtimedop() as noted in code commentary.
  */
 static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
 {
---

Do we need to change somthing else?
Looking forward your thoughts!
-- Rafael

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
After the locking semantics for the SysV IPC API got improved, a couple of
IPC_RMID race windows were opened because we ended up dropping the
'kern_ipc_perm.deleted' check performed way down in ipc_lock().
The spotted races got sorted out by re-introducing the old test within
the racy critical sections.

This patch introduces ipc_valid_object() to consolidate the way we cope with
IPC_RMID races by using the same abstraction across the API implementation.

Signed-off-by: Rafael Aquini aqu...@redhat.com
Acked-by: Rik van Riel r...@redhat.com
Acked-by: Greg Thelen gthe...@google.com
---
Changelog:
* v3:
 - code commentary changes as requested by reviewers

* v2:
 - drop assert_spin_locked() from ipc_valid_object() for less overhead
 - extend ipc_valid_object() usage in sem.c (not spotted checkpoints)
 - keep the received ACKs

 ipc/msg.c  |  7 ---
 ipc/sem.c  | 24 
 ipc/shm.c  | 16 
 ipc/util.h | 13 +
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index 558aa91..8983ea5 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -696,7 +696,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
goto out_unlock0;
 
/* raced with RMID? */
-   if (msq-q_perm.deleted) {
+   if (!ipc_valid_object(msq-q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -731,7 +731,8 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_lock_object(msq-q_perm);
 
ipc_rcu_putref(msq, ipc_rcu_free);
-   if (msq-q_perm.deleted) {
+   /* raced with RMID? */
+   if (!ipc_valid_object(msq-q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -909,7 +910,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, 
long msgtyp, int msgfl
ipc_lock_object(msq-q_perm);
 
/* raced with RMID? */
-   if (msq-q_perm.deleted) {
+   if (!ipc_valid_object(msq-q_perm)) {
msg = ERR_PTR(-EIDRM);
goto out_unlock0;
}
diff --git a/ipc/sem.c b/ipc/sem.c
index db9d241..5972e60 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1282,7 +1282,7 @@ static int semctl_setval(struct ipc_namespace *ns, int 
semid, int semnum,
 
sem_lock(sma, NULL, -1);
 
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
return -EIDRM;
@@ -1342,7 +1342,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
int i;
 
sem_lock(sma, NULL, -1);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1361,7 +1361,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
 
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1409,7 +1409,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
}
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1435,7 +1435,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
goto out_rcu_wakeup;
 
sem_lock(sma, NULL, -1);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1699,7 +1699,7 @@ static struct sem_undo *find_alloc_undo(struct 
ipc_namespace *ns, int semid)
/* step 3: Acquire the lock on semaphore array */
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
kfree(new);
@@ -1846,7 +1846,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf 
__user *, tsops,
 
error = -EIDRM;
locknum = sem_lock(sma, sops, nsops);
-   if (sma-sem_perm.deleted)
+   /*
+* We eventually might perform the following check in a lockless
+* fashion, considering ipc_valid_object() locking constraints.
+* If nsops == 1 and there is no contention

Re: [PATCH v3] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-18 Thread Rafael Aquini
On Wed, Dec 18, 2013 at 04:38:24PM -0800, Davidlohr Bueso wrote:
 On Wed, 2013-12-18 at 18:33 -0200, Rafael Aquini wrote:
  After the locking semantics for the SysV IPC API got improved, a couple of
  IPC_RMID race windows were opened because we ended up dropping the
  'kern_ipc_perm.deleted' check performed way down in ipc_lock().
  The spotted races got sorted out by re-introducing the old test within
  the racy critical sections.
  
  This patch introduces ipc_valid_object() to consolidate the way we cope with
  IPC_RMID races by using the same abstraction across the API implementation.
  
  Signed-off-by: Rafael Aquini aqu...@redhat.com
  Acked-by: Rik van Riel r...@redhat.com
  Acked-by: Greg Thelen gthe...@google.com
 
 Reviewed-by: Davidlohr Bueso davidl...@hp.com
 
 [...]
 
  +/*
  + * ipc_valid_object() - helper to sort out IPC_RMID races for codepaths
  + * where the respective ipc_ids.rwsem is not being held down.
  + * Checks whether the ipc object is still around or if it's gone already, 
  as
  + * ipc_rmid() may have already freed the ID while the ipc lock was 
  spinning.
  + * Needs to be called with kern_ipc_perm.lock held -- exception made for 
  one
  + * checkpoint case at sys_semtimedop() as noted in code commentary.
  + */
  +static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
  +{
  +   return perm-deleted == 0;
  +}
 
 I would like to see .deleted being converted to bool while we're at it
 though, that return statement just bugs the hell out of me. Could you
 send a patch for that as well?


Sure, as I mentioned earlier the full .deleted conversion from int to bool
it's on my todo list already for a follow-up patch.

Thanks!
-- Rafael 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-17 Thread Rafael Aquini
After the locking semantics for the SysV IPC API got improved, a couple of
IPC_RMID race windows were opened because we ended up dropping the
'kern_ipc_perm.deleted' check performed way down in ipc_lock().
The spotted races got sorted out by re-introducing the old test within
the racy critical sections.

This patch introduces ipc_valid_object() to consolidate the way we cope with
IPC_RMID races by using the same abstraction across the API implementation.

Signed-off-by: Rafael Aquini 
Acked-by: Rik van Riel 
Acked-by: Greg Thelen 
---
Changelog:
* v2:
 - drop assert_spin_locked() from ipc_valid_object() for less overhead
 - extend ipc_valid_object() usage in sem.c (not spotted checkpoints)
 - keep the received ACKs

 ipc/msg.c  |  7 ---
 ipc/sem.c  | 16 
 ipc/shm.c  | 16 
 ipc/util.h | 12 
 4 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index 558aa91..8983ea5 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -696,7 +696,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
goto out_unlock0;
 
/* raced with RMID? */
-   if (msq->q_perm.deleted) {
+   if (!ipc_valid_object(>q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -731,7 +731,8 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_lock_object(>q_perm);
 
ipc_rcu_putref(msq, ipc_rcu_free);
-   if (msq->q_perm.deleted) {
+   /* raced with RMID? */
+   if (!ipc_valid_object(>q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -909,7 +910,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, 
long msgtyp, int msgfl
ipc_lock_object(>q_perm);
 
/* raced with RMID? */
-   if (msq->q_perm.deleted) {
+   if (!ipc_valid_object(>q_perm)) {
msg = ERR_PTR(-EIDRM);
goto out_unlock0;
}
diff --git a/ipc/sem.c b/ipc/sem.c
index db9d241..ed0057a 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1282,7 +1282,7 @@ static int semctl_setval(struct ipc_namespace *ns, int 
semid, int semnum,
 
sem_lock(sma, NULL, -1);
 
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
return -EIDRM;
@@ -1342,7 +1342,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
int i;
 
sem_lock(sma, NULL, -1);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1361,7 +1361,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
 
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1409,7 +1409,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
}
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1435,7 +1435,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
goto out_rcu_wakeup;
 
sem_lock(sma, NULL, -1);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1699,7 +1699,7 @@ static struct sem_undo *find_alloc_undo(struct 
ipc_namespace *ns, int semid)
/* step 3: Acquire the lock on semaphore array */
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
kfree(new);
@@ -1846,7 +1846,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf 
__user *, tsops,
 
error = -EIDRM;
locknum = sem_lock(sma, sops, nsops);
-   if (sma->sem_perm.deleted)
+   if (!ipc_valid_object(>sem_perm))
goto out_unlock_free;
/*
 * semid identifiers are not unique - find_alloc_undo may have
@@ -2068,7 +2068,7 @@ void exit_sem(struct task_struct *tsk)
 
sem_lock(sma, NULL, -1);
/* exit

Re: [PATCH] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-17 Thread Rafael Aquini
On Tue, Dec 17, 2013 at 02:18:02PM -0800, Davidlohr Bueso wrote:
> On Tue, 2013-12-17 at 19:46 -0200, Rafael Aquini wrote:
> > On Tue, Dec 17, 2013 at 01:27:49PM -0800, Davidlohr Bueso wrote:
> > > Ccing Manfred.
> > > 
> > > On Tue, 2013-12-17 at 17:03 -0200, Rafael Aquini wrote:
> > > > After the locking semantics for the SysV IPC API got improved, a couple 
> > > > of
> > > > IPC_RMID race windows were opened because we ended up dropping the
> > > > 'kern_ipc_perm.deleted' check performed way down in ipc_lock().
> > > > The spotted races got sorted out by re-introducing the old test within
> > > > the racy critical sections.
> > > > 
> > > > This patch introduces ipc_valid_object() to consolidate the way we cope 
> > > > with
> > > > IPC_RMID races by using the same abstraction across the API 
> > > > implementation.
> > > 
> > > This is certainly a good function to have. Some comments below.
> > > 
> [...]
> > > > 
> > > > shm_file = shp->shm_file;
> > > > -
> > > > -   /* check if shm_destroy() is tearing down shp */
> > > > -   if (shm_file == NULL) {
> > > > -   err = -EIDRM;
> > > > -   goto out_unlock0;
> > > > -   }
> > > 
> > > Ok, this seems safe, we can always rely on .deleted for validity since
> > > shm_destroy() ends up calling shm_rmid() which sets .deleted. So this
> > > change is really moving what we're checking against just a few
> > > instructions later.
> > >
> > 
> > Yep, I did change it cause it seems that there's no reason to delay the 
> > return
> > condition if we raced with shm_destroy(), anyways.
> >  
> 
> Right, but I was referring to moving what we consider as valid.
> 
> static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
> {
>   struct file *shm_file;
> 
>   shm_file = shp->shm_file;
>   shp->shm_file = NULL;   <--- we currently use this.
>   ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
>   shm_rmid(ns, shp); <--- with your patch we now use this.
>   shm_unlock(shp);
>   ...
> }
> 
> ... and it makes since since shm was the only one not using .deleted for
> RMID racing checks.
>

Oh, I see. As a matter of fact, what made shm to start using a different check
than kern_ipc_perm.deleted, as you points out, was the follwoing commit:
---8<--
commit a399b29dfbaaaf91162b2dc5a5875dd51bbfa2a1
Author: Greg Thelen 
Date:   Thu Nov 21 14:32:00 2013 -0800

ipc,shm: fix shm_file deletion races
--->8---

Although it closes the spotted race properly, Greg's commit also implies that 
(by the
way it works the race around) the race has always been there, which is not 
true. 
OTOH, I didn't propose reverting Greg's commit because I thought the changes 
it introduced to shm_destroy() might come handy to help one spotting similar 
races, 
if they eventually pop out in the future.
(a cleanup patch can be sent later, if that hunk is not regarded as useful 
anymore)

Thanks!
-- Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-17 Thread Rafael Aquini
On Tue, Dec 17, 2013 at 01:27:49PM -0800, Davidlohr Bueso wrote:
> Ccing Manfred.
> 
> On Tue, 2013-12-17 at 17:03 -0200, Rafael Aquini wrote:
> > After the locking semantics for the SysV IPC API got improved, a couple of
> > IPC_RMID race windows were opened because we ended up dropping the
> > 'kern_ipc_perm.deleted' check performed way down in ipc_lock().
> > The spotted races got sorted out by re-introducing the old test within
> > the racy critical sections.
> > 
> > This patch introduces ipc_valid_object() to consolidate the way we cope with
> > IPC_RMID races by using the same abstraction across the API implementation.
> 
> This is certainly a good function to have. Some comments below.
> 
> > 
> > Signed-off-by: Rafael Aquini 
> > ---
> >  ipc/msg.c  |  7 ---
> >  ipc/sem.c  |  8 
> >  ipc/shm.c  | 16 
> >  ipc/util.h | 13 +
> >  4 files changed, 29 insertions(+), 15 deletions(-)
> > 
> > diff --git a/ipc/msg.c b/ipc/msg.c
> > index 558aa91..8983ea5 100644
> > --- a/ipc/msg.c
> > +++ b/ipc/msg.c
> > @@ -696,7 +696,7 @@ long do_msgsnd(int msqid, long mtype, void __user 
> > *mtext,
> > goto out_unlock0;
> >  
> > /* raced with RMID? */
> > -   if (msq->q_perm.deleted) {
> > +   if (!ipc_valid_object(>q_perm)) {
> > err = -EIDRM;
> > goto out_unlock0;
> > }
> > @@ -731,7 +731,8 @@ long do_msgsnd(int msqid, long mtype, void __user 
> > *mtext,
> > ipc_lock_object(>q_perm);
> >  
> > ipc_rcu_putref(msq, ipc_rcu_free);
> > -   if (msq->q_perm.deleted) {
> > +   /* raced with RMID? */
> > +   if (!ipc_valid_object(>q_perm)) {
> > err = -EIDRM;
> > goto out_unlock0;
> > }
> > @@ -909,7 +910,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t 
> > bufsz, long msgtyp, int msgfl
> > ipc_lock_object(>q_perm);
> >  
> > /* raced with RMID? */
> > -   if (msq->q_perm.deleted) {
> > +   if (!ipc_valid_object(>q_perm)) {
> > msg = ERR_PTR(-EIDRM);
> > goto out_unlock0;
> > }
> > diff --git a/ipc/sem.c b/ipc/sem.c
> > index db9d241..f4fad32 100644
> > --- a/ipc/sem.c
> > +++ b/ipc/sem.c
> > @@ -1282,7 +1282,7 @@ static int semctl_setval(struct ipc_namespace *ns, 
> > int semid, int semnum,
> >  
> > sem_lock(sma, NULL, -1);
> >  
> > -   if (sma->sem_perm.deleted) {
> > +   if (!ipc_valid_object(>sem_perm)) {
> > sem_unlock(sma, -1);
> > rcu_read_unlock();
> > return -EIDRM;
> > @@ -1342,7 +1342,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
> > semid, int semnum,
> > int i;
> >  
> > sem_lock(sma, NULL, -1);
> > -   if (sma->sem_perm.deleted) {
> > +   if (!ipc_valid_object(>sem_perm)) {
> > err = -EIDRM;
> > goto out_unlock;
> > }
> > @@ -1435,7 +1435,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
> > semid, int semnum,
> > goto out_rcu_wakeup;
> >  
> > sem_lock(sma, NULL, -1);
> > -   if (sma->sem_perm.deleted) {
> > +   if (!ipc_valid_object(>sem_perm)) {
> > err = -EIDRM;
> > goto out_unlock;
> > }
> > @@ -2068,7 +2068,7 @@ void exit_sem(struct task_struct *tsk)
> >  
> > sem_lock(sma, NULL, -1);
> > /* exit_sem raced with IPC_RMID, nothing to do */
> > -   if (sma->sem_perm.deleted) {
> > +   if (!ipc_valid_object(>sem_perm)) {
> > sem_unlock(sma, -1);
> > rcu_read_unlock();
> > continue;
> > diff --git a/ipc/shm.c b/ipc/shm.c
> > index 7a51443..1bc68f1 100644
> > --- a/ipc/shm.c
> > +++ b/ipc/shm.c
> > @@ -975,6 +975,13 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct 
> > shmid_ds __user *, buf)
> > goto out_unlock1;
> >  
> > ipc_lock_object(>shm_perm);
> > +
> > +   /* check if shm_destroy() is tearing down shp */
> > +   if (!ipc_valid_object(>shm_perm)) {
> > +   err = -EIDRM;
> > + 

[PATCH] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-17 Thread Rafael Aquini
After the locking semantics for the SysV IPC API got improved, a couple of
IPC_RMID race windows were opened because we ended up dropping the
'kern_ipc_perm.deleted' check performed way down in ipc_lock().
The spotted races got sorted out by re-introducing the old test within
the racy critical sections.

This patch introduces ipc_valid_object() to consolidate the way we cope with
IPC_RMID races by using the same abstraction across the API implementation.

Signed-off-by: Rafael Aquini 
---
 ipc/msg.c  |  7 ---
 ipc/sem.c  |  8 
 ipc/shm.c  | 16 
 ipc/util.h | 13 +
 4 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index 558aa91..8983ea5 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -696,7 +696,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
goto out_unlock0;
 
/* raced with RMID? */
-   if (msq->q_perm.deleted) {
+   if (!ipc_valid_object(>q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -731,7 +731,8 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_lock_object(>q_perm);
 
ipc_rcu_putref(msq, ipc_rcu_free);
-   if (msq->q_perm.deleted) {
+   /* raced with RMID? */
+   if (!ipc_valid_object(>q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -909,7 +910,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, 
long msgtyp, int msgfl
ipc_lock_object(>q_perm);
 
/* raced with RMID? */
-   if (msq->q_perm.deleted) {
+   if (!ipc_valid_object(>q_perm)) {
msg = ERR_PTR(-EIDRM);
goto out_unlock0;
}
diff --git a/ipc/sem.c b/ipc/sem.c
index db9d241..f4fad32 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1282,7 +1282,7 @@ static int semctl_setval(struct ipc_namespace *ns, int 
semid, int semnum,
 
sem_lock(sma, NULL, -1);
 
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
return -EIDRM;
@@ -1342,7 +1342,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
int i;
 
sem_lock(sma, NULL, -1);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1435,7 +1435,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
goto out_rcu_wakeup;
 
sem_lock(sma, NULL, -1);
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -2068,7 +2068,7 @@ void exit_sem(struct task_struct *tsk)
 
sem_lock(sma, NULL, -1);
/* exit_sem raced with IPC_RMID, nothing to do */
-   if (sma->sem_perm.deleted) {
+   if (!ipc_valid_object(>sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
continue;
diff --git a/ipc/shm.c b/ipc/shm.c
index 7a51443..1bc68f1 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -975,6 +975,13 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct 
shmid_ds __user *, buf)
goto out_unlock1;
 
ipc_lock_object(>shm_perm);
+
+   /* check if shm_destroy() is tearing down shp */
+   if (!ipc_valid_object(>shm_perm)) {
+   err = -EIDRM;
+   goto out_unlock0;
+   }
+
if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
kuid_t euid = current_euid();
if (!uid_eq(euid, shp->shm_perm.uid) &&
@@ -989,13 +996,6 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct 
shmid_ds __user *, buf)
}
 
shm_file = shp->shm_file;
-
-   /* check if shm_destroy() is tearing down shp */
-   if (shm_file == NULL) {
-   err = -EIDRM;
-   goto out_unlock0;
-   }
-
if (is_file_hugepages(shm_file))
goto out_unlock0;
 
@@ -1116,7 +1116,7 @@ long do_shmat(int shmid, char __user *shmaddr, int 
shmflg, ulong *raddr,
ipc_lock_object(>shm_perm);
 
/* check if shm_destroy() is tearing down shp */
-   if (shp->shm_file == NULL) {
+   if (!ipc_valid_object(>shm_perm)) {
ipc_unlock_object(>shm_perm);
err = -EIDRM;
goto out_unlock;
diff --git

[PATCH] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-17 Thread Rafael Aquini
After the locking semantics for the SysV IPC API got improved, a couple of
IPC_RMID race windows were opened because we ended up dropping the
'kern_ipc_perm.deleted' check performed way down in ipc_lock().
The spotted races got sorted out by re-introducing the old test within
the racy critical sections.

This patch introduces ipc_valid_object() to consolidate the way we cope with
IPC_RMID races by using the same abstraction across the API implementation.

Signed-off-by: Rafael Aquini aqu...@redhat.com
---
 ipc/msg.c  |  7 ---
 ipc/sem.c  |  8 
 ipc/shm.c  | 16 
 ipc/util.h | 13 +
 4 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index 558aa91..8983ea5 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -696,7 +696,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
goto out_unlock0;
 
/* raced with RMID? */
-   if (msq-q_perm.deleted) {
+   if (!ipc_valid_object(msq-q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -731,7 +731,8 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_lock_object(msq-q_perm);
 
ipc_rcu_putref(msq, ipc_rcu_free);
-   if (msq-q_perm.deleted) {
+   /* raced with RMID? */
+   if (!ipc_valid_object(msq-q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -909,7 +910,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, 
long msgtyp, int msgfl
ipc_lock_object(msq-q_perm);
 
/* raced with RMID? */
-   if (msq-q_perm.deleted) {
+   if (!ipc_valid_object(msq-q_perm)) {
msg = ERR_PTR(-EIDRM);
goto out_unlock0;
}
diff --git a/ipc/sem.c b/ipc/sem.c
index db9d241..f4fad32 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1282,7 +1282,7 @@ static int semctl_setval(struct ipc_namespace *ns, int 
semid, int semnum,
 
sem_lock(sma, NULL, -1);
 
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
return -EIDRM;
@@ -1342,7 +1342,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
int i;
 
sem_lock(sma, NULL, -1);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1435,7 +1435,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
goto out_rcu_wakeup;
 
sem_lock(sma, NULL, -1);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -2068,7 +2068,7 @@ void exit_sem(struct task_struct *tsk)
 
sem_lock(sma, NULL, -1);
/* exit_sem raced with IPC_RMID, nothing to do */
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
continue;
diff --git a/ipc/shm.c b/ipc/shm.c
index 7a51443..1bc68f1 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -975,6 +975,13 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct 
shmid_ds __user *, buf)
goto out_unlock1;
 
ipc_lock_object(shp-shm_perm);
+
+   /* check if shm_destroy() is tearing down shp */
+   if (!ipc_valid_object(shp-shm_perm)) {
+   err = -EIDRM;
+   goto out_unlock0;
+   }
+
if (!ns_capable(ns-user_ns, CAP_IPC_LOCK)) {
kuid_t euid = current_euid();
if (!uid_eq(euid, shp-shm_perm.uid) 
@@ -989,13 +996,6 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct 
shmid_ds __user *, buf)
}
 
shm_file = shp-shm_file;
-
-   /* check if shm_destroy() is tearing down shp */
-   if (shm_file == NULL) {
-   err = -EIDRM;
-   goto out_unlock0;
-   }
-
if (is_file_hugepages(shm_file))
goto out_unlock0;
 
@@ -1116,7 +1116,7 @@ long do_shmat(int shmid, char __user *shmaddr, int 
shmflg, ulong *raddr,
ipc_lock_object(shp-shm_perm);
 
/* check if shm_destroy() is tearing down shp */
-   if (shp-shm_file == NULL) {
+   if (!ipc_valid_object(shp-shm_perm)) {
ipc_unlock_object(shp-shm_perm);
err = -EIDRM;
goto out_unlock;
diff --git a/ipc/util.h b/ipc/util.h
index

Re: [PATCH] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-17 Thread Rafael Aquini
On Tue, Dec 17, 2013 at 01:27:49PM -0800, Davidlohr Bueso wrote:
 Ccing Manfred.
 
 On Tue, 2013-12-17 at 17:03 -0200, Rafael Aquini wrote:
  After the locking semantics for the SysV IPC API got improved, a couple of
  IPC_RMID race windows were opened because we ended up dropping the
  'kern_ipc_perm.deleted' check performed way down in ipc_lock().
  The spotted races got sorted out by re-introducing the old test within
  the racy critical sections.
  
  This patch introduces ipc_valid_object() to consolidate the way we cope with
  IPC_RMID races by using the same abstraction across the API implementation.
 
 This is certainly a good function to have. Some comments below.
 
  
  Signed-off-by: Rafael Aquini aqu...@redhat.com
  ---
   ipc/msg.c  |  7 ---
   ipc/sem.c  |  8 
   ipc/shm.c  | 16 
   ipc/util.h | 13 +
   4 files changed, 29 insertions(+), 15 deletions(-)
  
  diff --git a/ipc/msg.c b/ipc/msg.c
  index 558aa91..8983ea5 100644
  --- a/ipc/msg.c
  +++ b/ipc/msg.c
  @@ -696,7 +696,7 @@ long do_msgsnd(int msqid, long mtype, void __user 
  *mtext,
  goto out_unlock0;
   
  /* raced with RMID? */
  -   if (msq-q_perm.deleted) {
  +   if (!ipc_valid_object(msq-q_perm)) {
  err = -EIDRM;
  goto out_unlock0;
  }
  @@ -731,7 +731,8 @@ long do_msgsnd(int msqid, long mtype, void __user 
  *mtext,
  ipc_lock_object(msq-q_perm);
   
  ipc_rcu_putref(msq, ipc_rcu_free);
  -   if (msq-q_perm.deleted) {
  +   /* raced with RMID? */
  +   if (!ipc_valid_object(msq-q_perm)) {
  err = -EIDRM;
  goto out_unlock0;
  }
  @@ -909,7 +910,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t 
  bufsz, long msgtyp, int msgfl
  ipc_lock_object(msq-q_perm);
   
  /* raced with RMID? */
  -   if (msq-q_perm.deleted) {
  +   if (!ipc_valid_object(msq-q_perm)) {
  msg = ERR_PTR(-EIDRM);
  goto out_unlock0;
  }
  diff --git a/ipc/sem.c b/ipc/sem.c
  index db9d241..f4fad32 100644
  --- a/ipc/sem.c
  +++ b/ipc/sem.c
  @@ -1282,7 +1282,7 @@ static int semctl_setval(struct ipc_namespace *ns, 
  int semid, int semnum,
   
  sem_lock(sma, NULL, -1);
   
  -   if (sma-sem_perm.deleted) {
  +   if (!ipc_valid_object(sma-sem_perm)) {
  sem_unlock(sma, -1);
  rcu_read_unlock();
  return -EIDRM;
  @@ -1342,7 +1342,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
  semid, int semnum,
  int i;
   
  sem_lock(sma, NULL, -1);
  -   if (sma-sem_perm.deleted) {
  +   if (!ipc_valid_object(sma-sem_perm)) {
  err = -EIDRM;
  goto out_unlock;
  }
  @@ -1435,7 +1435,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
  semid, int semnum,
  goto out_rcu_wakeup;
   
  sem_lock(sma, NULL, -1);
  -   if (sma-sem_perm.deleted) {
  +   if (!ipc_valid_object(sma-sem_perm)) {
  err = -EIDRM;
  goto out_unlock;
  }
  @@ -2068,7 +2068,7 @@ void exit_sem(struct task_struct *tsk)
   
  sem_lock(sma, NULL, -1);
  /* exit_sem raced with IPC_RMID, nothing to do */
  -   if (sma-sem_perm.deleted) {
  +   if (!ipc_valid_object(sma-sem_perm)) {
  sem_unlock(sma, -1);
  rcu_read_unlock();
  continue;
  diff --git a/ipc/shm.c b/ipc/shm.c
  index 7a51443..1bc68f1 100644
  --- a/ipc/shm.c
  +++ b/ipc/shm.c
  @@ -975,6 +975,13 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct 
  shmid_ds __user *, buf)
  goto out_unlock1;
   
  ipc_lock_object(shp-shm_perm);
  +
  +   /* check if shm_destroy() is tearing down shp */
  +   if (!ipc_valid_object(shp-shm_perm)) {
  +   err = -EIDRM;
  +   goto out_unlock0;
  +   }
  +
  if (!ns_capable(ns-user_ns, CAP_IPC_LOCK)) {
  kuid_t euid = current_euid();
  if (!uid_eq(euid, shp-shm_perm.uid) 
  @@ -989,13 +996,6 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct 
  shmid_ds __user *, buf)
  }
   
  shm_file = shp-shm_file;
  -
  -   /* check if shm_destroy() is tearing down shp */
  -   if (shm_file == NULL) {
  -   err = -EIDRM;
  -   goto out_unlock0;
  -   }
 
 Ok, this seems safe, we can always rely on .deleted for validity since
 shm_destroy() ends up calling shm_rmid() which sets .deleted. So this
 change is really moving what we're checking against just a few
 instructions later.


Yep, I did change it cause it seems that there's no reason to delay the return
condition if we raced

Re: [PATCH] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-17 Thread Rafael Aquini
On Tue, Dec 17, 2013 at 02:18:02PM -0800, Davidlohr Bueso wrote:
 On Tue, 2013-12-17 at 19:46 -0200, Rafael Aquini wrote:
  On Tue, Dec 17, 2013 at 01:27:49PM -0800, Davidlohr Bueso wrote:
   Ccing Manfred.
   
   On Tue, 2013-12-17 at 17:03 -0200, Rafael Aquini wrote:
After the locking semantics for the SysV IPC API got improved, a couple 
of
IPC_RMID race windows were opened because we ended up dropping the
'kern_ipc_perm.deleted' check performed way down in ipc_lock().
The spotted races got sorted out by re-introducing the old test within
the racy critical sections.

This patch introduces ipc_valid_object() to consolidate the way we cope 
with
IPC_RMID races by using the same abstraction across the API 
implementation.
   
   This is certainly a good function to have. Some comments below.
   
 [...]

shm_file = shp-shm_file;
-
-   /* check if shm_destroy() is tearing down shp */
-   if (shm_file == NULL) {
-   err = -EIDRM;
-   goto out_unlock0;
-   }
   
   Ok, this seems safe, we can always rely on .deleted for validity since
   shm_destroy() ends up calling shm_rmid() which sets .deleted. So this
   change is really moving what we're checking against just a few
   instructions later.
  
  
  Yep, I did change it cause it seems that there's no reason to delay the 
  return
  condition if we raced with shm_destroy(), anyways.
   
 
 Right, but I was referring to moving what we consider as valid.
 
 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
 {
   struct file *shm_file;
 
   shm_file = shp-shm_file;
   shp-shm_file = NULL;   --- we currently use this.
   ns-shm_tot -= (shp-shm_segsz + PAGE_SIZE - 1)  PAGE_SHIFT;
   shm_rmid(ns, shp); --- with your patch we now use this.
   shm_unlock(shp);
   ...
 }
 
 ... and it makes since since shm was the only one not using .deleted for
 RMID racing checks.


Oh, I see. As a matter of fact, what made shm to start using a different check
than kern_ipc_perm.deleted, as you points out, was the follwoing commit:
---8--
commit a399b29dfbaaaf91162b2dc5a5875dd51bbfa2a1
Author: Greg Thelen gthe...@google.com
Date:   Thu Nov 21 14:32:00 2013 -0800

ipc,shm: fix shm_file deletion races
---8---

Although it closes the spotted race properly, Greg's commit also implies that 
(by the
way it works the race around) the race has always been there, which is not 
true. 
OTOH, I didn't propose reverting Greg's commit because I thought the changes 
it introduced to shm_destroy() might come handy to help one spotting similar 
races, 
if they eventually pop out in the future.
(a cleanup patch can be sent later, if that hunk is not regarded as useful 
anymore)

Thanks!
-- Rafael
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] ipc: introduce ipc_valid_object() helper to sort out IPC_RMID races

2013-12-17 Thread Rafael Aquini
After the locking semantics for the SysV IPC API got improved, a couple of
IPC_RMID race windows were opened because we ended up dropping the
'kern_ipc_perm.deleted' check performed way down in ipc_lock().
The spotted races got sorted out by re-introducing the old test within
the racy critical sections.

This patch introduces ipc_valid_object() to consolidate the way we cope with
IPC_RMID races by using the same abstraction across the API implementation.

Signed-off-by: Rafael Aquini aqu...@redhat.com
Acked-by: Rik van Riel r...@redhat.com
Acked-by: Greg Thelen gthe...@google.com
---
Changelog:
* v2:
 - drop assert_spin_locked() from ipc_valid_object() for less overhead
 - extend ipc_valid_object() usage in sem.c (not spotted checkpoints)
 - keep the received ACKs

 ipc/msg.c  |  7 ---
 ipc/sem.c  | 16 
 ipc/shm.c  | 16 
 ipc/util.h | 12 
 4 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index 558aa91..8983ea5 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -696,7 +696,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
goto out_unlock0;
 
/* raced with RMID? */
-   if (msq-q_perm.deleted) {
+   if (!ipc_valid_object(msq-q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -731,7 +731,8 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_lock_object(msq-q_perm);
 
ipc_rcu_putref(msq, ipc_rcu_free);
-   if (msq-q_perm.deleted) {
+   /* raced with RMID? */
+   if (!ipc_valid_object(msq-q_perm)) {
err = -EIDRM;
goto out_unlock0;
}
@@ -909,7 +910,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, 
long msgtyp, int msgfl
ipc_lock_object(msq-q_perm);
 
/* raced with RMID? */
-   if (msq-q_perm.deleted) {
+   if (!ipc_valid_object(msq-q_perm)) {
msg = ERR_PTR(-EIDRM);
goto out_unlock0;
}
diff --git a/ipc/sem.c b/ipc/sem.c
index db9d241..ed0057a 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1282,7 +1282,7 @@ static int semctl_setval(struct ipc_namespace *ns, int 
semid, int semnum,
 
sem_lock(sma, NULL, -1);
 
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
return -EIDRM;
@@ -1342,7 +1342,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
int i;
 
sem_lock(sma, NULL, -1);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1361,7 +1361,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
 
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1409,7 +1409,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
}
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1435,7 +1435,7 @@ static int semctl_main(struct ipc_namespace *ns, int 
semid, int semnum,
goto out_rcu_wakeup;
 
sem_lock(sma, NULL, -1);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
err = -EIDRM;
goto out_unlock;
}
@@ -1699,7 +1699,7 @@ static struct sem_undo *find_alloc_undo(struct 
ipc_namespace *ns, int semid)
/* step 3: Acquire the lock on semaphore array */
rcu_read_lock();
sem_lock_and_putref(sma);
-   if (sma-sem_perm.deleted) {
+   if (!ipc_valid_object(sma-sem_perm)) {
sem_unlock(sma, -1);
rcu_read_unlock();
kfree(new);
@@ -1846,7 +1846,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf 
__user *, tsops,
 
error = -EIDRM;
locknum = sem_lock(sma, sops, nsops);
-   if (sma-sem_perm.deleted)
+   if (!ipc_valid_object(sma-sem_perm))
goto out_unlock_free;
/*
 * semid identifiers are not unique - find_alloc_undo may have
@@ -2068,7 +2068,7 @@ void exit_sem(struct task_struct *tsk)
 
sem_lock(sma, NULL, -1

Re: [PATCH v3 1/6] mm/migrate: add comment about permanent failure path

2013-12-16 Thread Rafael Aquini
On Fri, Dec 13, 2013 at 03:53:26PM +0900, Joonsoo Kim wrote:
> From: Naoya Horiguchi 
> 
> Let's add a comment about where the failed page goes to, which makes
> code more readable.
> 
> Acked-by: Christoph Lameter 
> Reviewed-by: Wanpeng Li 
> Signed-off-by: Naoya Horiguchi 
> Signed-off-by: Joonsoo Kim 
> 
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 3747fcd..c6ac87a 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1123,7 +1123,12 @@ int migrate_pages(struct list_head *from, new_page_t 
> get_new_page,
>   nr_succeeded++;
>   break;
>   default:
> - /* Permanent failure */
> + /*
> +  * Permanent failure (-EBUSY, -ENOSYS, etc.):
> +  * unlike -EAGAIN case, the failed page is
> +  * removed from migration page list and not
> +  * retried in the next outer loop.
> +  */
>   nr_failed++;
>   break;
>   }

Acked-by: Rafael Aquini 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


<    1   2   3   4   5   6   7   >