[PATCH 3/4] kconfig: Regenerate parser with current Bison prior to making changes

2015-05-13 Thread Josh Triplett
Before making changes to the parser, regenerate the parser with current
Bison to avoid mixing those changes with those produced by updating
Bison.

Signed-off-by: Josh Triplett 
---
 scripts/kconfig/zconf.tab.c_shipped | 1244 ---
 1 file changed, 561 insertions(+), 683 deletions(-)

diff --git a/scripts/kconfig/zconf.tab.c_shipped 
b/scripts/kconfig/zconf.tab.c_shipped
index de5e84e..0864c7d 100644
--- a/scripts/kconfig/zconf.tab.c_shipped
+++ b/scripts/kconfig/zconf.tab.c_shipped
@@ -1,19 +1,19 @@
-/* A Bison parser, made by GNU Bison 2.5.  */
+/* A Bison parser, made by GNU Bison 3.0.2.  */
 
 /* Bison implementation for Yacc-like parsers in C
-   
-  Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
-   
+
+   Copyright (C) 1984, 1989-1990, 2000-2013 Free Software Foundation, Inc.
+
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
-   
+
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
-   
+
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 
@@ -26,7 +26,7 @@
special exception, which will cause the skeleton and the resulting
Bison output files to be licensed under the GNU General Public
License without this special exception.
-   
+
This special exception was added by the Free Software Foundation in
version 2.2 of Bison.  */
 
@@ -44,7 +44,7 @@
 #define YYBISON 1
 
 /* Bison version.  */
-#define YYBISON_VERSION "2.5"
+#define YYBISON_VERSION "3.0.2"
 
 /* Skeleton name.  */
 #define YYSKELETON_NAME "yacc.c"
@@ -58,18 +58,16 @@
 /* Pull parsers.  */
 #define YYPULL 1
 
-/* Using locations.  */
-#define YYLSP_NEEDED 0
 
 /* Substitute the variable and function names.  */
 #define yyparse zconfparse
 #define yylex   zconflex
 #define yyerror zconferror
-#define yylval  zconflval
-#define yychar  zconfchar
 #define yydebug zconfdebug
 #define yynerrs zconfnerrs
 
+#define yylval  zconflval
+#define yychar  zconfchar
 
 /* Copy the first part of user declarations.  */
 
@@ -108,10 +106,13 @@ static struct menu *current_menu, *current_entry;
 
 
 
-/* Enabling traces.  */
-#ifndef YYDEBUG
-# define YYDEBUG 1
-#endif
+# ifndef YY_NULLPTR
+#  if defined __cplusplus && 201103L <= __cplusplus
+#   define YY_NULLPTR nullptr
+#  else
+#   define YY_NULLPTR 0
+#  endif
+# endif
 
 /* Enabling verbose error messages.  */
 #ifdef YYERROR_VERBOSE
@@ -121,58 +122,60 @@ static struct menu *current_menu, *current_entry;
 # define YYERROR_VERBOSE 0
 #endif
 
-/* Enabling the token table.  */
-#ifndef YYTOKEN_TABLE
-# define YYTOKEN_TABLE 0
-#endif
 
+/* Debug traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 1
+#endif
+#if YYDEBUG
+extern int zconfdebug;
+#endif
 
-/* Tokens.  */
+/* Token type.  */
 #ifndef YYTOKENTYPE
 # define YYTOKENTYPE
-   /* Put the tokens into the symbol table, so that GDB and other debuggers
-  know about them.  */
-   enum yytokentype {
- T_MAINMENU = 258,
- T_MENU = 259,
- T_ENDMENU = 260,
- T_SOURCE = 261,
- T_CHOICE = 262,
- T_ENDCHOICE = 263,
- T_COMMENT = 264,
- T_CONFIG = 265,
- T_MENUCONFIG = 266,
- T_HELP = 267,
- T_HELPTEXT = 268,
- T_IF = 269,
- T_ENDIF = 270,
- T_DEPENDS = 271,
- T_OPTIONAL = 272,
- T_PROMPT = 273,
- T_TYPE = 274,
- T_DEFAULT = 275,
- T_SELECT = 276,
- T_RANGE = 277,
- T_VISIBLE = 278,
- T_OPTION = 279,
- T_ON = 280,
- T_WORD = 281,
- T_WORD_QUOTE = 282,
- T_UNEQUAL = 283,
- T_CLOSE_PAREN = 284,
- T_OPEN_PAREN = 285,
- T_EOL = 286,
- T_OR = 287,
- T_AND = 288,
- T_EQUAL = 289,
- T_NOT = 290
-   };
+  enum yytokentype
+  {
+T_MAINMENU = 258,
+T_MENU = 259,
+T_ENDMENU = 260,
+T_SOURCE = 261,
+T_CHOICE = 262,
+T_ENDCHOICE = 263,
+T_COMMENT = 264,
+T_CONFIG = 265,
+T_MENUCONFIG = 266,
+T_HELP = 267,
+T_HELPTEXT = 268,
+T_IF = 269,
+T_ENDIF = 270,
+T_DEPENDS = 271,
+T_OPTIONAL = 272,
+T_PROMPT = 273,
+T_TYPE = 274,
+T_DEFAULT = 275,
+T_SELECT = 276,
+T_RANGE = 277,
+T_VISIBLE = 278,
+T_OPTION = 279,
+T_ON = 280,
+T_WORD = 281,
+T_WORD_QUOTE = 282,
+T_UNEQUAL = 283,
+T_CLOSE_PAREN = 284,
+T_OPEN_PAREN = 285,
+T_EOL = 286,
+T_OR = 287,
+T_AND = 288,
+T_EQUAL = 289,
+T_NOT = 290
+  };
 #endif
 
-
-
+/* Valu

[PATCH 2/4] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-13 Thread Josh Triplett
The expert menu frequently gets broken by a config item in the middle
that leaves off the "if EXPERT" from its prompt.  This results in the
remainder of the menu spilling out into the parent "General setup" menu.
Move the entire expert menu into a separate Kconfig file,
init/Kconfig.expert, to make this harder to do accidentally, and to
break up the exceedingly long init/Kconfig a bit.

Signed-off-by: Josh Triplett 
---
 init/Kconfig| 232 +---
 init/Kconfig.expert | 231 +++
 2 files changed, 232 insertions(+), 231 deletions(-)
 create mode 100644 init/Kconfig.expert

diff --git a/init/Kconfig b/init/Kconfig
index e2f16f1..a2de3f5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1351,237 +1351,7 @@ config BPF_SYSCALL
  Enable the bpf() system call that allows to manipulate eBPF
  programs and maps via file descriptors.
 
-menuconfig EXPERT
-   bool "Configure standard kernel features (expert users)"
-   # Unhide debug options, to make the on-by-default options visible
-   select DEBUG_KERNEL
-   help
- This option allows certain base kernel options and settings
-  to be disabled or tweaked. This is for specialized
-  environments which can tolerate a "non-standard" kernel.
-  Only use this if you really know what you are doing.
-
-config UID16
-   bool "Enable 16-bit UID system calls" if EXPERT
-   depends on HAVE_UID16 && MULTIUSER
-   default y
-   help
- This enables the legacy 16-bit UID syscall wrappers.
-
-config MULTIUSER
-   bool "Multiple users, groups and capabilities support" if EXPERT
-   default y
-   help
- This option enables support for non-root users, groups and
- capabilities.
-
- If you say N here, all processes will run with UID 0, GID 0, and all
- possible capabilities.  Saying N here also compiles out support for
- system calls related to UIDs, GIDs, and capabilities, such as setuid,
- setgid, and capset.
-
- If unsure, say Y here.
-
-config SGETMASK_SYSCALL
-   bool "sgetmask/ssetmask syscalls support" if EXPERT
-   def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || 
SPARC || CRIS || MICROBLAZE || SUPERH
-   ---help---
- sys_sgetmask and sys_ssetmask are obsolete system calls
- no longer supported in libc but still enabled by default in some
- architectures.
-
- If unsure, leave the default option here.
-
-config SYSFS_SYSCALL
-   bool "Sysfs syscall support" if EXPERT
-   default y
-   ---help---
- sys_sysfs is an obsolete system call no longer supported in libc.
- Note that disabling this option is more secure but might break
- compatibility with some systems.
-
- If unsure say Y here.
-
-config SYSCTL_SYSCALL
-   bool "Sysctl syscall support" if EXPERT
-   depends on PROC_SYSCTL
-   default n
-   select SYSCTL
-   ---help---
- sys_sysctl uses binary paths that have been found challenging
- to properly maintain and use.  The interface in /proc/sys
- using paths with ascii names is now the primary path to this
- information.
-
- Almost nothing using the binary sysctl interface so if you are
- trying to save some space it is probably safe to disable this,
- making your kernel marginally smaller.
-
- If unsure say N here.
-
-config KALLSYMS
-bool "Load all symbols for debugging/ksymoops" if EXPERT
-default y
-help
-  Say Y here to let the kernel print out symbolic crash information and
-  symbolic stack backtraces. This increases the size of the kernel
-  somewhat, as all symbols have to be loaded into the kernel image.
-
-config KALLSYMS_ALL
-   bool "Include all symbols in kallsyms"
-   depends on DEBUG_KERNEL && KALLSYMS
-   help
-  Normally kallsyms only contains the symbols of functions for nicer
-  OOPS messages and backtraces (i.e., symbols from the text and 
inittext
-  sections). This is sufficient for most cases. And only in very rare
-  cases (e.g., when a debugger is used) all symbols are required (e.g.,
-  names of variables from the data sections, etc).
-
-  This option makes sure that all symbols are loaded into the kernel
-  image (i.e., symbols from all sections) in cost of increased kernel
-  size (depending on the kernel configuration, it may be 300KiB or
-  something like this).
-
-  Say N unless you really need all symbols.
-
-config PRINTK
-   default y
-   bool "Enable support for printk" if EXPERT
-   select IRQ_WORK
-   help

[PATCH 1/4] init/Kconfig: Fix break in middle of EXPERT menu

2015-05-13 Thread Josh Triplett
Commit e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y &&
!CONFIG_TRACING kernels, make it more configurable") made BPF_SYSCALL no
longer hidden with !EXPERT, but left it in the middle of the EXPERT
menu.  menuconfig stops putting config items under a submenu once it
encounters an item that doesn't depend on the menu's config item, so
this caused the remainder of the EXPERT menu to spill out into the
containing menu around it.  Fix by moving BPF_SYSCALL before the EXPERT
menu, next to BPF.

Fixes: e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING 
kernels, make it more configurable")
Signed-off-by: Josh Triplett 
---
 init/Kconfig | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index dc24dec..e2f16f1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1341,6 +1341,16 @@ config HAVE_PCSPKR_PLATFORM
 config BPF
bool
 
+# syscall, maps, verifier
+config BPF_SYSCALL
+   bool "Enable bpf() system call"
+   select ANON_INODES
+   select BPF
+   default n
+   help
+ Enable the bpf() system call that allows to manipulate eBPF
+ programs and maps via file descriptors.
+
 menuconfig EXPERT
bool "Configure standard kernel features (expert users)"
# Unhide debug options, to make the on-by-default options visible
@@ -1535,16 +1545,6 @@ config EVENTFD
 
  If unsure, say Y.
 
-# syscall, maps, verifier
-config BPF_SYSCALL
-   bool "Enable bpf() system call"
-   select ANON_INODES
-   select BPF
-   default n
-   help
- Enable the bpf() system call that allows to manipulate eBPF
- programs and maps via file descriptors.
-
 config SHMEM
bool "Use full shmem filesystem" if EXPERT
default y
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH v6] Documentation/arch: Add Documentation/arch-features.txt

2015-05-13 Thread Josh Triplett
On Wed, May 13, 2015 at 09:27:57AM -0700, Josh Triplett wrote:
> How likely is this to get out of date?  Are people going to remember to
> patch this when they add a feature to their architecture?  If
> they found out they had work to do by reading this file, which is the
> goal, then they'll likely remember to edit the file; however, if they
> find the feature and fix it without knowing about the file, will someone
> notice?
> 
> Is there any way we can *generate* this file from Kconfig?  Can we
> extract the necessary "this is possible to enable" or "this arch selects
> this symbol" information from Kconfig, and together with the list of
> symbols for features needing architecture support, generate the table?

Just tried this.  Looks like it's pretty trivial for most of these
features: just make ARCH=thearch allyesconfig, then look for the config
symbol in the result.

For instance, after doing "make ARCH=alpha allyesconfig":

~/src/linux$ grep -x 'CONFIG_ARCH_HAS_ELF_RANDOMIZE=y' .config || echo TODO
TODO

~/src/linux$ grep -x 'CONFIG_GENERIC_CLOCKEVENTS=y' .config || echo TODO
CONFIG_GENERIC_CLOCKEVENTS=y

This seems easy to turn into a shell script or Python script, given a list of
feature descriptions and corresponding config symbols, as well as a list of
architectures.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH tip/core/rcu 2/3] rcutorture: Test both RCU-sched and RCU-bh for Tiny RCU

2015-05-13 Thread Josh Triplett
On Wed, May 13, 2015 at 06:07:28AM -0700, Paul E. McKenney wrote:
> On Tue, May 12, 2015 at 05:59:29PM -0700, j...@joshtriplett.org wrote:
> > On Tue, May 12, 2015 at 03:49:12PM -0700, Paul E. McKenney wrote:
> > > From: "Paul E. McKenney" 
> > > 
> > > Reported-by: "Ahmed, Iftekhar" 
> > > Signed-off-by: Paul E. McKenney 
> > 
> > Could you elaborate a bit more on this patch (ideally in its commit
> > message)?  I see an addition of a command-line parameter to test rcu_bh;
> > is rcu-sched already tested elsewhere by some other config, or does this
> > parameter somehow enable testing both?
> 
> The commit log now reads as follows, does that help?
> 
>   Thanx, Paul
> 
> 
> 
> rcutorture: Test both RCU-sched and RCU-bh for Tiny RCU
> 
> Tiny RCU supports both RCU-sched and RCU-bh, but only RCU-sched is
> currently tested by the rcutorture scripts.  This commit therefore
> changes the TINY02 configuration to test RCU-bh, with TINY01 continuing
> to test RCU-sched.
> 
> This shortcoming of the current rcutorture tests was located by mutation
> testing by Iftekhar.  The idea behind mutation testing is to automatically
> mutate the code under test.  If a given mutant is not caught by testing,
> this is a hint that the testing might need to be improved, as was the
> case here.  Note that this is only a hint because it is possible to mutate
> the code into something else that still works.  For example, a mutation
> that removes (say) a WARN_ON() will not normally result in a test failure.
> 
> This change resulted in the test failure caused by list mishandling,
> which is fixed by the next commit.
> 
> Reported-by: "Ahmed, Iftekhar" 
> Signed-off-by: Paul E. McKenney 
> 
Much better, thanks.  In particular, the information about TINY01 and
TINY02 was not obvious from the patch.
Reviewed-by: Josh Triplett 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH v6] Documentation/arch: Add Documentation/arch-features.txt

2015-05-13 Thread Josh Triplett
On Wed, May 13, 2015 at 03:48:42PM +0200, Ingo Molnar wrote:
> From 93f6bd67b4348bf4bf27cbac8ffa9f1def4fa6aa Mon Sep 17 00:00:00 2001
> From: Ingo Molnar 
> Date: Wed, 13 May 2015 10:30:11 +0200
> Subject: [PATCH] Documentation/arch: Add Documentation/arch-features.txt
> 
> Add a support matrix for various generic kernel features that need
> architecture support.
> 
> Signed-off-by: Ingo Molnar 

Could you add a column for the bpf JIT?

Should this file track syscalls not wired up on architectures?

How likely is this to get out of date?  Are people going to remember to
patch this when they add a feature to their architecture?  If
they found out they had work to do by reading this file, which is the
goal, then they'll likely remember to edit the file; however, if they
find the feature and fix it without knowing about the file, will someone
notice?

Is there any way we can *generate* this file from Kconfig?  Can we
extract the necessary "this is possible to enable" or "this arch selects
this symbol" information from Kconfig, and together with the list of
symbols for features needing architecture support, generate the table?

If we can't generate this, then the ASCII-art style and right-aligned
feature names seems *really* likely to produce spurious conflicts,
especially when adding a feature to the list.  Even though it would
produce a much longer file, would you consider dropping the tables and
just having a section per feature?  That also avoids the need to
abbreviate as much.  For instance:

Transparent huge pages
HAVE_ARCH_TRANSPARENT_HUGEPAGE && 64BIT

alpha   TODO
arc ..
arm ok
arm64   ok
avr32   ..
blackfin..
c6x ..
cris..
frv ..
hexagon ..
ia64TODO
m32r..
m68k..
metag   ..
microblaze  ..
mipsok
mn10300 ..
nios2   ..
openrisc..
parisc  TODO
powerpc ok
s390ok
score   ..
sh  ..
sparc   ok
tileTODO
um  ..
unicore32   ..
x86 ok
xtensa  ..

Or alternatively:

Transparent huge pages
HAVE_ARCH_TRANSPARENT_HUGEPAGE && 64BIT

TODO:
alpha
ia64
parisc
tile

Done:
arm
arm64
mips
powerpc
s390
sparc
x86

Not possible:
arc
avr32
blackfin
c6x
cris
frv
hexagon
m32r
m68k
metag
microblaze
mn10300
nios2
openrisc
score
sh
um
unicore32
xtensa


After all, the point of this document isn't to look at, it's to use as a
TODO list and actively edit.  So anything that makes it prettier but
more painful to edit seems like a bug.

>  Documentation/arch-features.txt | 222 
> 
>  1 file changed, 222 insertions(+)
> 
> diff --git a/Documentation/arch-features.txt b/Documentation/arch-features.txt
> new file mode 100644
> index ..4f6430bc552b
> --- /dev/null
> +++ b/Documentation/arch-features.txt
> @@ -0,0 +1,222 @@
> +
> +For generic kernel features that need architecture support, this is
> +the feature support matrix, for all upstream Linux architectures.
> +
> +Meaning of entries in the tables:
> +
> +' ok ': feature supported by the architecture
> +'TODO': feature not yet supported by the architecture
> +' .. ': feature cannot be supported by the hardware
> +
> +
> +
> +lockdep: LOCKDEP_SUPPORT
> + stackprotector: HAVE_CC_STACKPROTECTOR
> +jump labels: HAVE_ARCH_JUMP_LABEL
> + seccomp filter: HAVE_ARCH_SECCOMP_FILTER
> +   context tracking: HAVE_CONTEXT_TRACKING
> +   kgdb: HAVE_ARCH_KGDB
> + modern timekeeping: !ARCH_USES_GETTIMEOFFSET
> +clockevents: GENERIC_CLOCKEVENTS
> +   ELF ASLR: ARCH_HAS_ELF_RANDOMIZE
> +
> +
> lockdep:-.
> + stackprotector:--.  
> |
> +jump labels:---.  |  
> |
> + seccomp filter:.  |  |  
> |
> +   context tracking:-.  |  |  |  
> |
> +   kgdb:--.  |  |  |  |  
> |
> + modern timekeeping:---.  |  |  |  |  |  
> |
> +clockevents:.  |  |  |  |  |  |  
> |
> +   ELF ASLR:-.  |  |  |  |  |  |  |  
> |
> + |  |  |  |  |  |  |  |  
> |
> +--
> +  alpha   | TODO |  ok  |  ok  | TODO | TODO | TODO | TODO | TODO | 
> TODO |
> +  arc | TODO |  ok  |  ok  |  ok  | TODO | TODO | TODO | TODO |  
> ok  |
> +  arm |  ok  |  ok  | TODO |  ok  |  ok  |  ok  |  ok  |  ok  |  
> ok  |
> +  arm64   |  ok  |  ok  

[PATCH 3/4] kconfig: Regenerate parser with current Bison prior to making changes

2015-05-13 Thread Josh Triplett
Before making changes to the parser, regenerate the parser with current
Bison to avoid mixing those changes with those produced by updating
Bison.

Signed-off-by: Josh Triplett j...@joshtriplett.org
---
 scripts/kconfig/zconf.tab.c_shipped | 1244 ---
 1 file changed, 561 insertions(+), 683 deletions(-)

diff --git a/scripts/kconfig/zconf.tab.c_shipped 
b/scripts/kconfig/zconf.tab.c_shipped
index de5e84e..0864c7d 100644
--- a/scripts/kconfig/zconf.tab.c_shipped
+++ b/scripts/kconfig/zconf.tab.c_shipped
@@ -1,19 +1,19 @@
-/* A Bison parser, made by GNU Bison 2.5.  */
+/* A Bison parser, made by GNU Bison 3.0.2.  */
 
 /* Bison implementation for Yacc-like parsers in C
-   
-  Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
-   
+
+   Copyright (C) 1984, 1989-1990, 2000-2013 Free Software Foundation, Inc.
+
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
-   
+
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
-   
+
You should have received a copy of the GNU General Public License
along with this program.  If not, see http://www.gnu.org/licenses/.  */
 
@@ -26,7 +26,7 @@
special exception, which will cause the skeleton and the resulting
Bison output files to be licensed under the GNU General Public
License without this special exception.
-   
+
This special exception was added by the Free Software Foundation in
version 2.2 of Bison.  */
 
@@ -44,7 +44,7 @@
 #define YYBISON 1
 
 /* Bison version.  */
-#define YYBISON_VERSION 2.5
+#define YYBISON_VERSION 3.0.2
 
 /* Skeleton name.  */
 #define YYSKELETON_NAME yacc.c
@@ -58,18 +58,16 @@
 /* Pull parsers.  */
 #define YYPULL 1
 
-/* Using locations.  */
-#define YYLSP_NEEDED 0
 
 /* Substitute the variable and function names.  */
 #define yyparse zconfparse
 #define yylex   zconflex
 #define yyerror zconferror
-#define yylval  zconflval
-#define yychar  zconfchar
 #define yydebug zconfdebug
 #define yynerrs zconfnerrs
 
+#define yylval  zconflval
+#define yychar  zconfchar
 
 /* Copy the first part of user declarations.  */
 
@@ -108,10 +106,13 @@ static struct menu *current_menu, *current_entry;
 
 
 
-/* Enabling traces.  */
-#ifndef YYDEBUG
-# define YYDEBUG 1
-#endif
+# ifndef YY_NULLPTR
+#  if defined __cplusplus  201103L = __cplusplus
+#   define YY_NULLPTR nullptr
+#  else
+#   define YY_NULLPTR 0
+#  endif
+# endif
 
 /* Enabling verbose error messages.  */
 #ifdef YYERROR_VERBOSE
@@ -121,58 +122,60 @@ static struct menu *current_menu, *current_entry;
 # define YYERROR_VERBOSE 0
 #endif
 
-/* Enabling the token table.  */
-#ifndef YYTOKEN_TABLE
-# define YYTOKEN_TABLE 0
-#endif
 
+/* Debug traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 1
+#endif
+#if YYDEBUG
+extern int zconfdebug;
+#endif
 
-/* Tokens.  */
+/* Token type.  */
 #ifndef YYTOKENTYPE
 # define YYTOKENTYPE
-   /* Put the tokens into the symbol table, so that GDB and other debuggers
-  know about them.  */
-   enum yytokentype {
- T_MAINMENU = 258,
- T_MENU = 259,
- T_ENDMENU = 260,
- T_SOURCE = 261,
- T_CHOICE = 262,
- T_ENDCHOICE = 263,
- T_COMMENT = 264,
- T_CONFIG = 265,
- T_MENUCONFIG = 266,
- T_HELP = 267,
- T_HELPTEXT = 268,
- T_IF = 269,
- T_ENDIF = 270,
- T_DEPENDS = 271,
- T_OPTIONAL = 272,
- T_PROMPT = 273,
- T_TYPE = 274,
- T_DEFAULT = 275,
- T_SELECT = 276,
- T_RANGE = 277,
- T_VISIBLE = 278,
- T_OPTION = 279,
- T_ON = 280,
- T_WORD = 281,
- T_WORD_QUOTE = 282,
- T_UNEQUAL = 283,
- T_CLOSE_PAREN = 284,
- T_OPEN_PAREN = 285,
- T_EOL = 286,
- T_OR = 287,
- T_AND = 288,
- T_EQUAL = 289,
- T_NOT = 290
-   };
+  enum yytokentype
+  {
+T_MAINMENU = 258,
+T_MENU = 259,
+T_ENDMENU = 260,
+T_SOURCE = 261,
+T_CHOICE = 262,
+T_ENDCHOICE = 263,
+T_COMMENT = 264,
+T_CONFIG = 265,
+T_MENUCONFIG = 266,
+T_HELP = 267,
+T_HELPTEXT = 268,
+T_IF = 269,
+T_ENDIF = 270,
+T_DEPENDS = 271,
+T_OPTIONAL = 272,
+T_PROMPT = 273,
+T_TYPE = 274,
+T_DEFAULT = 275,
+T_SELECT = 276,
+T_RANGE = 277,
+T_VISIBLE = 278,
+T_OPTION = 279,
+T_ON = 280,
+T_WORD = 281,
+T_WORD_QUOTE = 282,
+T_UNEQUAL = 283,
+T_CLOSE_PAREN = 284,
+T_OPEN_PAREN = 285,
+T_EOL = 286,
+T_OR = 287,
+T_AND = 288,
+T_EQUAL = 289,
+T_NOT = 290
+  };
 #endif
 
-
-
+/* Value type.  */
 #if ! defined YYSTYPE  ! defined

[PATCH 2/4] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-13 Thread Josh Triplett
The expert menu frequently gets broken by a config item in the middle
that leaves off the if EXPERT from its prompt.  This results in the
remainder of the menu spilling out into the parent General setup menu.
Move the entire expert menu into a separate Kconfig file,
init/Kconfig.expert, to make this harder to do accidentally, and to
break up the exceedingly long init/Kconfig a bit.

Signed-off-by: Josh Triplett j...@joshtriplett.org
---
 init/Kconfig| 232 +---
 init/Kconfig.expert | 231 +++
 2 files changed, 232 insertions(+), 231 deletions(-)
 create mode 100644 init/Kconfig.expert

diff --git a/init/Kconfig b/init/Kconfig
index e2f16f1..a2de3f5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1351,237 +1351,7 @@ config BPF_SYSCALL
  Enable the bpf() system call that allows to manipulate eBPF
  programs and maps via file descriptors.
 
-menuconfig EXPERT
-   bool Configure standard kernel features (expert users)
-   # Unhide debug options, to make the on-by-default options visible
-   select DEBUG_KERNEL
-   help
- This option allows certain base kernel options and settings
-  to be disabled or tweaked. This is for specialized
-  environments which can tolerate a non-standard kernel.
-  Only use this if you really know what you are doing.
-
-config UID16
-   bool Enable 16-bit UID system calls if EXPERT
-   depends on HAVE_UID16  MULTIUSER
-   default y
-   help
- This enables the legacy 16-bit UID syscall wrappers.
-
-config MULTIUSER
-   bool Multiple users, groups and capabilities support if EXPERT
-   default y
-   help
- This option enables support for non-root users, groups and
- capabilities.
-
- If you say N here, all processes will run with UID 0, GID 0, and all
- possible capabilities.  Saying N here also compiles out support for
- system calls related to UIDs, GIDs, and capabilities, such as setuid,
- setgid, and capset.
-
- If unsure, say Y here.
-
-config SGETMASK_SYSCALL
-   bool sgetmask/ssetmask syscalls support if EXPERT
-   def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || 
SPARC || CRIS || MICROBLAZE || SUPERH
-   ---help---
- sys_sgetmask and sys_ssetmask are obsolete system calls
- no longer supported in libc but still enabled by default in some
- architectures.
-
- If unsure, leave the default option here.
-
-config SYSFS_SYSCALL
-   bool Sysfs syscall support if EXPERT
-   default y
-   ---help---
- sys_sysfs is an obsolete system call no longer supported in libc.
- Note that disabling this option is more secure but might break
- compatibility with some systems.
-
- If unsure say Y here.
-
-config SYSCTL_SYSCALL
-   bool Sysctl syscall support if EXPERT
-   depends on PROC_SYSCTL
-   default n
-   select SYSCTL
-   ---help---
- sys_sysctl uses binary paths that have been found challenging
- to properly maintain and use.  The interface in /proc/sys
- using paths with ascii names is now the primary path to this
- information.
-
- Almost nothing using the binary sysctl interface so if you are
- trying to save some space it is probably safe to disable this,
- making your kernel marginally smaller.
-
- If unsure say N here.
-
-config KALLSYMS
-bool Load all symbols for debugging/ksymoops if EXPERT
-default y
-help
-  Say Y here to let the kernel print out symbolic crash information and
-  symbolic stack backtraces. This increases the size of the kernel
-  somewhat, as all symbols have to be loaded into the kernel image.
-
-config KALLSYMS_ALL
-   bool Include all symbols in kallsyms
-   depends on DEBUG_KERNEL  KALLSYMS
-   help
-  Normally kallsyms only contains the symbols of functions for nicer
-  OOPS messages and backtraces (i.e., symbols from the text and 
inittext
-  sections). This is sufficient for most cases. And only in very rare
-  cases (e.g., when a debugger is used) all symbols are required (e.g.,
-  names of variables from the data sections, etc).
-
-  This option makes sure that all symbols are loaded into the kernel
-  image (i.e., symbols from all sections) in cost of increased kernel
-  size (depending on the kernel configuration, it may be 300KiB or
-  something like this).
-
-  Say N unless you really need all symbols.
-
-config PRINTK
-   default y
-   bool Enable support for printk if EXPERT
-   select IRQ_WORK
-   help
- This option enables normal printk support. Removing it
- eliminates most of the message strings from the kernel image

[PATCH 1/4] init/Kconfig: Fix break in middle of EXPERT menu

2015-05-13 Thread Josh Triplett
Commit e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y 
!CONFIG_TRACING kernels, make it more configurable) made BPF_SYSCALL no
longer hidden with !EXPERT, but left it in the middle of the EXPERT
menu.  menuconfig stops putting config items under a submenu once it
encounters an item that doesn't depend on the menu's config item, so
this caused the remainder of the EXPERT menu to spill out into the
containing menu around it.  Fix by moving BPF_SYSCALL before the EXPERT
menu, next to BPF.

Fixes: e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y  !CONFIG_TRACING 
kernels, make it more configurable)
Signed-off-by: Josh Triplett j...@joshtriplett.org
---
 init/Kconfig | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index dc24dec..e2f16f1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1341,6 +1341,16 @@ config HAVE_PCSPKR_PLATFORM
 config BPF
bool
 
+# syscall, maps, verifier
+config BPF_SYSCALL
+   bool Enable bpf() system call
+   select ANON_INODES
+   select BPF
+   default n
+   help
+ Enable the bpf() system call that allows to manipulate eBPF
+ programs and maps via file descriptors.
+
 menuconfig EXPERT
bool Configure standard kernel features (expert users)
# Unhide debug options, to make the on-by-default options visible
@@ -1535,16 +1545,6 @@ config EVENTFD
 
  If unsure, say Y.
 
-# syscall, maps, verifier
-config BPF_SYSCALL
-   bool Enable bpf() system call
-   select ANON_INODES
-   select BPF
-   default n
-   help
- Enable the bpf() system call that allows to manipulate eBPF
- programs and maps via file descriptors.
-
 config SHMEM
bool Use full shmem filesystem if EXPERT
default y
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] kconfig: Introduce showif to factor out conditions on visibility

2015-05-13 Thread Josh Triplett
kconfig implicitly creates a submenu whenever a series of symbols all
have dependencies or prompt-visibility expressions that all depend on a
preceeding symbol.  For instance, the series of symbols following
menuconfig EXPERT that all have if EXPERT on their prompt will all
appear as a submenu of EXPERT.

However, this implicit submenuing will break if any intervening symbol
loses its if EXPERT; doing so causes the subsequent symbols to appear
in the parent menu (General setup).  This has happened many times, and
it's easy to miss that the entire block should have that same
expression.

For submenus created from depends dependencies, these can be converted
to a single wrapping if expr ... endif block around all the submenu
items.  However, there's no equivalent for invisible items, for which
the prompt is hidden but the symbol may potentially be enabled.  For
instance, many items in the EXPERT menu are hidden if EXPERT is
disabled, but they have default y or are determined by some other
expression.

To handle this case, introduce a new kconfig construct, showif, which
does the same thing as if but for visibility expressions rather than
dependencies.  Every symbol in a showif expr ... endif block
effectively has if expr added to its prompt.

Use that new construct to simplify the expert menu.  Now, making a
symbol no longer invisible when !EXPERT requires moving it out of the
EXPERT menu, which makes it more difficult to break the EXPERT menu.

Signed-off-by: Josh Triplett j...@joshtriplett.org
---
 init/Kconfig.expert  |  44 +--
 scripts/kconfig/menu.c   |   4 +-
 scripts/kconfig/zconf.gperf  |   1 +
 scripts/kconfig/zconf.hash.c_shipped | 258 
 scripts/kconfig/zconf.tab.c_shipped  | 561 +++
 scripts/kconfig/zconf.y  |  28 +-
 6 files changed, 483 insertions(+), 413 deletions(-)

diff --git a/init/Kconfig.expert b/init/Kconfig.expert
index c84a372..fb3c3aa 100644
--- a/init/Kconfig.expert
+++ b/init/Kconfig.expert
@@ -8,15 +8,17 @@ menuconfig EXPERT
   environments which can tolerate a non-standard kernel.
   Only use this if you really know what you are doing.
 
+showif EXPERT
+
 config UID16
-   bool Enable 16-bit UID system calls if EXPERT
+   bool Enable 16-bit UID system calls
depends on HAVE_UID16  MULTIUSER
default y
help
  This enables the legacy 16-bit UID syscall wrappers.
 
 config MULTIUSER
-   bool Multiple users, groups and capabilities support if EXPERT
+   bool Multiple users, groups and capabilities support
default y
help
  This option enables support for non-root users, groups and
@@ -30,7 +32,7 @@ config MULTIUSER
  If unsure, say Y here.
 
 config SGETMASK_SYSCALL
-   bool sgetmask/ssetmask syscalls support if EXPERT
+   bool sgetmask/ssetmask syscalls support
def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || 
SPARC || CRIS || MICROBLAZE || SUPERH
---help---
  sys_sgetmask and sys_ssetmask are obsolete system calls
@@ -40,7 +42,7 @@ config SGETMASK_SYSCALL
  If unsure, leave the default option here.
 
 config SYSFS_SYSCALL
-   bool Sysfs syscall support if EXPERT
+   bool Sysfs syscall support
default y
---help---
  sys_sysfs is an obsolete system call no longer supported in libc.
@@ -50,7 +52,7 @@ config SYSFS_SYSCALL
  If unsure say Y here.
 
 config SYSCTL_SYSCALL
-   bool Sysctl syscall support if EXPERT
+   bool Sysctl syscall support
depends on PROC_SYSCTL
default n
select SYSCTL
@@ -67,7 +69,7 @@ config SYSCTL_SYSCALL
  If unsure say N here.
 
 config KALLSYMS
-bool Load all symbols for debugging/ksymoops if EXPERT
+bool Load all symbols for debugging/ksymoops
 default y
 help
   Say Y here to let the kernel print out symbolic crash information and
@@ -93,7 +95,7 @@ config KALLSYMS_ALL
 
 config PRINTK
default y
-   bool Enable support for printk if EXPERT
+   bool Enable support for printk
select IRQ_WORK
help
  This option enables normal printk support. Removing it
@@ -103,7 +105,7 @@ config PRINTK
  strongly discouraged.
 
 config BUG
-   bool BUG() support if EXPERT
+   bool BUG() support
default y
help
   Disabling this option eliminates support for BUG and WARN, reducing
@@ -115,13 +117,13 @@ config BUG
 config ELF_CORE
depends on COREDUMP
default y
-   bool Enable ELF core dumps if EXPERT
+   bool Enable ELF core dumps
help
  Enable support for generating core dumps. Disabling saves about 4k.
 
 
 config PCSPKR_PLATFORM
-   bool Enable PC-Speaker support if EXPERT
+   bool Enable PC-Speaker support
depends on HAVE_PCSPKR_PLATFORM
select I8253_LOCK
default

Re: [RFC PATCH v6] Documentation/arch: Add Documentation/arch-features.txt

2015-05-13 Thread Josh Triplett
On Wed, May 13, 2015 at 03:48:42PM +0200, Ingo Molnar wrote:
 From 93f6bd67b4348bf4bf27cbac8ffa9f1def4fa6aa Mon Sep 17 00:00:00 2001
 From: Ingo Molnar mi...@kernel.org
 Date: Wed, 13 May 2015 10:30:11 +0200
 Subject: [PATCH] Documentation/arch: Add Documentation/arch-features.txt
 
 Add a support matrix for various generic kernel features that need
 architecture support.
 
 Signed-off-by: Ingo Molnar mi...@kernel.org

Could you add a column for the bpf JIT?

Should this file track syscalls not wired up on architectures?

How likely is this to get out of date?  Are people going to remember to
patch this when they add a feature to their architecture?  If
they found out they had work to do by reading this file, which is the
goal, then they'll likely remember to edit the file; however, if they
find the feature and fix it without knowing about the file, will someone
notice?

Is there any way we can *generate* this file from Kconfig?  Can we
extract the necessary this is possible to enable or this arch selects
this symbol information from Kconfig, and together with the list of
symbols for features needing architecture support, generate the table?

If we can't generate this, then the ASCII-art style and right-aligned
feature names seems *really* likely to produce spurious conflicts,
especially when adding a feature to the list.  Even though it would
produce a much longer file, would you consider dropping the tables and
just having a section per feature?  That also avoids the need to
abbreviate as much.  For instance:

Transparent huge pages
HAVE_ARCH_TRANSPARENT_HUGEPAGE  64BIT

alpha   TODO
arc ..
arm ok
arm64   ok
avr32   ..
blackfin..
c6x ..
cris..
frv ..
hexagon ..
ia64TODO
m32r..
m68k..
metag   ..
microblaze  ..
mipsok
mn10300 ..
nios2   ..
openrisc..
parisc  TODO
powerpc ok
s390ok
score   ..
sh  ..
sparc   ok
tileTODO
um  ..
unicore32   ..
x86 ok
xtensa  ..

Or alternatively:

Transparent huge pages
HAVE_ARCH_TRANSPARENT_HUGEPAGE  64BIT

TODO:
alpha
ia64
parisc
tile

Done:
arm
arm64
mips
powerpc
s390
sparc
x86

Not possible:
arc
avr32
blackfin
c6x
cris
frv
hexagon
m32r
m68k
metag
microblaze
mn10300
nios2
openrisc
score
sh
um
unicore32
xtensa


After all, the point of this document isn't to look at, it's to use as a
TODO list and actively edit.  So anything that makes it prettier but
more painful to edit seems like a bug.

  Documentation/arch-features.txt | 222 
 
  1 file changed, 222 insertions(+)
 
 diff --git a/Documentation/arch-features.txt b/Documentation/arch-features.txt
 new file mode 100644
 index ..4f6430bc552b
 --- /dev/null
 +++ b/Documentation/arch-features.txt
 @@ -0,0 +1,222 @@
 +
 +For generic kernel features that need architecture support, this is
 +the feature support matrix, for all upstream Linux architectures.
 +
 +Meaning of entries in the tables:
 +
 +' ok ': feature supported by the architecture
 +'TODO': feature not yet supported by the architecture
 +' .. ': feature cannot be supported by the hardware
 +
 +
 +
 +lockdep: LOCKDEP_SUPPORT
 + stackprotector: HAVE_CC_STACKPROTECTOR
 +jump labels: HAVE_ARCH_JUMP_LABEL
 + seccomp filter: HAVE_ARCH_SECCOMP_FILTER
 +   context tracking: HAVE_CONTEXT_TRACKING
 +   kgdb: HAVE_ARCH_KGDB
 + modern timekeeping: !ARCH_USES_GETTIMEOFFSET
 +clockevents: GENERIC_CLOCKEVENTS
 +   ELF ASLR: ARCH_HAS_ELF_RANDOMIZE
 +
 +
 lockdep:-.
 + stackprotector:--.  
 |
 +jump labels:---.  |  
 |
 + seccomp filter:.  |  |  
 |
 +   context tracking:-.  |  |  |  
 |
 +   kgdb:--.  |  |  |  |  
 |
 + modern timekeeping:---.  |  |  |  |  |  
 |
 +clockevents:.  |  |  |  |  |  |  
 |
 +   ELF ASLR:-.  |  |  |  |  |  |  |  
 |
 + |  |  |  |  |  |  |  |  
 |
 +--
 +  alpha   | TODO |  ok  |  ok  | TODO | TODO | TODO | TODO | TODO | 
 TODO |
 +  arc | TODO |  ok  |  ok  |  ok  | TODO | TODO | TODO | TODO |  
 ok  |
 +  arm |  ok  |  ok  | TODO |  ok  |  ok  |  ok  |  ok  |  ok  |  
 ok  |
 +  arm64   |  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |  

Re: [PATCH tip/core/rcu 2/3] rcutorture: Test both RCU-sched and RCU-bh for Tiny RCU

2015-05-13 Thread Josh Triplett
On Wed, May 13, 2015 at 06:07:28AM -0700, Paul E. McKenney wrote:
 On Tue, May 12, 2015 at 05:59:29PM -0700, j...@joshtriplett.org wrote:
  On Tue, May 12, 2015 at 03:49:12PM -0700, Paul E. McKenney wrote:
   From: Paul E. McKenney paul...@linux.vnet.ibm.com
   
   Reported-by: Ahmed, Iftekhar ahm...@onid.oregonstate.edu
   Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com
  
  Could you elaborate a bit more on this patch (ideally in its commit
  message)?  I see an addition of a command-line parameter to test rcu_bh;
  is rcu-sched already tested elsewhere by some other config, or does this
  parameter somehow enable testing both?
 
 The commit log now reads as follows, does that help?
 
   Thanx, Paul
 
 
 
 rcutorture: Test both RCU-sched and RCU-bh for Tiny RCU
 
 Tiny RCU supports both RCU-sched and RCU-bh, but only RCU-sched is
 currently tested by the rcutorture scripts.  This commit therefore
 changes the TINY02 configuration to test RCU-bh, with TINY01 continuing
 to test RCU-sched.
 
 This shortcoming of the current rcutorture tests was located by mutation
 testing by Iftekhar.  The idea behind mutation testing is to automatically
 mutate the code under test.  If a given mutant is not caught by testing,
 this is a hint that the testing might need to be improved, as was the
 case here.  Note that this is only a hint because it is possible to mutate
 the code into something else that still works.  For example, a mutation
 that removes (say) a WARN_ON() will not normally result in a test failure.
 
 This change resulted in the test failure caused by list mishandling,
 which is fixed by the next commit.
 
 Reported-by: Ahmed, Iftekhar ahm...@onid.oregonstate.edu
 Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com
 
Much better, thanks.  In particular, the information about TINY01 and
TINY02 was not obvious from the patch.
Reviewed-by: Josh Triplett j...@joshtriplett.org
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH v6] Documentation/arch: Add Documentation/arch-features.txt

2015-05-13 Thread Josh Triplett
On Wed, May 13, 2015 at 09:27:57AM -0700, Josh Triplett wrote:
 How likely is this to get out of date?  Are people going to remember to
 patch this when they add a feature to their architecture?  If
 they found out they had work to do by reading this file, which is the
 goal, then they'll likely remember to edit the file; however, if they
 find the feature and fix it without knowing about the file, will someone
 notice?
 
 Is there any way we can *generate* this file from Kconfig?  Can we
 extract the necessary this is possible to enable or this arch selects
 this symbol information from Kconfig, and together with the list of
 symbols for features needing architecture support, generate the table?

Just tried this.  Looks like it's pretty trivial for most of these
features: just make ARCH=thearch allyesconfig, then look for the config
symbol in the result.

For instance, after doing make ARCH=alpha allyesconfig:

~/src/linux$ grep -x 'CONFIG_ARCH_HAS_ELF_RANDOMIZE=y' .config || echo TODO
TODO

~/src/linux$ grep -x 'CONFIG_GENERIC_CLOCKEVENTS=y' .config || echo TODO
CONFIG_GENERIC_CLOCKEVENTS=y

This seems easy to turn into a shell script or Python script, given a list of
feature descriptions and corresponding config symbols, as well as a list of
architectures.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-12 Thread Josh Triplett
On Tue, May 12, 2015 at 10:17:28AM +0200, Ingo Molnar wrote:
> * Josh Triplett  wrote:
> > > Looks good to me, but I have not looked very deeply ...
> > 
> > I sent out a v2 with the co-author information moved from the 
> > signoffs to the commit message.  If it looks reasonable to you, can 
> > you take it through the tip tree please?
> 
> So since this is multi-arch, and changes kernel/fork.c, I'd say -mm is 
> a more appropriate home for it? (Assuming Linus does not object.)

Works for me.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] init/Kconfig: Fix break in middle of EXPERT menu

2015-05-12 Thread Josh Triplett
On Tue, May 12, 2015 at 09:04:55AM +0200, Ingo Molnar wrote:
> 
> * Josh Triplett  wrote:
> 
> > Commit e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y &&
> > !CONFIG_TRACING kernels, make it more configurable") made BPF_SYSCALL no
> > longer hidden with !EXPERT, but left it in the middle of the EXPERT
> > menu.  menuconfig stops putting config items under a submenu once it
> > encounters an item that doesn't depend on the menu's config item, so
> > this caused the remainder of the EXPERT menu to spill out into the
> > containing menu around it.  Fix by moving BPF_SYSCALL before the EXPERT
> > menu, next to BPF.
> > 
> > Fixes: e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING 
> > kernels, make it more configurable")
> > Signed-off-by: Josh Triplett 
> > ---
> > 
> > Ingo, do you want to take this through -tip?  Or should this go 
> > through some other tree?
> 
> I can pick it up, but -mm might be better suited for this, if you do:

Fair enough.

> > I'm also thinking about splitting the entire EXPERT menu into a 
> > separate Kconfig.expert and including it from init/Kconfig, to make 
> > it clear that everything in that menu should only be visible if 
> > EXPERT.  Right now, the long EXPERT menu blends into the longer 
> > init/Kconfig, and issues like this happen every few kernel releases.
> 
> That's a good idea as well.

I did so, in a patch later in this thread.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-12 Thread Josh Triplett
On Tue, May 12, 2015 at 09:56:58AM +0530, Vineet Gupta wrote:
> On Monday 11 May 2015 08:17 PM, Josh Triplett wrote:
> > On Mon, May 11, 2015 at 02:31:39PM +, Vineet Gupta wrote:
> >> On Tuesday 21 April 2015 11:17 PM, Josh Triplett wrote:
> >>> clone with CLONE_SETTLS accepts an argument to set the thread-local
> >>> storage area for the new thread.  sys_clone declares an int argument
> >>> tls_val in the appropriate point in the argument list (based on the
> >>> various CLONE_BACKWARDS variants), but doesn't actually use or pass
> >>> along that argument.  Instead, sys_clone calls do_fork, which calls
> >>> copy_process, which calls the arch-specific copy_thread, and copy_thread
> >>> pulls the corresponding syscall argument out of the pt_regs captured at
> >>> kernel entry (knowing what argument of clone that architecture passes
> >>> tls in).
> >>>
> >>> Apart from being awful and inscrutable, that also only works because
> >>> only one code path into copy_thread can pass the CLONE_SETTLS flag, and
> >>> that code path comes from sys_clone with its architecture-specific
> >>> argument-passing order.  This prevents introducing a new version of the
> >>> clone system call without propagating the same architecture-specific
> >>> position of the tls argument.
> >>>
> >>> However, there's no reason to pull the argument out of pt_regs when
> >>> sys_clone could just pass it down via C function call arguments.
> >>>
> >>> Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt
> >>> into, and a new copy_thread_tls that accepts the tls parameter as an
> >>> additional unsigned long (syscall-argument-sized) argument.
> >>> Change sys_clone's tls argument to an unsigned long (which does
> >>> not change the ABI), and pass that down to copy_thread_tls.
> >>>
> >>> Architectures that don't opt into copy_thread_tls will continue to
> >>> ignore the C argument to sys_clone in favor of the pt_regs captured at
> >>> kernel entry, and thus will be unable to introduce new versions of the
> >>> clone syscall.
> >>>
> >>> Signed-off-by: Josh Triplett 
> >>> Signed-off-by: Thiago Macieira 
> >>> Acked-by: Andy Lutomirski 
> >>> ---
> >>>  arch/Kconfig |  7 ++
> >>>  include/linux/sched.h| 14 
> >>>  include/linux/syscalls.h |  6 +++---
> >>>  kernel/fork.c| 55 
> >>> +++-
> >>>  4 files changed, 60 insertions(+), 22 deletions(-)
> >>>
> >>> diff --git a/arch/Kconfig b/arch/Kconfig
> >>> index 05d7a8a..4834a58 100644
> >>> --- a/arch/Kconfig
> >>> +++ b/arch/Kconfig
> >>> @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
> >>> This spares a stack switch and improves cache usage on softirq
> >>> processing.
> >>>  
> >>> +config HAVE_COPY_THREAD_TLS
> >>> + bool
> >>> + help
> >>> +   Architecture provides copy_thread_tls to accept tls argument via
> >>> +   normal C parameter passing, rather than extracting the syscall
> >>> +   argument from pt_regs.
> >>> +
> >>>  #
> >>>  # ABI hall of shame
> >>>  #
> >>> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >>> index a419b65..2cc88c6 100644
> >>> --- a/include/linux/sched.h
> >>> +++ b/include/linux/sched.h
> >>> @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct 
> >>> task_struct *task, unsigned int mode);
> >>>  /* Remove the current tasks stale references to the old mm_struct */
> >>>  extern void mm_release(struct task_struct *, struct mm_struct *);
> >>>  
> >>> +#ifdef CONFIG_HAVE_COPY_THREAD_TLS
> >>> +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
> >>> + struct task_struct *, unsigned long);
> >>> +#else
> >>>  extern int copy_thread(unsigned long, unsigned long, unsigned long,
> >>>   struct task_struct *);
> >>> +
> >>> +/* Architectures that haven't opted into copy_thread_tls get the tls 
> >>> argument
> >>> + * via pt_regs, so ignore the tls argument passed via C. */
> >>> +static inline int 

Re: [PATCH] init/Kconfig: Fix break in middle of EXPERT menu

2015-05-12 Thread Josh Triplett
On Tue, May 12, 2015 at 09:04:55AM +0200, Ingo Molnar wrote:
 
 * Josh Triplett j...@joshtriplett.org wrote:
 
  Commit e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y 
  !CONFIG_TRACING kernels, make it more configurable) made BPF_SYSCALL no
  longer hidden with !EXPERT, but left it in the middle of the EXPERT
  menu.  menuconfig stops putting config items under a submenu once it
  encounters an item that doesn't depend on the menu's config item, so
  this caused the remainder of the EXPERT menu to spill out into the
  containing menu around it.  Fix by moving BPF_SYSCALL before the EXPERT
  menu, next to BPF.
  
  Fixes: e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y  !CONFIG_TRACING 
  kernels, make it more configurable)
  Signed-off-by: Josh Triplett j...@joshtriplett.org
  ---
  
  Ingo, do you want to take this through -tip?  Or should this go 
  through some other tree?
 
 I can pick it up, but -mm might be better suited for this, if you do:

Fair enough.

  I'm also thinking about splitting the entire EXPERT menu into a 
  separate Kconfig.expert and including it from init/Kconfig, to make 
  it clear that everything in that menu should only be visible if 
  EXPERT.  Right now, the long EXPERT menu blends into the longer 
  init/Kconfig, and issues like this happen every few kernel releases.
 
 That's a good idea as well.

I did so, in a patch later in this thread.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-12 Thread Josh Triplett
On Tue, May 12, 2015 at 09:56:58AM +0530, Vineet Gupta wrote:
 On Monday 11 May 2015 08:17 PM, Josh Triplett wrote:
  On Mon, May 11, 2015 at 02:31:39PM +, Vineet Gupta wrote:
  On Tuesday 21 April 2015 11:17 PM, Josh Triplett wrote:
  clone with CLONE_SETTLS accepts an argument to set the thread-local
  storage area for the new thread.  sys_clone declares an int argument
  tls_val in the appropriate point in the argument list (based on the
  various CLONE_BACKWARDS variants), but doesn't actually use or pass
  along that argument.  Instead, sys_clone calls do_fork, which calls
  copy_process, which calls the arch-specific copy_thread, and copy_thread
  pulls the corresponding syscall argument out of the pt_regs captured at
  kernel entry (knowing what argument of clone that architecture passes
  tls in).
 
  Apart from being awful and inscrutable, that also only works because
  only one code path into copy_thread can pass the CLONE_SETTLS flag, and
  that code path comes from sys_clone with its architecture-specific
  argument-passing order.  This prevents introducing a new version of the
  clone system call without propagating the same architecture-specific
  position of the tls argument.
 
  However, there's no reason to pull the argument out of pt_regs when
  sys_clone could just pass it down via C function call arguments.
 
  Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt
  into, and a new copy_thread_tls that accepts the tls parameter as an
  additional unsigned long (syscall-argument-sized) argument.
  Change sys_clone's tls argument to an unsigned long (which does
  not change the ABI), and pass that down to copy_thread_tls.
 
  Architectures that don't opt into copy_thread_tls will continue to
  ignore the C argument to sys_clone in favor of the pt_regs captured at
  kernel entry, and thus will be unable to introduce new versions of the
  clone syscall.
 
  Signed-off-by: Josh Triplett j...@joshtriplett.org
  Signed-off-by: Thiago Macieira thiago.macie...@intel.com
  Acked-by: Andy Lutomirski l...@kernel.org
  ---
   arch/Kconfig |  7 ++
   include/linux/sched.h| 14 
   include/linux/syscalls.h |  6 +++---
   kernel/fork.c| 55 
  +++-
   4 files changed, 60 insertions(+), 22 deletions(-)
 
  diff --git a/arch/Kconfig b/arch/Kconfig
  index 05d7a8a..4834a58 100644
  --- a/arch/Kconfig
  +++ b/arch/Kconfig
  @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
  This spares a stack switch and improves cache usage on softirq
  processing.
   
  +config HAVE_COPY_THREAD_TLS
  + bool
  + help
  +   Architecture provides copy_thread_tls to accept tls argument via
  +   normal C parameter passing, rather than extracting the syscall
  +   argument from pt_regs.
  +
   #
   # ABI hall of shame
   #
  diff --git a/include/linux/sched.h b/include/linux/sched.h
  index a419b65..2cc88c6 100644
  --- a/include/linux/sched.h
  +++ b/include/linux/sched.h
  @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct 
  task_struct *task, unsigned int mode);
   /* Remove the current tasks stale references to the old mm_struct */
   extern void mm_release(struct task_struct *, struct mm_struct *);
   
  +#ifdef CONFIG_HAVE_COPY_THREAD_TLS
  +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
  + struct task_struct *, unsigned long);
  +#else
   extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *);
  +
  +/* Architectures that haven't opted into copy_thread_tls get the tls 
  argument
  + * via pt_regs, so ignore the tls argument passed via C. */
  +static inline int copy_thread_tls(
  + unsigned long clone_flags, unsigned long sp, unsigned long arg,
  + struct task_struct *p, unsigned long tls)
  +{
  + return copy_thread(clone_flags, sp, arg, p);
  +}
  +#endif
 
  Is this detour really needed. Can we not update copy_thread() of all 
  arches in one
  go and add the tls arg, w/o using it.
 
  And then arch maintainers can micro-optimize their code to use that arg vs.
  pt_regs-rxx version at their own leisure. The only downside I see with 
  that is
  bigger churn (touches all arches), and a interim unused arg warning ?
  
  In addition to the cleanup and simplification, the purpose of this patch
  is specifically to make sure that any architecture opting into
  HAVE_COPY_THREAD_TLS does *not* care how tls is passed in, and in
  particular doesn't depend on it arriving in a specific syscall argument.
 
 Sorry for sounding dense, but as I see here, in the end even for non opting
 arches, copy_thread_tls() calling convention expects tls arg passed to it from
 sys_clone call stack, but simply drops it. So that arg is always available, 
 has to
 be, otherwise even the pt_regs approach won't get to it.

That just avoids the need for preprocessor conditionals in the middle

Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-12 Thread Josh Triplett
On Tue, May 12, 2015 at 10:17:28AM +0200, Ingo Molnar wrote:
 * Josh Triplett j...@joshtriplett.org wrote:
   Looks good to me, but I have not looked very deeply ...
  
  I sent out a v2 with the co-author information moved from the 
  signoffs to the commit message.  If it looks reasonable to you, can 
  you take it through the tip tree please?
 
 So since this is multi-arch, and changes kernel/fork.c, I'd say -mm is 
 a more appropriate home for it? (Assuming Linus does not object.)

Works for me.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
On Tue, May 12, 2015 at 12:36:49AM +0200, Luis R. Rodriguez wrote:
> On Mon, May 11, 2015 at 01:23:02PM -0700, Josh Triplett wrote:
> >  init/Kconfig| 232 
> > +---
> >  init/Kconfig.expert | 231 
> > +++
> >  2 files changed, 232 insertions(+), 231 deletions(-)
> >  create mode 100644 init/Kconfig.expert
> 
> I'm blinded by this patch, can you use git format-patch -M and also add:
> 
> [diff]
>   
> renamelimit=0
> 
> To your .gitconfig

>From a quick test, doing what you suggest doesn't have any effect; if I
also use -C1 instead of -M, then the patch to init/Kconfig looks exactly
the same (deleting 231 lines and adding a "source" line), while the
patch to init/Kconfig.expert goes from adding 231 lines to copying
init/Kconfig and deleting 1800 lines.

That does not seem like an improvement.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
On Tue, May 12, 2015 at 12:01:27AM +0200, Paul Bolle wrote:
> On Mon, 2015-05-11 at 14:47 -0700, Josh Triplett wrote:
> > On Mon, May 11, 2015 at 11:32:28PM +0200, Paul Bolle wrote:
> > > Is squashing those two lines worth a new kconfig mechanism?
> > 
> > In my opinion, yes.  If you use the implicit (and error-prone)
> > menuconfig submenuing, you get a single entry with the '[ ]' and the
> > submenu.  There are currently 272 instances of "menuconfig" in Kconfig
> > files.
> 
> How many of those use the subtle trick EXPERT uses?

A rough search suggests about 35, with the rest using an "if" right
after the menu (presumably because they want dependencies rather than
visibility).  (A couple of which currently have the same broken-menu
problem today's patch 1 fixed for EXPERT, and a couple of which are
using that trick *across input files*.)

> > I'd like to have a less error-prone mechanism for people to use,
> > with an explicit "endmenu" at the end, and I don't want to leave any
> > incentive for people to need the more error-prone version.
> > 
> > I would be tempted to just make "menuconfig" require an endmenu, and
> > convert all users, but that would almost certainly break many
> > third-party users of kconfig.  So instead, I'm currently extending
> > "menu" (which already expects "endmenu") to allow the syntax
> > "menu config SYMBOL", which acts like a combination of "config SYMBOL"
> > and a menu with "visible if SYMBOL".
> 
> Bikeshedding (before I'm even convinced of the need of this extension):
> "menu config" is far too similar to "menuconfig".

Yeah, agreed; given that people will still end up using "menuconfig" in
light of the above, I'll use another syntax.

> > Diffstat for the patch I'm testing
> > right now:
> > 
> >  scripts/kconfig/zconf.y | 18 ++
> >   1 file changed, 18 insertions(+)
> > 
> > That seems worthwhile to have a less error-prone menu mechanism.
> > 
> > (The actual patch would also need to updated zconf.tab.c_shipped.)
> 
> And some lines in Documentation/kbuild/kconfig-language.txt (speaking
> from memory).

Good point; this does need documentation.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 11:50:21PM +0200, Paul Bolle wrote:
> On Mon, 2015-05-11 at 13:23 -0700, Josh Triplett wrote:
> > --- /dev/null
> > +++ b/init/Kconfig.expert
> > @@ -0,0 +1,231 @@
> > +menuconfig EXPERT
> > +   bool "Configure standard kernel features (expert users)"
> > +   # Unhide debug options, to make the on-by-default options visible
> > +   select DEBUG_KERNEL
> > +   help
> > + This option allows certain base kernel options and settings
> > +  to be disabled or tweaked. This is for specialized
> > +  environments which can tolerate a "non-standard" kernel.
> > +  Only use this if you really know what you are doing.
> 
> Comment here saying
>   # All entries in this file must have "if EXPERT" after their prompt
> 
> or something to that effect (pending you patch, that is)?

Yeah, I should add such a comment.  I was hoping to make it obsolete via
kconfig changes, but in the interim, sure.

> > +config KALLSYMS
> > +bool "Load all symbols for debugging/ksymoops" if EXPERT
> > +default y
> > +help
> > +  Say Y here to let the kernel print out symbolic crash information and
> > +  symbolic stack backtraces. This increases the size of the kernel
> > +  somewhat, as all symbols have to be loaded into the kernel image.
> > +
> > +config KALLSYMS_ALL
> > +   bool "Include all symbols in kallsyms"
> 
> (For some reason this entry doesn't have if EXPERT but it seems to
> behave as expected. Odd.)

Because it depends on KALLSYMS.  Magic!

> > +   depends on DEBUG_KERNEL && KALLSYMS
> > +   help
> > +  Normally kallsyms only contains the symbols of functions for nicer
> > +  OOPS messages and backtraces (i.e., symbols from the text and 
> > inittext
> > +  sections). This is sufficient for most cases. And only in very rare
> > +  cases (e.g., when a debugger is used) all symbols are required (e.g.,
> > +  names of variables from the data sections, etc).
> > +
> > +  This option makes sure that all symbols are loaded into the kernel
> > +  image (i.e., symbols from all sections) in cost of increased kernel
> > +  size (depending on the kernel configuration, it may be 300KiB or
> > +  something like this).
> > +
> > +  Say N unless you really need all symbols.
> > +
> > +config PRINTK
> > +   default y
> > +   bool "Enable support for printk" if EXPERT
> 
> Now you're touching this: bool [...] as the first line, please.

I'd like the commit moving these to their own file to very obviously
look like it's moving these lines unmodified, rather than making changes
in the process.  If you want this (and all the subsequent instances you
flagged) cleaned up, let's do that as a subsequent patch separate from
the move.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 11:32:28PM +0200, Paul Bolle wrote:
> On Mon, 2015-05-11 at 14:18 -0700, Josh Triplett wrote:
> > However, that would produce *two* entries under the "General setup"
> > menu: a yes/no entry "Configure standard kernel features (expert users)"
> > with no submenu, and a "Some separate menu prompt here" entry with a
> > submenu but no '[ ]' for a yes/no option.  Integrating the two (without
> > using menuconfig's implicit "add stuff to submenu until an option's
> > prompt doesn't depend on this symbol" magic) requires new a kconfig
> > mechanism.
> 
> The diff pasted at the end of this message, which I quickly cobbled
> together an applies on top of this 2/1, generates these two lines in
> menuconfig (for EXPERT = 'y')
> [*] Configure standard kernel features (expert users)
> Standard kernel features  --->   
> 
> Is squashing those two lines worth a new kconfig mechanism?

In my opinion, yes.  If you use the implicit (and error-prone)
menuconfig submenuing, you get a single entry with the '[ ]' and the
submenu.  There are currently 272 instances of "menuconfig" in Kconfig
files.  I'd like to have a less error-prone mechanism for people to use,
with an explicit "endmenu" at the end, and I don't want to leave any
incentive for people to need the more error-prone version.

I would be tempted to just make "menuconfig" require an endmenu, and
convert all users, but that would almost certainly break many
third-party users of kconfig.  So instead, I'm currently extending
"menu" (which already expects "endmenu") to allow the syntax
"menu config SYMBOL", which acts like a combination of "config SYMBOL"
and a menu with "visible if SYMBOL".  Diffstat for the patch I'm testing
right now:

 scripts/kconfig/zconf.y | 18 ++
  1 file changed, 18 insertions(+)

That seems worthwhile to have a less error-prone menu mechanism.

(The actual patch would also need to updated zconf.tab.c_shipped.)

(Also, the diff you posted would be smaller if you left "config EXPERT"
at the top of init/Kconfig.expert; why the move?)

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 11:01:22PM +0200, Paul Bolle wrote:
> On Mon, 2015-05-11 at 13:23 -0700, Josh Triplett wrote:
> > I'd also like to factor the "if EXPERT" off of all the prompts and into a
> > single scoped item wrapped around all of them, but kconfig doesn't have any 
> > way
> > to do that.  "menuconfig" is just a hint, with no matching "endmenu" and no
> > implicit visibility; "menu" is scoped and has "visible if", but that would
> > create a separate option containing a menu, rather than a menu under 
> > EXPERT's
> > "Configure standard kernel features (expert users)".  And "if EXPERT ... 
> > endif"
> > produces a dependency, not a prompt-visibility condition.  So I think this
> > would require changes to the Kconfig language, to introduce either a scoped
> > "visible if EXPERT ... endvisible" or similar, or a scoped version of
> > menuconfig with a matching "endmenu" and implicit visibility (effectively a
> > "menu" statement with attached "config" rather than a "config" with a hint
> > "this might be a menu").  I'm leaning towards the latter.
> 
> The behavior of menuconfig in this case is rather subtle. I must admit I
> never noticed it.
> 
> The "visible" option to menus is little used, and I'm not really
> familiar with it. So, for what it's worth: would adding a new menu with
>   visible if EXPERT
> 
> attached to it, and putting all current
>   prompt "Foo" if EXPERT
> 
> entries in that menu roughly do what you want?

Not quite.  menu has a prompt but no associated symbol, so with current
kconfig that would look like:

config EXPERT
bool "Configure standard kernel features (expert users)"
...

menu "Some separate menu prompt here"
visible if EXPERT

... various options that currently have if EXPERT on their prompts ...

endmenu


However, that would produce *two* entries under the "General setup"
menu: a yes/no entry "Configure standard kernel features (expert users)"
with no submenu, and a "Some separate menu prompt here" entry with a
submenu but no '[ ]' for a yes/no option.  Integrating the two (without
using menuconfig's implicit "add stuff to submenu until an option's
prompt doesn't depend on this symbol" magic) requires new a kconfig
mechanism.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
The expert menu frequently gets broken by a config item in the middle
that leaves off the "if EXPERT" from its prompt.  This results in the
remainder of the menu spilling out into the parent "General setup" menu.
Move the entire expert menu into a separate Kconfig file,
init/Kconfig.expert, to make this harder to do accidentally, and to
break up the exceedingly long init/Kconfig a bit.

Signed-off-by: Josh Triplett 
---

This applies on top of "init/Kconfig: Fix break in middle of EXPERT
menu".  Please apply both.

I'd also like to factor the "if EXPERT" off of all the prompts and into a
single scoped item wrapped around all of them, but kconfig doesn't have any way
to do that.  "menuconfig" is just a hint, with no matching "endmenu" and no
implicit visibility; "menu" is scoped and has "visible if", but that would
create a separate option containing a menu, rather than a menu under EXPERT's
"Configure standard kernel features (expert users)".  And "if EXPERT ... endif"
produces a dependency, not a prompt-visibility condition.  So I think this
would require changes to the Kconfig language, to introduce either a scoped
"visible if EXPERT ... endvisible" or similar, or a scoped version of
menuconfig with a matching "endmenu" and implicit visibility (effectively a
"menu" statement with attached "config" rather than a "config" with a hint
"this might be a menu").  I'm leaning towards the latter.

So I'll send a followup patch enhancing kconfig to improve this case,
but I think splitting this into a separate file is still worth it even
without that.

 init/Kconfig| 232 +---
 init/Kconfig.expert | 231 +++
 2 files changed, 232 insertions(+), 231 deletions(-)
 create mode 100644 init/Kconfig.expert

diff --git a/init/Kconfig b/init/Kconfig
index e2f16f1..a2de3f5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1351,237 +1351,7 @@ config BPF_SYSCALL
  Enable the bpf() system call that allows to manipulate eBPF
  programs and maps via file descriptors.
 
-menuconfig EXPERT
-   bool "Configure standard kernel features (expert users)"
-   # Unhide debug options, to make the on-by-default options visible
-   select DEBUG_KERNEL
-   help
- This option allows certain base kernel options and settings
-  to be disabled or tweaked. This is for specialized
-  environments which can tolerate a "non-standard" kernel.
-  Only use this if you really know what you are doing.
-
-config UID16
-   bool "Enable 16-bit UID system calls" if EXPERT
-   depends on HAVE_UID16 && MULTIUSER
-   default y
-   help
- This enables the legacy 16-bit UID syscall wrappers.
-
-config MULTIUSER
-   bool "Multiple users, groups and capabilities support" if EXPERT
-   default y
-   help
- This option enables support for non-root users, groups and
- capabilities.
-
- If you say N here, all processes will run with UID 0, GID 0, and all
- possible capabilities.  Saying N here also compiles out support for
- system calls related to UIDs, GIDs, and capabilities, such as setuid,
- setgid, and capset.
-
- If unsure, say Y here.
-
-config SGETMASK_SYSCALL
-   bool "sgetmask/ssetmask syscalls support" if EXPERT
-   def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || 
SPARC || CRIS || MICROBLAZE || SUPERH
-   ---help---
- sys_sgetmask and sys_ssetmask are obsolete system calls
- no longer supported in libc but still enabled by default in some
- architectures.
-
- If unsure, leave the default option here.
-
-config SYSFS_SYSCALL
-   bool "Sysfs syscall support" if EXPERT
-   default y
-   ---help---
- sys_sysfs is an obsolete system call no longer supported in libc.
- Note that disabling this option is more secure but might break
- compatibility with some systems.
-
- If unsure say Y here.
-
-config SYSCTL_SYSCALL
-   bool "Sysctl syscall support" if EXPERT
-   depends on PROC_SYSCTL
-   default n
-   select SYSCTL
-   ---help---
- sys_sysctl uses binary paths that have been found challenging
- to properly maintain and use.  The interface in /proc/sys
- using paths with ascii names is now the primary path to this
- information.
-
- Almost nothing using the binary sysctl interface so if you are
- trying to save some space it is probably safe to disable this,
- making your kernel marginally smaller.
-
- If unsure say N here.
-
-config KALLSYMS
-bool "Load all symbols 

[PATCHv2 2/2] x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit

2015-05-11 Thread Josh Triplett
For 32-bit userspace on a 64-bit kernel, this requires modifying
stub32_clone to actually swap the appropriate arguments to match
CONFIG_CLONE_BACKWARDS, rather than just leaving the C argument for tls
broken.

Patch co-authored by Josh Triplett and Thiago Macieira.

Signed-off-by: Josh Triplett 
Acked-by: Andy Lutomirski 
---
 arch/x86/Kconfig | 1 +
 arch/x86/ia32/ia32entry.S| 2 +-
 arch/x86/kernel/process_32.c | 6 +++---
 arch/x86/kernel/process_64.c | 8 
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca..4960b0d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -124,6 +124,7 @@ config X86
select MODULES_USE_ELF_REL if X86_32
select MODULES_USE_ELF_RELA if X86_64
select CLONE_BACKWARDS if X86_32
+   select HAVE_COPY_THREAD_TLS
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUE_RWLOCK
select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebca..0286735 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -487,7 +487,7 @@ GLOBAL(\label)
ALIGN
 GLOBAL(stub32_clone)
leaq sys_clone(%rip),%rax
-   mov %r8, %rcx
+   xchg %r8, %rcx
jmp  ia32_ptregs_common 
 
ALIGN
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f9..ead28ff 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -129,8 +129,8 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-   unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+   unsigned long arg, struct task_struct *p, unsigned long tls)
 {
struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *tsk;
@@ -185,7 +185,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 */
if (clone_flags & CLONE_SETTLS)
err = do_set_thread_area(p, -1,
-   (struct user_desc __user *)childregs->si, 0);
+   (struct user_desc __user *)tls, 0);
 
if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43..c69cabc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -151,8 +151,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int 
tls)
return get_desc_base(>thread.tls_array[tls]);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-   unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+   unsigned long arg, struct task_struct *p, unsigned long tls)
 {
int err;
struct pt_regs *childregs;
@@ -209,10 +209,10 @@ int copy_thread(unsigned long clone_flags, unsigned long 
sp,
 #ifdef CONFIG_IA32_EMULATION
if (test_thread_flag(TIF_IA32))
err = do_set_thread_area(p, -1,
-   (struct user_desc __user *)childregs->si, 0);
+   (struct user_desc __user *)tls, 0);
else
 #endif
-   err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+   err = do_arch_prctl(p, ARCH_SET_FS, tls);
if (err)
goto out;
}
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 04:00:43PM +0200, Ingo Molnar wrote:
> 
> * Josh Triplett  wrote:
> 
> > On Mon, May 11, 2015 at 12:13:13PM +0200, Ingo Molnar wrote:
> > > 
> > > * j...@joshtriplett.org  wrote:
> > > 
> > > > On Tue, May 05, 2015 at 08:53:03PM +0200, Thomas Gleixner wrote:
> > > > > On Tue, 21 Apr 2015, Josh Triplett wrote:
> > > > > > 
> > > > > > Signed-off-by: Josh Triplett 
> > > > > > Signed-off-by: Thiago Macieira 
> > > > > 
> > > > > Can you please clarify that SOB chain? It does not make any sense.
> > > > 
> > > > Co-authored patch.  We both worked on it together, and sadly git 
> > > > doesn't support a commit with multiple authors, so this is the next 
> > > > best thing.
> > > 
> > > No, this is not a valid SOB chain.
> > > 
> > > For 'co authored' patches you can add credits either to the file, 
> > > as two copyright lines, or via the changelog, no need to mess up 
> > > the SOB chain for that.
> > 
> > Fine, I'll write it another way.
> > 
> > Do you see any other issues with the patch other than the signoffs?
> 
> Looks good to me, but I have not looked very deeply ...

I sent out a v2 with the co-author information moved from the signoffs
to the commit message.  If it looks reasonable to you, can you take it
through the tip tree please?

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-11 Thread Josh Triplett
clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread.  sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass
along that argument.  Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes
tls in).

Apart from being awful and inscrutable, that also only works because
only one code path into copy_thread can pass the CLONE_SETTLS flag, and
that code path comes from sys_clone with its architecture-specific
argument-passing order.  This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.

However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.

Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt
into, and a new copy_thread_tls that accepts the tls parameter as an
additional unsigned long (syscall-argument-sized) argument.
Change sys_clone's tls argument to an unsigned long (which does
not change the ABI), and pass that down to copy_thread_tls.

Architectures that don't opt into copy_thread_tls will continue to
ignore the C argument to sys_clone in favor of the pt_regs captured at
kernel entry, and thus will be unable to introduce new versions of the
clone syscall.

Patch co-authored by Josh Triplett and Thiago Macieira.

Signed-off-by: Josh Triplett 
Acked-by: Andy Lutomirski 
---
 arch/Kconfig |  7 ++
 include/linux/sched.h| 14 
 include/linux/syscalls.h |  6 +++---
 kernel/fork.c| 55 +++-
 4 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 05d7a8a..4834a58 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
  This spares a stack switch and improves cache usage on softirq
  processing.
 
+config HAVE_COPY_THREAD_TLS
+   bool
+   help
+ Architecture provides copy_thread_tls to accept tls argument via
+ normal C parameter passing, rather than extracting the syscall
+ argument from pt_regs.
+
 #
 # ABI hall of shame
 #
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a419b65..2cc88c6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct 
*task, unsigned int mode);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
 
+#ifdef CONFIG_HAVE_COPY_THREAD_TLS
+extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
+   struct task_struct *, unsigned long);
+#else
 extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *);
+
+/* Architectures that haven't opted into copy_thread_tls get the tls argument
+ * via pt_regs, so ignore the tls argument passed via C. */
+static inline int copy_thread_tls(
+   unsigned long clone_flags, unsigned long sp, unsigned long arg,
+   struct task_struct *p, unsigned long tls)
+{
+   return copy_thread(clone_flags, sp, arg, p);
+}
+#endif
 extern void flush_thread(void);
 extern void exit_thread(void);
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 76d1e38..bb51bec 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd);
 asmlinkage long sys_fork(void);
 asmlinkage long sys_vfork(void);
 #ifdef CONFIG_CLONE_BACKWARDS
-asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int,
+asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned 
long,
   int __user *);
 #else
 #ifdef CONFIG_CLONE_BACKWARDS3
 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *,
- int __user *, int);
+ int __user *, unsigned long);
 #else
 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
-  int __user *, int);
+  int __user *, unsigned long);
 #endif
 #endif
 
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139..b3dadf4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1192,7 +1192,8 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
-   int trace

[PATCHv2 0/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-11 Thread Josh Triplett
clone has some of the quirkiest syscall handling in the kernel, with a pile of
special cases, historical curiosities, and architecture-specific calling
conventions.  In particular, clone with CLONE_SETTLS accepts a parameter "tls"
that the C entry point completely ignores and some assembly entry points
overwrite; instead, the low-level arch-specific code pulls the tls parameter
out of the arch-specific register captured as part of pt_regs on entry to the
kernel.  That's a massive hack, and it makes the arch-specific code only work
when called via the specific existing syscall entry points; because of this
hack, any new clone-like system call would have to accept an identical tls
argument in exactly the same arch-specific position, rather than providing a
unified system call entry point across architectures.

The first patch allows architectures to handle the tls argument via normal C
parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS.  The
second patch makes 32-bit and 64-bit x86 opt into this.

These two patches came out of the clone4 series, which isn't ready for this
merge window, but these first two cleanup patches were entirely uncontroversial
and have acks.  I'd like to go ahead and submit these two so that other
architectures can begin building on top of this and opting into
HAVE_COPY_THREAD_TLS.  However, I'm also happy to wait and send these through
the next merge window (along with v3 of clone4) if anyone would prefer that.

v2: Move co-author from signoffs to a note in the commit message, as
required by Ingo Molnar.

Josh Triplett (2):
  clone: Support passing tls argument via C rather than pt_regs magic
  x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit

 arch/Kconfig |  7 ++
 arch/x86/Kconfig |  1 +
 arch/x86/ia32/ia32entry.S|  2 +-
 arch/x86/kernel/process_32.c |  6 ++---
 arch/x86/kernel/process_64.c |  8 +++
 include/linux/sched.h| 14 +++
 include/linux/syscalls.h |  6 ++---
 kernel/fork.c| 55 +---
 8 files changed, 69 insertions(+), 30 deletions(-)

-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] force inlining of spinlock ops

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 07:57:22PM +0200, Denys Vlasenko wrote:
> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
> very small functions we expect to be inlined. In particular,
> with this config: http://busybox.net/~vda/kernel_config
> there are more than a thousand copies of tiny spinlock-related functions:
> 
> $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort 
> -rn | grep ' spin'
> 473 000b t spin_unlock_irqrestore
> 292 000b t spin_unlock
> 215 000b t spin_lock
> 134 000b t spin_unlock_irq
> 130 000b t spin_unlock_bh
> 120 000b t spin_lock_irq
> 106 000b t spin_lock_bh
> 
> Disassembly:
> 
> 81004720 :
> 81004720:   55  push   %rbp
> 81004721:   48 89 e5mov%rsp,%rbp
> 81004724:   e8 f8 4e e2 02  callq  <_raw_spin_lock>
> 81004729:   5d  pop%rbp
> 8100472a:   c3  retq

Frame pointers make this even more awful, since without them this could
just become a single jmp.  (Assuming _raw_spin_lock shouldn't be
inlined too.)

> This patch fixes this via s/inline/__always_inline/ in spinlock.h.
> This decreases vmlinux by about 30k:
> 
> text data  bss   dec hex filename
> 82375570 22255544 20627456 125258570 7774b4a vmlinux.before
> 82335059 22255416 20627456 125217931 776ac8b vmlinux

Nice improvement.  Given that this actually makes the kernel *smaller*,
presumably in addition to faster, this forced inlining seems completely
reasonable.

> Signed-off-by: Denys Vlasenko 
> Cc: Thomas Graf 
> Cc: David S. Miller 
> Cc: Bart Van Assche 
> Cc: Peter Zijlstra 
> Cc: David Rientjes 
> Cc: David S. Miller 
> Cc: Andrew Morton 
> Cc: Linus Torvalds 
> Cc: Oleg Nesterov 
> Cc: Paul E. McKenney 
> Cc: Ingo Molnar 
> Cc: Paul E. McKenney 
> CC: linux-kernel@vger.kernel.org

Reviewed-by: Josh Triplett 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] init/Kconfig: Fix break in middle of EXPERT menu

2015-05-11 Thread Josh Triplett
Commit e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y &&
!CONFIG_TRACING kernels, make it more configurable") made BPF_SYSCALL no
longer hidden with !EXPERT, but left it in the middle of the EXPERT
menu.  menuconfig stops putting config items under a submenu once it
encounters an item that doesn't depend on the menu's config item, so
this caused the remainder of the EXPERT menu to spill out into the
containing menu around it.  Fix by moving BPF_SYSCALL before the EXPERT
menu, next to BPF.

Fixes: e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING 
kernels, make it more configurable")
Signed-off-by: Josh Triplett 
---

Ingo, do you want to take this through -tip?  Or should this go through some
other tree?

I'm also thinking about splitting the entire EXPERT menu into a separate
Kconfig.expert and including it from init/Kconfig, to make it clear that
everything in that menu should only be visible if EXPERT.  Right now, the long
EXPERT menu blends into the longer init/Kconfig, and issues like this happen
every few kernel releases.

 init/Kconfig | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index dc24dec..e2f16f1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1341,6 +1341,16 @@ config HAVE_PCSPKR_PLATFORM
 config BPF
bool
 
+# syscall, maps, verifier
+config BPF_SYSCALL
+   bool "Enable bpf() system call"
+   select ANON_INODES
+   select BPF
+   default n
+   help
+ Enable the bpf() system call that allows to manipulate eBPF
+ programs and maps via file descriptors.
+
 menuconfig EXPERT
bool "Configure standard kernel features (expert users)"
# Unhide debug options, to make the on-by-default options visible
@@ -1535,16 +1545,6 @@ config EVENTFD
 
  If unsure, say Y.
 
-# syscall, maps, verifier
-config BPF_SYSCALL
-   bool "Enable bpf() system call"
-   select ANON_INODES
-   select BPF
-   default n
-   help
- Enable the bpf() system call that allows to manipulate eBPF
- programs and maps via file descriptors.
-
 config SHMEM
bool "Use full shmem filesystem" if EXPERT
default y
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V4] rcu: change function declaration to bool

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 06:12:27PM +0200, Nicholas Mc Guire wrote:
> rcu_cpu_has_callbacks() is declared int. The current declaration was 
> introduced
> in commit c0f4dfd4f90f (rcu: Make RCU_FAST_NO_HZ take advantage of numbered
> callbacks). But it is actually returning bool and as the function description 
> states " * Return true if the specified CPU has any callback", this 
> probably 
> should be a bool as all (3) call-sites currently treat it as bool.
> 
> Type-checking coccinelle spatches are being used to locate type mismatches
> between function signatures and return values in this case this produced:
> ./kernel/rcu/tree.c:3538 WARNING: return of wrong type
> int != bool, 
> 
> Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y)
> 
> Patch is against 4.1-rc3 (localversion-next is -next-20150511) and fixes
> 
> Signed-off-by: Nicholas Mc Guire 

Seems like a reasonable addition.

Reviewed-by: Josh Triplett 

> ---
> 
> V4: fix-up to include the origin of the issue being fixed as requeseted by
> Steven Rostedt .
> 
>  kernel/rcu/tree.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index bcc5943..599550c 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3516,7 +3516,7 @@ static int rcu_pending(void)
>   * non-NULL, store an indication of whether all callbacks are lazy.
>   * (If there are no callbacks, all of them are deemed to be lazy.)
>   */
> -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
> +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
>  {
>   bool al = true;
>   bool hc = false;
> -- 
> 1.7.10.4
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] rcu: change function declaration to bool

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 05:46:12PM +0200, Nicholas Mc Guire wrote:
> rcu_cpu_has_callbacks() is declared int but is actually returning bool
> and as the function description states " * Return true if the specified 
> CPU has any callback", this probably should be a bool. All (3) 
> call-sites currently treat it as bool.
> 
> Type-checking coccinelle spatches are being used to locate type mismatches
> between function signatures and return values in this case this produced:
> ./kernel/rcu/tree.c:3538 WARNING: return of wrong type
> int != bool, 
> 
> Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y)
> 
> Patch is against 4.1-rc3 (localversion-next is -next-20150511)
> 
> Signed-off-by: Nicholas Mc Guire 

Reviewed-by: Josh Triplett 

Thanks!

> ---
> 
> V3: fix-up of commit message again (hope I got it right this time) as 
> requested by Josh Triplett /Steven Rostedt
> 
> 
>  kernel/rcu/tree.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index bcc5943..599550c 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3516,7 +3516,7 @@ static int rcu_pending(void)
>   * non-NULL, store an indication of whether all callbacks are lazy.
>   * (If there are no callbacks, all of them are deemed to be lazy.)
>   */
> -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
> +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
>  {
>   bool al = true;
>   bool hc = false;
> -- 
> 1.7.10.4
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] rcu: change function declaration to bool

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 11:28:30AM -0400, Steven Rostedt wrote:
> On Mon, 11 May 2015 17:10:59 +0200
> Nicholas Mc Guire  wrote:
> 
> > rcu_cpu_has_callbacks() is declared int but is actually returning bool and
> > and as the function description states " * Return true if the specified 
> > CPU has any callback", this probably should be a bool. All (3) 
> > call-sites currently treat it as bool so the declaration.
> > 
> > 
> > Signed-off-by: Nicholas Mc Guire 
> > ---
> > 
> > V2: fixed up commit message and tool infos as requested by 
> > Josh Triplett 
> > 
> > Type-checking coccinelle spatches are being used to locate type 
> > mismatches between function signatures and return values. 
> > ./kernel/rcu/tree.c:3538 WARNING: return of wrong type
> > int != bool, 
> > 
> > Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y)
> > 
> > Patch is against 4.1-rc3 (localversion-next is -next-20150511)
> 
> I think what Josh was saying is that all the above except for the "V2"
> should be above the signature. Everything between the "---" and the
> patch gets tossed out when committed into git.
> 
> Giving credit to coccinelle and even what branch and config was used
> for testing is something we want in the git change log history.

Yes, exactly.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 02:31:39PM +, Vineet Gupta wrote:
> On Tuesday 21 April 2015 11:17 PM, Josh Triplett wrote:
> > clone with CLONE_SETTLS accepts an argument to set the thread-local
> > storage area for the new thread.  sys_clone declares an int argument
> > tls_val in the appropriate point in the argument list (based on the
> > various CLONE_BACKWARDS variants), but doesn't actually use or pass
> > along that argument.  Instead, sys_clone calls do_fork, which calls
> > copy_process, which calls the arch-specific copy_thread, and copy_thread
> > pulls the corresponding syscall argument out of the pt_regs captured at
> > kernel entry (knowing what argument of clone that architecture passes
> > tls in).
> > 
> > Apart from being awful and inscrutable, that also only works because
> > only one code path into copy_thread can pass the CLONE_SETTLS flag, and
> > that code path comes from sys_clone with its architecture-specific
> > argument-passing order.  This prevents introducing a new version of the
> > clone system call without propagating the same architecture-specific
> > position of the tls argument.
> > 
> > However, there's no reason to pull the argument out of pt_regs when
> > sys_clone could just pass it down via C function call arguments.
> > 
> > Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt
> > into, and a new copy_thread_tls that accepts the tls parameter as an
> > additional unsigned long (syscall-argument-sized) argument.
> > Change sys_clone's tls argument to an unsigned long (which does
> > not change the ABI), and pass that down to copy_thread_tls.
> > 
> > Architectures that don't opt into copy_thread_tls will continue to
> > ignore the C argument to sys_clone in favor of the pt_regs captured at
> > kernel entry, and thus will be unable to introduce new versions of the
> > clone syscall.
> > 
> > Signed-off-by: Josh Triplett 
> > Signed-off-by: Thiago Macieira 
> > Acked-by: Andy Lutomirski 
> > ---
> >  arch/Kconfig |  7 ++
> >  include/linux/sched.h| 14 
> >  include/linux/syscalls.h |  6 +++---
> >  kernel/fork.c| 55 
> > +++-
> >  4 files changed, 60 insertions(+), 22 deletions(-)
> > 
> > diff --git a/arch/Kconfig b/arch/Kconfig
> > index 05d7a8a..4834a58 100644
> > --- a/arch/Kconfig
> > +++ b/arch/Kconfig
> > @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
> >   This spares a stack switch and improves cache usage on softirq
> >   processing.
> >  
> > +config HAVE_COPY_THREAD_TLS
> > +   bool
> > +   help
> > + Architecture provides copy_thread_tls to accept tls argument via
> > + normal C parameter passing, rather than extracting the syscall
> > + argument from pt_regs.
> > +
> >  #
> >  # ABI hall of shame
> >  #
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index a419b65..2cc88c6 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct 
> > task_struct *task, unsigned int mode);
> >  /* Remove the current tasks stale references to the old mm_struct */
> >  extern void mm_release(struct task_struct *, struct mm_struct *);
> >  
> > +#ifdef CONFIG_HAVE_COPY_THREAD_TLS
> > +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
> > +   struct task_struct *, unsigned long);
> > +#else
> >  extern int copy_thread(unsigned long, unsigned long, unsigned long,
> > struct task_struct *);
> > +
> > +/* Architectures that haven't opted into copy_thread_tls get the tls 
> > argument
> > + * via pt_regs, so ignore the tls argument passed via C. */
> > +static inline int copy_thread_tls(
> > +   unsigned long clone_flags, unsigned long sp, unsigned long arg,
> > +   struct task_struct *p, unsigned long tls)
> > +{
> > +   return copy_thread(clone_flags, sp, arg, p);
> > +}
> > +#endif
> 
> Is this detour really needed. Can we not update copy_thread() of all arches 
> in one
> go and add the tls arg, w/o using it.
> 
> And then arch maintainers can micro-optimize their code to use that arg vs.
> pt_regs->rxx version at their own leisure. The only downside I see with that 
> is
> bigger churn (touches all arches), and a interim unused arg warning ?

In addition to the cleanup and simplification, the purpose of this patch
is specifically to ma

Re: [PATCH] rcu: change function declaration to bool

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 10:51:55AM +0200, Nicholas Mc Guire wrote:
> rcu_cpu_has_callbacks() is declared int but is actually returning bool and
> all call-sites currently use it as bool so the declaration should be bool 
> as well.
> 
> Signed-off-by: Nicholas Mc Guire 

The patch seems reasonable to me.  However...

> ---
> 
> ./kernel/rcu/tree.c:3538 WARNING: return of wrong type
> int != bool, 
> 
> as the description of rcu_cpu_has_callbacks() states:
> " * Return true if the specified CPU has any callback"
> this probably should be a bool 
> All (3) call sites are conditions and are treating it as boolean.
> 
> Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y)

...some of this information should be in the commit message, as well as
a description of what tool produced this warning.

With that changed,
Reviewed-by: Josh Triplett 

> Patch is against 4.1-rc3 (localversion-next is -next-20150511)
> 
>  kernel/rcu/tree.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index bcc5943..599550c 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3516,7 +3516,7 @@ static int rcu_pending(void)
>   * non-NULL, store an indication of whether all callbacks are lazy.
>   * (If there are no callbacks, all of them are deemed to be lazy.)
>   */
> -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
> +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
>  {
>   bool al = true;
>   bool hc = false;
> -- 
> 1.7.10.4
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 12:13:13PM +0200, Ingo Molnar wrote:
> 
> * j...@joshtriplett.org  wrote:
> 
> > On Tue, May 05, 2015 at 08:53:03PM +0200, Thomas Gleixner wrote:
> > > On Tue, 21 Apr 2015, Josh Triplett wrote:
> > > > 
> > > > Signed-off-by: Josh Triplett 
> > > > Signed-off-by: Thiago Macieira 
> > > 
> > > Can you please clarify that SOB chain? It does not make any sense.
> > 
> > Co-authored patch.  We both worked on it together, and sadly git 
> > doesn't support a commit with multiple authors, so this is the next 
> > best thing.
> 
> No, this is not a valid SOB chain.
> 
> For 'co authored' patches you can add credits either to the file, as 
> two copyright lines, or via the changelog, no need to mess up the SOB 
> chain for that.

Fine, I'll write it another way.

Do you see any other issues with the patch other than the signoffs?

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 12:13:13PM +0200, Ingo Molnar wrote:
 
 * j...@joshtriplett.org j...@joshtriplett.org wrote:
 
  On Tue, May 05, 2015 at 08:53:03PM +0200, Thomas Gleixner wrote:
   On Tue, 21 Apr 2015, Josh Triplett wrote:

Signed-off-by: Josh Triplett j...@joshtriplett.org
Signed-off-by: Thiago Macieira thiago.macie...@intel.com
   
   Can you please clarify that SOB chain? It does not make any sense.
  
  Co-authored patch.  We both worked on it together, and sadly git 
  doesn't support a commit with multiple authors, so this is the next 
  best thing.
 
 No, this is not a valid SOB chain.
 
 For 'co authored' patches you can add credits either to the file, as 
 two copyright lines, or via the changelog, no need to mess up the SOB 
 chain for that.

Fine, I'll write it another way.

Do you see any other issues with the patch other than the signoffs?

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] rcu: change function declaration to bool

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 10:51:55AM +0200, Nicholas Mc Guire wrote:
 rcu_cpu_has_callbacks() is declared int but is actually returning bool and
 all call-sites currently use it as bool so the declaration should be bool 
 as well.
 
 Signed-off-by: Nicholas Mc Guire hof...@osadl.org

The patch seems reasonable to me.  However...

 ---
 
 ./kernel/rcu/tree.c:3538 WARNING: return of wrong type
 int != bool, 
 
 as the description of rcu_cpu_has_callbacks() states:
  * Return true if the specified CPU has any callback
 this probably should be a bool 
 All (3) call sites are conditions and are treating it as boolean.
 
 Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y)

...some of this information should be in the commit message, as well as
a description of what tool produced this warning.

With that changed,
Reviewed-by: Josh Triplett j...@joshtriplett.org

 Patch is against 4.1-rc3 (localversion-next is -next-20150511)
 
  kernel/rcu/tree.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
 index bcc5943..599550c 100644
 --- a/kernel/rcu/tree.c
 +++ b/kernel/rcu/tree.c
 @@ -3516,7 +3516,7 @@ static int rcu_pending(void)
   * non-NULL, store an indication of whether all callbacks are lazy.
   * (If there are no callbacks, all of them are deemed to be lazy.)
   */
 -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
 +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
  {
   bool al = true;
   bool hc = false;
 -- 
 1.7.10.4
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 02:31:39PM +, Vineet Gupta wrote:
 On Tuesday 21 April 2015 11:17 PM, Josh Triplett wrote:
  clone with CLONE_SETTLS accepts an argument to set the thread-local
  storage area for the new thread.  sys_clone declares an int argument
  tls_val in the appropriate point in the argument list (based on the
  various CLONE_BACKWARDS variants), but doesn't actually use or pass
  along that argument.  Instead, sys_clone calls do_fork, which calls
  copy_process, which calls the arch-specific copy_thread, and copy_thread
  pulls the corresponding syscall argument out of the pt_regs captured at
  kernel entry (knowing what argument of clone that architecture passes
  tls in).
  
  Apart from being awful and inscrutable, that also only works because
  only one code path into copy_thread can pass the CLONE_SETTLS flag, and
  that code path comes from sys_clone with its architecture-specific
  argument-passing order.  This prevents introducing a new version of the
  clone system call without propagating the same architecture-specific
  position of the tls argument.
  
  However, there's no reason to pull the argument out of pt_regs when
  sys_clone could just pass it down via C function call arguments.
  
  Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt
  into, and a new copy_thread_tls that accepts the tls parameter as an
  additional unsigned long (syscall-argument-sized) argument.
  Change sys_clone's tls argument to an unsigned long (which does
  not change the ABI), and pass that down to copy_thread_tls.
  
  Architectures that don't opt into copy_thread_tls will continue to
  ignore the C argument to sys_clone in favor of the pt_regs captured at
  kernel entry, and thus will be unable to introduce new versions of the
  clone syscall.
  
  Signed-off-by: Josh Triplett j...@joshtriplett.org
  Signed-off-by: Thiago Macieira thiago.macie...@intel.com
  Acked-by: Andy Lutomirski l...@kernel.org
  ---
   arch/Kconfig |  7 ++
   include/linux/sched.h| 14 
   include/linux/syscalls.h |  6 +++---
   kernel/fork.c| 55 
  +++-
   4 files changed, 60 insertions(+), 22 deletions(-)
  
  diff --git a/arch/Kconfig b/arch/Kconfig
  index 05d7a8a..4834a58 100644
  --- a/arch/Kconfig
  +++ b/arch/Kconfig
  @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
This spares a stack switch and improves cache usage on softirq
processing.
   
  +config HAVE_COPY_THREAD_TLS
  +   bool
  +   help
  + Architecture provides copy_thread_tls to accept tls argument via
  + normal C parameter passing, rather than extracting the syscall
  + argument from pt_regs.
  +
   #
   # ABI hall of shame
   #
  diff --git a/include/linux/sched.h b/include/linux/sched.h
  index a419b65..2cc88c6 100644
  --- a/include/linux/sched.h
  +++ b/include/linux/sched.h
  @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct 
  task_struct *task, unsigned int mode);
   /* Remove the current tasks stale references to the old mm_struct */
   extern void mm_release(struct task_struct *, struct mm_struct *);
   
  +#ifdef CONFIG_HAVE_COPY_THREAD_TLS
  +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
  +   struct task_struct *, unsigned long);
  +#else
   extern int copy_thread(unsigned long, unsigned long, unsigned long,
  struct task_struct *);
  +
  +/* Architectures that haven't opted into copy_thread_tls get the tls 
  argument
  + * via pt_regs, so ignore the tls argument passed via C. */
  +static inline int copy_thread_tls(
  +   unsigned long clone_flags, unsigned long sp, unsigned long arg,
  +   struct task_struct *p, unsigned long tls)
  +{
  +   return copy_thread(clone_flags, sp, arg, p);
  +}
  +#endif
 
 Is this detour really needed. Can we not update copy_thread() of all arches 
 in one
 go and add the tls arg, w/o using it.
 
 And then arch maintainers can micro-optimize their code to use that arg vs.
 pt_regs-rxx version at their own leisure. The only downside I see with that 
 is
 bigger churn (touches all arches), and a interim unused arg warning ?

In addition to the cleanup and simplification, the purpose of this patch
is specifically to make sure that any architecture opting into
HAVE_COPY_THREAD_TLS does *not* care how tls is passed in, and in
particular doesn't depend on it arriving in a specific syscall argument.
I have patches in flight (for CLONE_FD and clone4) that depend on that
assumption, by introducing additional syscalls (with tls passed
differently) that call down through these same code paths.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] rcu: change function declaration to bool

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 11:28:30AM -0400, Steven Rostedt wrote:
 On Mon, 11 May 2015 17:10:59 +0200
 Nicholas Mc Guire hof...@osadl.org wrote:
 
  rcu_cpu_has_callbacks() is declared int but is actually returning bool and
  and as the function description states  * Return true if the specified 
  CPU has any callback, this probably should be a bool. All (3) 
  call-sites currently treat it as bool so the declaration.
  
  
  Signed-off-by: Nicholas Mc Guire hof...@osadl.org
  ---
  
  V2: fixed up commit message and tool infos as requested by 
  Josh Triplett j...@joshtriplett.org
  
  Type-checking coccinelle spatches are being used to locate type 
  mismatches between function signatures and return values. 
  ./kernel/rcu/tree.c:3538 WARNING: return of wrong type
  int != bool, 
  
  Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y)
  
  Patch is against 4.1-rc3 (localversion-next is -next-20150511)
 
 I think what Josh was saying is that all the above except for the V2
 should be above the signature. Everything between the --- and the
 patch gets tossed out when committed into git.
 
 Giving credit to coccinelle and even what branch and config was used
 for testing is something we want in the git change log history.

Yes, exactly.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] init/Kconfig: Fix break in middle of EXPERT menu

2015-05-11 Thread Josh Triplett
Commit e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y 
!CONFIG_TRACING kernels, make it more configurable) made BPF_SYSCALL no
longer hidden with !EXPERT, but left it in the middle of the EXPERT
menu.  menuconfig stops putting config items under a submenu once it
encounters an item that doesn't depend on the menu's config item, so
this caused the remainder of the EXPERT menu to spill out into the
containing menu around it.  Fix by moving BPF_SYSCALL before the EXPERT
menu, next to BPF.

Fixes: e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y  !CONFIG_TRACING 
kernels, make it more configurable)
Signed-off-by: Josh Triplett j...@joshtriplett.org
---

Ingo, do you want to take this through -tip?  Or should this go through some
other tree?

I'm also thinking about splitting the entire EXPERT menu into a separate
Kconfig.expert and including it from init/Kconfig, to make it clear that
everything in that menu should only be visible if EXPERT.  Right now, the long
EXPERT menu blends into the longer init/Kconfig, and issues like this happen
every few kernel releases.

 init/Kconfig | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index dc24dec..e2f16f1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1341,6 +1341,16 @@ config HAVE_PCSPKR_PLATFORM
 config BPF
bool
 
+# syscall, maps, verifier
+config BPF_SYSCALL
+   bool Enable bpf() system call
+   select ANON_INODES
+   select BPF
+   default n
+   help
+ Enable the bpf() system call that allows to manipulate eBPF
+ programs and maps via file descriptors.
+
 menuconfig EXPERT
bool Configure standard kernel features (expert users)
# Unhide debug options, to make the on-by-default options visible
@@ -1535,16 +1545,6 @@ config EVENTFD
 
  If unsure, say Y.
 
-# syscall, maps, verifier
-config BPF_SYSCALL
-   bool Enable bpf() system call
-   select ANON_INODES
-   select BPF
-   default n
-   help
- Enable the bpf() system call that allows to manipulate eBPF
- programs and maps via file descriptors.
-
 config SHMEM
bool Use full shmem filesystem if EXPERT
default y
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] force inlining of spinlock ops

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 07:57:22PM +0200, Denys Vlasenko wrote:
 With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
 very small functions we expect to be inlined. In particular,
 with this config: http://busybox.net/~vda/kernel_config
 there are more than a thousand copies of tiny spinlock-related functions:
 
 $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort 
 -rn | grep ' spin'
 473 000b t spin_unlock_irqrestore
 292 000b t spin_unlock
 215 000b t spin_lock
 134 000b t spin_unlock_irq
 130 000b t spin_unlock_bh
 120 000b t spin_lock_irq
 106 000b t spin_lock_bh
 
 Disassembly:
 
 81004720 spin_lock:
 81004720:   55  push   %rbp
 81004721:   48 89 e5mov%rsp,%rbp
 81004724:   e8 f8 4e e2 02  callq  _raw_spin_lock
 81004729:   5d  pop%rbp
 8100472a:   c3  retq

Frame pointers make this even more awful, since without them this could
just become a single jmp.  (Assuming _raw_spin_lock shouldn't be
inlined too.)

 This patch fixes this via s/inline/__always_inline/ in spinlock.h.
 This decreases vmlinux by about 30k:
 
 text data  bss   dec hex filename
 82375570 22255544 20627456 125258570 7774b4a vmlinux.before
 82335059 22255416 20627456 125217931 776ac8b vmlinux

Nice improvement.  Given that this actually makes the kernel *smaller*,
presumably in addition to faster, this forced inlining seems completely
reasonable.

 Signed-off-by: Denys Vlasenko dvlas...@redhat.com
 Cc: Thomas Graf tg...@suug.ch
 Cc: David S. Miller da...@davemloft.net
 Cc: Bart Van Assche bvanass...@acm.org
 Cc: Peter Zijlstra pet...@infradead.org
 Cc: David Rientjes rient...@google.com
 Cc: David S. Miller da...@davemloft.net
 Cc: Andrew Morton a...@linux-foundation.org
 Cc: Linus Torvalds torva...@linux-foundation.org
 Cc: Oleg Nesterov o...@redhat.com
 Cc: Paul E. McKenney paul...@linux.vnet.ibm.com
 Cc: Ingo Molnar mi...@kernel.org
 Cc: Paul E. McKenney paul...@linux.vnet.ibm.com
 CC: linux-kernel@vger.kernel.org

Reviewed-by: Josh Triplett j...@joshtriplett.org
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-11 Thread Josh Triplett
clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread.  sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass
along that argument.  Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes
tls in).

Apart from being awful and inscrutable, that also only works because
only one code path into copy_thread can pass the CLONE_SETTLS flag, and
that code path comes from sys_clone with its architecture-specific
argument-passing order.  This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.

However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.

Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt
into, and a new copy_thread_tls that accepts the tls parameter as an
additional unsigned long (syscall-argument-sized) argument.
Change sys_clone's tls argument to an unsigned long (which does
not change the ABI), and pass that down to copy_thread_tls.

Architectures that don't opt into copy_thread_tls will continue to
ignore the C argument to sys_clone in favor of the pt_regs captured at
kernel entry, and thus will be unable to introduce new versions of the
clone syscall.

Patch co-authored by Josh Triplett and Thiago Macieira.

Signed-off-by: Josh Triplett j...@joshtriplett.org
Acked-by: Andy Lutomirski l...@kernel.org
---
 arch/Kconfig |  7 ++
 include/linux/sched.h| 14 
 include/linux/syscalls.h |  6 +++---
 kernel/fork.c| 55 +++-
 4 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 05d7a8a..4834a58 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
  This spares a stack switch and improves cache usage on softirq
  processing.
 
+config HAVE_COPY_THREAD_TLS
+   bool
+   help
+ Architecture provides copy_thread_tls to accept tls argument via
+ normal C parameter passing, rather than extracting the syscall
+ argument from pt_regs.
+
 #
 # ABI hall of shame
 #
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a419b65..2cc88c6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct 
*task, unsigned int mode);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
 
+#ifdef CONFIG_HAVE_COPY_THREAD_TLS
+extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
+   struct task_struct *, unsigned long);
+#else
 extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *);
+
+/* Architectures that haven't opted into copy_thread_tls get the tls argument
+ * via pt_regs, so ignore the tls argument passed via C. */
+static inline int copy_thread_tls(
+   unsigned long clone_flags, unsigned long sp, unsigned long arg,
+   struct task_struct *p, unsigned long tls)
+{
+   return copy_thread(clone_flags, sp, arg, p);
+}
+#endif
 extern void flush_thread(void);
 extern void exit_thread(void);
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 76d1e38..bb51bec 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd);
 asmlinkage long sys_fork(void);
 asmlinkage long sys_vfork(void);
 #ifdef CONFIG_CLONE_BACKWARDS
-asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int,
+asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned 
long,
   int __user *);
 #else
 #ifdef CONFIG_CLONE_BACKWARDS3
 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *,
- int __user *, int);
+ int __user *, unsigned long);
 #else
 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
-  int __user *, int);
+  int __user *, unsigned long);
 #endif
 #endif
 
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139..b3dadf4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1192,7 +1192,8 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid

Re: [PATCH V4] rcu: change function declaration to bool

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 06:12:27PM +0200, Nicholas Mc Guire wrote:
 rcu_cpu_has_callbacks() is declared int. The current declaration was 
 introduced
 in commit c0f4dfd4f90f (rcu: Make RCU_FAST_NO_HZ take advantage of numbered
 callbacks). But it is actually returning bool and as the function description 
 states  * Return true if the specified CPU has any callback, this 
 probably 
 should be a bool as all (3) call-sites currently treat it as bool.
 
 Type-checking coccinelle spatches are being used to locate type mismatches
 between function signatures and return values in this case this produced:
 ./kernel/rcu/tree.c:3538 WARNING: return of wrong type
 int != bool, 
 
 Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y)
 
 Patch is against 4.1-rc3 (localversion-next is -next-20150511) and fixes
 
 Signed-off-by: Nicholas Mc Guire hof...@osadl.org

Seems like a reasonable addition.

Reviewed-by: Josh Triplett j...@joshtriplett.org

 ---
 
 V4: fix-up to include the origin of the issue being fixed as requeseted by
 Steven Rostedt rost...@goodmis.org.
 
  kernel/rcu/tree.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
 index bcc5943..599550c 100644
 --- a/kernel/rcu/tree.c
 +++ b/kernel/rcu/tree.c
 @@ -3516,7 +3516,7 @@ static int rcu_pending(void)
   * non-NULL, store an indication of whether all callbacks are lazy.
   * (If there are no callbacks, all of them are deemed to be lazy.)
   */
 -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
 +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
  {
   bool al = true;
   bool hc = false;
 -- 
 1.7.10.4
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2 2/2] x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit

2015-05-11 Thread Josh Triplett
For 32-bit userspace on a 64-bit kernel, this requires modifying
stub32_clone to actually swap the appropriate arguments to match
CONFIG_CLONE_BACKWARDS, rather than just leaving the C argument for tls
broken.

Patch co-authored by Josh Triplett and Thiago Macieira.

Signed-off-by: Josh Triplett j...@joshtriplett.org
Acked-by: Andy Lutomirski l...@kernel.org
---
 arch/x86/Kconfig | 1 +
 arch/x86/ia32/ia32entry.S| 2 +-
 arch/x86/kernel/process_32.c | 6 +++---
 arch/x86/kernel/process_64.c | 8 
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca..4960b0d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -124,6 +124,7 @@ config X86
select MODULES_USE_ELF_REL if X86_32
select MODULES_USE_ELF_RELA if X86_64
select CLONE_BACKWARDS if X86_32
+   select HAVE_COPY_THREAD_TLS
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUE_RWLOCK
select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebca..0286735 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -487,7 +487,7 @@ GLOBAL(\label)
ALIGN
 GLOBAL(stub32_clone)
leaq sys_clone(%rip),%rax
-   mov %r8, %rcx
+   xchg %r8, %rcx
jmp  ia32_ptregs_common 
 
ALIGN
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f9..ead28ff 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -129,8 +129,8 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-   unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+   unsigned long arg, struct task_struct *p, unsigned long tls)
 {
struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *tsk;
@@ -185,7 +185,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 */
if (clone_flags  CLONE_SETTLS)
err = do_set_thread_area(p, -1,
-   (struct user_desc __user *)childregs-si, 0);
+   (struct user_desc __user *)tls, 0);
 
if (err  p-thread.io_bitmap_ptr) {
kfree(p-thread.io_bitmap_ptr);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43..c69cabc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -151,8 +151,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int 
tls)
return get_desc_base(t-thread.tls_array[tls]);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-   unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+   unsigned long arg, struct task_struct *p, unsigned long tls)
 {
int err;
struct pt_regs *childregs;
@@ -209,10 +209,10 @@ int copy_thread(unsigned long clone_flags, unsigned long 
sp,
 #ifdef CONFIG_IA32_EMULATION
if (test_thread_flag(TIF_IA32))
err = do_set_thread_area(p, -1,
-   (struct user_desc __user *)childregs-si, 0);
+   (struct user_desc __user *)tls, 0);
else
 #endif
-   err = do_arch_prctl(p, ARCH_SET_FS, childregs-r8);
+   err = do_arch_prctl(p, ARCH_SET_FS, tls);
if (err)
goto out;
}
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 04:00:43PM +0200, Ingo Molnar wrote:
 
 * Josh Triplett j...@joshtriplett.org wrote:
 
  On Mon, May 11, 2015 at 12:13:13PM +0200, Ingo Molnar wrote:
   
   * j...@joshtriplett.org j...@joshtriplett.org wrote:
   
On Tue, May 05, 2015 at 08:53:03PM +0200, Thomas Gleixner wrote:
 On Tue, 21 Apr 2015, Josh Triplett wrote:
  
  Signed-off-by: Josh Triplett j...@joshtriplett.org
  Signed-off-by: Thiago Macieira thiago.macie...@intel.com
 
 Can you please clarify that SOB chain? It does not make any sense.

Co-authored patch.  We both worked on it together, and sadly git 
doesn't support a commit with multiple authors, so this is the next 
best thing.
   
   No, this is not a valid SOB chain.
   
   For 'co authored' patches you can add credits either to the file, 
   as two copyright lines, or via the changelog, no need to mess up 
   the SOB chain for that.
  
  Fine, I'll write it another way.
  
  Do you see any other issues with the patch other than the signoffs?
 
 Looks good to me, but I have not looked very deeply ...

I sent out a v2 with the co-author information moved from the signoffs
to the commit message.  If it looks reasonable to you, can you take it
through the tip tree please?

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2 0/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-05-11 Thread Josh Triplett
clone has some of the quirkiest syscall handling in the kernel, with a pile of
special cases, historical curiosities, and architecture-specific calling
conventions.  In particular, clone with CLONE_SETTLS accepts a parameter tls
that the C entry point completely ignores and some assembly entry points
overwrite; instead, the low-level arch-specific code pulls the tls parameter
out of the arch-specific register captured as part of pt_regs on entry to the
kernel.  That's a massive hack, and it makes the arch-specific code only work
when called via the specific existing syscall entry points; because of this
hack, any new clone-like system call would have to accept an identical tls
argument in exactly the same arch-specific position, rather than providing a
unified system call entry point across architectures.

The first patch allows architectures to handle the tls argument via normal C
parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS.  The
second patch makes 32-bit and 64-bit x86 opt into this.

These two patches came out of the clone4 series, which isn't ready for this
merge window, but these first two cleanup patches were entirely uncontroversial
and have acks.  I'd like to go ahead and submit these two so that other
architectures can begin building on top of this and opting into
HAVE_COPY_THREAD_TLS.  However, I'm also happy to wait and send these through
the next merge window (along with v3 of clone4) if anyone would prefer that.

v2: Move co-author from signoffs to a note in the commit message, as
required by Ingo Molnar.

Josh Triplett (2):
  clone: Support passing tls argument via C rather than pt_regs magic
  x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit

 arch/Kconfig |  7 ++
 arch/x86/Kconfig |  1 +
 arch/x86/ia32/ia32entry.S|  2 +-
 arch/x86/kernel/process_32.c |  6 ++---
 arch/x86/kernel/process_64.c |  8 +++
 include/linux/sched.h| 14 +++
 include/linux/syscalls.h |  6 ++---
 kernel/fork.c| 55 +---
 8 files changed, 69 insertions(+), 30 deletions(-)

-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 11:32:28PM +0200, Paul Bolle wrote:
 On Mon, 2015-05-11 at 14:18 -0700, Josh Triplett wrote:
  However, that would produce *two* entries under the General setup
  menu: a yes/no entry Configure standard kernel features (expert users)
  with no submenu, and a Some separate menu prompt here entry with a
  submenu but no '[ ]' for a yes/no option.  Integrating the two (without
  using menuconfig's implicit add stuff to submenu until an option's
  prompt doesn't depend on this symbol magic) requires new a kconfig
  mechanism.
 
 The diff pasted at the end of this message, which I quickly cobbled
 together an applies on top of this 2/1, generates these two lines in
 menuconfig (for EXPERT = 'y')
 [*] Configure standard kernel features (expert users)
 Standard kernel features  ---   
 
 Is squashing those two lines worth a new kconfig mechanism?

In my opinion, yes.  If you use the implicit (and error-prone)
menuconfig submenuing, you get a single entry with the '[ ]' and the
submenu.  There are currently 272 instances of menuconfig in Kconfig
files.  I'd like to have a less error-prone mechanism for people to use,
with an explicit endmenu at the end, and I don't want to leave any
incentive for people to need the more error-prone version.

I would be tempted to just make menuconfig require an endmenu, and
convert all users, but that would almost certainly break many
third-party users of kconfig.  So instead, I'm currently extending
menu (which already expects endmenu) to allow the syntax
menu config SYMBOL, which acts like a combination of config SYMBOL
and a menu with visible if SYMBOL.  Diffstat for the patch I'm testing
right now:

 scripts/kconfig/zconf.y | 18 ++
  1 file changed, 18 insertions(+)

That seems worthwhile to have a less error-prone menu mechanism.

(The actual patch would also need to updated zconf.tab.c_shipped.)

(Also, the diff you posted would be smaller if you left config EXPERT
at the top of init/Kconfig.expert; why the move?)

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
The expert menu frequently gets broken by a config item in the middle
that leaves off the if EXPERT from its prompt.  This results in the
remainder of the menu spilling out into the parent General setup menu.
Move the entire expert menu into a separate Kconfig file,
init/Kconfig.expert, to make this harder to do accidentally, and to
break up the exceedingly long init/Kconfig a bit.

Signed-off-by: Josh Triplett j...@joshtriplett.org
---

This applies on top of init/Kconfig: Fix break in middle of EXPERT
menu.  Please apply both.

I'd also like to factor the if EXPERT off of all the prompts and into a
single scoped item wrapped around all of them, but kconfig doesn't have any way
to do that.  menuconfig is just a hint, with no matching endmenu and no
implicit visibility; menu is scoped and has visible if, but that would
create a separate option containing a menu, rather than a menu under EXPERT's
Configure standard kernel features (expert users).  And if EXPERT ... endif
produces a dependency, not a prompt-visibility condition.  So I think this
would require changes to the Kconfig language, to introduce either a scoped
visible if EXPERT ... endvisible or similar, or a scoped version of
menuconfig with a matching endmenu and implicit visibility (effectively a
menu statement with attached config rather than a config with a hint
this might be a menu).  I'm leaning towards the latter.

So I'll send a followup patch enhancing kconfig to improve this case,
but I think splitting this into a separate file is still worth it even
without that.

 init/Kconfig| 232 +---
 init/Kconfig.expert | 231 +++
 2 files changed, 232 insertions(+), 231 deletions(-)
 create mode 100644 init/Kconfig.expert

diff --git a/init/Kconfig b/init/Kconfig
index e2f16f1..a2de3f5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1351,237 +1351,7 @@ config BPF_SYSCALL
  Enable the bpf() system call that allows to manipulate eBPF
  programs and maps via file descriptors.
 
-menuconfig EXPERT
-   bool Configure standard kernel features (expert users)
-   # Unhide debug options, to make the on-by-default options visible
-   select DEBUG_KERNEL
-   help
- This option allows certain base kernel options and settings
-  to be disabled or tweaked. This is for specialized
-  environments which can tolerate a non-standard kernel.
-  Only use this if you really know what you are doing.
-
-config UID16
-   bool Enable 16-bit UID system calls if EXPERT
-   depends on HAVE_UID16  MULTIUSER
-   default y
-   help
- This enables the legacy 16-bit UID syscall wrappers.
-
-config MULTIUSER
-   bool Multiple users, groups and capabilities support if EXPERT
-   default y
-   help
- This option enables support for non-root users, groups and
- capabilities.
-
- If you say N here, all processes will run with UID 0, GID 0, and all
- possible capabilities.  Saying N here also compiles out support for
- system calls related to UIDs, GIDs, and capabilities, such as setuid,
- setgid, and capset.
-
- If unsure, say Y here.
-
-config SGETMASK_SYSCALL
-   bool sgetmask/ssetmask syscalls support if EXPERT
-   def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || 
SPARC || CRIS || MICROBLAZE || SUPERH
-   ---help---
- sys_sgetmask and sys_ssetmask are obsolete system calls
- no longer supported in libc but still enabled by default in some
- architectures.
-
- If unsure, leave the default option here.
-
-config SYSFS_SYSCALL
-   bool Sysfs syscall support if EXPERT
-   default y
-   ---help---
- sys_sysfs is an obsolete system call no longer supported in libc.
- Note that disabling this option is more secure but might break
- compatibility with some systems.
-
- If unsure say Y here.
-
-config SYSCTL_SYSCALL
-   bool Sysctl syscall support if EXPERT
-   depends on PROC_SYSCTL
-   default n
-   select SYSCTL
-   ---help---
- sys_sysctl uses binary paths that have been found challenging
- to properly maintain and use.  The interface in /proc/sys
- using paths with ascii names is now the primary path to this
- information.
-
- Almost nothing using the binary sysctl interface so if you are
- trying to save some space it is probably safe to disable this,
- making your kernel marginally smaller.
-
- If unsure say N here.
-
-config KALLSYMS
-bool Load all symbols for debugging/ksymoops if EXPERT
-default y
-help
-  Say Y here to let the kernel print out symbolic crash information and
-  symbolic stack backtraces. This increases the size of the kernel
-  somewhat, as all symbols have to be loaded

Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 11:01:22PM +0200, Paul Bolle wrote:
 On Mon, 2015-05-11 at 13:23 -0700, Josh Triplett wrote:
  I'd also like to factor the if EXPERT off of all the prompts and into a
  single scoped item wrapped around all of them, but kconfig doesn't have any 
  way
  to do that.  menuconfig is just a hint, with no matching endmenu and no
  implicit visibility; menu is scoped and has visible if, but that would
  create a separate option containing a menu, rather than a menu under 
  EXPERT's
  Configure standard kernel features (expert users).  And if EXPERT ... 
  endif
  produces a dependency, not a prompt-visibility condition.  So I think this
  would require changes to the Kconfig language, to introduce either a scoped
  visible if EXPERT ... endvisible or similar, or a scoped version of
  menuconfig with a matching endmenu and implicit visibility (effectively a
  menu statement with attached config rather than a config with a hint
  this might be a menu).  I'm leaning towards the latter.
 
 The behavior of menuconfig in this case is rather subtle. I must admit I
 never noticed it.
 
 The visible option to menus is little used, and I'm not really
 familiar with it. So, for what it's worth: would adding a new menu with
   visible if EXPERT
 
 attached to it, and putting all current
   prompt Foo if EXPERT
 
 entries in that menu roughly do what you want?

Not quite.  menu has a prompt but no associated symbol, so with current
kconfig that would look like:

config EXPERT
bool Configure standard kernel features (expert users)
...

menu Some separate menu prompt here
visible if EXPERT

... various options that currently have if EXPERT on their prompts ...

endmenu


However, that would produce *two* entries under the General setup
menu: a yes/no entry Configure standard kernel features (expert users)
with no submenu, and a Some separate menu prompt here entry with a
submenu but no '[ ]' for a yes/no option.  Integrating the two (without
using menuconfig's implicit add stuff to submenu until an option's
prompt doesn't depend on this symbol magic) requires new a kconfig
mechanism.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 11:50:21PM +0200, Paul Bolle wrote:
 On Mon, 2015-05-11 at 13:23 -0700, Josh Triplett wrote:
  --- /dev/null
  +++ b/init/Kconfig.expert
  @@ -0,0 +1,231 @@
  +menuconfig EXPERT
  +   bool Configure standard kernel features (expert users)
  +   # Unhide debug options, to make the on-by-default options visible
  +   select DEBUG_KERNEL
  +   help
  + This option allows certain base kernel options and settings
  +  to be disabled or tweaked. This is for specialized
  +  environments which can tolerate a non-standard kernel.
  +  Only use this if you really know what you are doing.
 
 Comment here saying
   # All entries in this file must have if EXPERT after their prompt
 
 or something to that effect (pending you patch, that is)?

Yeah, I should add such a comment.  I was hoping to make it obsolete via
kconfig changes, but in the interim, sure.

  +config KALLSYMS
  +bool Load all symbols for debugging/ksymoops if EXPERT
  +default y
  +help
  +  Say Y here to let the kernel print out symbolic crash information and
  +  symbolic stack backtraces. This increases the size of the kernel
  +  somewhat, as all symbols have to be loaded into the kernel image.
  +
  +config KALLSYMS_ALL
  +   bool Include all symbols in kallsyms
 
 (For some reason this entry doesn't have if EXPERT but it seems to
 behave as expected. Odd.)

Because it depends on KALLSYMS.  Magic!

  +   depends on DEBUG_KERNEL  KALLSYMS
  +   help
  +  Normally kallsyms only contains the symbols of functions for nicer
  +  OOPS messages and backtraces (i.e., symbols from the text and 
  inittext
  +  sections). This is sufficient for most cases. And only in very rare
  +  cases (e.g., when a debugger is used) all symbols are required (e.g.,
  +  names of variables from the data sections, etc).
  +
  +  This option makes sure that all symbols are loaded into the kernel
  +  image (i.e., symbols from all sections) in cost of increased kernel
  +  size (depending on the kernel configuration, it may be 300KiB or
  +  something like this).
  +
  +  Say N unless you really need all symbols.
  +
  +config PRINTK
  +   default y
  +   bool Enable support for printk if EXPERT
 
 Now you're touching this: bool [...] as the first line, please.

I'd like the commit moving these to their own file to very obviously
look like it's moving these lines unmodified, rather than making changes
in the process.  If you want this (and all the subsequent instances you
flagged) cleaned up, let's do that as a subsequent patch separate from
the move.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
On Tue, May 12, 2015 at 12:36:49AM +0200, Luis R. Rodriguez wrote:
 On Mon, May 11, 2015 at 01:23:02PM -0700, Josh Triplett wrote:
   init/Kconfig| 232 
  +---
   init/Kconfig.expert | 231 
  +++
   2 files changed, 232 insertions(+), 231 deletions(-)
   create mode 100644 init/Kconfig.expert
 
 I'm blinded by this patch, can you use git format-patch -M and also add:
 
 [diff]
   
 renamelimit=0
 
 To your .gitconfig

From a quick test, doing what you suggest doesn't have any effect; if I
also use -C1 instead of -M, then the patch to init/Kconfig looks exactly
the same (deleting 231 lines and adding a source line), while the
patch to init/Kconfig.expert goes from adding 231 lines to copying
init/Kconfig and deleting 1800 lines.

That does not seem like an improvement.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert

2015-05-11 Thread Josh Triplett
On Tue, May 12, 2015 at 12:01:27AM +0200, Paul Bolle wrote:
 On Mon, 2015-05-11 at 14:47 -0700, Josh Triplett wrote:
  On Mon, May 11, 2015 at 11:32:28PM +0200, Paul Bolle wrote:
   Is squashing those two lines worth a new kconfig mechanism?
  
  In my opinion, yes.  If you use the implicit (and error-prone)
  menuconfig submenuing, you get a single entry with the '[ ]' and the
  submenu.  There are currently 272 instances of menuconfig in Kconfig
  files.
 
 How many of those use the subtle trick EXPERT uses?

A rough search suggests about 35, with the rest using an if right
after the menu (presumably because they want dependencies rather than
visibility).  (A couple of which currently have the same broken-menu
problem today's patch 1 fixed for EXPERT, and a couple of which are
using that trick *across input files*.)

  I'd like to have a less error-prone mechanism for people to use,
  with an explicit endmenu at the end, and I don't want to leave any
  incentive for people to need the more error-prone version.
  
  I would be tempted to just make menuconfig require an endmenu, and
  convert all users, but that would almost certainly break many
  third-party users of kconfig.  So instead, I'm currently extending
  menu (which already expects endmenu) to allow the syntax
  menu config SYMBOL, which acts like a combination of config SYMBOL
  and a menu with visible if SYMBOL.
 
 Bikeshedding (before I'm even convinced of the need of this extension):
 menu config is far too similar to menuconfig.

Yeah, agreed; given that people will still end up using menuconfig in
light of the above, I'll use another syntax.

  Diffstat for the patch I'm testing
  right now:
  
   scripts/kconfig/zconf.y | 18 ++
1 file changed, 18 insertions(+)
  
  That seems worthwhile to have a less error-prone menu mechanism.
  
  (The actual patch would also need to updated zconf.tab.c_shipped.)
 
 And some lines in Documentation/kbuild/kconfig-language.txt (speaking
 from memory).

Good point; this does need documentation.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2] rcu: change function declaration to bool

2015-05-11 Thread Josh Triplett
On Mon, May 11, 2015 at 05:46:12PM +0200, Nicholas Mc Guire wrote:
 rcu_cpu_has_callbacks() is declared int but is actually returning bool
 and as the function description states  * Return true if the specified 
 CPU has any callback, this probably should be a bool. All (3) 
 call-sites currently treat it as bool.
 
 Type-checking coccinelle spatches are being used to locate type mismatches
 between function signatures and return values in this case this produced:
 ./kernel/rcu/tree.c:3538 WARNING: return of wrong type
 int != bool, 
 
 Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y)
 
 Patch is against 4.1-rc3 (localversion-next is -next-20150511)
 
 Signed-off-by: Nicholas Mc Guire hof...@osadl.org

Reviewed-by: Josh Triplett j...@joshtriplett.org

Thanks!

 ---
 
 V3: fix-up of commit message again (hope I got it right this time) as 
 requested by Josh Triplett j...@joshtriplett.org/Steven Rostedt
 rost...@goodmis.org
 
  kernel/rcu/tree.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
 index bcc5943..599550c 100644
 --- a/kernel/rcu/tree.c
 +++ b/kernel/rcu/tree.c
 @@ -3516,7 +3516,7 @@ static int rcu_pending(void)
   * non-NULL, store an indication of whether all callbacks are lazy.
   * (If there are no callbacks, all of them are deemed to be lazy.)
   */
 -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
 +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
  {
   bool al = true;
   bool hc = false;
 -- 
 1.7.10.4
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] init.h: mark init functions hot instead of cold

2015-05-09 Thread Josh Triplett
On Sat, May 09, 2015 at 02:17:39AM -0700, Andi Kleen wrote:
> On Sat, May 09, 2015 at 12:45:01AM +0200, Rasmus Villemoes wrote:
> > attribute((cold)) causes gcc to optimize the function for size rather
> > than speed. But since __init functions will be discarded anyway, I
> > don't see why memory should be a concern. On the contrary, everybody
> 
> It makes the bzImage smaller.
> A lot of people on smaller systems are interested in flash size.

True, but people interested in flash size can set
CONFIG_CC_OPTIMIZE_FOR_SIZE.

I would propose dropping "cold" and *not* adding "hot"; just let __init
functions get default optimizations.  People who set
CONFIG_CC_OPTIMIZE_FOR_SIZE should get similar size optimizations, and
people who don't will get GCC's normal optimizations, which should
provide much of the improved boot performance Rasmus observed.

Rasmus, can you confirm that just dropping cold 1) doesn't make the
kernel larger when building with CONFIG_CC_OPTIMIZE_FOR_SIZE, and 2)
gives approximately the same 2% performance benefit you observed?

> > wants their box to boot faster. Using the opposite attribute, hot,
> > causes gcc to optimize the functions more aggressively, possibly at
> > the expense of larger (.init).text. A completely unscientific test
> > showed about 2% faster boot time: I booted a kernel in qemu with and
> > without this patch five times each; the boot times were very stable in
> > each case, so I think the 2% is ok, but of course only applies to that
> > specific .config running in a virtual machine on my hardware.
> 
> 2% on boot is basically noise.

I disagree; there are people working on shaving milliseconds from boot.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] init.h: mark init functions hot instead of cold

2015-05-09 Thread Josh Triplett
On Sat, May 09, 2015 at 02:17:39AM -0700, Andi Kleen wrote:
 On Sat, May 09, 2015 at 12:45:01AM +0200, Rasmus Villemoes wrote:
  attribute((cold)) causes gcc to optimize the function for size rather
  than speed. But since __init functions will be discarded anyway, I
  don't see why memory should be a concern. On the contrary, everybody
 
 It makes the bzImage smaller.
 A lot of people on smaller systems are interested in flash size.

True, but people interested in flash size can set
CONFIG_CC_OPTIMIZE_FOR_SIZE.

I would propose dropping cold and *not* adding hot; just let __init
functions get default optimizations.  People who set
CONFIG_CC_OPTIMIZE_FOR_SIZE should get similar size optimizations, and
people who don't will get GCC's normal optimizations, which should
provide much of the improved boot performance Rasmus observed.

Rasmus, can you confirm that just dropping cold 1) doesn't make the
kernel larger when building with CONFIG_CC_OPTIMIZE_FOR_SIZE, and 2)
gives approximately the same 2% performance benefit you observed?

  wants their box to boot faster. Using the opposite attribute, hot,
  causes gcc to optimize the functions more aggressively, possibly at
  the expense of larger (.init).text. A completely unscientific test
  showed about 2% faster boot time: I booted a kernel in qemu with and
  without this patch five times each; the boot times were very stable in
  each case, so I think the 2% is ok, but of course only applies to that
  specific .config running in a virtual machine on my hardware.
 
 2% on boot is basically noise.

I disagree; there are people working on shaving milliseconds from boot.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] uidgid: Make uid_valid and gid_valid work with !CONFIG_MULTIUSER

2015-05-07 Thread Josh Triplett
{u,g}id_valid call {u,g}id_eq, which calls __k{u,g}id_val on both
arguments and compares.  With !CONFIG_MULTIUSER, __k{u,g}id_val return a
constant 0, which makes {u,g}id_valid always return false.  Change
{u,g}id_valid to compare their argument against -1 instead.  That
produces identical results in the normal CONFIG_MULTIUSER=y case, but
with !CONFIG_MULTIUSER will make {u,g}id_valid constant-fold into
"return true;" rather than "return false;".

This fixes uses of devpts without CONFIG_MULTIUSER.

Signed-off-by: Josh Triplett 
---
 include/linux/uidgid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index 0ee05da..0383552 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -109,12 +109,12 @@ static inline bool gid_lte(kgid_t left, kgid_t right)
 
 static inline bool uid_valid(kuid_t uid)
 {
-   return !uid_eq(uid, INVALID_UID);
+   return __kuid_val(uid) != (uid_t) -1;
 }
 
 static inline bool gid_valid(kgid_t gid)
 {
-   return !gid_eq(gid, INVALID_GID);
+   return __kgid_val(gid) != (gid_t) -1;
 }
 
 #ifdef CONFIG_USER_NS
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2] devpts: If initialization failed, don't crash when opening /dev/ptmx

2015-05-07 Thread Josh Triplett
If devpts failed to initialize, it would store an ERR_PTR in the global
devpts_mnt.  A subsequent open of /dev/ptmx would call devpts_new_index,
which would dereference devpts_mnt and crash.

Avoid storing invalid values in devpts_mnt; leave it NULL instead.
Make both devpts_new_index and devpts_pty_new fail gracefully with
ENODEV in that case, which then becomes the return value to the
userspace open call on /dev/ptmx.

Signed-off-by: Josh Triplett 
Reviewed-by: Peter Hurley 
---

v2: Fix copy-paste error caught by Peter Hurley.

As mentioned in v1, this is separate from fixing the bug that makes devpts fail
to initialize in the first place, but this makes devpts fail gracefully rather
than crashing.

 fs/devpts/inode.c | 30 +++---
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index cfe8466..016687b 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct 
inode *inode)
if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
return inode->i_sb;
 #endif
+   if (!devpts_mnt)
+   return NULL;
return devpts_mnt->mnt_sb;
 }
 
@@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = {
 int devpts_new_index(struct inode *ptmx_inode)
 {
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-   struct pts_fs_info *fsi = DEVPTS_SB(sb);
+   struct pts_fs_info *fsi;
int index;
int ida_ret;
 
+   if (!sb)
+   return -ENODEV;
+
+   fsi = DEVPTS_SB(sb);
 retry:
if (!ida_pre_get(>allocated_ptys, GFP_KERNEL))
return -ENOMEM;
@@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, 
dev_t device, int index,
struct dentry *dentry;
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
struct inode *inode;
-   struct dentry *root = sb->s_root;
-   struct pts_fs_info *fsi = DEVPTS_SB(sb);
-   struct pts_mount_opts *opts = >mount_opts;
+   struct dentry *root;
+   struct pts_fs_info *fsi;
+   struct pts_mount_opts *opts;
char s[12];
 
+   if (!sb)
+   return ERR_PTR(-ENODEV);
+
+   root = sb->s_root;
+   fsi = DEVPTS_SB(sb);
+   opts = >mount_opts;
+
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -676,12 +689,15 @@ static int __init init_devpts_fs(void)
struct ctl_table_header *table;
 
if (!err) {
+   struct vfsmount *mnt;
table = register_sysctl_table(pty_root_table);
-   devpts_mnt = kern_mount(_fs_type);
-   if (IS_ERR(devpts_mnt)) {
-   err = PTR_ERR(devpts_mnt);
+   mnt = kern_mount(_fs_type);
+   if (IS_ERR(mnt)) {
+   err = PTR_ERR(mnt);
unregister_filesystem(_fs_type);
unregister_sysctl_table(table);
+   } else {
+   devpts_mnt = mnt;
}
}
return err;
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] uidgid: Make uid_valid and gid_valid work with !CONFIG_MULTIUSER

2015-05-07 Thread Josh Triplett
{u,g}id_valid call {u,g}id_eq, which calls __k{u,g}id_val on both
arguments and compares.  With !CONFIG_MULTIUSER, __k{u,g}id_val return a
constant 0, which makes {u,g}id_valid always return false.  Change
{u,g}id_valid to compare their argument against -1 instead.  That
produces identical results in the normal CONFIG_MULTIUSER=y case, but
with !CONFIG_MULTIUSER will make {u,g}id_valid constant-fold into
return true; rather than return false;.

This fixes uses of devpts without CONFIG_MULTIUSER.

Signed-off-by: Josh Triplett j...@joshtriplett.org
---
 include/linux/uidgid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index 0ee05da..0383552 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -109,12 +109,12 @@ static inline bool gid_lte(kgid_t left, kgid_t right)
 
 static inline bool uid_valid(kuid_t uid)
 {
-   return !uid_eq(uid, INVALID_UID);
+   return __kuid_val(uid) != (uid_t) -1;
 }
 
 static inline bool gid_valid(kgid_t gid)
 {
-   return !gid_eq(gid, INVALID_GID);
+   return __kgid_val(gid) != (gid_t) -1;
 }
 
 #ifdef CONFIG_USER_NS
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2] devpts: If initialization failed, don't crash when opening /dev/ptmx

2015-05-07 Thread Josh Triplett
If devpts failed to initialize, it would store an ERR_PTR in the global
devpts_mnt.  A subsequent open of /dev/ptmx would call devpts_new_index,
which would dereference devpts_mnt and crash.

Avoid storing invalid values in devpts_mnt; leave it NULL instead.
Make both devpts_new_index and devpts_pty_new fail gracefully with
ENODEV in that case, which then becomes the return value to the
userspace open call on /dev/ptmx.

Signed-off-by: Josh Triplett j...@joshtriplett.org
Reviewed-by: Peter Hurley pe...@hurleysoftware.com
---

v2: Fix copy-paste error caught by Peter Hurley.

As mentioned in v1, this is separate from fixing the bug that makes devpts fail
to initialize in the first place, but this makes devpts fail gracefully rather
than crashing.

 fs/devpts/inode.c | 30 +++---
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index cfe8466..016687b 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct 
inode *inode)
if (inode-i_sb-s_magic == DEVPTS_SUPER_MAGIC)
return inode-i_sb;
 #endif
+   if (!devpts_mnt)
+   return NULL;
return devpts_mnt-mnt_sb;
 }
 
@@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = {
 int devpts_new_index(struct inode *ptmx_inode)
 {
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-   struct pts_fs_info *fsi = DEVPTS_SB(sb);
+   struct pts_fs_info *fsi;
int index;
int ida_ret;
 
+   if (!sb)
+   return -ENODEV;
+
+   fsi = DEVPTS_SB(sb);
 retry:
if (!ida_pre_get(fsi-allocated_ptys, GFP_KERNEL))
return -ENOMEM;
@@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, 
dev_t device, int index,
struct dentry *dentry;
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
struct inode *inode;
-   struct dentry *root = sb-s_root;
-   struct pts_fs_info *fsi = DEVPTS_SB(sb);
-   struct pts_mount_opts *opts = fsi-mount_opts;
+   struct dentry *root;
+   struct pts_fs_info *fsi;
+   struct pts_mount_opts *opts;
char s[12];
 
+   if (!sb)
+   return ERR_PTR(-ENODEV);
+
+   root = sb-s_root;
+   fsi = DEVPTS_SB(sb);
+   opts = fsi-mount_opts;
+
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -676,12 +689,15 @@ static int __init init_devpts_fs(void)
struct ctl_table_header *table;
 
if (!err) {
+   struct vfsmount *mnt;
table = register_sysctl_table(pty_root_table);
-   devpts_mnt = kern_mount(devpts_fs_type);
-   if (IS_ERR(devpts_mnt)) {
-   err = PTR_ERR(devpts_mnt);
+   mnt = kern_mount(devpts_fs_type);
+   if (IS_ERR(mnt)) {
+   err = PTR_ERR(mnt);
unregister_filesystem(devpts_fs_type);
unregister_sysctl_table(table);
+   } else {
+   devpts_mnt = mnt;
}
}
return err;
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] devpts: If initialization failed, don't crash when opening /dev/ptmx

2015-05-06 Thread Josh Triplett
If devpts failed to initialize, it would store an ERR_PTR in the global
devpts_mnt.  A subsequent open of /dev/ptmx would call devpts_new_index,
which would dereference devpts_mnt and crash.

Avoid storing invalid values in devpts_mnt; leave it NULL instead.
Make both devpts_new_index and devpts_pty_new fail gracefully with
ENODEV in that case, which then becomes the return value to the
userspace open call on /dev/ptmx.

Signed-off-by: Josh Triplett 
---

This fixes a crash found by Fengguang Wu's 0-day service ("BUG: unable to
handle kernel paging request at ffee").  It doesn't yet fix the underlying
initialization failure in init_devpts_fs, but it stops that failure from
becoming a kernel crash.  I'm working on the initialization failure now.

 fs/devpts/inode.c | 30 +++---
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index cfe8466..03e9076 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct 
inode *inode)
if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
return inode->i_sb;
 #endif
+   if (!devpts_mnt)
+   return NULL;
return devpts_mnt->mnt_sb;
 }
 
@@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = {
 int devpts_new_index(struct inode *ptmx_inode)
 {
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-   struct pts_fs_info *fsi = DEVPTS_SB(sb);
+   struct pts_fs_info *fsi;
int index;
int ida_ret;
 
+   if (!sb)
+   return -ENODEV;
+
+   fsi = DEVPTS_SB(sb);
 retry:
if (!ida_pre_get(>allocated_ptys, GFP_KERNEL))
return -ENOMEM;
@@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, 
dev_t device, int index,
struct dentry *dentry;
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
struct inode *inode;
-   struct dentry *root = sb->s_root;
-   struct pts_fs_info *fsi = DEVPTS_SB(sb);
-   struct pts_mount_opts *opts = >mount_opts;
+   struct dentry *root;
+   struct pts_fs_info *fsi;
+   struct pts_mount_opts *opts;
char s[12];
 
+   if (!sb)
+   return ERR_PTR(-ENODEV);
+
+   root = sb->s_root;
+   fsi = DEVPTS_SB(sb);
+   opts = >mount_opts;
+
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -676,12 +689,15 @@ static int __init init_devpts_fs(void)
struct ctl_table_header *table;
 
if (!err) {
+   static struct vfsmount *mnt;
table = register_sysctl_table(pty_root_table);
-   devpts_mnt = kern_mount(_fs_type);
-   if (IS_ERR(devpts_mnt)) {
-   err = PTR_ERR(devpts_mnt);
+   mnt = kern_mount(_fs_type);
+   if (IS_ERR(mnt)) {
+   err = PTR_ERR(mnt);
unregister_filesystem(_fs_type);
unregister_sysctl_table(table);
+   } else {
+   devpts_mnt = mnt;
}
}
return err;
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] devpts: If initialization failed, don't crash when opening /dev/ptmx

2015-05-06 Thread Josh Triplett
If devpts failed to initialize, it would store an ERR_PTR in the global
devpts_mnt.  A subsequent open of /dev/ptmx would call devpts_new_index,
which would dereference devpts_mnt and crash.

Avoid storing invalid values in devpts_mnt; leave it NULL instead.
Make both devpts_new_index and devpts_pty_new fail gracefully with
ENODEV in that case, which then becomes the return value to the
userspace open call on /dev/ptmx.

Signed-off-by: Josh Triplett j...@joshtriplett.org
---

This fixes a crash found by Fengguang Wu's 0-day service (BUG: unable to
handle kernel paging request at ffee).  It doesn't yet fix the underlying
initialization failure in init_devpts_fs, but it stops that failure from
becoming a kernel crash.  I'm working on the initialization failure now.

 fs/devpts/inode.c | 30 +++---
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index cfe8466..03e9076 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct 
inode *inode)
if (inode-i_sb-s_magic == DEVPTS_SUPER_MAGIC)
return inode-i_sb;
 #endif
+   if (!devpts_mnt)
+   return NULL;
return devpts_mnt-mnt_sb;
 }
 
@@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = {
 int devpts_new_index(struct inode *ptmx_inode)
 {
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-   struct pts_fs_info *fsi = DEVPTS_SB(sb);
+   struct pts_fs_info *fsi;
int index;
int ida_ret;
 
+   if (!sb)
+   return -ENODEV;
+
+   fsi = DEVPTS_SB(sb);
 retry:
if (!ida_pre_get(fsi-allocated_ptys, GFP_KERNEL))
return -ENOMEM;
@@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, 
dev_t device, int index,
struct dentry *dentry;
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
struct inode *inode;
-   struct dentry *root = sb-s_root;
-   struct pts_fs_info *fsi = DEVPTS_SB(sb);
-   struct pts_mount_opts *opts = fsi-mount_opts;
+   struct dentry *root;
+   struct pts_fs_info *fsi;
+   struct pts_mount_opts *opts;
char s[12];
 
+   if (!sb)
+   return ERR_PTR(-ENODEV);
+
+   root = sb-s_root;
+   fsi = DEVPTS_SB(sb);
+   opts = fsi-mount_opts;
+
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -676,12 +689,15 @@ static int __init init_devpts_fs(void)
struct ctl_table_header *table;
 
if (!err) {
+   static struct vfsmount *mnt;
table = register_sysctl_table(pty_root_table);
-   devpts_mnt = kern_mount(devpts_fs_type);
-   if (IS_ERR(devpts_mnt)) {
-   err = PTR_ERR(devpts_mnt);
+   mnt = kern_mount(devpts_fs_type);
+   if (IS_ERR(mnt)) {
+   err = PTR_ERR(mnt);
unregister_filesystem(devpts_fs_type);
unregister_sysctl_table(table);
+   } else {
+   devpts_mnt = mnt;
}
}
return err;
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] rcu: declare rcu_data variables in the section they are defined in

2015-05-03 Thread Josh Triplett
On Sun, May 03, 2015 at 05:57:53PM +0800, Nicolas Iooss wrote:
> Commit 11bbb235c26f ("rcu: Use DEFINE_PER_CPU_SHARED_ALIGNED for
> rcu_data") replaced DEFINE_PER_CPU by DEFINE_PER_CPU_SHARED_ALIGNED in
> the definition of rcu_sched and rcu_bh without updating
> kernel/rcu/tree.h.
> 
> This makes clang report a section mismatch (-Wsection warning) when
> building LLVMLinux because the variables are declared in .data..percpu
> but defined in .data..percpu..shared_aligned.
> 
> Signed-off-by: Nicolas Iooss 

Good catch.
Reviewed-by: Josh Triplett 

> ---
>  kernel/rcu/tree.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index a69d3dab2ec4..c5e85b27a79f 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -519,10 +519,10 @@ extern struct list_head rcu_struct_flavors;
>   * RCU implementation internal declarations:
>   */
>  extern struct rcu_state rcu_sched_state;
> -DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
> +DECLARE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_sched_data);
>  
>  extern struct rcu_state rcu_bh_state;
> -DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
> +DECLARE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_bh_data);
>  
>  #ifdef CONFIG_PREEMPT_RCU
>  extern struct rcu_state rcu_preempt_state;
> -- 
> 2.3.6
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] rcu: declare rcu_data variables in the section they are defined in

2015-05-03 Thread Josh Triplett
On Sun, May 03, 2015 at 05:57:53PM +0800, Nicolas Iooss wrote:
 Commit 11bbb235c26f (rcu: Use DEFINE_PER_CPU_SHARED_ALIGNED for
 rcu_data) replaced DEFINE_PER_CPU by DEFINE_PER_CPU_SHARED_ALIGNED in
 the definition of rcu_sched and rcu_bh without updating
 kernel/rcu/tree.h.
 
 This makes clang report a section mismatch (-Wsection warning) when
 building LLVMLinux because the variables are declared in .data..percpu
 but defined in .data..percpu..shared_aligned.
 
 Signed-off-by: Nicolas Iooss nicolas.iooss_li...@m4x.org

Good catch.
Reviewed-by: Josh Triplett j...@joshtriplett.org

 ---
  kernel/rcu/tree.h | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
 index a69d3dab2ec4..c5e85b27a79f 100644
 --- a/kernel/rcu/tree.h
 +++ b/kernel/rcu/tree.h
 @@ -519,10 +519,10 @@ extern struct list_head rcu_struct_flavors;
   * RCU implementation internal declarations:
   */
  extern struct rcu_state rcu_sched_state;
 -DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
 +DECLARE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_sched_data);
  
  extern struct rcu_state rcu_bh_state;
 -DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 +DECLARE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_bh_data);
  
  #ifdef CONFIG_PREEMPT_RCU
  extern struct rcu_state rcu_preempt_state;
 -- 
 2.3.6
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [CONFIG_MULTIUSER] init: error.c:320: Assertion failed in nih_error_get: CURRENT_CONTEXT->error != NULL

2015-05-02 Thread Josh Triplett
On Sun, May 03, 2015 at 07:18:28AM +0800, Fengguang Wu wrote:
> Hi Iulia,
> 
> FYI, there are Ubuntu init error messages when CONFIG_MULTIUSER=n.
> Since it's not embedded system and hence the target user of
> CONFIG_MULTIUSER=n, it might be fine..

I would expect a non-trivial amount of work required to make a standard
distribution boot with CONFIG_MULTIUSER=n.  Anything attempting to set
the user ID or group ID will fail, such as su, start-stop-daemon
--chuid, or systemd's daemon launching code.

So I'd suggest that this is an expected failure; allnoconfig or
tinyconfig would already not be expected to boot unmodified Ubuntu.

- Josh triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [CONFIG_MULTIUSER] init: error.c:320: Assertion failed in nih_error_get: CURRENT_CONTEXT-error != NULL

2015-05-02 Thread Josh Triplett
On Sun, May 03, 2015 at 07:18:28AM +0800, Fengguang Wu wrote:
 Hi Iulia,
 
 FYI, there are Ubuntu init error messages when CONFIG_MULTIUSER=n.
 Since it's not embedded system and hence the target user of
 CONFIG_MULTIUSER=n, it might be fine..

I would expect a non-trivial amount of work required to make a standard
distribution boot with CONFIG_MULTIUSER=n.  Anything attempting to set
the user ID or group ID will fail, such as su, start-stop-daemon
--chuid, or systemd's daemon launching code.

So I'd suggest that this is an expected failure; allnoconfig or
tinyconfig would already not be expected to boot unmodified Ubuntu.

- Josh triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone

2015-04-23 Thread Josh Triplett
On Thu, Apr 23, 2015 at 08:24:38AM +0200, Ingo Molnar wrote:
> * Josh Triplett  wrote:
> > On Wed, Apr 22, 2015 at 11:22:02AM -0700, Linus Torvalds wrote:
> > > On Wed, Apr 22, 2015 at 10:10 AM, Josh Triplett  
> > > wrote:
> > > >
> > > > I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing
> > > > this
> > > 
> > > Ugh, I absolutely detesrt that patch.
> > > 
> > > Don't make random crazy function signatures that depend on some config
> > > option. That's just evil. The patch is a mess of #ifdef's and should
> > > be shot in the head and staked with a silver stake to  make sure it
> > > never re-appears.
> > > 
> > > Either:
> > > 
> > >  (a) make the change for every architecture
> > > 
> > >  (b) have side-by-side interfaces. With different names!
> > 
> > ...that's exactly what I did.  They're called copy_thread and 
> > copy_thread_tls; I very intentionally did not conditionally change 
> > the signature of copy_thread, for exactly that reason.  Those 
> > functions are implemented in architecture-specific code, so the 
> > config option just specifies which of the two functions the 
> > architecture provides.
> > 
> > *sys_clone* has different function signatures based on config 
> > options, but I didn't touch that other than fixing the type of the 
> > tls argument. That's historical baggage that we can't throw away 
> > without breaking userspace.
> 
> So you want to add a new clone() parameter. I strongly suspect that 
> you won't be the last person who wants to do this.
> 
> So why not leave the compatibility baggage alone and introduce a new 
> clean clone syscall with a flexible, backwards and forwards compatible 
> parameter ABI?

...that's also exactly what I did, in the clone4 patch series, which
uses exactly the arg-structure approach you're describing. :)

Take a look at the clone4 patch series (v2).  We'll be updating it to v3
to address some feedback, and we're hoping to aim for the 4.2 merge
window.

> Something like:
> 
>   SYSCALL_DEFINE1(clone_params, struct clone_params __user *, params)

clone4 passes the size outside the params structure, since otherwise
you'd need to copy the first few bytes to know how much more to copy.

clone4 also passes flags outside the params structure, to allow for a
potential future flag that might indicate a completely different
interpretation of the params structure.

But otherwise, yes, clone4 moves all the parameters into a structure.

(Differences between clone4_args and your clone_params structure: size
and flags passed outside, type of the tls parameter is "unsigned long"
since it's pointer-sized, and the structure includes the stack_size
argument needed on some architectures.)

> The only real cost of this approach is that this parameter structure 
> has to be copied (once): should be fast as it fits into a single cache 
> line and is a constant size copy. Also note how this parameter block 
> can be passed down by const reference from that point on, instead of 
> copying/shuffling 5-6 parameters into increasingly long function 
> parameter lists. So it might in fact turn out to be slightly faster as 
> well.

Agreed.

> Note how easily extensible it is: a new field can be added by 
> appending to the structure, and compatibility is achieved in the 
> kernel without explicit versioning, by checking params->size:
> 
>   params->size == sizeof(*params):
> 
>  Good, kernel and userspace ABI version matches. This is a simple
>  check and the most typical situation - it will be the fastest as
>  well.
> 
> 
>   params->size < sizeof(*params):
> 
>  Old binary calls into new kernel. Missing fields are set to 0.
>  Binaries will be forward compatible without any versioning hacks.

I combined these two cases by just copying the userspace struct over a
pre-zeroed params structure; then any fields not copied over will remain
zero.

>   params->size > sizeof(*params):
> 
>  New user-space calls into old kernel. System call can still be
>  serviced if the new field(s) are all zero. We return -ENOSYS if 
>  any field is non-zero. (i.e. if new user-space tries to make use
>  of a new syscall feature that this kernel has not implemented
>  yet.) This way user-space can figure out whether a particular
>  new parameter is supported by a kernel, without having to add new
>  system calls or versioning.

In theory clone4 could have had this special case for "userspace passed
extra parameters this kernel doesn't understand, but they're all zero",
but since this is a brand new ABI, i

Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone

2015-04-23 Thread Josh Triplett
On Thu, Apr 23, 2015 at 08:24:38AM +0200, Ingo Molnar wrote:
 * Josh Triplett j...@joshtriplett.org wrote:
  On Wed, Apr 22, 2015 at 11:22:02AM -0700, Linus Torvalds wrote:
   On Wed, Apr 22, 2015 at 10:10 AM, Josh Triplett j...@joshtriplett.org 
   wrote:
   
I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing
this
   
   Ugh, I absolutely detesrt that patch.
   
   Don't make random crazy function signatures that depend on some config
   option. That's just evil. The patch is a mess of #ifdef's and should
   be shot in the head and staked with a silver stake to  make sure it
   never re-appears.
   
   Either:
   
(a) make the change for every architecture
   
(b) have side-by-side interfaces. With different names!
  
  ...that's exactly what I did.  They're called copy_thread and 
  copy_thread_tls; I very intentionally did not conditionally change 
  the signature of copy_thread, for exactly that reason.  Those 
  functions are implemented in architecture-specific code, so the 
  config option just specifies which of the two functions the 
  architecture provides.
  
  *sys_clone* has different function signatures based on config 
  options, but I didn't touch that other than fixing the type of the 
  tls argument. That's historical baggage that we can't throw away 
  without breaking userspace.
 
 So you want to add a new clone() parameter. I strongly suspect that 
 you won't be the last person who wants to do this.
 
 So why not leave the compatibility baggage alone and introduce a new 
 clean clone syscall with a flexible, backwards and forwards compatible 
 parameter ABI?

...that's also exactly what I did, in the clone4 patch series, which
uses exactly the arg-structure approach you're describing. :)

Take a look at the clone4 patch series (v2).  We'll be updating it to v3
to address some feedback, and we're hoping to aim for the 4.2 merge
window.

 Something like:
 
   SYSCALL_DEFINE1(clone_params, struct clone_params __user *, params)

clone4 passes the size outside the params structure, since otherwise
you'd need to copy the first few bytes to know how much more to copy.

clone4 also passes flags outside the params structure, to allow for a
potential future flag that might indicate a completely different
interpretation of the params structure.

But otherwise, yes, clone4 moves all the parameters into a structure.

(Differences between clone4_args and your clone_params structure: size
and flags passed outside, type of the tls parameter is unsigned long
since it's pointer-sized, and the structure includes the stack_size
argument needed on some architectures.)

 The only real cost of this approach is that this parameter structure 
 has to be copied (once): should be fast as it fits into a single cache 
 line and is a constant size copy. Also note how this parameter block 
 can be passed down by const reference from that point on, instead of 
 copying/shuffling 5-6 parameters into increasingly long function 
 parameter lists. So it might in fact turn out to be slightly faster as 
 well.

Agreed.

 Note how easily extensible it is: a new field can be added by 
 appending to the structure, and compatibility is achieved in the 
 kernel without explicit versioning, by checking params-size:
 
   params-size == sizeof(*params):
 
  Good, kernel and userspace ABI version matches. This is a simple
  check and the most typical situation - it will be the fastest as
  well.
 
 
   params-size  sizeof(*params):
 
  Old binary calls into new kernel. Missing fields are set to 0.
  Binaries will be forward compatible without any versioning hacks.

I combined these two cases by just copying the userspace struct over a
pre-zeroed params structure; then any fields not copied over will remain
zero.

   params-size  sizeof(*params):
 
  New user-space calls into old kernel. System call can still be
  serviced if the new field(s) are all zero. We return -ENOSYS if 
  any field is non-zero. (i.e. if new user-space tries to make use
  of a new syscall feature that this kernel has not implemented
  yet.) This way user-space can figure out whether a particular
  new parameter is supported by a kernel, without having to add new
  system calls or versioning.

In theory clone4 could have had this special case for userspace passed
extra parameters this kernel doesn't understand, but they're all zero,
but since this is a brand new ABI, it seems far easier for userspace to
simply pass only the size needed for its non-zero parameters, and then
the kernel can reject structures larger than it expects.  Only pass the
new version of the args structure if you need to pass non-zero values
for the new fields in that version.

Since clone4 doesn't even copy the structure if the size exceeds what it
understands, that also avoids additional special cases such as the user
passing in an obscenely large size.

 Also note how 'fool proof' this kind

Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone

2015-04-22 Thread Josh Triplett
On Wed, Apr 22, 2015 at 11:22:02AM -0700, Linus Torvalds wrote:
> On Wed, Apr 22, 2015 at 10:10 AM, Josh Triplett  wrote:
> >
> > I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing
> > this
> 
> Ugh, I absolutely detesrt that patch.
> 
> Don't make random crazy function signatures that depend on some config
> option. That's just evil. The patch is a mess of #ifdef's and should
> be shot in the head and staked with a silver stake to  make sure it
> never re-appears.
> 
> Either:
> 
>  (a) make the change for every architecture
> 
>  (b) have side-by-side interfaces. With different names!

...that's exactly what I did.  They're called copy_thread and
copy_thread_tls; I very intentionally did not conditionally change the
signature of copy_thread, for exactly that reason.  Those functions are
implemented in architecture-specific code, so the config option just
specifies which of the two functions the architecture provides.

*sys_clone* has different function signatures based on config options,
but I didn't touch that other than fixing the type of the tls argument.
That's historical baggage that we can't throw away without breaking
userspace.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone

2015-04-22 Thread Josh Triplett
On Wed, Apr 22, 2015 at 09:54:24AM -0700, Andy Lutomirski wrote:
> On Wed, Apr 22, 2015 at 9:40 AM, Denys Vlasenko  wrote:
> > Really swap arguments #4 and #5 in stub32_clone instead of "optimizing"
> > it into a move.
> >
> > Yes, tls_val is currently unused. Yes, on some CPUs XCHG is a little bit
> > more expensive than MOV. But a cycle or two on an expensive syscall like
> > clone() is way below noise floor, and obfuscation of logic introduced
> > by this optimization is simply not worth it.
> 
> Ditto re: Josh's patch.

I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing
this, but I'd like to see the final version of Denys' comment added on
top of it (with an update to the type and name of the tls argument to
match the changes to sys_clone).

Denys, would you consider submitting a patch adding your comment on top
of the two-patch series I just sent?

Thanks,
Josh Triplett

> --Andy
> 
> >
> > Signed-off-by: Denys Vlasenko 
> > CC: Linus Torvalds 
> > CC: Steven Rostedt 
> > CC: Ingo Molnar 
> > CC: Borislav Petkov 
> > CC: "H. Peter Anvin" 
> > CC: Andy Lutomirski 
> > CC: Oleg Nesterov 
> > CC: Frederic Weisbecker 
> > CC: Alexei Starovoitov 
> > CC: Will Drewry 
> > CC: Kees Cook 
> > CC: x...@kernel.org
> > CC: linux-kernel@vger.kernel.org
> > ---
> >  arch/x86/ia32/ia32entry.S | 6 ++
> >  1 file changed, 2 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> > index 8e72256..0c302d0 100644
> > --- a/arch/x86/ia32/ia32entry.S
> > +++ b/arch/x86/ia32/ia32entry.S
> > @@ -567,11 +567,9 @@ GLOBAL(stub32_clone)
> >  * 32-bit clone API is clone(..., int tls_val, int *child_tidptr).
> >  * 64-bit clone API is clone(..., int *child_tidptr, int tls_val).
> >  * Native 64-bit kernel's sys_clone() implements the latter.
> > -* We need to swap args here. But since tls_val is in fact ignored
> > -* by sys_clone(), we can get away with an assignment
> > -* (arg4 = arg5) instead of a full swap:
> > +* We need to swap args here:
> >  */
> > -   mov %r8, %rcx
> > +   xchg%r8, %rcx
> > jmp ia32_ptregs_common
> >
> > ALIGN
> > --
> > 1.8.1.4
> >
> 
> 
> 
> -- 
> Andy Lutomirski
> AMA Capital Management, LLC
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone

2015-04-22 Thread Josh Triplett
On Wed, Apr 22, 2015 at 09:54:24AM -0700, Andy Lutomirski wrote:
 On Wed, Apr 22, 2015 at 9:40 AM, Denys Vlasenko dvlas...@redhat.com wrote:
  Really swap arguments #4 and #5 in stub32_clone instead of optimizing
  it into a move.
 
  Yes, tls_val is currently unused. Yes, on some CPUs XCHG is a little bit
  more expensive than MOV. But a cycle or two on an expensive syscall like
  clone() is way below noise floor, and obfuscation of logic introduced
  by this optimization is simply not worth it.
 
 Ditto re: Josh's patch.

I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing
this, but I'd like to see the final version of Denys' comment added on
top of it (with an update to the type and name of the tls argument to
match the changes to sys_clone).

Denys, would you consider submitting a patch adding your comment on top
of the two-patch series I just sent?

Thanks,
Josh Triplett

 --Andy
 
 
  Signed-off-by: Denys Vlasenko dvlas...@redhat.com
  CC: Linus Torvalds torva...@linux-foundation.org
  CC: Steven Rostedt rost...@goodmis.org
  CC: Ingo Molnar mi...@kernel.org
  CC: Borislav Petkov b...@alien8.de
  CC: H. Peter Anvin h...@zytor.com
  CC: Andy Lutomirski l...@amacapital.net
  CC: Oleg Nesterov o...@redhat.com
  CC: Frederic Weisbecker fweis...@gmail.com
  CC: Alexei Starovoitov a...@plumgrid.com
  CC: Will Drewry w...@chromium.org
  CC: Kees Cook keesc...@chromium.org
  CC: x...@kernel.org
  CC: linux-kernel@vger.kernel.org
  ---
   arch/x86/ia32/ia32entry.S | 6 ++
   1 file changed, 2 insertions(+), 4 deletions(-)
 
  diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
  index 8e72256..0c302d0 100644
  --- a/arch/x86/ia32/ia32entry.S
  +++ b/arch/x86/ia32/ia32entry.S
  @@ -567,11 +567,9 @@ GLOBAL(stub32_clone)
   * 32-bit clone API is clone(..., int tls_val, int *child_tidptr).
   * 64-bit clone API is clone(..., int *child_tidptr, int tls_val).
   * Native 64-bit kernel's sys_clone() implements the latter.
  -* We need to swap args here. But since tls_val is in fact ignored
  -* by sys_clone(), we can get away with an assignment
  -* (arg4 = arg5) instead of a full swap:
  +* We need to swap args here:
   */
  -   mov %r8, %rcx
  +   xchg%r8, %rcx
  jmp ia32_ptregs_common
 
  ALIGN
  --
  1.8.1.4
 
 
 
 
 -- 
 Andy Lutomirski
 AMA Capital Management, LLC
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone

2015-04-22 Thread Josh Triplett
On Wed, Apr 22, 2015 at 11:22:02AM -0700, Linus Torvalds wrote:
 On Wed, Apr 22, 2015 at 10:10 AM, Josh Triplett j...@joshtriplett.org wrote:
 
  I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing
  this
 
 Ugh, I absolutely detesrt that patch.
 
 Don't make random crazy function signatures that depend on some config
 option. That's just evil. The patch is a mess of #ifdef's and should
 be shot in the head and staked with a silver stake to  make sure it
 never re-appears.
 
 Either:
 
  (a) make the change for every architecture
 
  (b) have side-by-side interfaces. With different names!

...that's exactly what I did.  They're called copy_thread and
copy_thread_tls; I very intentionally did not conditionally change the
signature of copy_thread, for exactly that reason.  Those functions are
implemented in architecture-specific code, so the config option just
specifies which of the two functions the architecture provides.

*sys_clone* has different function signatures based on config options,
but I didn't touch that other than fixing the type of the tls argument.
That's historical baggage that we can't throw away without breaking
userspace.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit

2015-04-21 Thread Josh Triplett
For 32-bit userspace on a 64-bit kernel, this requires modifying
stub32_clone to actually swap the appropriate arguments to match
CONFIG_CLONE_BACKWARDS, rather than just leaving the C argument for tls
broken.

Signed-off-by: Josh Triplett 
Signed-off-by: Thiago Macieira 
Acked-by: Andy Lutomirski 
---
 arch/x86/Kconfig | 1 +
 arch/x86/ia32/ia32entry.S| 2 +-
 arch/x86/kernel/process_32.c | 6 +++---
 arch/x86/kernel/process_64.c | 8 
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca..4960b0d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -124,6 +124,7 @@ config X86
select MODULES_USE_ELF_REL if X86_32
select MODULES_USE_ELF_RELA if X86_64
select CLONE_BACKWARDS if X86_32
+   select HAVE_COPY_THREAD_TLS
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUE_RWLOCK
select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebca..0286735 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -487,7 +487,7 @@ GLOBAL(\label)
ALIGN
 GLOBAL(stub32_clone)
leaq sys_clone(%rip),%rax
-   mov %r8, %rcx
+   xchg %r8, %rcx
jmp  ia32_ptregs_common 
 
ALIGN
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f9..ead28ff 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -129,8 +129,8 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-   unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+   unsigned long arg, struct task_struct *p, unsigned long tls)
 {
struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *tsk;
@@ -185,7 +185,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 */
if (clone_flags & CLONE_SETTLS)
err = do_set_thread_area(p, -1,
-   (struct user_desc __user *)childregs->si, 0);
+   (struct user_desc __user *)tls, 0);
 
if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43..c69cabc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -151,8 +151,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int 
tls)
return get_desc_base(>thread.tls_array[tls]);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-   unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+   unsigned long arg, struct task_struct *p, unsigned long tls)
 {
int err;
struct pt_regs *childregs;
@@ -209,10 +209,10 @@ int copy_thread(unsigned long clone_flags, unsigned long 
sp,
 #ifdef CONFIG_IA32_EMULATION
if (test_thread_flag(TIF_IA32))
err = do_set_thread_area(p, -1,
-   (struct user_desc __user *)childregs->si, 0);
+   (struct user_desc __user *)tls, 0);
else
 #endif
-   err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+   err = do_arch_prctl(p, ARCH_SET_FS, tls);
if (err)
goto out;
}
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-04-21 Thread Josh Triplett
clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread.  sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass
along that argument.  Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes
tls in).

Apart from being awful and inscrutable, that also only works because
only one code path into copy_thread can pass the CLONE_SETTLS flag, and
that code path comes from sys_clone with its architecture-specific
argument-passing order.  This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.

However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.

Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt
into, and a new copy_thread_tls that accepts the tls parameter as an
additional unsigned long (syscall-argument-sized) argument.
Change sys_clone's tls argument to an unsigned long (which does
not change the ABI), and pass that down to copy_thread_tls.

Architectures that don't opt into copy_thread_tls will continue to
ignore the C argument to sys_clone in favor of the pt_regs captured at
kernel entry, and thus will be unable to introduce new versions of the
clone syscall.

Signed-off-by: Josh Triplett 
Signed-off-by: Thiago Macieira 
Acked-by: Andy Lutomirski 
---
 arch/Kconfig |  7 ++
 include/linux/sched.h| 14 
 include/linux/syscalls.h |  6 +++---
 kernel/fork.c| 55 +++-
 4 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 05d7a8a..4834a58 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
  This spares a stack switch and improves cache usage on softirq
  processing.
 
+config HAVE_COPY_THREAD_TLS
+   bool
+   help
+ Architecture provides copy_thread_tls to accept tls argument via
+ normal C parameter passing, rather than extracting the syscall
+ argument from pt_regs.
+
 #
 # ABI hall of shame
 #
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a419b65..2cc88c6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct 
*task, unsigned int mode);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
 
+#ifdef CONFIG_HAVE_COPY_THREAD_TLS
+extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
+   struct task_struct *, unsigned long);
+#else
 extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *);
+
+/* Architectures that haven't opted into copy_thread_tls get the tls argument
+ * via pt_regs, so ignore the tls argument passed via C. */
+static inline int copy_thread_tls(
+   unsigned long clone_flags, unsigned long sp, unsigned long arg,
+   struct task_struct *p, unsigned long tls)
+{
+   return copy_thread(clone_flags, sp, arg, p);
+}
+#endif
 extern void flush_thread(void);
 extern void exit_thread(void);
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 76d1e38..bb51bec 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd);
 asmlinkage long sys_fork(void);
 asmlinkage long sys_vfork(void);
 #ifdef CONFIG_CLONE_BACKWARDS
-asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int,
+asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned 
long,
   int __user *);
 #else
 #ifdef CONFIG_CLONE_BACKWARDS3
 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *,
- int __user *, int);
+ int __user *, unsigned long);
 #else
 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
-  int __user *, int);
+  int __user *, unsigned long);
 #endif
 #endif
 
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139..b3dadf4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1192,7 +1192,8 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
-   int trace

[PATCH 0/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-04-21 Thread Josh Triplett
clone has some of the quirkiest syscall handling in the kernel, with a pile of
special cases, historical curiosities, and architecture-specific calling
conventions.  In particular, clone with CLONE_SETTLS accepts a parameter "tls"
that the C entry point completely ignores and some assembly entry points
overwrite; instead, the low-level arch-specific code pulls the tls parameter
out of the arch-specific register captured as part of pt_regs on entry to the
kernel.  That's a massive hack, and it makes the arch-specific code only work
when called via the specific existing syscall entry points; because of this
hack, any new clone-like system call would have to accept an identical tls
argument in exactly the same arch-specific position, rather than providing a
unified system call entry point across architectures.

The first patch allows architectures to handle the tls argument via normal C
parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS.  The
second patch makes 32-bit and 64-bit x86 opt into this.

These two patches came out of the clone4 series, which isn't ready for this
merge window, but these first two cleanup patches were entirely uncontroversial
and have acks.  I'd like to go ahead and submit these two so that other
architectures can begin building on top of this and opting into
HAVE_COPY_THREAD_TLS.  However, I'm also happy to wait and send these through
the next merge window (along with v3 of clone4) if anyone would prefer that.

Josh Triplett (2):
  clone: Support passing tls argument via C rather than pt_regs magic
  x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit

 arch/Kconfig |  7 ++
 arch/x86/Kconfig |  1 +
 arch/x86/ia32/ia32entry.S|  2 +-
 arch/x86/kernel/process_32.c |  6 ++---
 arch/x86/kernel/process_64.c |  8 +++
 include/linux/sched.h| 14 +++
 include/linux/syscalls.h |  6 ++---
 kernel/fork.c| 55 +---
 8 files changed, 69 insertions(+), 30 deletions(-)

-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-04-21 Thread Josh Triplett
clone has some of the quirkiest syscall handling in the kernel, with a pile of
special cases, historical curiosities, and architecture-specific calling
conventions.  In particular, clone with CLONE_SETTLS accepts a parameter tls
that the C entry point completely ignores and some assembly entry points
overwrite; instead, the low-level arch-specific code pulls the tls parameter
out of the arch-specific register captured as part of pt_regs on entry to the
kernel.  That's a massive hack, and it makes the arch-specific code only work
when called via the specific existing syscall entry points; because of this
hack, any new clone-like system call would have to accept an identical tls
argument in exactly the same arch-specific position, rather than providing a
unified system call entry point across architectures.

The first patch allows architectures to handle the tls argument via normal C
parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS.  The
second patch makes 32-bit and 64-bit x86 opt into this.

These two patches came out of the clone4 series, which isn't ready for this
merge window, but these first two cleanup patches were entirely uncontroversial
and have acks.  I'd like to go ahead and submit these two so that other
architectures can begin building on top of this and opting into
HAVE_COPY_THREAD_TLS.  However, I'm also happy to wait and send these through
the next merge window (along with v3 of clone4) if anyone would prefer that.

Josh Triplett (2):
  clone: Support passing tls argument via C rather than pt_regs magic
  x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit

 arch/Kconfig |  7 ++
 arch/x86/Kconfig |  1 +
 arch/x86/ia32/ia32entry.S|  2 +-
 arch/x86/kernel/process_32.c |  6 ++---
 arch/x86/kernel/process_64.c |  8 +++
 include/linux/sched.h| 14 +++
 include/linux/syscalls.h |  6 ++---
 kernel/fork.c| 55 +---
 8 files changed, 69 insertions(+), 30 deletions(-)

-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

2015-04-21 Thread Josh Triplett
clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread.  sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass
along that argument.  Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes
tls in).

Apart from being awful and inscrutable, that also only works because
only one code path into copy_thread can pass the CLONE_SETTLS flag, and
that code path comes from sys_clone with its architecture-specific
argument-passing order.  This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.

However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.

Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt
into, and a new copy_thread_tls that accepts the tls parameter as an
additional unsigned long (syscall-argument-sized) argument.
Change sys_clone's tls argument to an unsigned long (which does
not change the ABI), and pass that down to copy_thread_tls.

Architectures that don't opt into copy_thread_tls will continue to
ignore the C argument to sys_clone in favor of the pt_regs captured at
kernel entry, and thus will be unable to introduce new versions of the
clone syscall.

Signed-off-by: Josh Triplett j...@joshtriplett.org
Signed-off-by: Thiago Macieira thiago.macie...@intel.com
Acked-by: Andy Lutomirski l...@kernel.org
---
 arch/Kconfig |  7 ++
 include/linux/sched.h| 14 
 include/linux/syscalls.h |  6 +++---
 kernel/fork.c| 55 +++-
 4 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 05d7a8a..4834a58 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
  This spares a stack switch and improves cache usage on softirq
  processing.
 
+config HAVE_COPY_THREAD_TLS
+   bool
+   help
+ Architecture provides copy_thread_tls to accept tls argument via
+ normal C parameter passing, rather than extracting the syscall
+ argument from pt_regs.
+
 #
 # ABI hall of shame
 #
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a419b65..2cc88c6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct 
*task, unsigned int mode);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
 
+#ifdef CONFIG_HAVE_COPY_THREAD_TLS
+extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
+   struct task_struct *, unsigned long);
+#else
 extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *);
+
+/* Architectures that haven't opted into copy_thread_tls get the tls argument
+ * via pt_regs, so ignore the tls argument passed via C. */
+static inline int copy_thread_tls(
+   unsigned long clone_flags, unsigned long sp, unsigned long arg,
+   struct task_struct *p, unsigned long tls)
+{
+   return copy_thread(clone_flags, sp, arg, p);
+}
+#endif
 extern void flush_thread(void);
 extern void exit_thread(void);
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 76d1e38..bb51bec 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd);
 asmlinkage long sys_fork(void);
 asmlinkage long sys_vfork(void);
 #ifdef CONFIG_CLONE_BACKWARDS
-asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int,
+asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned 
long,
   int __user *);
 #else
 #ifdef CONFIG_CLONE_BACKWARDS3
 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *,
- int __user *, int);
+ int __user *, unsigned long);
 #else
 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
-  int __user *, int);
+  int __user *, unsigned long);
 #endif
 #endif
 
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139..b3dadf4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1192,7 +1192,8 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid

[PATCH 2/2] x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit

2015-04-21 Thread Josh Triplett
For 32-bit userspace on a 64-bit kernel, this requires modifying
stub32_clone to actually swap the appropriate arguments to match
CONFIG_CLONE_BACKWARDS, rather than just leaving the C argument for tls
broken.

Signed-off-by: Josh Triplett j...@joshtriplett.org
Signed-off-by: Thiago Macieira thiago.macie...@intel.com
Acked-by: Andy Lutomirski l...@kernel.org
---
 arch/x86/Kconfig | 1 +
 arch/x86/ia32/ia32entry.S| 2 +-
 arch/x86/kernel/process_32.c | 6 +++---
 arch/x86/kernel/process_64.c | 8 
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca..4960b0d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -124,6 +124,7 @@ config X86
select MODULES_USE_ELF_REL if X86_32
select MODULES_USE_ELF_RELA if X86_64
select CLONE_BACKWARDS if X86_32
+   select HAVE_COPY_THREAD_TLS
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUE_RWLOCK
select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebca..0286735 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -487,7 +487,7 @@ GLOBAL(\label)
ALIGN
 GLOBAL(stub32_clone)
leaq sys_clone(%rip),%rax
-   mov %r8, %rcx
+   xchg %r8, %rcx
jmp  ia32_ptregs_common 
 
ALIGN
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f9..ead28ff 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -129,8 +129,8 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-   unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+   unsigned long arg, struct task_struct *p, unsigned long tls)
 {
struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *tsk;
@@ -185,7 +185,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 */
if (clone_flags  CLONE_SETTLS)
err = do_set_thread_area(p, -1,
-   (struct user_desc __user *)childregs-si, 0);
+   (struct user_desc __user *)tls, 0);
 
if (err  p-thread.io_bitmap_ptr) {
kfree(p-thread.io_bitmap_ptr);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43..c69cabc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -151,8 +151,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int 
tls)
return get_desc_base(t-thread.tls_array[tls]);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-   unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+   unsigned long arg, struct task_struct *p, unsigned long tls)
 {
int err;
struct pt_regs *childregs;
@@ -209,10 +209,10 @@ int copy_thread(unsigned long clone_flags, unsigned long 
sp,
 #ifdef CONFIG_IA32_EMULATION
if (test_thread_flag(TIF_IA32))
err = do_set_thread_area(p, -1,
-   (struct user_desc __user *)childregs-si, 0);
+   (struct user_desc __user *)tls, 0);
else
 #endif
-   err = do_arch_prctl(p, ARCH_SET_FS, childregs-r8);
+   err = do_arch_prctl(p, ARCH_SET_FS, tls);
if (err)
goto out;
}
-- 
2.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: Align jump targets to 1 byte boundaries

2015-04-11 Thread Josh Triplett
On Fri, Apr 10, 2015 at 07:10:08AM -0700, Paul E. McKenney wrote:
> On Fri, Apr 10, 2015 at 02:08:46PM +0200, Ingo Molnar wrote:
> > * Ingo Molnar  wrote:
[...]
> > Btw., totally off topic, the following NOP caught my attention:
> > 
> > >   5a: 66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
> > 
> > That's a dead NOP that boats the function a bit, added for the 16 byte 
> > alignment of one of the jump targets.
> > 
> > I realize that x86 CPU manufacturers recommend 16-byte jump target 
> > alignments (it's in the Intel optimization manual), but the cost of 
> > that is very significant:
> > 
> > text   data   bss dec  filename
> > 125663911617840   108953615273767  vmlinux.align.16-byte
> > 122249511617840   108953614932327  vmlinux.align.1-byte
> > 
> > By using 1 byte jump target alignment (i.e. no alignment at all) we 
> > get an almost 3% reduction in kernel size (!) - and a probably similar 
> > reduction in I$ footprint.
> > 
> > So I'm wondering, is the 16 byte jump target optimization suggestion 
> > really worth this price? The patch below boots fine and I've not 
> > measured any noticeable slowdown, but I've not tried hard.
> 
> Good point, adding Josh Triplett on CC.  I suspect that he might be
> interested.  ;-)

Quite interested, yes.  Even if there *are* benchmarks to support
keeping the optimization (which wouldn't surprise me), it'd be nice to
have a Kconfig option to enable the jump-target optimization.  (With 'y'
meaning "pad jump targets to 16 bytes", so that allnoconfig and
tinyconfig automatically don't.)

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: Align jump targets to 1 byte boundaries

2015-04-11 Thread Josh Triplett
On Fri, Apr 10, 2015 at 07:10:08AM -0700, Paul E. McKenney wrote:
 On Fri, Apr 10, 2015 at 02:08:46PM +0200, Ingo Molnar wrote:
  * Ingo Molnar mi...@kernel.org wrote:
[...]
  Btw., totally off topic, the following NOP caught my attention:
  
 5a: 66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
  
  That's a dead NOP that boats the function a bit, added for the 16 byte 
  alignment of one of the jump targets.
  
  I realize that x86 CPU manufacturers recommend 16-byte jump target 
  alignments (it's in the Intel optimization manual), but the cost of 
  that is very significant:
  
  text   data   bss dec  filename
  125663911617840   108953615273767  vmlinux.align.16-byte
  122249511617840   108953614932327  vmlinux.align.1-byte
  
  By using 1 byte jump target alignment (i.e. no alignment at all) we 
  get an almost 3% reduction in kernel size (!) - and a probably similar 
  reduction in I$ footprint.
  
  So I'm wondering, is the 16 byte jump target optimization suggestion 
  really worth this price? The patch below boots fine and I've not 
  measured any noticeable slowdown, but I've not tried hard.
 
 Good point, adding Josh Triplett on CC.  I suspect that he might be
 interested.  ;-)

Quite interested, yes.  Even if there *are* benchmarks to support
keeping the optimization (which wouldn't surprise me), it'd be nice to
have a Kconfig option to enable the jump-target optimization.  (With 'y'
meaning pad jump targets to 16 bytes, so that allnoconfig and
tinyconfig automatically don't.)

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor

2015-04-08 Thread Josh Triplett
On Wed, Apr 01, 2015 at 09:24:20AM +0200, Jonathan Corbet wrote:
> On Tue, 31 Mar 2015 15:02:24 -0700
> j...@joshtriplett.org wrote:
> 
> > > This would appear to assume that a clonefd_info structure is the only
> > > thing that will ever be read from this descriptor.  It seems to me that
> > > there is the potential for, someday, wanting to be able to read and write
> > > other things as well.  Should this structure be marked with type and
> > > length fields so that other structures could be added in the future?  
> > 
> > I don't think it makes sense for a caller to get an arbitrary structure
> > on read(), and have to figure out what they got and ignore something
> > they don't understand.  Instead, I think it makes more sense for the
> > caller to say "Hey, here's a flag saying I understand the new thing, go
> > ahead and give me the new thing".  So, for instance, if you want to
> > receive SIGSTOP/SIGCONT messages for child processes through this
> > descriptor, we could add a flag for that.
> 
> The flag is fine, but, once we have set that flag saying we want those
> messages, how do we know which type of structure we've gotten?  That's
> the piece of the puzzle I'm missing, sorry if I'm being overly slow.

If you pass a flag saying you can handle a new set of potential
structures, those structures can then include any necessary
disambiguating flags/IDs/etc.  No need for them to match the current
clonefd_info structure if userspace has opted into a new version.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor

2015-04-08 Thread Josh Triplett
On Wed, Apr 01, 2015 at 09:24:20AM +0200, Jonathan Corbet wrote:
 On Tue, 31 Mar 2015 15:02:24 -0700
 j...@joshtriplett.org wrote:
 
   This would appear to assume that a clonefd_info structure is the only
   thing that will ever be read from this descriptor.  It seems to me that
   there is the potential for, someday, wanting to be able to read and write
   other things as well.  Should this structure be marked with type and
   length fields so that other structures could be added in the future?  
  
  I don't think it makes sense for a caller to get an arbitrary structure
  on read(), and have to figure out what they got and ignore something
  they don't understand.  Instead, I think it makes more sense for the
  caller to say Hey, here's a flag saying I understand the new thing, go
  ahead and give me the new thing.  So, for instance, if you want to
  receive SIGSTOP/SIGCONT messages for child processes through this
  descriptor, we could add a flag for that.
 
 The flag is fine, but, once we have set that flag saying we want those
 messages, how do we know which type of structure we've gotten?  That's
 the piece of the puzzle I'm missing, sorry if I'm being overly slow.

If you pass a flag saying you can handle a new set of potential
structures, those structures can then include any necessary
disambiguating flags/IDs/etc.  No need for them to match the current
clonefd_info structure if userspace has opted into a new version.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 7/7] clone4: Add a CLONE_FD flag to get task exit notification via fd

2015-04-06 Thread Josh Triplett
On Mon, Apr 06, 2015 at 05:30:35PM +0900, Sergey Senozhatsky wrote:
> On (03/15/15 01:00), Josh Triplett wrote:
> [..]
> > +
> > +/* Handle the CLONE_FD case for copy_process. */
> > +int clonefd_do_clone(u64 clone_flags, struct task_struct *p,
> > +struct clone4_args *args, struct clonefd_setup *setup)
> > +{
> > +   int flags;
> > +   struct file *file;
> > +   int fd;
> > +
> > +   p->clonefd = !!(clone_flags & CLONE_FD);
> > +   if (!p->clonefd)
> > +   return 0;
> > +
> > +   if (args->clonefd_flags & ~(O_CLOEXEC | O_NONBLOCK))
> > +   return -EINVAL;
> > +
> > +   init_waitqueue_head(>clonefd_wqh);
> > +
> > +   get_task_struct(p);
> > +   flags = O_RDONLY | FMODE_ATOMIC_POS | args->clonefd_flags;
> > +   file = anon_inode_getfile("[process]", _fops, p, flags);
> > +   if (IS_ERR(file)) {
> > +   put_task_struct(p);
> > +   return PTR_ERR(file);
> > +   }
> > +
> > +   fd = get_unused_fd_flags(flags);
> > +   if (fd < 0) {
> 
> + put_task_struct(p); ?

No, once anon_inode_getfile has succeeded, the file owns the reference
to the task_struct, so fput(file) will call the release function which
calls put_task_struct.  Only the failure case for anon_inode_getfile
needs to call put_task_struct directly.

> > +   fput(file);
> > +   return fd;
> > +   }

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 7/7] clone4: Add a CLONE_FD flag to get task exit notification via fd

2015-04-06 Thread Josh Triplett
On Mon, Apr 06, 2015 at 05:30:35PM +0900, Sergey Senozhatsky wrote:
 On (03/15/15 01:00), Josh Triplett wrote:
 [..]
  +
  +/* Handle the CLONE_FD case for copy_process. */
  +int clonefd_do_clone(u64 clone_flags, struct task_struct *p,
  +struct clone4_args *args, struct clonefd_setup *setup)
  +{
  +   int flags;
  +   struct file *file;
  +   int fd;
  +
  +   p-clonefd = !!(clone_flags  CLONE_FD);
  +   if (!p-clonefd)
  +   return 0;
  +
  +   if (args-clonefd_flags  ~(O_CLOEXEC | O_NONBLOCK))
  +   return -EINVAL;
  +
  +   init_waitqueue_head(p-clonefd_wqh);
  +
  +   get_task_struct(p);
  +   flags = O_RDONLY | FMODE_ATOMIC_POS | args-clonefd_flags;
  +   file = anon_inode_getfile([process], clonefd_fops, p, flags);
  +   if (IS_ERR(file)) {
  +   put_task_struct(p);
  +   return PTR_ERR(file);
  +   }
  +
  +   fd = get_unused_fd_flags(flags);
  +   if (fd  0) {
 
 + put_task_struct(p); ?

No, once anon_inode_getfile has succeeded, the file owns the reference
to the task_struct, so fput(file) will call the release function which
calls put_task_struct.  Only the failure case for anon_inode_getfile
needs to call put_task_struct directly.

  +   fput(file);
  +   return fd;
  +   }

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 7/7] clone4: Add a CLONE_FD flag to get task exit notification via fd

2015-03-25 Thread Josh Triplett
On Mon, Mar 23, 2015 at 05:38:45PM +, David Drysdale wrote:
> On Sun, Mar 15, 2015 at 8:00 AM, Josh Triplett  wrote:
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 9daa017..1dc680b 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1374,6 +1374,11 @@ struct task_struct {
> >
> > unsigned autoreap:1; /* Do not become a zombie on exit */
> >
> > +#ifdef CONFIG_CLONEFD
> > +   unsigned clonefd:1; /* Notify clonefd_wqh on exit */
> > +   wait_queue_head_t clonefd_wqh;
> > +#endif
> > +
> > unsigned long atomic_flags; /* Flags needing atomic access. */
> >
> > struct restart_block restart_block;
> 
> Idle thought: are there any concerns about the occupancy
> impact of adding a wait_queue_head to every task_struct,
> whether it has a clonefd or not?
> 
> I guess we could reduce the size somewhat by just
> storing a struct file *clonefd_file in the task, and then have
> a separate structure (with the wqh and a task_struct*) referenced
> by file->private_data.  Not sure whether the added complication
> would be worthwhile, though.

My original patches did exactly that (minus the reference back to the
task_struct).  However, there are a couple of problems with that
approach.  First, it assumes that a task_struct has only a single file
referencing it, but in the future I'd like to support obtaining a
clonefd for an existing task.  Second, the task_struct really shouldn't
have a reference to the actual struct file, when it only needs the
wait_queue_head_t.

Also, AFAICT a wait_queue_head_t is normally (in the absence of kernel
lock debugging options) the size of two pointers.  Adding an indirection
and an extra allocation to change that to the size of one pointer seems
iffy, especially when looking at the rest of what's directly in
task_struct that's far larger.

> > --- /dev/null
> > +++ b/kernel/clonefd.c
> > @@ -0,0 +1,121 @@
> > +/*
> > + * Support functions for CLONE_FD
> > + *
> > + * Copyright (c) 2015 Intel Corporation
> > + * Original authors: Josh Triplett 
> > + *   Thiago Macieira 
> > + */
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include "clonefd.h"
> > +
> > +static int clonefd_release(struct inode *inode, struct file *file)
> > +{
> > +   put_task_struct(file->private_data);
> > +   return 0;
> > +}
> > +
> > +static unsigned int clonefd_poll(struct file *file, poll_table *wait)
> > +{
> > +   struct task_struct *p = file->private_data;
> > +   poll_wait(file, >clonefd_wqh, wait);
> > +   return p->exit_state ? (POLLIN | POLLRDNORM | POLLHUP) : 0;
> > +}
> > +
> > +static ssize_t clonefd_read(struct file *file, char __user *buf, size_t 
> > count, loff_t *ppos)
> > +{
> > +   struct task_struct *p = file->private_data;
> > +   int ret = 0;
> > +
> > +   /* EOF after first read */
> > +   if (*ppos)
> > +   return 0;
> > +
> > +   if (file->f_flags & O_NONBLOCK)
> > +   ret = -EAGAIN;
> > +   else
> > +   ret = wait_event_interruptible(p->clonefd_wqh, 
> > p->exit_state);
> > +
> > +   if (p->exit_state) {
> > +   struct clonefd_info info = {};
> > +   cputime_t utime, stime;
> > +   task_exit_code_status(p->exit_code, , 
> > );
> > +   info.code &= ~__SI_MASK;
> > +   task_cputime(p, , );
> > +   info.utime = cputime_to_clock_t(utime + p->signal->utime);
> > +   info.stime = cputime_to_clock_t(stime + p->signal->stime);
> > +   ret = simple_read_from_buffer(buf, count, ppos, , 
> > sizeof(info));
> > +   }
> > +   return ret;
> > +}
> > +
> > +static struct file_operations clonefd_fops = {
> > +   .release = clonefd_release,
> > +   .poll = clonefd_poll,
> > +   .read = clonefd_read,
> > +   .llseek = no_llseek,
> > +};
> 
> It might be nice to include a show_fdinfo() implementation that shows
> (say) the pid that the clonefd refers to.  E.g. something like:
> 
> static void clonefd_show_fdinfo(struct seq_file *m, struct file *file)
> {
> struct task_struct *p = file->private_data;
> 
> seq_printf(m, "tid:\t%d\n", task_tgid_vnr(p));
> }

I thought about that, but that would add a couple of additional ifdefs
(C

Re: [PATCH v2 7/7] clone4: Add a CLONE_FD flag to get task exit notification via fd

2015-03-25 Thread Josh Triplett
On Mon, Mar 23, 2015 at 05:38:45PM +, David Drysdale wrote:
 On Sun, Mar 15, 2015 at 8:00 AM, Josh Triplett j...@joshtriplett.org wrote:
  diff --git a/include/linux/sched.h b/include/linux/sched.h
  index 9daa017..1dc680b 100644
  --- a/include/linux/sched.h
  +++ b/include/linux/sched.h
  @@ -1374,6 +1374,11 @@ struct task_struct {
 
  unsigned autoreap:1; /* Do not become a zombie on exit */
 
  +#ifdef CONFIG_CLONEFD
  +   unsigned clonefd:1; /* Notify clonefd_wqh on exit */
  +   wait_queue_head_t clonefd_wqh;
  +#endif
  +
  unsigned long atomic_flags; /* Flags needing atomic access. */
 
  struct restart_block restart_block;
 
 Idle thought: are there any concerns about the occupancy
 impact of adding a wait_queue_head to every task_struct,
 whether it has a clonefd or not?
 
 I guess we could reduce the size somewhat by just
 storing a struct file *clonefd_file in the task, and then have
 a separate structure (with the wqh and a task_struct*) referenced
 by file-private_data.  Not sure whether the added complication
 would be worthwhile, though.

My original patches did exactly that (minus the reference back to the
task_struct).  However, there are a couple of problems with that
approach.  First, it assumes that a task_struct has only a single file
referencing it, but in the future I'd like to support obtaining a
clonefd for an existing task.  Second, the task_struct really shouldn't
have a reference to the actual struct file, when it only needs the
wait_queue_head_t.

Also, AFAICT a wait_queue_head_t is normally (in the absence of kernel
lock debugging options) the size of two pointers.  Adding an indirection
and an extra allocation to change that to the size of one pointer seems
iffy, especially when looking at the rest of what's directly in
task_struct that's far larger.

  --- /dev/null
  +++ b/kernel/clonefd.c
  @@ -0,0 +1,121 @@
  +/*
  + * Support functions for CLONE_FD
  + *
  + * Copyright (c) 2015 Intel Corporation
  + * Original authors: Josh Triplett j...@joshtriplett.org
  + *   Thiago Macieira thi...@macieira.org
  + */
  +#include linux/anon_inodes.h
  +#include linux/file.h
  +#include linux/fs.h
  +#include linux/poll.h
  +#include linux/slab.h
  +#include clonefd.h
  +
  +static int clonefd_release(struct inode *inode, struct file *file)
  +{
  +   put_task_struct(file-private_data);
  +   return 0;
  +}
  +
  +static unsigned int clonefd_poll(struct file *file, poll_table *wait)
  +{
  +   struct task_struct *p = file-private_data;
  +   poll_wait(file, p-clonefd_wqh, wait);
  +   return p-exit_state ? (POLLIN | POLLRDNORM | POLLHUP) : 0;
  +}
  +
  +static ssize_t clonefd_read(struct file *file, char __user *buf, size_t 
  count, loff_t *ppos)
  +{
  +   struct task_struct *p = file-private_data;
  +   int ret = 0;
  +
  +   /* EOF after first read */
  +   if (*ppos)
  +   return 0;
  +
  +   if (file-f_flags  O_NONBLOCK)
  +   ret = -EAGAIN;
  +   else
  +   ret = wait_event_interruptible(p-clonefd_wqh, 
  p-exit_state);
  +
  +   if (p-exit_state) {
  +   struct clonefd_info info = {};
  +   cputime_t utime, stime;
  +   task_exit_code_status(p-exit_code, info.code, 
  info.status);
  +   info.code = ~__SI_MASK;
  +   task_cputime(p, utime, stime);
  +   info.utime = cputime_to_clock_t(utime + p-signal-utime);
  +   info.stime = cputime_to_clock_t(stime + p-signal-stime);
  +   ret = simple_read_from_buffer(buf, count, ppos, info, 
  sizeof(info));
  +   }
  +   return ret;
  +}
  +
  +static struct file_operations clonefd_fops = {
  +   .release = clonefd_release,
  +   .poll = clonefd_poll,
  +   .read = clonefd_read,
  +   .llseek = no_llseek,
  +};
 
 It might be nice to include a show_fdinfo() implementation that shows
 (say) the pid that the clonefd refers to.  E.g. something like:
 
 static void clonefd_show_fdinfo(struct seq_file *m, struct file *file)
 {
 struct task_struct *p = file-private_data;
 
 seq_printf(m, tid:\t%d\n, task_tgid_vnr(p));
 }

I thought about that, but that would add a couple of additional ifdefs
(CONFIG_PROC_FS), for an informational file of minimal value.  More
importantly, I don't want to add that until after adding an ioctl or
similar to programmatically obtain the pid from a clonefd; otherwise,
someone might try to use fdinfo as the API to do so, which would be
all kinds of awful.

So I'd prefer to add fdinfo in a future extension of clonefd, rather
than in the initial patch series.

  +
  +/* Do process exit notification for clonefd. */
  +void clonefd_do_notify(struct task_struct *p)
  +{
  +   if (p-clonefd)
  +   wake_up_all(p-clonefd_wqh);
  +}
  +
  +/* Handle the CLONE_FD case for copy_process. */
  +int clonefd_do_clone(u64 clone_flags, struct task_struct

Re: [PATCH] reiserfs: kstrdup() memory handling

2015-03-22 Thread Josh Triplett
On Sun, Mar 22, 2015 at 08:12:46PM -0400, Sanidhya Kashyap wrote:
> On Sat, Mar 21, 2015 at 1:15 PM, Josh Triplett  wrote:
> > On Sat, Mar 21, 2015 at 01:00:13PM -0400, Sanidhya Kashyap wrote:
> >> Checking for ENOMEM even for new_opts in reiserfs_remount function as
> >> there is a possibility of nothing being allocated.
> >
> > You don't need to add a new label; kfree(NULL) is a no-op.
> >
> 
> Shall I resubmit the patch?

I would suggest doing so, yes, after you see if anyone else has feedback
on it.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] reiserfs: kstrdup() memory handling

2015-03-22 Thread Josh Triplett
On Sun, Mar 22, 2015 at 08:12:46PM -0400, Sanidhya Kashyap wrote:
 On Sat, Mar 21, 2015 at 1:15 PM, Josh Triplett j...@joshtriplett.org wrote:
  On Sat, Mar 21, 2015 at 01:00:13PM -0400, Sanidhya Kashyap wrote:
  Checking for ENOMEM even for new_opts in reiserfs_remount function as
  there is a possibility of nothing being allocated.
 
  You don't need to add a new label; kfree(NULL) is a no-op.
 
 
 Shall I resubmit the patch?

I would suggest doing so, yes, after you see if anyone else has feedback
on it.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] reiserfs: kstrdup() memory handling

2015-03-21 Thread Josh Triplett
On Sat, Mar 21, 2015 at 01:00:13PM -0400, Sanidhya Kashyap wrote:
> Checking for ENOMEM even for new_opts in reiserfs_remount function as
> there is a possibility of nothing being allocated.

You don't need to add a new label; kfree(NULL) is a no-op.

> Signed-off-by: Sanidhya Kashyap 
> ---
>  fs/reiserfs/super.c | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
> index 71fbbe3..bf9bc66 100644
> --- a/fs/reiserfs/super.c
> +++ b/fs/reiserfs/super.c
> @@ -1397,6 +1397,11 @@ static int reiserfs_remount(struct super_block *s, int 
> *mount_flags, char *arg)
>   int i;
>  #endif
>  
> + if (!new_opts) {
> + err = -ENOMEM;
> + goto out_err_no_kfree;
> + }
> +
>   sync_filesystem(s);
>   reiserfs_write_lock(s);
>  
> @@ -1549,6 +1554,7 @@ out_err_unlock:
>   reiserfs_write_unlock(s);
>  out_err:
>   kfree(new_opts);
> +out_err_no_kfree:
>   return err;
>  }
>  
> -- 
> 2.1.0
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] reiserfs: kstrdup() memory handling

2015-03-21 Thread Josh Triplett
On Sat, Mar 21, 2015 at 01:00:13PM -0400, Sanidhya Kashyap wrote:
 Checking for ENOMEM even for new_opts in reiserfs_remount function as
 there is a possibility of nothing being allocated.

You don't need to add a new label; kfree(NULL) is a no-op.

 Signed-off-by: Sanidhya Kashyap sanidhya.gat...@gmail.com
 ---
  fs/reiserfs/super.c | 6 ++
  1 file changed, 6 insertions(+)
 
 diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
 index 71fbbe3..bf9bc66 100644
 --- a/fs/reiserfs/super.c
 +++ b/fs/reiserfs/super.c
 @@ -1397,6 +1397,11 @@ static int reiserfs_remount(struct super_block *s, int 
 *mount_flags, char *arg)
   int i;
  #endif
  
 + if (!new_opts) {
 + err = -ENOMEM;
 + goto out_err_no_kfree;
 + }
 +
   sync_filesystem(s);
   reiserfs_write_lock(s);
  
 @@ -1549,6 +1554,7 @@ out_err_unlock:
   reiserfs_write_unlock(s);
  out_err:
   kfree(new_opts);
 +out_err_no_kfree:
   return err;
  }
  
 -- 
 2.1.0
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH v13] sys_membarrier(): system/process-wide memory barrier (x86)

2015-03-17 Thread Josh Triplett
On Tue, Mar 17, 2015 at 07:01:23PM +, Mathieu Desnoyers wrote:
> - Original Message -
> > On Tue, Mar 17, 2015 at 10:57:50AM -0700, j...@joshtriplett.org wrote:
> > > On Tue, Mar 17, 2015 at 06:30:35PM +0100, Peter Zijlstra wrote:
> > > > On Tue, Mar 17, 2015 at 01:22:02PM -0400, Mathieu Desnoyers wrote:
> > > > > Here is an implementation of a new system call, sys_membarrier(), 
> > > > > which
> > > > > executes a memory barrier on either all running threads of the current
> > > > > process (MEMBARRIER_PRIVATE) issues a memory barrier on all threads
> > > > > running on the system (~MEMBARRIER_PRIVATE). Both are currently
> > > > > implemented by calling synchronize_sched().
> > > > 
> > > > Then why bother with the flag?
> > > 
> > > Semantically, MEMBARRIER_PRIVATE is allowed to avoid issuing a barrier
> > > on CPUs not running the current process if it can, while
> > > ~MEMBARRIER_PRIVATE may not.  (The latter would be useful for
> > > applications such as system-wide tracing.)  That they're currently both
> > > implemented the same way doesn't mean they're semantically equivalent.
> > 
> > Sure; but why bother with pointless fluff like that? We can always
> > introduce the private flag if and when it starts to make sense having
> > it.
> 
> Without the expedited implementation, the only usefulness of the
> private flag is to skip synchronize_sched() if called from a
> single-threaded process.
> 
> We could easily argue that if a process is using sys_membarrier in
> the first place, it's very likely that it is multithreaded. So I
> agree that we can drop the flag for now, and add it later on,
> e.g. when adding the expedited mode.
> 
> I am tempted to leave the "flags" argument in place though, along
> with the "MEMBARRIER_QUERY" flag. Thoughts ?

You should definitely *always* supply a flags argument with any new
syscall, even if you have no flags yet.

As for QUERY, I assume that exists because the more expensive forms of
barriers (e.g. expedited) are sufficiently invasive that you don't want
to trigger one when not needed just to test if they work?  If so, then
yeah, having that flag seems fine.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH v13] sys_membarrier(): system/process-wide memory barrier (x86)

2015-03-17 Thread Josh Triplett
On Tue, Mar 17, 2015 at 07:01:23PM +, Mathieu Desnoyers wrote:
 - Original Message -
  On Tue, Mar 17, 2015 at 10:57:50AM -0700, j...@joshtriplett.org wrote:
   On Tue, Mar 17, 2015 at 06:30:35PM +0100, Peter Zijlstra wrote:
On Tue, Mar 17, 2015 at 01:22:02PM -0400, Mathieu Desnoyers wrote:
 Here is an implementation of a new system call, sys_membarrier(), 
 which
 executes a memory barrier on either all running threads of the current
 process (MEMBARRIER_PRIVATE) issues a memory barrier on all threads
 running on the system (~MEMBARRIER_PRIVATE). Both are currently
 implemented by calling synchronize_sched().

Then why bother with the flag?
   
   Semantically, MEMBARRIER_PRIVATE is allowed to avoid issuing a barrier
   on CPUs not running the current process if it can, while
   ~MEMBARRIER_PRIVATE may not.  (The latter would be useful for
   applications such as system-wide tracing.)  That they're currently both
   implemented the same way doesn't mean they're semantically equivalent.
  
  Sure; but why bother with pointless fluff like that? We can always
  introduce the private flag if and when it starts to make sense having
  it.
 
 Without the expedited implementation, the only usefulness of the
 private flag is to skip synchronize_sched() if called from a
 single-threaded process.
 
 We could easily argue that if a process is using sys_membarrier in
 the first place, it's very likely that it is multithreaded. So I
 agree that we can drop the flag for now, and add it later on,
 e.g. when adding the expedited mode.
 
 I am tempted to leave the flags argument in place though, along
 with the MEMBARRIER_QUERY flag. Thoughts ?

You should definitely *always* supply a flags argument with any new
syscall, even if you have no flags yet.

As for QUERY, I assume that exists because the more expensive forms of
barriers (e.g. expedited) are sufficiently invasive that you don't want
to trigger one when not needed just to test if they work?  If so, then
yeah, having that flag seems fine.

- Josh Triplett
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] sys_membarrier(): system/process-wide memory barrier (x86) (v12)

2015-03-15 Thread Josh Triplett
On Sun, Mar 15, 2015 at 03:24:19PM -0400, Mathieu Desnoyers wrote:
> Here is an implementation of a new system call, sys_membarrier(), which
> executes a memory barrier on either all running threads of the current
> process (MEMBARRIER_PRIVATE_FLAG) or calls synchronize_sched() to issue
> a memory barrier on all threads running on the system. It can be used to
> distribute the cost of user-space memory barriers asymmetrically by
> transforming pairs of memory barriers into pairs consisting of
> sys_membarrier() and a compiler barrier. For synchronization primitives
> that distinguish between read-side and write-side (e.g. userspace RCU,
> rwlocks), the read-side can be accelerated significantly by moving the
> bulk of the memory barrier overhead to the write-side.

>From a quick review, this seems quite reasonable (as it did 5 years
ago).

One request: Could you please add a config option (default y) in the
EXPERT menu to disable this?  You actually seem to already have it
marked as a cond_syscall.

Also, a very minor nit: flags in kernel APIs aren't typically named with
a _FLAG suffix.

With the syscall made optional, and with or without that naming nit
fixed:
Reviewed-by: Josh Triplett 

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 5/7] clone4: Add a CLONE_AUTOREAP flag to automatically reap the child process

2015-03-15 Thread Josh Triplett
On Sun, Mar 15, 2015 at 08:55:06PM +0100, Oleg Nesterov wrote:
> On 03/15, Josh Triplett wrote:
> > On Sun, Mar 15, 2015 at 03:52:23PM +0100, Oleg Nesterov wrote:
> > > On 03/15, Josh Triplett wrote:
> > > > Add a CLONE_AUTOREAP flag to request this behavior unconditionally,
> > >
> > > Yes, CLONE_AUTOREAP is much better. And I agree (mostly) with that
> > > we should rely on do_notify_parent().
> > >
> > > Howver the patch still doesn't look right. First of all, ->autoreap
> > > should be per-process, not per-thread.
> >
> > Ah, you're thinking of the case where the parent process launches a
> > ...
> 
> Not really, although we probably need more sanity checks.
> 
> It should be per-process simply because this "autoreap" affects the whole
> process. And the sub-threads are already "autoreap". And these 2 autoreap's
> semantics differ, we should not confuse them.

Will the approach I suggested, of having clones with CLONE_THREAD
inherit the autoreap value rather than setting it from CLONE_AUTOREAP,
implement the semantics you're looking for?

Also, are you suggesting that CLONE_AUTOREAP with CLONE_THREAD should
produce -EINVAL, or just that it should be ignored?

> > (As an aside, what *is* the use case for CLONE_PARENT without
> > CLONE_THREAD?)
> 
> To me CLONE_PARENT is another historical mistake and the source of misc
> problems ;)

I kinda figured. :)

> > > And there are ptrace/mt issues,
> > > it seems. Just for example, we should avoid EXIT_TRACE if autoreap in
> > > wait_task_zombie() even if we are going to re-notify parent.
> >
> > I don't see how EXIT_TRACE can happen in wait_task_zombie if autoreap is
> > set.  wait_task_zombie does a cmpxchg with exit_state and doesn't
> > proceed unless exit_state was EXIT_ZOMBIE, and I don't see how we can
> > ever reach the EXIT_ZOMBIE state if autoreap.
> 
> Because you again forgot about ptrace ;)
> 
> Josh. Let me try to summarise this later when I have time. Again, I am
> not sure, perhaps this is even simpler than I currently think. And let
> me apologize in advance, most probably I will be busy tomorrow.

I look forward to your later review and feedback.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 5/7] clone4: Add a CLONE_AUTOREAP flag to automatically reap the child process

2015-03-15 Thread Josh Triplett
On Sun, Mar 15, 2015 at 03:52:23PM +0100, Oleg Nesterov wrote:
> On 03/15, Josh Triplett wrote:
> > Add a CLONE_AUTOREAP flag to request this behavior unconditionally,
> 
> Yes, CLONE_AUTOREAP is much better. And I agree (mostly) with that
> we should rely on do_notify_parent().
> 
> Howver the patch still doesn't look right. First of all, ->autoreap
> should be per-process, not per-thread.

Ah, you're thinking of the case where the parent process launches a
child with CLONE_AUTOREAP, that child process launches siblings with
CLONE_THREAD and without CLONE_AUTOREAP, and one of those siblings is
the last to exit?  That seems easy enough to handle: instead of setting
->autoreap unconditionally in copy_process, I can set it only in the
non-CLONE_THREAD case, and otherwise let it inherit.  Then every task in
the group will have the same value for autoreap.

(As an aside, what *is* the use case for CLONE_PARENT without
CLONE_THREAD?)

> And there are ptrace/mt issues,
> it seems. Just for example, we should avoid EXIT_TRACE if autoreap in
> wait_task_zombie() even if we are going to re-notify parent.

I don't see how EXIT_TRACE can happen in wait_task_zombie if autoreap is
set.  wait_task_zombie does a cmpxchg with exit_state and doesn't
proceed unless exit_state was EXIT_ZOMBIE, and I don't see how we can
ever reach the EXIT_ZOMBIE state if autoreap.

> EXCEPT: do we really want SIGCHLD from the exiting child? I think we
> do not. I won't really argue though, but this should be discussed and
> documented. IIUC, with your patch it is still sent.

I think we do, yes.  The caller of clone can already specify what signal
they want, including no signal at all.  If they specify a signal
(SIGCHLD or otherwise) along with CLONE_AUTOREAP, we can send that
signal.  I don't think that causes any particular problem.

That's the same semantic you'd get if you have a SIGCHLD handler with
SA_NOCLDWAIT: you'd still get the signal, even though you don't need to
(and can't) wait on the child process.

> Josh, please give me some time to think and re-check, I'll write another
> email next week. I am not sure this is really needed, but it seems to
> me that we need the preparation patch to make this change clear/simple.

I'd appreciate any feedback you can offer on this series, including any
potential subtle interactions with ptrace.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/6] CLONE_FD: Task exit notification via file descriptor

2015-03-15 Thread Josh Triplett
On Sun, Mar 15, 2015 at 10:18:05AM +, David Drysdale wrote:
> On Sat, Mar 14, 2015 at 7:29 PM, Josh Triplett  wrote:
> > On Sat, Mar 14, 2015 at 12:03:12PM -0700, Thiago Macieira wrote:
> >> On Friday 13 March 2015 18:11:32 Thiago Macieira wrote:
> >> > On Friday 13 March 2015 14:51:47 Andy Lutomirski wrote:
> >> > > In any event, we should find out what FreeBSD does in response to
> >> > > read(2) on the fd.
> >> >
> >> > I've just successfully installed FreeBSD and compiled qtbase (main 
> >> > package
> >> > of Qt 5) on it.
> >> >
> >> > I'll test pdfork during the weekend and report its behaviour.
> >>
> >> Here are my findings about pdfork.
> >>
> >> Source: http://fxr.watson.org/fxr/source/kern/sys_procdesc.c?v=FREEBSD10
> >> Qt adaptations: https://codereview.qt-project.org/108561
> >>
> >> Processes created with pdfork() are normal processes that still send 
> >> SIGCHLD
> >> to their parents. The only difference is that you get the extra file 
> >> descriptor
> >> that can be passed to the pdgetpid() system call and works on 
> >> select()/poll().
> >> Trying to read from that file descriptor will result in EOPNOTSUPP.
> >
> > OK, since read() doesn't work on a pdfork() file descriptor, we don't
> > have to worry about compatibility with pdfork()'s read result.
> >
> > However, if the expectation is that pdfork()ed child processes still
> > send SIGCHLD, then I don't see how we can be compatible there, nor do I
> > think we want to; as you mention below, that breaks the ability to
> > encapsulate management of the created process entirely within a library.
> 
> I didn't think that was the case -- my understanding was that pdfork()ed
> children would not generate SIGCHLD (and that does seem to be the
> case with a quick test program).

Well, either way, v2 of this series is capable of producing either
behavior.  You can have a clonefd and still receive SIGCHLD or any other
signal, or none at all, and you can decide independently from that if
you want autoreaping or waiting.

> As an aside, I do think there are some aspects of FreeBSD's process
> descriptors that aren't quite right yet, particularly their interaction with
> waitpid(-1, ...) -- IIRC pdfork()ed children are visible to it, but I'd expect
> them not to be (to allow libraries to use sub-processes invisibly to the
> programs using them). There's a thread at:
> https://lists.cam.ac.uk/pipermail/cl-capsicum-discuss/2014-March/thread.html
> but I'm not sure that anything came of that discussion.

As long as you don't use the Linux-specific flags __WALL or __WCLONE, a
process created with clone will be invisible to wait if it has an exit
signal other than SIGCHLD.  That's true independent of this patch
series.  So you can decide if you want processes visible to wait or not.

> As it happens, I'm meeting Robert Watson (one of the progenitors
> of Capsicum/process descriptors) tomorrow, so I'll chase further.

Sounds good.

> >> Since they've never implemented pdwait4() (it's not even declared in the
> >> headers), the only way to reap a child if you only have the file 
> >> descriptor is
> >> to first pdgetpid() and then call wait4() or wait6().
> >
> > Which suggests that we shouldn't try to implement pdwait4() in glibc
> > until FreeBSD implements it in their kernel, since we won't know the
> > exact semantics they expect.
> 
> By the way, I should point out one part of the FreeBSD design
> which might help explain some of the semantics.
> 
> Process descriptors are particularly designed to be used with
> Capsicum, which is a security framework where file descriptors
> get extra rights associated with them, and the kernel polices
> the use of those rights (e.g. you need CAP_READ for read(2)
> operations; normal file descriptors implicitly have all of the
> rights for back-compatibility).
>   https://www.freebsd.org/cgi/man.cgi?query=capsicum=4
> 
> Capsicum also includes 'capability mode', where system calls
> that access global namespaces are disabled -- including the
> pid namespace.
> 
> So process descriptors are the only way to manipulate child
> processes when a program is in capability mode -- and this
> means that pdkill() is then genuinely needed over and above
> kill(pdgetpid(),...).

Thanks for the explanation.  I've seen some details about Capsicum, and
I found it quite interesting.  I'm particularly interested in the notion
of getting rid of global namespaces in favor of descriptors or similar
mechanisms that you need specific rights to.

Does Capsicum do anything t

Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor

2015-03-15 Thread Josh Triplett
On Sun, Mar 15, 2015 at 12:59:17AM -0700, Josh Triplett wrote:
> This patch series also introduces a clone flag CLONE_AUTOREAP, which causes 
> the
> kernel to automatically reap the child process when it exits, just as it does
> for processes using SIGCHLD when the parent has SIGCHLD ignored or marked as
> SA_NOCLDSTOP.

Typo: s/SA_NOCLDSTOP/SA_NOCLDWAIT/.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 4/7] kernel/fork.c: Pass arguments to _do_fork and copy_process using clone4_args

2015-03-15 Thread Josh Triplett
Rather than continuing to add arguments to _do_fork and copy_process for
future clone4 extensions, with corresponding churn in every caller, pass
the arguments using the clone4_args structure instead.  This allows
clone4 to avoid unpacking the arguments, and allows other callers to use
C99 structure initializers to only initialize the arguments they care
about.  Future extensions to clone4_args will thus not need to touch
clone4, fork, vfork, or other callers of _do_fork.

Signed-off-by: Josh Triplett 
Signed-off-by: Thiago Macieira 
---
 kernel/fork.c | 77 +++
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 8a21f9e..db9012a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1188,12 +1188,9 @@ init_task_pid(struct task_struct *task, enum pid_type 
type, struct pid *pid)
  * flags). The actual kick-off is left to the caller.
  */
 static struct task_struct *copy_process(u64 clone_flags,
-   unsigned long stack_start,
-   unsigned long stack_size,
-   int __user *child_tidptr,
+   struct clone4_args *args,
struct pid *pid,
-   int trace,
-   unsigned long tls)
+   int trace)
 {
int retval;
struct task_struct *p;
@@ -1405,7 +1402,7 @@ static struct task_struct *copy_process(u64 clone_flags,
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
-   retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
+   retval = copy_thread_tls(clone_flags, args->stack_start, 
args->stack_size, p, args->tls);
if (retval)
goto bad_fork_cleanup_io;
 
@@ -1416,11 +1413,11 @@ static struct task_struct *copy_process(u64 clone_flags,
goto bad_fork_cleanup_io;
}
 
-   p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : 
NULL;
+   p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->ctid : 
NULL;
/*
 * Clear TID on mm_release()?
 */
-   p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? 
child_tidptr : NULL;
+   p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->ctid 
: NULL;
 #ifdef CONFIG_BLOCK
p->plug = NULL;
 #endif
@@ -1617,7 +1614,8 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct *fork_idle(int cpu)
 {
struct task_struct *task;
-   task = copy_process(CLONE_VM, 0, 0, NULL, _struct_pid, 0, 0);
+   struct clone4_args args = {};
+   task = copy_process(CLONE_VM, , _struct_pid, 0);
if (!IS_ERR(task)) {
init_idle_pids(task->pids);
init_idle(task, cpu);
@@ -1632,13 +1630,7 @@ struct task_struct *fork_idle(int cpu)
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
  */
-static long _do_fork(
-   u64 clone_flags,
-   unsigned long stack_start,
-   unsigned long stack_size,
-   int __user *parent_tidptr,
-   int __user *child_tidptr,
-   unsigned long tls)
+static long _do_fork(u64 clone_flags, struct clone4_args *args)
 {
struct task_struct *p;
int trace = 0;
@@ -1662,8 +1654,7 @@ static long _do_fork(
trace = 0;
}
 
-   p = copy_process(clone_flags, stack_start, stack_size,
-child_tidptr, NULL, trace, tls);
+   p = copy_process(clone_flags, args, NULL, trace);
/*
 * Do this prior waking up the new thread - the thread pointer
 * might get invalid after that point, if the thread exits quickly.
@@ -1678,7 +1669,7 @@ static long _do_fork(
nr = pid_vnr(pid);
 
if (clone_flags & CLONE_PARENT_SETTID)
-   put_user(nr, parent_tidptr);
+   put_user(nr, args->ptid);
 
if (clone_flags & CLONE_VFORK) {
p->vfork_done = 
@@ -1722,9 +1713,13 @@ long do_fork(unsigned long clone_flags,
  int __user *parent_tidptr,
  int __user *child_tidptr)
 {
-   return _do_fork(squelch_clone_flags(clone_flags),
-   stack_start, stack_size,
-   parent_tidptr, child_tidptr, 0);
+   struct clone4_args kargs = {
+   .ptid = parent_tidptr,
+   .ctid = child_tidptr,
+   .stack_start = stack_start,
+   .stack_start = stack_size,
+   };
+   return _do_fork(squelch_clone_flags(clone_flags), );

[PATCH v2 man-pages] clone4.2: New manpage documenting clone4(2)

2015-03-15 Thread Josh Triplett
Also includes new cross-reference from clone.2.

Signed-off-by: Josh Triplett 
---
 man2/clone.2  |   1 +
 man2/clone4.2 | 345 ++
 2 files changed, 346 insertions(+)
 create mode 100644 man2/clone4.2

diff --git a/man2/clone.2 b/man2/clone.2
index 752c01e..7013885 100644
--- a/man2/clone.2
+++ b/man2/clone.2
@@ -1209,6 +1209,7 @@ main(int argc, char *argv[])
 }
 .fi
 .SH SEE ALSO
+.BR clone4 (2),
 .BR fork (2),
 .BR futex (2),
 .BR getpid (2),
diff --git a/man2/clone4.2 b/man2/clone4.2
new file mode 100644
index 000..f237ebc
--- /dev/null
+++ b/man2/clone4.2
@@ -0,0 +1,345 @@
+.\" Based on clone.2:
+.\" Copyright (c) 1992 Drew Eckhardt , March 28, 1992
+.\" and Copyright (c) Michael Kerrisk, 2001, 2002, 2005, 2013
+.\"
+.\" %%%LICENSE_START(GPL_NOVERSION_ONELINE)
+.\" May be distributed under the GNU General Public License.
+.\" %%%LICENSE_END
+.TH CLONE4 2 2015-03-14 "Linux" "Linux Programmer's Manual"
+.SH NAME
+clone4 \- create a child process
+.SH SYNOPSIS
+.nf
+/* Prototype for the glibc wrapper function */
+
+.B #define _GNU_SOURCE
+.B #include 
+
+.BI "int clone4(uint64_t " flags ,
+.BI "   size_t " args_size ,
+.BI "   struct clone4_args *" args ,
+.BI "   int (*" "fn" ")(void *), void *" arg );
+
+/* Prototype for the raw system call */
+
+.BI "int clone4(unsigned " flags_high ", unsigned " flags_low ,
+.BI "   unsigned long " args_size ,
+.BI "   struct clone4_args *" args );
+
+struct clone4_args {
+pid_t *ptid;
+pid_t *ctid;
+unsigned long stack_start;
+unsigned long stack_size;
+unsigned long tls;
+int *clonefd;
+unsigned clonefd_flags;
+};
+
+.SH DESCRIPTION
+.BR clone4 ()
+creates a new process, similar to
+.BR clone (2)
+and
+.BR fork (2).
+.BR clone4 ()
+supports additional flags that
+.BR clone (2)
+does not, and accepts arguments via an extensible structure.
+
+.I args
+points to a
+.I clone4_args
+structure, and
+.I args_size
+must contain the size of that structure, as understood by the caller.  If the
+caller passes a shorter structure than the kernel expects, the remaining fields
+will default to 0.  If the caller passes a larger structure than the kernel
+expects (such as one from a newer kernel),
+.BR clone4 ()
+will return
+.BR EINVAL .
+The
+.I clone4_args
+structure may gain additional fields at the end in the future, and callers must
+only pass a size that encompasses the number of fields they understand.  If the
+caller passes 0 for
+.IR args_size ,
+.I args
+is ignored and may be NULL.
+
+In the
+.I clone4_args
+structure,
+.IR ptid ,
+.IR ctid ,
+.IR stack_start ,
+.IR stack_size ,
+and
+.I tls
+have the same semantics as they do with
+.BR clone (2)
+and
+.BR clone2 (2).
+
+In the glibc wrapper,
+.I fn
+and
+.I arg
+have the same semantics as they do with
+.BR clone (2).
+As with
+.BR clone (2),
+the underlying system call works more like
+.BR fork (2),
+returning 0 in the child process; the glibc wrapper simplifies thread execution
+by calling
+.IR fn ( arg )
+and exiting the child when that function exits.
+
+The 64-bit
+.I flags
+argument (split into the 32-bit
+.I flags_high
+and
+.I flags_low
+arguments in the kernel interface for portability across architectures)
+accepts all the same flags as
+.BR clone (2),
+with the exception of the obsolete
+.BR CLONE_PID ,
+.BR CLONE_DETACHED ,
+and
+.BR CLONE_STOPPED .
+In addition,
+.I flags
+accepts the following flags:
+
+.TP
+.B CLONE_AUTOREAP
+When the new process exits, immediately reap it, rather than keeping it around
+as a "zombie" until a call to
+.BR waitpid (2)
+or similar.  Without this flag, the kernel will automatically reap a process if
+its exit signal is set to
+.BR SIGCHLD ,
+and if the parent process has
+.B SIGCHLD
+set to
+.B SIG_IGN
+or has a
+.B SIGCHLD
+handler installed with
+.B SA_NOCLDWAIT
+(see
+.BR sigaction (2)).
+.B CLONE_AUTOREAP
+allows the calling process to enable automatic reaping with an exit signal
+other than
+.B SIGCHLD
+(including 0 to disable the exit signal), and does not depend on the
+configuration of process-wide signal handling.
+
+.TP
+.B CLONE_FD
+Return a file descriptor associated with the new process, storing it in
+location
+.I clonefd
+in the parent's address space.  When the new process exits, the file descriptor
+will become available for reading.
+
+Unlike using
+.BR signalfd (2)
+for the
+.B SIGCHLD
+signal,
+the file descriptor returned by
+.BR clone4 ()
+with the
+.B CLONE_FD
+flag works even with
+.B SIGCHLD
+unblocked in one or more threads of the parent process, allowing the process to
+have different handlers for different child processes, such as those created by
+a library, without introducing race conditions around process-wide signal
+handling.
+
+.I clonefd_

<    2   3   4   5   6   7   8   9   10   11   >