[PATCH 3/4] kconfig: Regenerate parser with current Bison prior to making changes
Before making changes to the parser, regenerate the parser with current Bison to avoid mixing those changes with those produced by updating Bison. Signed-off-by: Josh Triplett --- scripts/kconfig/zconf.tab.c_shipped | 1244 --- 1 file changed, 561 insertions(+), 683 deletions(-) diff --git a/scripts/kconfig/zconf.tab.c_shipped b/scripts/kconfig/zconf.tab.c_shipped index de5e84e..0864c7d 100644 --- a/scripts/kconfig/zconf.tab.c_shipped +++ b/scripts/kconfig/zconf.tab.c_shipped @@ -1,19 +1,19 @@ -/* A Bison parser, made by GNU Bison 2.5. */ +/* A Bison parser, made by GNU Bison 3.0.2. */ /* Bison implementation for Yacc-like parsers in C - - Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc. - + + Copyright (C) 1984, 1989-1990, 2000-2013 Free Software Foundation, Inc. + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ @@ -26,7 +26,7 @@ special exception, which will cause the skeleton and the resulting Bison output files to be licensed under the GNU General Public License without this special exception. - + This special exception was added by the Free Software Foundation in version 2.2 of Bison. */ @@ -44,7 +44,7 @@ #define YYBISON 1 /* Bison version. */ -#define YYBISON_VERSION "2.5" +#define YYBISON_VERSION "3.0.2" /* Skeleton name. */ #define YYSKELETON_NAME "yacc.c" @@ -58,18 +58,16 @@ /* Pull parsers. */ #define YYPULL 1 -/* Using locations. */ -#define YYLSP_NEEDED 0 /* Substitute the variable and function names. */ #define yyparse zconfparse #define yylex zconflex #define yyerror zconferror -#define yylval zconflval -#define yychar zconfchar #define yydebug zconfdebug #define yynerrs zconfnerrs +#define yylval zconflval +#define yychar zconfchar /* Copy the first part of user declarations. */ @@ -108,10 +106,13 @@ static struct menu *current_menu, *current_entry; -/* Enabling traces. */ -#ifndef YYDEBUG -# define YYDEBUG 1 -#endif +# ifndef YY_NULLPTR +# if defined __cplusplus && 201103L <= __cplusplus +# define YY_NULLPTR nullptr +# else +# define YY_NULLPTR 0 +# endif +# endif /* Enabling verbose error messages. */ #ifdef YYERROR_VERBOSE @@ -121,58 +122,60 @@ static struct menu *current_menu, *current_entry; # define YYERROR_VERBOSE 0 #endif -/* Enabling the token table. */ -#ifndef YYTOKEN_TABLE -# define YYTOKEN_TABLE 0 -#endif +/* Debug traces. */ +#ifndef YYDEBUG +# define YYDEBUG 1 +#endif +#if YYDEBUG +extern int zconfdebug; +#endif -/* Tokens. */ +/* Token type. */ #ifndef YYTOKENTYPE # define YYTOKENTYPE - /* Put the tokens into the symbol table, so that GDB and other debuggers - know about them. */ - enum yytokentype { - T_MAINMENU = 258, - T_MENU = 259, - T_ENDMENU = 260, - T_SOURCE = 261, - T_CHOICE = 262, - T_ENDCHOICE = 263, - T_COMMENT = 264, - T_CONFIG = 265, - T_MENUCONFIG = 266, - T_HELP = 267, - T_HELPTEXT = 268, - T_IF = 269, - T_ENDIF = 270, - T_DEPENDS = 271, - T_OPTIONAL = 272, - T_PROMPT = 273, - T_TYPE = 274, - T_DEFAULT = 275, - T_SELECT = 276, - T_RANGE = 277, - T_VISIBLE = 278, - T_OPTION = 279, - T_ON = 280, - T_WORD = 281, - T_WORD_QUOTE = 282, - T_UNEQUAL = 283, - T_CLOSE_PAREN = 284, - T_OPEN_PAREN = 285, - T_EOL = 286, - T_OR = 287, - T_AND = 288, - T_EQUAL = 289, - T_NOT = 290 - }; + enum yytokentype + { +T_MAINMENU = 258, +T_MENU = 259, +T_ENDMENU = 260, +T_SOURCE = 261, +T_CHOICE = 262, +T_ENDCHOICE = 263, +T_COMMENT = 264, +T_CONFIG = 265, +T_MENUCONFIG = 266, +T_HELP = 267, +T_HELPTEXT = 268, +T_IF = 269, +T_ENDIF = 270, +T_DEPENDS = 271, +T_OPTIONAL = 272, +T_PROMPT = 273, +T_TYPE = 274, +T_DEFAULT = 275, +T_SELECT = 276, +T_RANGE = 277, +T_VISIBLE = 278, +T_OPTION = 279, +T_ON = 280, +T_WORD = 281, +T_WORD_QUOTE = 282, +T_UNEQUAL = 283, +T_CLOSE_PAREN = 284, +T_OPEN_PAREN = 285, +T_EOL = 286, +T_OR = 287, +T_AND = 288, +T_EQUAL = 289, +T_NOT = 290 + }; #endif - - +/* Valu
[PATCH 2/4] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
The expert menu frequently gets broken by a config item in the middle that leaves off the "if EXPERT" from its prompt. This results in the remainder of the menu spilling out into the parent "General setup" menu. Move the entire expert menu into a separate Kconfig file, init/Kconfig.expert, to make this harder to do accidentally, and to break up the exceedingly long init/Kconfig a bit. Signed-off-by: Josh Triplett --- init/Kconfig| 232 +--- init/Kconfig.expert | 231 +++ 2 files changed, 232 insertions(+), 231 deletions(-) create mode 100644 init/Kconfig.expert diff --git a/init/Kconfig b/init/Kconfig index e2f16f1..a2de3f5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1351,237 +1351,7 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. -menuconfig EXPERT - bool "Configure standard kernel features (expert users)" - # Unhide debug options, to make the on-by-default options visible - select DEBUG_KERNEL - help - This option allows certain base kernel options and settings - to be disabled or tweaked. This is for specialized - environments which can tolerate a "non-standard" kernel. - Only use this if you really know what you are doing. - -config UID16 - bool "Enable 16-bit UID system calls" if EXPERT - depends on HAVE_UID16 && MULTIUSER - default y - help - This enables the legacy 16-bit UID syscall wrappers. - -config MULTIUSER - bool "Multiple users, groups and capabilities support" if EXPERT - default y - help - This option enables support for non-root users, groups and - capabilities. - - If you say N here, all processes will run with UID 0, GID 0, and all - possible capabilities. Saying N here also compiles out support for - system calls related to UIDs, GIDs, and capabilities, such as setuid, - setgid, and capset. - - If unsure, say Y here. - -config SGETMASK_SYSCALL - bool "sgetmask/ssetmask syscalls support" if EXPERT - def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH - ---help--- - sys_sgetmask and sys_ssetmask are obsolete system calls - no longer supported in libc but still enabled by default in some - architectures. - - If unsure, leave the default option here. - -config SYSFS_SYSCALL - bool "Sysfs syscall support" if EXPERT - default y - ---help--- - sys_sysfs is an obsolete system call no longer supported in libc. - Note that disabling this option is more secure but might break - compatibility with some systems. - - If unsure say Y here. - -config SYSCTL_SYSCALL - bool "Sysctl syscall support" if EXPERT - depends on PROC_SYSCTL - default n - select SYSCTL - ---help--- - sys_sysctl uses binary paths that have been found challenging - to properly maintain and use. The interface in /proc/sys - using paths with ascii names is now the primary path to this - information. - - Almost nothing using the binary sysctl interface so if you are - trying to save some space it is probably safe to disable this, - making your kernel marginally smaller. - - If unsure say N here. - -config KALLSYMS -bool "Load all symbols for debugging/ksymoops" if EXPERT -default y -help - Say Y here to let the kernel print out symbolic crash information and - symbolic stack backtraces. This increases the size of the kernel - somewhat, as all symbols have to be loaded into the kernel image. - -config KALLSYMS_ALL - bool "Include all symbols in kallsyms" - depends on DEBUG_KERNEL && KALLSYMS - help - Normally kallsyms only contains the symbols of functions for nicer - OOPS messages and backtraces (i.e., symbols from the text and inittext - sections). This is sufficient for most cases. And only in very rare - cases (e.g., when a debugger is used) all symbols are required (e.g., - names of variables from the data sections, etc). - - This option makes sure that all symbols are loaded into the kernel - image (i.e., symbols from all sections) in cost of increased kernel - size (depending on the kernel configuration, it may be 300KiB or - something like this). - - Say N unless you really need all symbols. - -config PRINTK - default y - bool "Enable support for printk" if EXPERT - select IRQ_WORK - help
[PATCH 1/4] init/Kconfig: Fix break in middle of EXPERT menu
Commit e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING kernels, make it more configurable") made BPF_SYSCALL no longer hidden with !EXPERT, but left it in the middle of the EXPERT menu. menuconfig stops putting config items under a submenu once it encounters an item that doesn't depend on the menu's config item, so this caused the remainder of the EXPERT menu to spill out into the containing menu around it. Fix by moving BPF_SYSCALL before the EXPERT menu, next to BPF. Fixes: e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING kernels, make it more configurable") Signed-off-by: Josh Triplett --- init/Kconfig | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index dc24dec..e2f16f1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1341,6 +1341,16 @@ config HAVE_PCSPKR_PLATFORM config BPF bool +# syscall, maps, verifier +config BPF_SYSCALL + bool "Enable bpf() system call" + select ANON_INODES + select BPF + default n + help + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + menuconfig EXPERT bool "Configure standard kernel features (expert users)" # Unhide debug options, to make the on-by-default options visible @@ -1535,16 +1545,6 @@ config EVENTFD If unsure, say Y. -# syscall, maps, verifier -config BPF_SYSCALL - bool "Enable bpf() system call" - select ANON_INODES - select BPF - default n - help - Enable the bpf() system call that allows to manipulate eBPF - programs and maps via file descriptors. - config SHMEM bool "Use full shmem filesystem" if EXPERT default y -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH v6] Documentation/arch: Add Documentation/arch-features.txt
On Wed, May 13, 2015 at 09:27:57AM -0700, Josh Triplett wrote: > How likely is this to get out of date? Are people going to remember to > patch this when they add a feature to their architecture? If > they found out they had work to do by reading this file, which is the > goal, then they'll likely remember to edit the file; however, if they > find the feature and fix it without knowing about the file, will someone > notice? > > Is there any way we can *generate* this file from Kconfig? Can we > extract the necessary "this is possible to enable" or "this arch selects > this symbol" information from Kconfig, and together with the list of > symbols for features needing architecture support, generate the table? Just tried this. Looks like it's pretty trivial for most of these features: just make ARCH=thearch allyesconfig, then look for the config symbol in the result. For instance, after doing "make ARCH=alpha allyesconfig": ~/src/linux$ grep -x 'CONFIG_ARCH_HAS_ELF_RANDOMIZE=y' .config || echo TODO TODO ~/src/linux$ grep -x 'CONFIG_GENERIC_CLOCKEVENTS=y' .config || echo TODO CONFIG_GENERIC_CLOCKEVENTS=y This seems easy to turn into a shell script or Python script, given a list of feature descriptions and corresponding config symbols, as well as a list of architectures. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH tip/core/rcu 2/3] rcutorture: Test both RCU-sched and RCU-bh for Tiny RCU
On Wed, May 13, 2015 at 06:07:28AM -0700, Paul E. McKenney wrote: > On Tue, May 12, 2015 at 05:59:29PM -0700, j...@joshtriplett.org wrote: > > On Tue, May 12, 2015 at 03:49:12PM -0700, Paul E. McKenney wrote: > > > From: "Paul E. McKenney" > > > > > > Reported-by: "Ahmed, Iftekhar" > > > Signed-off-by: Paul E. McKenney > > > > Could you elaborate a bit more on this patch (ideally in its commit > > message)? I see an addition of a command-line parameter to test rcu_bh; > > is rcu-sched already tested elsewhere by some other config, or does this > > parameter somehow enable testing both? > > The commit log now reads as follows, does that help? > > Thanx, Paul > > > > rcutorture: Test both RCU-sched and RCU-bh for Tiny RCU > > Tiny RCU supports both RCU-sched and RCU-bh, but only RCU-sched is > currently tested by the rcutorture scripts. This commit therefore > changes the TINY02 configuration to test RCU-bh, with TINY01 continuing > to test RCU-sched. > > This shortcoming of the current rcutorture tests was located by mutation > testing by Iftekhar. The idea behind mutation testing is to automatically > mutate the code under test. If a given mutant is not caught by testing, > this is a hint that the testing might need to be improved, as was the > case here. Note that this is only a hint because it is possible to mutate > the code into something else that still works. For example, a mutation > that removes (say) a WARN_ON() will not normally result in a test failure. > > This change resulted in the test failure caused by list mishandling, > which is fixed by the next commit. > > Reported-by: "Ahmed, Iftekhar" > Signed-off-by: Paul E. McKenney > Much better, thanks. In particular, the information about TINY01 and TINY02 was not obvious from the patch. Reviewed-by: Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH v6] Documentation/arch: Add Documentation/arch-features.txt
On Wed, May 13, 2015 at 03:48:42PM +0200, Ingo Molnar wrote: > From 93f6bd67b4348bf4bf27cbac8ffa9f1def4fa6aa Mon Sep 17 00:00:00 2001 > From: Ingo Molnar > Date: Wed, 13 May 2015 10:30:11 +0200 > Subject: [PATCH] Documentation/arch: Add Documentation/arch-features.txt > > Add a support matrix for various generic kernel features that need > architecture support. > > Signed-off-by: Ingo Molnar Could you add a column for the bpf JIT? Should this file track syscalls not wired up on architectures? How likely is this to get out of date? Are people going to remember to patch this when they add a feature to their architecture? If they found out they had work to do by reading this file, which is the goal, then they'll likely remember to edit the file; however, if they find the feature and fix it without knowing about the file, will someone notice? Is there any way we can *generate* this file from Kconfig? Can we extract the necessary "this is possible to enable" or "this arch selects this symbol" information from Kconfig, and together with the list of symbols for features needing architecture support, generate the table? If we can't generate this, then the ASCII-art style and right-aligned feature names seems *really* likely to produce spurious conflicts, especially when adding a feature to the list. Even though it would produce a much longer file, would you consider dropping the tables and just having a section per feature? That also avoids the need to abbreviate as much. For instance: Transparent huge pages HAVE_ARCH_TRANSPARENT_HUGEPAGE && 64BIT alpha TODO arc .. arm ok arm64 ok avr32 .. blackfin.. c6x .. cris.. frv .. hexagon .. ia64TODO m32r.. m68k.. metag .. microblaze .. mipsok mn10300 .. nios2 .. openrisc.. parisc TODO powerpc ok s390ok score .. sh .. sparc ok tileTODO um .. unicore32 .. x86 ok xtensa .. Or alternatively: Transparent huge pages HAVE_ARCH_TRANSPARENT_HUGEPAGE && 64BIT TODO: alpha ia64 parisc tile Done: arm arm64 mips powerpc s390 sparc x86 Not possible: arc avr32 blackfin c6x cris frv hexagon m32r m68k metag microblaze mn10300 nios2 openrisc score sh um unicore32 xtensa After all, the point of this document isn't to look at, it's to use as a TODO list and actively edit. So anything that makes it prettier but more painful to edit seems like a bug. > Documentation/arch-features.txt | 222 > > 1 file changed, 222 insertions(+) > > diff --git a/Documentation/arch-features.txt b/Documentation/arch-features.txt > new file mode 100644 > index ..4f6430bc552b > --- /dev/null > +++ b/Documentation/arch-features.txt > @@ -0,0 +1,222 @@ > + > +For generic kernel features that need architecture support, this is > +the feature support matrix, for all upstream Linux architectures. > + > +Meaning of entries in the tables: > + > +' ok ': feature supported by the architecture > +'TODO': feature not yet supported by the architecture > +' .. ': feature cannot be supported by the hardware > + > + > + > +lockdep: LOCKDEP_SUPPORT > + stackprotector: HAVE_CC_STACKPROTECTOR > +jump labels: HAVE_ARCH_JUMP_LABEL > + seccomp filter: HAVE_ARCH_SECCOMP_FILTER > + context tracking: HAVE_CONTEXT_TRACKING > + kgdb: HAVE_ARCH_KGDB > + modern timekeeping: !ARCH_USES_GETTIMEOFFSET > +clockevents: GENERIC_CLOCKEVENTS > + ELF ASLR: ARCH_HAS_ELF_RANDOMIZE > + > + > lockdep:-. > + stackprotector:--. > | > +jump labels:---. | > | > + seccomp filter:. | | > | > + context tracking:-. | | | > | > + kgdb:--. | | | | > | > + modern timekeeping:---. | | | | | > | > +clockevents:. | | | | | | > | > + ELF ASLR:-. | | | | | | | > | > + | | | | | | | | > | > +-- > + alpha | TODO | ok | ok | TODO | TODO | TODO | TODO | TODO | > TODO | > + arc | TODO | ok | ok | ok | TODO | TODO | TODO | TODO | > ok | > + arm | ok | ok | TODO | ok | ok | ok | ok | ok | > ok | > + arm64 | ok | ok
[PATCH 3/4] kconfig: Regenerate parser with current Bison prior to making changes
Before making changes to the parser, regenerate the parser with current Bison to avoid mixing those changes with those produced by updating Bison. Signed-off-by: Josh Triplett j...@joshtriplett.org --- scripts/kconfig/zconf.tab.c_shipped | 1244 --- 1 file changed, 561 insertions(+), 683 deletions(-) diff --git a/scripts/kconfig/zconf.tab.c_shipped b/scripts/kconfig/zconf.tab.c_shipped index de5e84e..0864c7d 100644 --- a/scripts/kconfig/zconf.tab.c_shipped +++ b/scripts/kconfig/zconf.tab.c_shipped @@ -1,19 +1,19 @@ -/* A Bison parser, made by GNU Bison 2.5. */ +/* A Bison parser, made by GNU Bison 3.0.2. */ /* Bison implementation for Yacc-like parsers in C - - Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc. - + + Copyright (C) 1984, 1989-1990, 2000-2013 Free Software Foundation, Inc. + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/. */ @@ -26,7 +26,7 @@ special exception, which will cause the skeleton and the resulting Bison output files to be licensed under the GNU General Public License without this special exception. - + This special exception was added by the Free Software Foundation in version 2.2 of Bison. */ @@ -44,7 +44,7 @@ #define YYBISON 1 /* Bison version. */ -#define YYBISON_VERSION 2.5 +#define YYBISON_VERSION 3.0.2 /* Skeleton name. */ #define YYSKELETON_NAME yacc.c @@ -58,18 +58,16 @@ /* Pull parsers. */ #define YYPULL 1 -/* Using locations. */ -#define YYLSP_NEEDED 0 /* Substitute the variable and function names. */ #define yyparse zconfparse #define yylex zconflex #define yyerror zconferror -#define yylval zconflval -#define yychar zconfchar #define yydebug zconfdebug #define yynerrs zconfnerrs +#define yylval zconflval +#define yychar zconfchar /* Copy the first part of user declarations. */ @@ -108,10 +106,13 @@ static struct menu *current_menu, *current_entry; -/* Enabling traces. */ -#ifndef YYDEBUG -# define YYDEBUG 1 -#endif +# ifndef YY_NULLPTR +# if defined __cplusplus 201103L = __cplusplus +# define YY_NULLPTR nullptr +# else +# define YY_NULLPTR 0 +# endif +# endif /* Enabling verbose error messages. */ #ifdef YYERROR_VERBOSE @@ -121,58 +122,60 @@ static struct menu *current_menu, *current_entry; # define YYERROR_VERBOSE 0 #endif -/* Enabling the token table. */ -#ifndef YYTOKEN_TABLE -# define YYTOKEN_TABLE 0 -#endif +/* Debug traces. */ +#ifndef YYDEBUG +# define YYDEBUG 1 +#endif +#if YYDEBUG +extern int zconfdebug; +#endif -/* Tokens. */ +/* Token type. */ #ifndef YYTOKENTYPE # define YYTOKENTYPE - /* Put the tokens into the symbol table, so that GDB and other debuggers - know about them. */ - enum yytokentype { - T_MAINMENU = 258, - T_MENU = 259, - T_ENDMENU = 260, - T_SOURCE = 261, - T_CHOICE = 262, - T_ENDCHOICE = 263, - T_COMMENT = 264, - T_CONFIG = 265, - T_MENUCONFIG = 266, - T_HELP = 267, - T_HELPTEXT = 268, - T_IF = 269, - T_ENDIF = 270, - T_DEPENDS = 271, - T_OPTIONAL = 272, - T_PROMPT = 273, - T_TYPE = 274, - T_DEFAULT = 275, - T_SELECT = 276, - T_RANGE = 277, - T_VISIBLE = 278, - T_OPTION = 279, - T_ON = 280, - T_WORD = 281, - T_WORD_QUOTE = 282, - T_UNEQUAL = 283, - T_CLOSE_PAREN = 284, - T_OPEN_PAREN = 285, - T_EOL = 286, - T_OR = 287, - T_AND = 288, - T_EQUAL = 289, - T_NOT = 290 - }; + enum yytokentype + { +T_MAINMENU = 258, +T_MENU = 259, +T_ENDMENU = 260, +T_SOURCE = 261, +T_CHOICE = 262, +T_ENDCHOICE = 263, +T_COMMENT = 264, +T_CONFIG = 265, +T_MENUCONFIG = 266, +T_HELP = 267, +T_HELPTEXT = 268, +T_IF = 269, +T_ENDIF = 270, +T_DEPENDS = 271, +T_OPTIONAL = 272, +T_PROMPT = 273, +T_TYPE = 274, +T_DEFAULT = 275, +T_SELECT = 276, +T_RANGE = 277, +T_VISIBLE = 278, +T_OPTION = 279, +T_ON = 280, +T_WORD = 281, +T_WORD_QUOTE = 282, +T_UNEQUAL = 283, +T_CLOSE_PAREN = 284, +T_OPEN_PAREN = 285, +T_EOL = 286, +T_OR = 287, +T_AND = 288, +T_EQUAL = 289, +T_NOT = 290 + }; #endif - - +/* Value type. */ #if ! defined YYSTYPE ! defined
[PATCH 2/4] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
The expert menu frequently gets broken by a config item in the middle that leaves off the if EXPERT from its prompt. This results in the remainder of the menu spilling out into the parent General setup menu. Move the entire expert menu into a separate Kconfig file, init/Kconfig.expert, to make this harder to do accidentally, and to break up the exceedingly long init/Kconfig a bit. Signed-off-by: Josh Triplett j...@joshtriplett.org --- init/Kconfig| 232 +--- init/Kconfig.expert | 231 +++ 2 files changed, 232 insertions(+), 231 deletions(-) create mode 100644 init/Kconfig.expert diff --git a/init/Kconfig b/init/Kconfig index e2f16f1..a2de3f5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1351,237 +1351,7 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. -menuconfig EXPERT - bool Configure standard kernel features (expert users) - # Unhide debug options, to make the on-by-default options visible - select DEBUG_KERNEL - help - This option allows certain base kernel options and settings - to be disabled or tweaked. This is for specialized - environments which can tolerate a non-standard kernel. - Only use this if you really know what you are doing. - -config UID16 - bool Enable 16-bit UID system calls if EXPERT - depends on HAVE_UID16 MULTIUSER - default y - help - This enables the legacy 16-bit UID syscall wrappers. - -config MULTIUSER - bool Multiple users, groups and capabilities support if EXPERT - default y - help - This option enables support for non-root users, groups and - capabilities. - - If you say N here, all processes will run with UID 0, GID 0, and all - possible capabilities. Saying N here also compiles out support for - system calls related to UIDs, GIDs, and capabilities, such as setuid, - setgid, and capset. - - If unsure, say Y here. - -config SGETMASK_SYSCALL - bool sgetmask/ssetmask syscalls support if EXPERT - def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH - ---help--- - sys_sgetmask and sys_ssetmask are obsolete system calls - no longer supported in libc but still enabled by default in some - architectures. - - If unsure, leave the default option here. - -config SYSFS_SYSCALL - bool Sysfs syscall support if EXPERT - default y - ---help--- - sys_sysfs is an obsolete system call no longer supported in libc. - Note that disabling this option is more secure but might break - compatibility with some systems. - - If unsure say Y here. - -config SYSCTL_SYSCALL - bool Sysctl syscall support if EXPERT - depends on PROC_SYSCTL - default n - select SYSCTL - ---help--- - sys_sysctl uses binary paths that have been found challenging - to properly maintain and use. The interface in /proc/sys - using paths with ascii names is now the primary path to this - information. - - Almost nothing using the binary sysctl interface so if you are - trying to save some space it is probably safe to disable this, - making your kernel marginally smaller. - - If unsure say N here. - -config KALLSYMS -bool Load all symbols for debugging/ksymoops if EXPERT -default y -help - Say Y here to let the kernel print out symbolic crash information and - symbolic stack backtraces. This increases the size of the kernel - somewhat, as all symbols have to be loaded into the kernel image. - -config KALLSYMS_ALL - bool Include all symbols in kallsyms - depends on DEBUG_KERNEL KALLSYMS - help - Normally kallsyms only contains the symbols of functions for nicer - OOPS messages and backtraces (i.e., symbols from the text and inittext - sections). This is sufficient for most cases. And only in very rare - cases (e.g., when a debugger is used) all symbols are required (e.g., - names of variables from the data sections, etc). - - This option makes sure that all symbols are loaded into the kernel - image (i.e., symbols from all sections) in cost of increased kernel - size (depending on the kernel configuration, it may be 300KiB or - something like this). - - Say N unless you really need all symbols. - -config PRINTK - default y - bool Enable support for printk if EXPERT - select IRQ_WORK - help - This option enables normal printk support. Removing it - eliminates most of the message strings from the kernel image
[PATCH 1/4] init/Kconfig: Fix break in middle of EXPERT menu
Commit e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y !CONFIG_TRACING kernels, make it more configurable) made BPF_SYSCALL no longer hidden with !EXPERT, but left it in the middle of the EXPERT menu. menuconfig stops putting config items under a submenu once it encounters an item that doesn't depend on the menu's config item, so this caused the remainder of the EXPERT menu to spill out into the containing menu around it. Fix by moving BPF_SYSCALL before the EXPERT menu, next to BPF. Fixes: e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y !CONFIG_TRACING kernels, make it more configurable) Signed-off-by: Josh Triplett j...@joshtriplett.org --- init/Kconfig | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index dc24dec..e2f16f1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1341,6 +1341,16 @@ config HAVE_PCSPKR_PLATFORM config BPF bool +# syscall, maps, verifier +config BPF_SYSCALL + bool Enable bpf() system call + select ANON_INODES + select BPF + default n + help + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + menuconfig EXPERT bool Configure standard kernel features (expert users) # Unhide debug options, to make the on-by-default options visible @@ -1535,16 +1545,6 @@ config EVENTFD If unsure, say Y. -# syscall, maps, verifier -config BPF_SYSCALL - bool Enable bpf() system call - select ANON_INODES - select BPF - default n - help - Enable the bpf() system call that allows to manipulate eBPF - programs and maps via file descriptors. - config SHMEM bool Use full shmem filesystem if EXPERT default y -- 2.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 4/4] kconfig: Introduce showif to factor out conditions on visibility
kconfig implicitly creates a submenu whenever a series of symbols all have dependencies or prompt-visibility expressions that all depend on a preceeding symbol. For instance, the series of symbols following menuconfig EXPERT that all have if EXPERT on their prompt will all appear as a submenu of EXPERT. However, this implicit submenuing will break if any intervening symbol loses its if EXPERT; doing so causes the subsequent symbols to appear in the parent menu (General setup). This has happened many times, and it's easy to miss that the entire block should have that same expression. For submenus created from depends dependencies, these can be converted to a single wrapping if expr ... endif block around all the submenu items. However, there's no equivalent for invisible items, for which the prompt is hidden but the symbol may potentially be enabled. For instance, many items in the EXPERT menu are hidden if EXPERT is disabled, but they have default y or are determined by some other expression. To handle this case, introduce a new kconfig construct, showif, which does the same thing as if but for visibility expressions rather than dependencies. Every symbol in a showif expr ... endif block effectively has if expr added to its prompt. Use that new construct to simplify the expert menu. Now, making a symbol no longer invisible when !EXPERT requires moving it out of the EXPERT menu, which makes it more difficult to break the EXPERT menu. Signed-off-by: Josh Triplett j...@joshtriplett.org --- init/Kconfig.expert | 44 +-- scripts/kconfig/menu.c | 4 +- scripts/kconfig/zconf.gperf | 1 + scripts/kconfig/zconf.hash.c_shipped | 258 scripts/kconfig/zconf.tab.c_shipped | 561 +++ scripts/kconfig/zconf.y | 28 +- 6 files changed, 483 insertions(+), 413 deletions(-) diff --git a/init/Kconfig.expert b/init/Kconfig.expert index c84a372..fb3c3aa 100644 --- a/init/Kconfig.expert +++ b/init/Kconfig.expert @@ -8,15 +8,17 @@ menuconfig EXPERT environments which can tolerate a non-standard kernel. Only use this if you really know what you are doing. +showif EXPERT + config UID16 - bool Enable 16-bit UID system calls if EXPERT + bool Enable 16-bit UID system calls depends on HAVE_UID16 MULTIUSER default y help This enables the legacy 16-bit UID syscall wrappers. config MULTIUSER - bool Multiple users, groups and capabilities support if EXPERT + bool Multiple users, groups and capabilities support default y help This option enables support for non-root users, groups and @@ -30,7 +32,7 @@ config MULTIUSER If unsure, say Y here. config SGETMASK_SYSCALL - bool sgetmask/ssetmask syscalls support if EXPERT + bool sgetmask/ssetmask syscalls support def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH ---help--- sys_sgetmask and sys_ssetmask are obsolete system calls @@ -40,7 +42,7 @@ config SGETMASK_SYSCALL If unsure, leave the default option here. config SYSFS_SYSCALL - bool Sysfs syscall support if EXPERT + bool Sysfs syscall support default y ---help--- sys_sysfs is an obsolete system call no longer supported in libc. @@ -50,7 +52,7 @@ config SYSFS_SYSCALL If unsure say Y here. config SYSCTL_SYSCALL - bool Sysctl syscall support if EXPERT + bool Sysctl syscall support depends on PROC_SYSCTL default n select SYSCTL @@ -67,7 +69,7 @@ config SYSCTL_SYSCALL If unsure say N here. config KALLSYMS -bool Load all symbols for debugging/ksymoops if EXPERT +bool Load all symbols for debugging/ksymoops default y help Say Y here to let the kernel print out symbolic crash information and @@ -93,7 +95,7 @@ config KALLSYMS_ALL config PRINTK default y - bool Enable support for printk if EXPERT + bool Enable support for printk select IRQ_WORK help This option enables normal printk support. Removing it @@ -103,7 +105,7 @@ config PRINTK strongly discouraged. config BUG - bool BUG() support if EXPERT + bool BUG() support default y help Disabling this option eliminates support for BUG and WARN, reducing @@ -115,13 +117,13 @@ config BUG config ELF_CORE depends on COREDUMP default y - bool Enable ELF core dumps if EXPERT + bool Enable ELF core dumps help Enable support for generating core dumps. Disabling saves about 4k. config PCSPKR_PLATFORM - bool Enable PC-Speaker support if EXPERT + bool Enable PC-Speaker support depends on HAVE_PCSPKR_PLATFORM select I8253_LOCK default
Re: [RFC PATCH v6] Documentation/arch: Add Documentation/arch-features.txt
On Wed, May 13, 2015 at 03:48:42PM +0200, Ingo Molnar wrote: From 93f6bd67b4348bf4bf27cbac8ffa9f1def4fa6aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar mi...@kernel.org Date: Wed, 13 May 2015 10:30:11 +0200 Subject: [PATCH] Documentation/arch: Add Documentation/arch-features.txt Add a support matrix for various generic kernel features that need architecture support. Signed-off-by: Ingo Molnar mi...@kernel.org Could you add a column for the bpf JIT? Should this file track syscalls not wired up on architectures? How likely is this to get out of date? Are people going to remember to patch this when they add a feature to their architecture? If they found out they had work to do by reading this file, which is the goal, then they'll likely remember to edit the file; however, if they find the feature and fix it without knowing about the file, will someone notice? Is there any way we can *generate* this file from Kconfig? Can we extract the necessary this is possible to enable or this arch selects this symbol information from Kconfig, and together with the list of symbols for features needing architecture support, generate the table? If we can't generate this, then the ASCII-art style and right-aligned feature names seems *really* likely to produce spurious conflicts, especially when adding a feature to the list. Even though it would produce a much longer file, would you consider dropping the tables and just having a section per feature? That also avoids the need to abbreviate as much. For instance: Transparent huge pages HAVE_ARCH_TRANSPARENT_HUGEPAGE 64BIT alpha TODO arc .. arm ok arm64 ok avr32 .. blackfin.. c6x .. cris.. frv .. hexagon .. ia64TODO m32r.. m68k.. metag .. microblaze .. mipsok mn10300 .. nios2 .. openrisc.. parisc TODO powerpc ok s390ok score .. sh .. sparc ok tileTODO um .. unicore32 .. x86 ok xtensa .. Or alternatively: Transparent huge pages HAVE_ARCH_TRANSPARENT_HUGEPAGE 64BIT TODO: alpha ia64 parisc tile Done: arm arm64 mips powerpc s390 sparc x86 Not possible: arc avr32 blackfin c6x cris frv hexagon m32r m68k metag microblaze mn10300 nios2 openrisc score sh um unicore32 xtensa After all, the point of this document isn't to look at, it's to use as a TODO list and actively edit. So anything that makes it prettier but more painful to edit seems like a bug. Documentation/arch-features.txt | 222 1 file changed, 222 insertions(+) diff --git a/Documentation/arch-features.txt b/Documentation/arch-features.txt new file mode 100644 index ..4f6430bc552b --- /dev/null +++ b/Documentation/arch-features.txt @@ -0,0 +1,222 @@ + +For generic kernel features that need architecture support, this is +the feature support matrix, for all upstream Linux architectures. + +Meaning of entries in the tables: + +' ok ': feature supported by the architecture +'TODO': feature not yet supported by the architecture +' .. ': feature cannot be supported by the hardware + + + +lockdep: LOCKDEP_SUPPORT + stackprotector: HAVE_CC_STACKPROTECTOR +jump labels: HAVE_ARCH_JUMP_LABEL + seccomp filter: HAVE_ARCH_SECCOMP_FILTER + context tracking: HAVE_CONTEXT_TRACKING + kgdb: HAVE_ARCH_KGDB + modern timekeeping: !ARCH_USES_GETTIMEOFFSET +clockevents: GENERIC_CLOCKEVENTS + ELF ASLR: ARCH_HAS_ELF_RANDOMIZE + + lockdep:-. + stackprotector:--. | +jump labels:---. | | + seccomp filter:. | | | + context tracking:-. | | | | + kgdb:--. | | | | | + modern timekeeping:---. | | | | | | +clockevents:. | | | | | | | + ELF ASLR:-. | | | | | | | | + | | | | | | | | | +-- + alpha | TODO | ok | ok | TODO | TODO | TODO | TODO | TODO | TODO | + arc | TODO | ok | ok | ok | TODO | TODO | TODO | TODO | ok | + arm | ok | ok | TODO | ok | ok | ok | ok | ok | ok | + arm64 | ok | ok | ok | ok | ok | ok | ok | ok |
Re: [PATCH tip/core/rcu 2/3] rcutorture: Test both RCU-sched and RCU-bh for Tiny RCU
On Wed, May 13, 2015 at 06:07:28AM -0700, Paul E. McKenney wrote: On Tue, May 12, 2015 at 05:59:29PM -0700, j...@joshtriplett.org wrote: On Tue, May 12, 2015 at 03:49:12PM -0700, Paul E. McKenney wrote: From: Paul E. McKenney paul...@linux.vnet.ibm.com Reported-by: Ahmed, Iftekhar ahm...@onid.oregonstate.edu Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com Could you elaborate a bit more on this patch (ideally in its commit message)? I see an addition of a command-line parameter to test rcu_bh; is rcu-sched already tested elsewhere by some other config, or does this parameter somehow enable testing both? The commit log now reads as follows, does that help? Thanx, Paul rcutorture: Test both RCU-sched and RCU-bh for Tiny RCU Tiny RCU supports both RCU-sched and RCU-bh, but only RCU-sched is currently tested by the rcutorture scripts. This commit therefore changes the TINY02 configuration to test RCU-bh, with TINY01 continuing to test RCU-sched. This shortcoming of the current rcutorture tests was located by mutation testing by Iftekhar. The idea behind mutation testing is to automatically mutate the code under test. If a given mutant is not caught by testing, this is a hint that the testing might need to be improved, as was the case here. Note that this is only a hint because it is possible to mutate the code into something else that still works. For example, a mutation that removes (say) a WARN_ON() will not normally result in a test failure. This change resulted in the test failure caused by list mishandling, which is fixed by the next commit. Reported-by: Ahmed, Iftekhar ahm...@onid.oregonstate.edu Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com Much better, thanks. In particular, the information about TINY01 and TINY02 was not obvious from the patch. Reviewed-by: Josh Triplett j...@joshtriplett.org -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH v6] Documentation/arch: Add Documentation/arch-features.txt
On Wed, May 13, 2015 at 09:27:57AM -0700, Josh Triplett wrote: How likely is this to get out of date? Are people going to remember to patch this when they add a feature to their architecture? If they found out they had work to do by reading this file, which is the goal, then they'll likely remember to edit the file; however, if they find the feature and fix it without knowing about the file, will someone notice? Is there any way we can *generate* this file from Kconfig? Can we extract the necessary this is possible to enable or this arch selects this symbol information from Kconfig, and together with the list of symbols for features needing architecture support, generate the table? Just tried this. Looks like it's pretty trivial for most of these features: just make ARCH=thearch allyesconfig, then look for the config symbol in the result. For instance, after doing make ARCH=alpha allyesconfig: ~/src/linux$ grep -x 'CONFIG_ARCH_HAS_ELF_RANDOMIZE=y' .config || echo TODO TODO ~/src/linux$ grep -x 'CONFIG_GENERIC_CLOCKEVENTS=y' .config || echo TODO CONFIG_GENERIC_CLOCKEVENTS=y This seems easy to turn into a shell script or Python script, given a list of feature descriptions and corresponding config symbols, as well as a list of architectures. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
On Tue, May 12, 2015 at 10:17:28AM +0200, Ingo Molnar wrote: > * Josh Triplett wrote: > > > Looks good to me, but I have not looked very deeply ... > > > > I sent out a v2 with the co-author information moved from the > > signoffs to the commit message. If it looks reasonable to you, can > > you take it through the tip tree please? > > So since this is multi-arch, and changes kernel/fork.c, I'd say -mm is > a more appropriate home for it? (Assuming Linus does not object.) Works for me. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] init/Kconfig: Fix break in middle of EXPERT menu
On Tue, May 12, 2015 at 09:04:55AM +0200, Ingo Molnar wrote: > > * Josh Triplett wrote: > > > Commit e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y && > > !CONFIG_TRACING kernels, make it more configurable") made BPF_SYSCALL no > > longer hidden with !EXPERT, but left it in the middle of the EXPERT > > menu. menuconfig stops putting config items under a submenu once it > > encounters an item that doesn't depend on the menu's config item, so > > this caused the remainder of the EXPERT menu to spill out into the > > containing menu around it. Fix by moving BPF_SYSCALL before the EXPERT > > menu, next to BPF. > > > > Fixes: e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING > > kernels, make it more configurable") > > Signed-off-by: Josh Triplett > > --- > > > > Ingo, do you want to take this through -tip? Or should this go > > through some other tree? > > I can pick it up, but -mm might be better suited for this, if you do: Fair enough. > > I'm also thinking about splitting the entire EXPERT menu into a > > separate Kconfig.expert and including it from init/Kconfig, to make > > it clear that everything in that menu should only be visible if > > EXPERT. Right now, the long EXPERT menu blends into the longer > > init/Kconfig, and issues like this happen every few kernel releases. > > That's a good idea as well. I did so, in a patch later in this thread. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
On Tue, May 12, 2015 at 09:56:58AM +0530, Vineet Gupta wrote: > On Monday 11 May 2015 08:17 PM, Josh Triplett wrote: > > On Mon, May 11, 2015 at 02:31:39PM +, Vineet Gupta wrote: > >> On Tuesday 21 April 2015 11:17 PM, Josh Triplett wrote: > >>> clone with CLONE_SETTLS accepts an argument to set the thread-local > >>> storage area for the new thread. sys_clone declares an int argument > >>> tls_val in the appropriate point in the argument list (based on the > >>> various CLONE_BACKWARDS variants), but doesn't actually use or pass > >>> along that argument. Instead, sys_clone calls do_fork, which calls > >>> copy_process, which calls the arch-specific copy_thread, and copy_thread > >>> pulls the corresponding syscall argument out of the pt_regs captured at > >>> kernel entry (knowing what argument of clone that architecture passes > >>> tls in). > >>> > >>> Apart from being awful and inscrutable, that also only works because > >>> only one code path into copy_thread can pass the CLONE_SETTLS flag, and > >>> that code path comes from sys_clone with its architecture-specific > >>> argument-passing order. This prevents introducing a new version of the > >>> clone system call without propagating the same architecture-specific > >>> position of the tls argument. > >>> > >>> However, there's no reason to pull the argument out of pt_regs when > >>> sys_clone could just pass it down via C function call arguments. > >>> > >>> Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt > >>> into, and a new copy_thread_tls that accepts the tls parameter as an > >>> additional unsigned long (syscall-argument-sized) argument. > >>> Change sys_clone's tls argument to an unsigned long (which does > >>> not change the ABI), and pass that down to copy_thread_tls. > >>> > >>> Architectures that don't opt into copy_thread_tls will continue to > >>> ignore the C argument to sys_clone in favor of the pt_regs captured at > >>> kernel entry, and thus will be unable to introduce new versions of the > >>> clone syscall. > >>> > >>> Signed-off-by: Josh Triplett > >>> Signed-off-by: Thiago Macieira > >>> Acked-by: Andy Lutomirski > >>> --- > >>> arch/Kconfig | 7 ++ > >>> include/linux/sched.h| 14 > >>> include/linux/syscalls.h | 6 +++--- > >>> kernel/fork.c| 55 > >>> +++- > >>> 4 files changed, 60 insertions(+), 22 deletions(-) > >>> > >>> diff --git a/arch/Kconfig b/arch/Kconfig > >>> index 05d7a8a..4834a58 100644 > >>> --- a/arch/Kconfig > >>> +++ b/arch/Kconfig > >>> @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK > >>> This spares a stack switch and improves cache usage on softirq > >>> processing. > >>> > >>> +config HAVE_COPY_THREAD_TLS > >>> + bool > >>> + help > >>> + Architecture provides copy_thread_tls to accept tls argument via > >>> + normal C parameter passing, rather than extracting the syscall > >>> + argument from pt_regs. > >>> + > >>> # > >>> # ABI hall of shame > >>> # > >>> diff --git a/include/linux/sched.h b/include/linux/sched.h > >>> index a419b65..2cc88c6 100644 > >>> --- a/include/linux/sched.h > >>> +++ b/include/linux/sched.h > >>> @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct > >>> task_struct *task, unsigned int mode); > >>> /* Remove the current tasks stale references to the old mm_struct */ > >>> extern void mm_release(struct task_struct *, struct mm_struct *); > >>> > >>> +#ifdef CONFIG_HAVE_COPY_THREAD_TLS > >>> +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, > >>> + struct task_struct *, unsigned long); > >>> +#else > >>> extern int copy_thread(unsigned long, unsigned long, unsigned long, > >>> struct task_struct *); > >>> + > >>> +/* Architectures that haven't opted into copy_thread_tls get the tls > >>> argument > >>> + * via pt_regs, so ignore the tls argument passed via C. */ > >>> +static inline int
Re: [PATCH] init/Kconfig: Fix break in middle of EXPERT menu
On Tue, May 12, 2015 at 09:04:55AM +0200, Ingo Molnar wrote: * Josh Triplett j...@joshtriplett.org wrote: Commit e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y !CONFIG_TRACING kernels, make it more configurable) made BPF_SYSCALL no longer hidden with !EXPERT, but left it in the middle of the EXPERT menu. menuconfig stops putting config items under a submenu once it encounters an item that doesn't depend on the menu's config item, so this caused the remainder of the EXPERT menu to spill out into the containing menu around it. Fix by moving BPF_SYSCALL before the EXPERT menu, next to BPF. Fixes: e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y !CONFIG_TRACING kernels, make it more configurable) Signed-off-by: Josh Triplett j...@joshtriplett.org --- Ingo, do you want to take this through -tip? Or should this go through some other tree? I can pick it up, but -mm might be better suited for this, if you do: Fair enough. I'm also thinking about splitting the entire EXPERT menu into a separate Kconfig.expert and including it from init/Kconfig, to make it clear that everything in that menu should only be visible if EXPERT. Right now, the long EXPERT menu blends into the longer init/Kconfig, and issues like this happen every few kernel releases. That's a good idea as well. I did so, in a patch later in this thread. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
On Tue, May 12, 2015 at 09:56:58AM +0530, Vineet Gupta wrote: On Monday 11 May 2015 08:17 PM, Josh Triplett wrote: On Mon, May 11, 2015 at 02:31:39PM +, Vineet Gupta wrote: On Tuesday 21 April 2015 11:17 PM, Josh Triplett wrote: clone with CLONE_SETTLS accepts an argument to set the thread-local storage area for the new thread. sys_clone declares an int argument tls_val in the appropriate point in the argument list (based on the various CLONE_BACKWARDS variants), but doesn't actually use or pass along that argument. Instead, sys_clone calls do_fork, which calls copy_process, which calls the arch-specific copy_thread, and copy_thread pulls the corresponding syscall argument out of the pt_regs captured at kernel entry (knowing what argument of clone that architecture passes tls in). Apart from being awful and inscrutable, that also only works because only one code path into copy_thread can pass the CLONE_SETTLS flag, and that code path comes from sys_clone with its architecture-specific argument-passing order. This prevents introducing a new version of the clone system call without propagating the same architecture-specific position of the tls argument. However, there's no reason to pull the argument out of pt_regs when sys_clone could just pass it down via C function call arguments. Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into, and a new copy_thread_tls that accepts the tls parameter as an additional unsigned long (syscall-argument-sized) argument. Change sys_clone's tls argument to an unsigned long (which does not change the ABI), and pass that down to copy_thread_tls. Architectures that don't opt into copy_thread_tls will continue to ignore the C argument to sys_clone in favor of the pt_regs captured at kernel entry, and thus will be unable to introduce new versions of the clone syscall. Signed-off-by: Josh Triplett j...@joshtriplett.org Signed-off-by: Thiago Macieira thiago.macie...@intel.com Acked-by: Andy Lutomirski l...@kernel.org --- arch/Kconfig | 7 ++ include/linux/sched.h| 14 include/linux/syscalls.h | 6 +++--- kernel/fork.c| 55 +++- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 05d7a8a..4834a58 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK This spares a stack switch and improves cache usage on softirq processing. +config HAVE_COPY_THREAD_TLS + bool + help + Architecture provides copy_thread_tls to accept tls argument via + normal C parameter passing, rather than extracting the syscall + argument from pt_regs. + # # ABI hall of shame # diff --git a/include/linux/sched.h b/include/linux/sched.h index a419b65..2cc88c6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif Is this detour really needed. Can we not update copy_thread() of all arches in one go and add the tls arg, w/o using it. And then arch maintainers can micro-optimize their code to use that arg vs. pt_regs-rxx version at their own leisure. The only downside I see with that is bigger churn (touches all arches), and a interim unused arg warning ? In addition to the cleanup and simplification, the purpose of this patch is specifically to make sure that any architecture opting into HAVE_COPY_THREAD_TLS does *not* care how tls is passed in, and in particular doesn't depend on it arriving in a specific syscall argument. Sorry for sounding dense, but as I see here, in the end even for non opting arches, copy_thread_tls() calling convention expects tls arg passed to it from sys_clone call stack, but simply drops it. So that arg is always available, has to be, otherwise even the pt_regs approach won't get to it. That just avoids the need for preprocessor conditionals in the middle
Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
On Tue, May 12, 2015 at 10:17:28AM +0200, Ingo Molnar wrote: * Josh Triplett j...@joshtriplett.org wrote: Looks good to me, but I have not looked very deeply ... I sent out a v2 with the co-author information moved from the signoffs to the commit message. If it looks reasonable to you, can you take it through the tip tree please? So since this is multi-arch, and changes kernel/fork.c, I'd say -mm is a more appropriate home for it? (Assuming Linus does not object.) Works for me. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
On Tue, May 12, 2015 at 12:36:49AM +0200, Luis R. Rodriguez wrote: > On Mon, May 11, 2015 at 01:23:02PM -0700, Josh Triplett wrote: > > init/Kconfig| 232 > > +--- > > init/Kconfig.expert | 231 > > +++ > > 2 files changed, 232 insertions(+), 231 deletions(-) > > create mode 100644 init/Kconfig.expert > > I'm blinded by this patch, can you use git format-patch -M and also add: > > [diff] > > renamelimit=0 > > To your .gitconfig >From a quick test, doing what you suggest doesn't have any effect; if I also use -C1 instead of -M, then the patch to init/Kconfig looks exactly the same (deleting 231 lines and adding a "source" line), while the patch to init/Kconfig.expert goes from adding 231 lines to copying init/Kconfig and deleting 1800 lines. That does not seem like an improvement. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
On Tue, May 12, 2015 at 12:01:27AM +0200, Paul Bolle wrote: > On Mon, 2015-05-11 at 14:47 -0700, Josh Triplett wrote: > > On Mon, May 11, 2015 at 11:32:28PM +0200, Paul Bolle wrote: > > > Is squashing those two lines worth a new kconfig mechanism? > > > > In my opinion, yes. If you use the implicit (and error-prone) > > menuconfig submenuing, you get a single entry with the '[ ]' and the > > submenu. There are currently 272 instances of "menuconfig" in Kconfig > > files. > > How many of those use the subtle trick EXPERT uses? A rough search suggests about 35, with the rest using an "if" right after the menu (presumably because they want dependencies rather than visibility). (A couple of which currently have the same broken-menu problem today's patch 1 fixed for EXPERT, and a couple of which are using that trick *across input files*.) > > I'd like to have a less error-prone mechanism for people to use, > > with an explicit "endmenu" at the end, and I don't want to leave any > > incentive for people to need the more error-prone version. > > > > I would be tempted to just make "menuconfig" require an endmenu, and > > convert all users, but that would almost certainly break many > > third-party users of kconfig. So instead, I'm currently extending > > "menu" (which already expects "endmenu") to allow the syntax > > "menu config SYMBOL", which acts like a combination of "config SYMBOL" > > and a menu with "visible if SYMBOL". > > Bikeshedding (before I'm even convinced of the need of this extension): > "menu config" is far too similar to "menuconfig". Yeah, agreed; given that people will still end up using "menuconfig" in light of the above, I'll use another syntax. > > Diffstat for the patch I'm testing > > right now: > > > > scripts/kconfig/zconf.y | 18 ++ > > 1 file changed, 18 insertions(+) > > > > That seems worthwhile to have a less error-prone menu mechanism. > > > > (The actual patch would also need to updated zconf.tab.c_shipped.) > > And some lines in Documentation/kbuild/kconfig-language.txt (speaking > from memory). Good point; this does need documentation. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
On Mon, May 11, 2015 at 11:50:21PM +0200, Paul Bolle wrote: > On Mon, 2015-05-11 at 13:23 -0700, Josh Triplett wrote: > > --- /dev/null > > +++ b/init/Kconfig.expert > > @@ -0,0 +1,231 @@ > > +menuconfig EXPERT > > + bool "Configure standard kernel features (expert users)" > > + # Unhide debug options, to make the on-by-default options visible > > + select DEBUG_KERNEL > > + help > > + This option allows certain base kernel options and settings > > + to be disabled or tweaked. This is for specialized > > + environments which can tolerate a "non-standard" kernel. > > + Only use this if you really know what you are doing. > > Comment here saying > # All entries in this file must have "if EXPERT" after their prompt > > or something to that effect (pending you patch, that is)? Yeah, I should add such a comment. I was hoping to make it obsolete via kconfig changes, but in the interim, sure. > > +config KALLSYMS > > +bool "Load all symbols for debugging/ksymoops" if EXPERT > > +default y > > +help > > + Say Y here to let the kernel print out symbolic crash information and > > + symbolic stack backtraces. This increases the size of the kernel > > + somewhat, as all symbols have to be loaded into the kernel image. > > + > > +config KALLSYMS_ALL > > + bool "Include all symbols in kallsyms" > > (For some reason this entry doesn't have if EXPERT but it seems to > behave as expected. Odd.) Because it depends on KALLSYMS. Magic! > > + depends on DEBUG_KERNEL && KALLSYMS > > + help > > + Normally kallsyms only contains the symbols of functions for nicer > > + OOPS messages and backtraces (i.e., symbols from the text and > > inittext > > + sections). This is sufficient for most cases. And only in very rare > > + cases (e.g., when a debugger is used) all symbols are required (e.g., > > + names of variables from the data sections, etc). > > + > > + This option makes sure that all symbols are loaded into the kernel > > + image (i.e., symbols from all sections) in cost of increased kernel > > + size (depending on the kernel configuration, it may be 300KiB or > > + something like this). > > + > > + Say N unless you really need all symbols. > > + > > +config PRINTK > > + default y > > + bool "Enable support for printk" if EXPERT > > Now you're touching this: bool [...] as the first line, please. I'd like the commit moving these to their own file to very obviously look like it's moving these lines unmodified, rather than making changes in the process. If you want this (and all the subsequent instances you flagged) cleaned up, let's do that as a subsequent patch separate from the move. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
On Mon, May 11, 2015 at 11:32:28PM +0200, Paul Bolle wrote: > On Mon, 2015-05-11 at 14:18 -0700, Josh Triplett wrote: > > However, that would produce *two* entries under the "General setup" > > menu: a yes/no entry "Configure standard kernel features (expert users)" > > with no submenu, and a "Some separate menu prompt here" entry with a > > submenu but no '[ ]' for a yes/no option. Integrating the two (without > > using menuconfig's implicit "add stuff to submenu until an option's > > prompt doesn't depend on this symbol" magic) requires new a kconfig > > mechanism. > > The diff pasted at the end of this message, which I quickly cobbled > together an applies on top of this 2/1, generates these two lines in > menuconfig (for EXPERT = 'y') > [*] Configure standard kernel features (expert users) > Standard kernel features ---> > > Is squashing those two lines worth a new kconfig mechanism? In my opinion, yes. If you use the implicit (and error-prone) menuconfig submenuing, you get a single entry with the '[ ]' and the submenu. There are currently 272 instances of "menuconfig" in Kconfig files. I'd like to have a less error-prone mechanism for people to use, with an explicit "endmenu" at the end, and I don't want to leave any incentive for people to need the more error-prone version. I would be tempted to just make "menuconfig" require an endmenu, and convert all users, but that would almost certainly break many third-party users of kconfig. So instead, I'm currently extending "menu" (which already expects "endmenu") to allow the syntax "menu config SYMBOL", which acts like a combination of "config SYMBOL" and a menu with "visible if SYMBOL". Diffstat for the patch I'm testing right now: scripts/kconfig/zconf.y | 18 ++ 1 file changed, 18 insertions(+) That seems worthwhile to have a less error-prone menu mechanism. (The actual patch would also need to updated zconf.tab.c_shipped.) (Also, the diff you posted would be smaller if you left "config EXPERT" at the top of init/Kconfig.expert; why the move?) - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
On Mon, May 11, 2015 at 11:01:22PM +0200, Paul Bolle wrote: > On Mon, 2015-05-11 at 13:23 -0700, Josh Triplett wrote: > > I'd also like to factor the "if EXPERT" off of all the prompts and into a > > single scoped item wrapped around all of them, but kconfig doesn't have any > > way > > to do that. "menuconfig" is just a hint, with no matching "endmenu" and no > > implicit visibility; "menu" is scoped and has "visible if", but that would > > create a separate option containing a menu, rather than a menu under > > EXPERT's > > "Configure standard kernel features (expert users)". And "if EXPERT ... > > endif" > > produces a dependency, not a prompt-visibility condition. So I think this > > would require changes to the Kconfig language, to introduce either a scoped > > "visible if EXPERT ... endvisible" or similar, or a scoped version of > > menuconfig with a matching "endmenu" and implicit visibility (effectively a > > "menu" statement with attached "config" rather than a "config" with a hint > > "this might be a menu"). I'm leaning towards the latter. > > The behavior of menuconfig in this case is rather subtle. I must admit I > never noticed it. > > The "visible" option to menus is little used, and I'm not really > familiar with it. So, for what it's worth: would adding a new menu with > visible if EXPERT > > attached to it, and putting all current > prompt "Foo" if EXPERT > > entries in that menu roughly do what you want? Not quite. menu has a prompt but no associated symbol, so with current kconfig that would look like: config EXPERT bool "Configure standard kernel features (expert users)" ... menu "Some separate menu prompt here" visible if EXPERT ... various options that currently have if EXPERT on their prompts ... endmenu However, that would produce *two* entries under the "General setup" menu: a yes/no entry "Configure standard kernel features (expert users)" with no submenu, and a "Some separate menu prompt here" entry with a submenu but no '[ ]' for a yes/no option. Integrating the two (without using menuconfig's implicit "add stuff to submenu until an option's prompt doesn't depend on this symbol" magic) requires new a kconfig mechanism. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
The expert menu frequently gets broken by a config item in the middle that leaves off the "if EXPERT" from its prompt. This results in the remainder of the menu spilling out into the parent "General setup" menu. Move the entire expert menu into a separate Kconfig file, init/Kconfig.expert, to make this harder to do accidentally, and to break up the exceedingly long init/Kconfig a bit. Signed-off-by: Josh Triplett --- This applies on top of "init/Kconfig: Fix break in middle of EXPERT menu". Please apply both. I'd also like to factor the "if EXPERT" off of all the prompts and into a single scoped item wrapped around all of them, but kconfig doesn't have any way to do that. "menuconfig" is just a hint, with no matching "endmenu" and no implicit visibility; "menu" is scoped and has "visible if", but that would create a separate option containing a menu, rather than a menu under EXPERT's "Configure standard kernel features (expert users)". And "if EXPERT ... endif" produces a dependency, not a prompt-visibility condition. So I think this would require changes to the Kconfig language, to introduce either a scoped "visible if EXPERT ... endvisible" or similar, or a scoped version of menuconfig with a matching "endmenu" and implicit visibility (effectively a "menu" statement with attached "config" rather than a "config" with a hint "this might be a menu"). I'm leaning towards the latter. So I'll send a followup patch enhancing kconfig to improve this case, but I think splitting this into a separate file is still worth it even without that. init/Kconfig| 232 +--- init/Kconfig.expert | 231 +++ 2 files changed, 232 insertions(+), 231 deletions(-) create mode 100644 init/Kconfig.expert diff --git a/init/Kconfig b/init/Kconfig index e2f16f1..a2de3f5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1351,237 +1351,7 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. -menuconfig EXPERT - bool "Configure standard kernel features (expert users)" - # Unhide debug options, to make the on-by-default options visible - select DEBUG_KERNEL - help - This option allows certain base kernel options and settings - to be disabled or tweaked. This is for specialized - environments which can tolerate a "non-standard" kernel. - Only use this if you really know what you are doing. - -config UID16 - bool "Enable 16-bit UID system calls" if EXPERT - depends on HAVE_UID16 && MULTIUSER - default y - help - This enables the legacy 16-bit UID syscall wrappers. - -config MULTIUSER - bool "Multiple users, groups and capabilities support" if EXPERT - default y - help - This option enables support for non-root users, groups and - capabilities. - - If you say N here, all processes will run with UID 0, GID 0, and all - possible capabilities. Saying N here also compiles out support for - system calls related to UIDs, GIDs, and capabilities, such as setuid, - setgid, and capset. - - If unsure, say Y here. - -config SGETMASK_SYSCALL - bool "sgetmask/ssetmask syscalls support" if EXPERT - def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH - ---help--- - sys_sgetmask and sys_ssetmask are obsolete system calls - no longer supported in libc but still enabled by default in some - architectures. - - If unsure, leave the default option here. - -config SYSFS_SYSCALL - bool "Sysfs syscall support" if EXPERT - default y - ---help--- - sys_sysfs is an obsolete system call no longer supported in libc. - Note that disabling this option is more secure but might break - compatibility with some systems. - - If unsure say Y here. - -config SYSCTL_SYSCALL - bool "Sysctl syscall support" if EXPERT - depends on PROC_SYSCTL - default n - select SYSCTL - ---help--- - sys_sysctl uses binary paths that have been found challenging - to properly maintain and use. The interface in /proc/sys - using paths with ascii names is now the primary path to this - information. - - Almost nothing using the binary sysctl interface so if you are - trying to save some space it is probably safe to disable this, - making your kernel marginally smaller. - - If unsure say N here. - -config KALLSYMS -bool "Load all symbols
[PATCHv2 2/2] x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit
For 32-bit userspace on a 64-bit kernel, this requires modifying stub32_clone to actually swap the appropriate arguments to match CONFIG_CLONE_BACKWARDS, rather than just leaving the C argument for tls broken. Patch co-authored by Josh Triplett and Thiago Macieira. Signed-off-by: Josh Triplett Acked-by: Andy Lutomirski --- arch/x86/Kconfig | 1 + arch/x86/ia32/ia32entry.S| 2 +- arch/x86/kernel/process_32.c | 6 +++--- arch/x86/kernel/process_64.c | 8 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b7d31ca..4960b0d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -124,6 +124,7 @@ config X86 select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 + select HAVE_COPY_THREAD_TLS select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUE_RWLOCK select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 156ebca..0286735 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -487,7 +487,7 @@ GLOBAL(\label) ALIGN GLOBAL(stub32_clone) leaq sys_clone(%rip),%rax - mov %r8, %rcx + xchg %r8, %rcx jmp ia32_ptregs_common ALIGN diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f9..ead28ff 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -129,8 +129,8 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); struct task_struct *tsk; @@ -185,7 +185,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs->si, 0); + (struct user_desc __user *)tls, 0); if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 67fcc43..c69cabc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -151,8 +151,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) return get_desc_base(>thread.tls_array[tls]); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { int err; struct pt_regs *childregs; @@ -209,10 +209,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs->si, 0); + (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + err = do_arch_prctl(p, ARCH_SET_FS, tls); if (err) goto out; } -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
On Mon, May 11, 2015 at 04:00:43PM +0200, Ingo Molnar wrote: > > * Josh Triplett wrote: > > > On Mon, May 11, 2015 at 12:13:13PM +0200, Ingo Molnar wrote: > > > > > > * j...@joshtriplett.org wrote: > > > > > > > On Tue, May 05, 2015 at 08:53:03PM +0200, Thomas Gleixner wrote: > > > > > On Tue, 21 Apr 2015, Josh Triplett wrote: > > > > > > > > > > > > Signed-off-by: Josh Triplett > > > > > > Signed-off-by: Thiago Macieira > > > > > > > > > > Can you please clarify that SOB chain? It does not make any sense. > > > > > > > > Co-authored patch. We both worked on it together, and sadly git > > > > doesn't support a commit with multiple authors, so this is the next > > > > best thing. > > > > > > No, this is not a valid SOB chain. > > > > > > For 'co authored' patches you can add credits either to the file, > > > as two copyright lines, or via the changelog, no need to mess up > > > the SOB chain for that. > > > > Fine, I'll write it another way. > > > > Do you see any other issues with the patch other than the signoffs? > > Looks good to me, but I have not looked very deeply ... I sent out a v2 with the co-author information moved from the signoffs to the commit message. If it looks reasonable to you, can you take it through the tip tree please? - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv2 1/2] clone: Support passing tls argument via C rather than pt_regs magic
clone with CLONE_SETTLS accepts an argument to set the thread-local storage area for the new thread. sys_clone declares an int argument tls_val in the appropriate point in the argument list (based on the various CLONE_BACKWARDS variants), but doesn't actually use or pass along that argument. Instead, sys_clone calls do_fork, which calls copy_process, which calls the arch-specific copy_thread, and copy_thread pulls the corresponding syscall argument out of the pt_regs captured at kernel entry (knowing what argument of clone that architecture passes tls in). Apart from being awful and inscrutable, that also only works because only one code path into copy_thread can pass the CLONE_SETTLS flag, and that code path comes from sys_clone with its architecture-specific argument-passing order. This prevents introducing a new version of the clone system call without propagating the same architecture-specific position of the tls argument. However, there's no reason to pull the argument out of pt_regs when sys_clone could just pass it down via C function call arguments. Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into, and a new copy_thread_tls that accepts the tls parameter as an additional unsigned long (syscall-argument-sized) argument. Change sys_clone's tls argument to an unsigned long (which does not change the ABI), and pass that down to copy_thread_tls. Architectures that don't opt into copy_thread_tls will continue to ignore the C argument to sys_clone in favor of the pt_regs captured at kernel entry, and thus will be unable to introduce new versions of the clone syscall. Patch co-authored by Josh Triplett and Thiago Macieira. Signed-off-by: Josh Triplett Acked-by: Andy Lutomirski --- arch/Kconfig | 7 ++ include/linux/sched.h| 14 include/linux/syscalls.h | 6 +++--- kernel/fork.c| 55 +++- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 05d7a8a..4834a58 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK This spares a stack switch and improves cache usage on softirq processing. +config HAVE_COPY_THREAD_TLS + bool + help + Architecture provides copy_thread_tls to accept tls argument via + normal C parameter passing, rather than extracting the syscall + argument from pt_regs. + # # ABI hall of shame # diff --git a/include/linux/sched.h b/include/linux/sched.h index a419b65..2cc88c6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif extern void flush_thread(void); extern void exit_thread(void); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 76d1e38..bb51bec 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd); asmlinkage long sys_fork(void); asmlinkage long sys_vfork(void); #ifdef CONFIG_CLONE_BACKWARDS -asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int, +asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long, int __user *); #else #ifdef CONFIG_CLONE_BACKWARDS3 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *, - int __user *, int); + int __user *, unsigned long); #else asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, - int __user *, int); + int __user *, unsigned long); #endif #endif diff --git a/kernel/fork.c b/kernel/fork.c index cf65139..b3dadf4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1192,7 +1192,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, - int trace
[PATCHv2 0/2] clone: Support passing tls argument via C rather than pt_regs magic
clone has some of the quirkiest syscall handling in the kernel, with a pile of special cases, historical curiosities, and architecture-specific calling conventions. In particular, clone with CLONE_SETTLS accepts a parameter "tls" that the C entry point completely ignores and some assembly entry points overwrite; instead, the low-level arch-specific code pulls the tls parameter out of the arch-specific register captured as part of pt_regs on entry to the kernel. That's a massive hack, and it makes the arch-specific code only work when called via the specific existing syscall entry points; because of this hack, any new clone-like system call would have to accept an identical tls argument in exactly the same arch-specific position, rather than providing a unified system call entry point across architectures. The first patch allows architectures to handle the tls argument via normal C parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt into this. These two patches came out of the clone4 series, which isn't ready for this merge window, but these first two cleanup patches were entirely uncontroversial and have acks. I'd like to go ahead and submit these two so that other architectures can begin building on top of this and opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and send these through the next merge window (along with v3 of clone4) if anyone would prefer that. v2: Move co-author from signoffs to a note in the commit message, as required by Ingo Molnar. Josh Triplett (2): clone: Support passing tls argument via C rather than pt_regs magic x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit arch/Kconfig | 7 ++ arch/x86/Kconfig | 1 + arch/x86/ia32/ia32entry.S| 2 +- arch/x86/kernel/process_32.c | 6 ++--- arch/x86/kernel/process_64.c | 8 +++ include/linux/sched.h| 14 +++ include/linux/syscalls.h | 6 ++--- kernel/fork.c| 55 +--- 8 files changed, 69 insertions(+), 30 deletions(-) -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] force inlining of spinlock ops
On Mon, May 11, 2015 at 07:57:22PM +0200, Denys Vlasenko wrote: > With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline > very small functions we expect to be inlined. In particular, > with this config: http://busybox.net/~vda/kernel_config > there are more than a thousand copies of tiny spinlock-related functions: > > $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort > -rn | grep ' spin' > 473 000b t spin_unlock_irqrestore > 292 000b t spin_unlock > 215 000b t spin_lock > 134 000b t spin_unlock_irq > 130 000b t spin_unlock_bh > 120 000b t spin_lock_irq > 106 000b t spin_lock_bh > > Disassembly: > > 81004720 : > 81004720: 55 push %rbp > 81004721: 48 89 e5mov%rsp,%rbp > 81004724: e8 f8 4e e2 02 callq <_raw_spin_lock> > 81004729: 5d pop%rbp > 8100472a: c3 retq Frame pointers make this even more awful, since without them this could just become a single jmp. (Assuming _raw_spin_lock shouldn't be inlined too.) > This patch fixes this via s/inline/__always_inline/ in spinlock.h. > This decreases vmlinux by about 30k: > > text data bss dec hex filename > 82375570 22255544 20627456 125258570 7774b4a vmlinux.before > 82335059 22255416 20627456 125217931 776ac8b vmlinux Nice improvement. Given that this actually makes the kernel *smaller*, presumably in addition to faster, this forced inlining seems completely reasonable. > Signed-off-by: Denys Vlasenko > Cc: Thomas Graf > Cc: David S. Miller > Cc: Bart Van Assche > Cc: Peter Zijlstra > Cc: David Rientjes > Cc: David S. Miller > Cc: Andrew Morton > Cc: Linus Torvalds > Cc: Oleg Nesterov > Cc: Paul E. McKenney > Cc: Ingo Molnar > Cc: Paul E. McKenney > CC: linux-kernel@vger.kernel.org Reviewed-by: Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] init/Kconfig: Fix break in middle of EXPERT menu
Commit e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING kernels, make it more configurable") made BPF_SYSCALL no longer hidden with !EXPERT, but left it in the middle of the EXPERT menu. menuconfig stops putting config items under a submenu once it encounters an item that doesn't depend on the menu's config item, so this caused the remainder of the EXPERT menu to spill out into the containing menu around it. Fix by moving BPF_SYSCALL before the EXPERT menu, next to BPF. Fixes: e1abf2cc8d5 ("bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING kernels, make it more configurable") Signed-off-by: Josh Triplett --- Ingo, do you want to take this through -tip? Or should this go through some other tree? I'm also thinking about splitting the entire EXPERT menu into a separate Kconfig.expert and including it from init/Kconfig, to make it clear that everything in that menu should only be visible if EXPERT. Right now, the long EXPERT menu blends into the longer init/Kconfig, and issues like this happen every few kernel releases. init/Kconfig | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index dc24dec..e2f16f1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1341,6 +1341,16 @@ config HAVE_PCSPKR_PLATFORM config BPF bool +# syscall, maps, verifier +config BPF_SYSCALL + bool "Enable bpf() system call" + select ANON_INODES + select BPF + default n + help + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + menuconfig EXPERT bool "Configure standard kernel features (expert users)" # Unhide debug options, to make the on-by-default options visible @@ -1535,16 +1545,6 @@ config EVENTFD If unsure, say Y. -# syscall, maps, verifier -config BPF_SYSCALL - bool "Enable bpf() system call" - select ANON_INODES - select BPF - default n - help - Enable the bpf() system call that allows to manipulate eBPF - programs and maps via file descriptors. - config SHMEM bool "Use full shmem filesystem" if EXPERT default y -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH V4] rcu: change function declaration to bool
On Mon, May 11, 2015 at 06:12:27PM +0200, Nicholas Mc Guire wrote: > rcu_cpu_has_callbacks() is declared int. The current declaration was > introduced > in commit c0f4dfd4f90f (rcu: Make RCU_FAST_NO_HZ take advantage of numbered > callbacks). But it is actually returning bool and as the function description > states " * Return true if the specified CPU has any callback", this > probably > should be a bool as all (3) call-sites currently treat it as bool. > > Type-checking coccinelle spatches are being used to locate type mismatches > between function signatures and return values in this case this produced: > ./kernel/rcu/tree.c:3538 WARNING: return of wrong type > int != bool, > > Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y) > > Patch is against 4.1-rc3 (localversion-next is -next-20150511) and fixes > > Signed-off-by: Nicholas Mc Guire Seems like a reasonable addition. Reviewed-by: Josh Triplett > --- > > V4: fix-up to include the origin of the issue being fixed as requeseted by > Steven Rostedt . > > kernel/rcu/tree.c |2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > index bcc5943..599550c 100644 > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -3516,7 +3516,7 @@ static int rcu_pending(void) > * non-NULL, store an indication of whether all callbacks are lazy. > * (If there are no callbacks, all of them are deemed to be lazy.) > */ > -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) > +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) > { > bool al = true; > bool hc = false; > -- > 1.7.10.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH V2] rcu: change function declaration to bool
On Mon, May 11, 2015 at 05:46:12PM +0200, Nicholas Mc Guire wrote: > rcu_cpu_has_callbacks() is declared int but is actually returning bool > and as the function description states " * Return true if the specified > CPU has any callback", this probably should be a bool. All (3) > call-sites currently treat it as bool. > > Type-checking coccinelle spatches are being used to locate type mismatches > between function signatures and return values in this case this produced: > ./kernel/rcu/tree.c:3538 WARNING: return of wrong type > int != bool, > > Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y) > > Patch is against 4.1-rc3 (localversion-next is -next-20150511) > > Signed-off-by: Nicholas Mc Guire Reviewed-by: Josh Triplett Thanks! > --- > > V3: fix-up of commit message again (hope I got it right this time) as > requested by Josh Triplett /Steven Rostedt > > > kernel/rcu/tree.c |2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > index bcc5943..599550c 100644 > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -3516,7 +3516,7 @@ static int rcu_pending(void) > * non-NULL, store an indication of whether all callbacks are lazy. > * (If there are no callbacks, all of them are deemed to be lazy.) > */ > -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) > +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) > { > bool al = true; > bool hc = false; > -- > 1.7.10.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH V2] rcu: change function declaration to bool
On Mon, May 11, 2015 at 11:28:30AM -0400, Steven Rostedt wrote: > On Mon, 11 May 2015 17:10:59 +0200 > Nicholas Mc Guire wrote: > > > rcu_cpu_has_callbacks() is declared int but is actually returning bool and > > and as the function description states " * Return true if the specified > > CPU has any callback", this probably should be a bool. All (3) > > call-sites currently treat it as bool so the declaration. > > > > > > Signed-off-by: Nicholas Mc Guire > > --- > > > > V2: fixed up commit message and tool infos as requested by > > Josh Triplett > > > > Type-checking coccinelle spatches are being used to locate type > > mismatches between function signatures and return values. > > ./kernel/rcu/tree.c:3538 WARNING: return of wrong type > > int != bool, > > > > Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y) > > > > Patch is against 4.1-rc3 (localversion-next is -next-20150511) > > I think what Josh was saying is that all the above except for the "V2" > should be above the signature. Everything between the "---" and the > patch gets tossed out when committed into git. > > Giving credit to coccinelle and even what branch and config was used > for testing is something we want in the git change log history. Yes, exactly. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
On Mon, May 11, 2015 at 02:31:39PM +, Vineet Gupta wrote: > On Tuesday 21 April 2015 11:17 PM, Josh Triplett wrote: > > clone with CLONE_SETTLS accepts an argument to set the thread-local > > storage area for the new thread. sys_clone declares an int argument > > tls_val in the appropriate point in the argument list (based on the > > various CLONE_BACKWARDS variants), but doesn't actually use or pass > > along that argument. Instead, sys_clone calls do_fork, which calls > > copy_process, which calls the arch-specific copy_thread, and copy_thread > > pulls the corresponding syscall argument out of the pt_regs captured at > > kernel entry (knowing what argument of clone that architecture passes > > tls in). > > > > Apart from being awful and inscrutable, that also only works because > > only one code path into copy_thread can pass the CLONE_SETTLS flag, and > > that code path comes from sys_clone with its architecture-specific > > argument-passing order. This prevents introducing a new version of the > > clone system call without propagating the same architecture-specific > > position of the tls argument. > > > > However, there's no reason to pull the argument out of pt_regs when > > sys_clone could just pass it down via C function call arguments. > > > > Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt > > into, and a new copy_thread_tls that accepts the tls parameter as an > > additional unsigned long (syscall-argument-sized) argument. > > Change sys_clone's tls argument to an unsigned long (which does > > not change the ABI), and pass that down to copy_thread_tls. > > > > Architectures that don't opt into copy_thread_tls will continue to > > ignore the C argument to sys_clone in favor of the pt_regs captured at > > kernel entry, and thus will be unable to introduce new versions of the > > clone syscall. > > > > Signed-off-by: Josh Triplett > > Signed-off-by: Thiago Macieira > > Acked-by: Andy Lutomirski > > --- > > arch/Kconfig | 7 ++ > > include/linux/sched.h| 14 > > include/linux/syscalls.h | 6 +++--- > > kernel/fork.c| 55 > > +++- > > 4 files changed, 60 insertions(+), 22 deletions(-) > > > > diff --git a/arch/Kconfig b/arch/Kconfig > > index 05d7a8a..4834a58 100644 > > --- a/arch/Kconfig > > +++ b/arch/Kconfig > > @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK > > This spares a stack switch and improves cache usage on softirq > > processing. > > > > +config HAVE_COPY_THREAD_TLS > > + bool > > + help > > + Architecture provides copy_thread_tls to accept tls argument via > > + normal C parameter passing, rather than extracting the syscall > > + argument from pt_regs. > > + > > # > > # ABI hall of shame > > # > > diff --git a/include/linux/sched.h b/include/linux/sched.h > > index a419b65..2cc88c6 100644 > > --- a/include/linux/sched.h > > +++ b/include/linux/sched.h > > @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct > > task_struct *task, unsigned int mode); > > /* Remove the current tasks stale references to the old mm_struct */ > > extern void mm_release(struct task_struct *, struct mm_struct *); > > > > +#ifdef CONFIG_HAVE_COPY_THREAD_TLS > > +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, > > + struct task_struct *, unsigned long); > > +#else > > extern int copy_thread(unsigned long, unsigned long, unsigned long, > > struct task_struct *); > > + > > +/* Architectures that haven't opted into copy_thread_tls get the tls > > argument > > + * via pt_regs, so ignore the tls argument passed via C. */ > > +static inline int copy_thread_tls( > > + unsigned long clone_flags, unsigned long sp, unsigned long arg, > > + struct task_struct *p, unsigned long tls) > > +{ > > + return copy_thread(clone_flags, sp, arg, p); > > +} > > +#endif > > Is this detour really needed. Can we not update copy_thread() of all arches > in one > go and add the tls arg, w/o using it. > > And then arch maintainers can micro-optimize their code to use that arg vs. > pt_regs->rxx version at their own leisure. The only downside I see with that > is > bigger churn (touches all arches), and a interim unused arg warning ? In addition to the cleanup and simplification, the purpose of this patch is specifically to ma
Re: [PATCH] rcu: change function declaration to bool
On Mon, May 11, 2015 at 10:51:55AM +0200, Nicholas Mc Guire wrote: > rcu_cpu_has_callbacks() is declared int but is actually returning bool and > all call-sites currently use it as bool so the declaration should be bool > as well. > > Signed-off-by: Nicholas Mc Guire The patch seems reasonable to me. However... > --- > > ./kernel/rcu/tree.c:3538 WARNING: return of wrong type > int != bool, > > as the description of rcu_cpu_has_callbacks() states: > " * Return true if the specified CPU has any callback" > this probably should be a bool > All (3) call sites are conditions and are treating it as boolean. > > Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y) ...some of this information should be in the commit message, as well as a description of what tool produced this warning. With that changed, Reviewed-by: Josh Triplett > Patch is against 4.1-rc3 (localversion-next is -next-20150511) > > kernel/rcu/tree.c |2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > index bcc5943..599550c 100644 > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -3516,7 +3516,7 @@ static int rcu_pending(void) > * non-NULL, store an indication of whether all callbacks are lazy. > * (If there are no callbacks, all of them are deemed to be lazy.) > */ > -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) > +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) > { > bool al = true; > bool hc = false; > -- > 1.7.10.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
On Mon, May 11, 2015 at 12:13:13PM +0200, Ingo Molnar wrote: > > * j...@joshtriplett.org wrote: > > > On Tue, May 05, 2015 at 08:53:03PM +0200, Thomas Gleixner wrote: > > > On Tue, 21 Apr 2015, Josh Triplett wrote: > > > > > > > > Signed-off-by: Josh Triplett > > > > Signed-off-by: Thiago Macieira > > > > > > Can you please clarify that SOB chain? It does not make any sense. > > > > Co-authored patch. We both worked on it together, and sadly git > > doesn't support a commit with multiple authors, so this is the next > > best thing. > > No, this is not a valid SOB chain. > > For 'co authored' patches you can add credits either to the file, as > two copyright lines, or via the changelog, no need to mess up the SOB > chain for that. Fine, I'll write it another way. Do you see any other issues with the patch other than the signoffs? - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
On Mon, May 11, 2015 at 12:13:13PM +0200, Ingo Molnar wrote: * j...@joshtriplett.org j...@joshtriplett.org wrote: On Tue, May 05, 2015 at 08:53:03PM +0200, Thomas Gleixner wrote: On Tue, 21 Apr 2015, Josh Triplett wrote: Signed-off-by: Josh Triplett j...@joshtriplett.org Signed-off-by: Thiago Macieira thiago.macie...@intel.com Can you please clarify that SOB chain? It does not make any sense. Co-authored patch. We both worked on it together, and sadly git doesn't support a commit with multiple authors, so this is the next best thing. No, this is not a valid SOB chain. For 'co authored' patches you can add credits either to the file, as two copyright lines, or via the changelog, no need to mess up the SOB chain for that. Fine, I'll write it another way. Do you see any other issues with the patch other than the signoffs? - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rcu: change function declaration to bool
On Mon, May 11, 2015 at 10:51:55AM +0200, Nicholas Mc Guire wrote: rcu_cpu_has_callbacks() is declared int but is actually returning bool and all call-sites currently use it as bool so the declaration should be bool as well. Signed-off-by: Nicholas Mc Guire hof...@osadl.org The patch seems reasonable to me. However... --- ./kernel/rcu/tree.c:3538 WARNING: return of wrong type int != bool, as the description of rcu_cpu_has_callbacks() states: * Return true if the specified CPU has any callback this probably should be a bool All (3) call sites are conditions and are treating it as boolean. Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y) ...some of this information should be in the commit message, as well as a description of what tool produced this warning. With that changed, Reviewed-by: Josh Triplett j...@joshtriplett.org Patch is against 4.1-rc3 (localversion-next is -next-20150511) kernel/rcu/tree.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcc5943..599550c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3516,7 +3516,7 @@ static int rcu_pending(void) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
On Mon, May 11, 2015 at 02:31:39PM +, Vineet Gupta wrote: On Tuesday 21 April 2015 11:17 PM, Josh Triplett wrote: clone with CLONE_SETTLS accepts an argument to set the thread-local storage area for the new thread. sys_clone declares an int argument tls_val in the appropriate point in the argument list (based on the various CLONE_BACKWARDS variants), but doesn't actually use or pass along that argument. Instead, sys_clone calls do_fork, which calls copy_process, which calls the arch-specific copy_thread, and copy_thread pulls the corresponding syscall argument out of the pt_regs captured at kernel entry (knowing what argument of clone that architecture passes tls in). Apart from being awful and inscrutable, that also only works because only one code path into copy_thread can pass the CLONE_SETTLS flag, and that code path comes from sys_clone with its architecture-specific argument-passing order. This prevents introducing a new version of the clone system call without propagating the same architecture-specific position of the tls argument. However, there's no reason to pull the argument out of pt_regs when sys_clone could just pass it down via C function call arguments. Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into, and a new copy_thread_tls that accepts the tls parameter as an additional unsigned long (syscall-argument-sized) argument. Change sys_clone's tls argument to an unsigned long (which does not change the ABI), and pass that down to copy_thread_tls. Architectures that don't opt into copy_thread_tls will continue to ignore the C argument to sys_clone in favor of the pt_regs captured at kernel entry, and thus will be unable to introduce new versions of the clone syscall. Signed-off-by: Josh Triplett j...@joshtriplett.org Signed-off-by: Thiago Macieira thiago.macie...@intel.com Acked-by: Andy Lutomirski l...@kernel.org --- arch/Kconfig | 7 ++ include/linux/sched.h| 14 include/linux/syscalls.h | 6 +++--- kernel/fork.c| 55 +++- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 05d7a8a..4834a58 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK This spares a stack switch and improves cache usage on softirq processing. +config HAVE_COPY_THREAD_TLS + bool + help + Architecture provides copy_thread_tls to accept tls argument via + normal C parameter passing, rather than extracting the syscall + argument from pt_regs. + # # ABI hall of shame # diff --git a/include/linux/sched.h b/include/linux/sched.h index a419b65..2cc88c6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif Is this detour really needed. Can we not update copy_thread() of all arches in one go and add the tls arg, w/o using it. And then arch maintainers can micro-optimize their code to use that arg vs. pt_regs-rxx version at their own leisure. The only downside I see with that is bigger churn (touches all arches), and a interim unused arg warning ? In addition to the cleanup and simplification, the purpose of this patch is specifically to make sure that any architecture opting into HAVE_COPY_THREAD_TLS does *not* care how tls is passed in, and in particular doesn't depend on it arriving in a specific syscall argument. I have patches in flight (for CLONE_FD and clone4) that depend on that assumption, by introducing additional syscalls (with tls passed differently) that call down through these same code paths. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH V2] rcu: change function declaration to bool
On Mon, May 11, 2015 at 11:28:30AM -0400, Steven Rostedt wrote: On Mon, 11 May 2015 17:10:59 +0200 Nicholas Mc Guire hof...@osadl.org wrote: rcu_cpu_has_callbacks() is declared int but is actually returning bool and and as the function description states * Return true if the specified CPU has any callback, this probably should be a bool. All (3) call-sites currently treat it as bool so the declaration. Signed-off-by: Nicholas Mc Guire hof...@osadl.org --- V2: fixed up commit message and tool infos as requested by Josh Triplett j...@joshtriplett.org Type-checking coccinelle spatches are being used to locate type mismatches between function signatures and return values. ./kernel/rcu/tree.c:3538 WARNING: return of wrong type int != bool, Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y) Patch is against 4.1-rc3 (localversion-next is -next-20150511) I think what Josh was saying is that all the above except for the V2 should be above the signature. Everything between the --- and the patch gets tossed out when committed into git. Giving credit to coccinelle and even what branch and config was used for testing is something we want in the git change log history. Yes, exactly. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] init/Kconfig: Fix break in middle of EXPERT menu
Commit e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y !CONFIG_TRACING kernels, make it more configurable) made BPF_SYSCALL no longer hidden with !EXPERT, but left it in the middle of the EXPERT menu. menuconfig stops putting config items under a submenu once it encounters an item that doesn't depend on the menu's config item, so this caused the remainder of the EXPERT menu to spill out into the containing menu around it. Fix by moving BPF_SYSCALL before the EXPERT menu, next to BPF. Fixes: e1abf2cc8d5 (bpf: Fix the build on BPF_SYSCALL=y !CONFIG_TRACING kernels, make it more configurable) Signed-off-by: Josh Triplett j...@joshtriplett.org --- Ingo, do you want to take this through -tip? Or should this go through some other tree? I'm also thinking about splitting the entire EXPERT menu into a separate Kconfig.expert and including it from init/Kconfig, to make it clear that everything in that menu should only be visible if EXPERT. Right now, the long EXPERT menu blends into the longer init/Kconfig, and issues like this happen every few kernel releases. init/Kconfig | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index dc24dec..e2f16f1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1341,6 +1341,16 @@ config HAVE_PCSPKR_PLATFORM config BPF bool +# syscall, maps, verifier +config BPF_SYSCALL + bool Enable bpf() system call + select ANON_INODES + select BPF + default n + help + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + menuconfig EXPERT bool Configure standard kernel features (expert users) # Unhide debug options, to make the on-by-default options visible @@ -1535,16 +1545,6 @@ config EVENTFD If unsure, say Y. -# syscall, maps, verifier -config BPF_SYSCALL - bool Enable bpf() system call - select ANON_INODES - select BPF - default n - help - Enable the bpf() system call that allows to manipulate eBPF - programs and maps via file descriptors. - config SHMEM bool Use full shmem filesystem if EXPERT default y -- 2.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] force inlining of spinlock ops
On Mon, May 11, 2015 at 07:57:22PM +0200, Denys Vlasenko wrote: With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline very small functions we expect to be inlined. In particular, with this config: http://busybox.net/~vda/kernel_config there are more than a thousand copies of tiny spinlock-related functions: $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin' 473 000b t spin_unlock_irqrestore 292 000b t spin_unlock 215 000b t spin_lock 134 000b t spin_unlock_irq 130 000b t spin_unlock_bh 120 000b t spin_lock_irq 106 000b t spin_lock_bh Disassembly: 81004720 spin_lock: 81004720: 55 push %rbp 81004721: 48 89 e5mov%rsp,%rbp 81004724: e8 f8 4e e2 02 callq _raw_spin_lock 81004729: 5d pop%rbp 8100472a: c3 retq Frame pointers make this even more awful, since without them this could just become a single jmp. (Assuming _raw_spin_lock shouldn't be inlined too.) This patch fixes this via s/inline/__always_inline/ in spinlock.h. This decreases vmlinux by about 30k: text data bss dec hex filename 82375570 22255544 20627456 125258570 7774b4a vmlinux.before 82335059 22255416 20627456 125217931 776ac8b vmlinux Nice improvement. Given that this actually makes the kernel *smaller*, presumably in addition to faster, this forced inlining seems completely reasonable. Signed-off-by: Denys Vlasenko dvlas...@redhat.com Cc: Thomas Graf tg...@suug.ch Cc: David S. Miller da...@davemloft.net Cc: Bart Van Assche bvanass...@acm.org Cc: Peter Zijlstra pet...@infradead.org Cc: David Rientjes rient...@google.com Cc: David S. Miller da...@davemloft.net Cc: Andrew Morton a...@linux-foundation.org Cc: Linus Torvalds torva...@linux-foundation.org Cc: Oleg Nesterov o...@redhat.com Cc: Paul E. McKenney paul...@linux.vnet.ibm.com Cc: Ingo Molnar mi...@kernel.org Cc: Paul E. McKenney paul...@linux.vnet.ibm.com CC: linux-kernel@vger.kernel.org Reviewed-by: Josh Triplett j...@joshtriplett.org -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv2 1/2] clone: Support passing tls argument via C rather than pt_regs magic
clone with CLONE_SETTLS accepts an argument to set the thread-local storage area for the new thread. sys_clone declares an int argument tls_val in the appropriate point in the argument list (based on the various CLONE_BACKWARDS variants), but doesn't actually use or pass along that argument. Instead, sys_clone calls do_fork, which calls copy_process, which calls the arch-specific copy_thread, and copy_thread pulls the corresponding syscall argument out of the pt_regs captured at kernel entry (knowing what argument of clone that architecture passes tls in). Apart from being awful and inscrutable, that also only works because only one code path into copy_thread can pass the CLONE_SETTLS flag, and that code path comes from sys_clone with its architecture-specific argument-passing order. This prevents introducing a new version of the clone system call without propagating the same architecture-specific position of the tls argument. However, there's no reason to pull the argument out of pt_regs when sys_clone could just pass it down via C function call arguments. Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into, and a new copy_thread_tls that accepts the tls parameter as an additional unsigned long (syscall-argument-sized) argument. Change sys_clone's tls argument to an unsigned long (which does not change the ABI), and pass that down to copy_thread_tls. Architectures that don't opt into copy_thread_tls will continue to ignore the C argument to sys_clone in favor of the pt_regs captured at kernel entry, and thus will be unable to introduce new versions of the clone syscall. Patch co-authored by Josh Triplett and Thiago Macieira. Signed-off-by: Josh Triplett j...@joshtriplett.org Acked-by: Andy Lutomirski l...@kernel.org --- arch/Kconfig | 7 ++ include/linux/sched.h| 14 include/linux/syscalls.h | 6 +++--- kernel/fork.c| 55 +++- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 05d7a8a..4834a58 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK This spares a stack switch and improves cache usage on softirq processing. +config HAVE_COPY_THREAD_TLS + bool + help + Architecture provides copy_thread_tls to accept tls argument via + normal C parameter passing, rather than extracting the syscall + argument from pt_regs. + # # ABI hall of shame # diff --git a/include/linux/sched.h b/include/linux/sched.h index a419b65..2cc88c6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif extern void flush_thread(void); extern void exit_thread(void); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 76d1e38..bb51bec 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd); asmlinkage long sys_fork(void); asmlinkage long sys_vfork(void); #ifdef CONFIG_CLONE_BACKWARDS -asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int, +asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long, int __user *); #else #ifdef CONFIG_CLONE_BACKWARDS3 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *, - int __user *, int); + int __user *, unsigned long); #else asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, - int __user *, int); + int __user *, unsigned long); #endif #endif diff --git a/kernel/fork.c b/kernel/fork.c index cf65139..b3dadf4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1192,7 +1192,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_size, int __user *child_tidptr, struct pid *pid
Re: [PATCH V4] rcu: change function declaration to bool
On Mon, May 11, 2015 at 06:12:27PM +0200, Nicholas Mc Guire wrote: rcu_cpu_has_callbacks() is declared int. The current declaration was introduced in commit c0f4dfd4f90f (rcu: Make RCU_FAST_NO_HZ take advantage of numbered callbacks). But it is actually returning bool and as the function description states * Return true if the specified CPU has any callback, this probably should be a bool as all (3) call-sites currently treat it as bool. Type-checking coccinelle spatches are being used to locate type mismatches between function signatures and return values in this case this produced: ./kernel/rcu/tree.c:3538 WARNING: return of wrong type int != bool, Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y) Patch is against 4.1-rc3 (localversion-next is -next-20150511) and fixes Signed-off-by: Nicholas Mc Guire hof...@osadl.org Seems like a reasonable addition. Reviewed-by: Josh Triplett j...@joshtriplett.org --- V4: fix-up to include the origin of the issue being fixed as requeseted by Steven Rostedt rost...@goodmis.org. kernel/rcu/tree.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcc5943..599550c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3516,7 +3516,7 @@ static int rcu_pending(void) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv2 2/2] x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit
For 32-bit userspace on a 64-bit kernel, this requires modifying stub32_clone to actually swap the appropriate arguments to match CONFIG_CLONE_BACKWARDS, rather than just leaving the C argument for tls broken. Patch co-authored by Josh Triplett and Thiago Macieira. Signed-off-by: Josh Triplett j...@joshtriplett.org Acked-by: Andy Lutomirski l...@kernel.org --- arch/x86/Kconfig | 1 + arch/x86/ia32/ia32entry.S| 2 +- arch/x86/kernel/process_32.c | 6 +++--- arch/x86/kernel/process_64.c | 8 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b7d31ca..4960b0d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -124,6 +124,7 @@ config X86 select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 + select HAVE_COPY_THREAD_TLS select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUE_RWLOCK select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 156ebca..0286735 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -487,7 +487,7 @@ GLOBAL(\label) ALIGN GLOBAL(stub32_clone) leaq sys_clone(%rip),%rax - mov %r8, %rcx + xchg %r8, %rcx jmp ia32_ptregs_common ALIGN diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f9..ead28ff 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -129,8 +129,8 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); struct task_struct *tsk; @@ -185,7 +185,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ if (clone_flags CLONE_SETTLS) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs-si, 0); + (struct user_desc __user *)tls, 0); if (err p-thread.io_bitmap_ptr) { kfree(p-thread.io_bitmap_ptr); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 67fcc43..c69cabc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -151,8 +151,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) return get_desc_base(t-thread.tls_array[tls]); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { int err; struct pt_regs *childregs; @@ -209,10 +209,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs-si, 0); + (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs-r8); + err = do_arch_prctl(p, ARCH_SET_FS, tls); if (err) goto out; } -- 2.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
On Mon, May 11, 2015 at 04:00:43PM +0200, Ingo Molnar wrote: * Josh Triplett j...@joshtriplett.org wrote: On Mon, May 11, 2015 at 12:13:13PM +0200, Ingo Molnar wrote: * j...@joshtriplett.org j...@joshtriplett.org wrote: On Tue, May 05, 2015 at 08:53:03PM +0200, Thomas Gleixner wrote: On Tue, 21 Apr 2015, Josh Triplett wrote: Signed-off-by: Josh Triplett j...@joshtriplett.org Signed-off-by: Thiago Macieira thiago.macie...@intel.com Can you please clarify that SOB chain? It does not make any sense. Co-authored patch. We both worked on it together, and sadly git doesn't support a commit with multiple authors, so this is the next best thing. No, this is not a valid SOB chain. For 'co authored' patches you can add credits either to the file, as two copyright lines, or via the changelog, no need to mess up the SOB chain for that. Fine, I'll write it another way. Do you see any other issues with the patch other than the signoffs? Looks good to me, but I have not looked very deeply ... I sent out a v2 with the co-author information moved from the signoffs to the commit message. If it looks reasonable to you, can you take it through the tip tree please? - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv2 0/2] clone: Support passing tls argument via C rather than pt_regs magic
clone has some of the quirkiest syscall handling in the kernel, with a pile of special cases, historical curiosities, and architecture-specific calling conventions. In particular, clone with CLONE_SETTLS accepts a parameter tls that the C entry point completely ignores and some assembly entry points overwrite; instead, the low-level arch-specific code pulls the tls parameter out of the arch-specific register captured as part of pt_regs on entry to the kernel. That's a massive hack, and it makes the arch-specific code only work when called via the specific existing syscall entry points; because of this hack, any new clone-like system call would have to accept an identical tls argument in exactly the same arch-specific position, rather than providing a unified system call entry point across architectures. The first patch allows architectures to handle the tls argument via normal C parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt into this. These two patches came out of the clone4 series, which isn't ready for this merge window, but these first two cleanup patches were entirely uncontroversial and have acks. I'd like to go ahead and submit these two so that other architectures can begin building on top of this and opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and send these through the next merge window (along with v3 of clone4) if anyone would prefer that. v2: Move co-author from signoffs to a note in the commit message, as required by Ingo Molnar. Josh Triplett (2): clone: Support passing tls argument via C rather than pt_regs magic x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit arch/Kconfig | 7 ++ arch/x86/Kconfig | 1 + arch/x86/ia32/ia32entry.S| 2 +- arch/x86/kernel/process_32.c | 6 ++--- arch/x86/kernel/process_64.c | 8 +++ include/linux/sched.h| 14 +++ include/linux/syscalls.h | 6 ++--- kernel/fork.c| 55 +--- 8 files changed, 69 insertions(+), 30 deletions(-) -- 2.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
On Mon, May 11, 2015 at 11:32:28PM +0200, Paul Bolle wrote: On Mon, 2015-05-11 at 14:18 -0700, Josh Triplett wrote: However, that would produce *two* entries under the General setup menu: a yes/no entry Configure standard kernel features (expert users) with no submenu, and a Some separate menu prompt here entry with a submenu but no '[ ]' for a yes/no option. Integrating the two (without using menuconfig's implicit add stuff to submenu until an option's prompt doesn't depend on this symbol magic) requires new a kconfig mechanism. The diff pasted at the end of this message, which I quickly cobbled together an applies on top of this 2/1, generates these two lines in menuconfig (for EXPERT = 'y') [*] Configure standard kernel features (expert users) Standard kernel features --- Is squashing those two lines worth a new kconfig mechanism? In my opinion, yes. If you use the implicit (and error-prone) menuconfig submenuing, you get a single entry with the '[ ]' and the submenu. There are currently 272 instances of menuconfig in Kconfig files. I'd like to have a less error-prone mechanism for people to use, with an explicit endmenu at the end, and I don't want to leave any incentive for people to need the more error-prone version. I would be tempted to just make menuconfig require an endmenu, and convert all users, but that would almost certainly break many third-party users of kconfig. So instead, I'm currently extending menu (which already expects endmenu) to allow the syntax menu config SYMBOL, which acts like a combination of config SYMBOL and a menu with visible if SYMBOL. Diffstat for the patch I'm testing right now: scripts/kconfig/zconf.y | 18 ++ 1 file changed, 18 insertions(+) That seems worthwhile to have a less error-prone menu mechanism. (The actual patch would also need to updated zconf.tab.c_shipped.) (Also, the diff you posted would be smaller if you left config EXPERT at the top of init/Kconfig.expert; why the move?) - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
The expert menu frequently gets broken by a config item in the middle that leaves off the if EXPERT from its prompt. This results in the remainder of the menu spilling out into the parent General setup menu. Move the entire expert menu into a separate Kconfig file, init/Kconfig.expert, to make this harder to do accidentally, and to break up the exceedingly long init/Kconfig a bit. Signed-off-by: Josh Triplett j...@joshtriplett.org --- This applies on top of init/Kconfig: Fix break in middle of EXPERT menu. Please apply both. I'd also like to factor the if EXPERT off of all the prompts and into a single scoped item wrapped around all of them, but kconfig doesn't have any way to do that. menuconfig is just a hint, with no matching endmenu and no implicit visibility; menu is scoped and has visible if, but that would create a separate option containing a menu, rather than a menu under EXPERT's Configure standard kernel features (expert users). And if EXPERT ... endif produces a dependency, not a prompt-visibility condition. So I think this would require changes to the Kconfig language, to introduce either a scoped visible if EXPERT ... endvisible or similar, or a scoped version of menuconfig with a matching endmenu and implicit visibility (effectively a menu statement with attached config rather than a config with a hint this might be a menu). I'm leaning towards the latter. So I'll send a followup patch enhancing kconfig to improve this case, but I think splitting this into a separate file is still worth it even without that. init/Kconfig| 232 +--- init/Kconfig.expert | 231 +++ 2 files changed, 232 insertions(+), 231 deletions(-) create mode 100644 init/Kconfig.expert diff --git a/init/Kconfig b/init/Kconfig index e2f16f1..a2de3f5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1351,237 +1351,7 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. -menuconfig EXPERT - bool Configure standard kernel features (expert users) - # Unhide debug options, to make the on-by-default options visible - select DEBUG_KERNEL - help - This option allows certain base kernel options and settings - to be disabled or tweaked. This is for specialized - environments which can tolerate a non-standard kernel. - Only use this if you really know what you are doing. - -config UID16 - bool Enable 16-bit UID system calls if EXPERT - depends on HAVE_UID16 MULTIUSER - default y - help - This enables the legacy 16-bit UID syscall wrappers. - -config MULTIUSER - bool Multiple users, groups and capabilities support if EXPERT - default y - help - This option enables support for non-root users, groups and - capabilities. - - If you say N here, all processes will run with UID 0, GID 0, and all - possible capabilities. Saying N here also compiles out support for - system calls related to UIDs, GIDs, and capabilities, such as setuid, - setgid, and capset. - - If unsure, say Y here. - -config SGETMASK_SYSCALL - bool sgetmask/ssetmask syscalls support if EXPERT - def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH - ---help--- - sys_sgetmask and sys_ssetmask are obsolete system calls - no longer supported in libc but still enabled by default in some - architectures. - - If unsure, leave the default option here. - -config SYSFS_SYSCALL - bool Sysfs syscall support if EXPERT - default y - ---help--- - sys_sysfs is an obsolete system call no longer supported in libc. - Note that disabling this option is more secure but might break - compatibility with some systems. - - If unsure say Y here. - -config SYSCTL_SYSCALL - bool Sysctl syscall support if EXPERT - depends on PROC_SYSCTL - default n - select SYSCTL - ---help--- - sys_sysctl uses binary paths that have been found challenging - to properly maintain and use. The interface in /proc/sys - using paths with ascii names is now the primary path to this - information. - - Almost nothing using the binary sysctl interface so if you are - trying to save some space it is probably safe to disable this, - making your kernel marginally smaller. - - If unsure say N here. - -config KALLSYMS -bool Load all symbols for debugging/ksymoops if EXPERT -default y -help - Say Y here to let the kernel print out symbolic crash information and - symbolic stack backtraces. This increases the size of the kernel - somewhat, as all symbols have to be loaded
Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
On Mon, May 11, 2015 at 11:01:22PM +0200, Paul Bolle wrote: On Mon, 2015-05-11 at 13:23 -0700, Josh Triplett wrote: I'd also like to factor the if EXPERT off of all the prompts and into a single scoped item wrapped around all of them, but kconfig doesn't have any way to do that. menuconfig is just a hint, with no matching endmenu and no implicit visibility; menu is scoped and has visible if, but that would create a separate option containing a menu, rather than a menu under EXPERT's Configure standard kernel features (expert users). And if EXPERT ... endif produces a dependency, not a prompt-visibility condition. So I think this would require changes to the Kconfig language, to introduce either a scoped visible if EXPERT ... endvisible or similar, or a scoped version of menuconfig with a matching endmenu and implicit visibility (effectively a menu statement with attached config rather than a config with a hint this might be a menu). I'm leaning towards the latter. The behavior of menuconfig in this case is rather subtle. I must admit I never noticed it. The visible option to menus is little used, and I'm not really familiar with it. So, for what it's worth: would adding a new menu with visible if EXPERT attached to it, and putting all current prompt Foo if EXPERT entries in that menu roughly do what you want? Not quite. menu has a prompt but no associated symbol, so with current kconfig that would look like: config EXPERT bool Configure standard kernel features (expert users) ... menu Some separate menu prompt here visible if EXPERT ... various options that currently have if EXPERT on their prompts ... endmenu However, that would produce *two* entries under the General setup menu: a yes/no entry Configure standard kernel features (expert users) with no submenu, and a Some separate menu prompt here entry with a submenu but no '[ ]' for a yes/no option. Integrating the two (without using menuconfig's implicit add stuff to submenu until an option's prompt doesn't depend on this symbol magic) requires new a kconfig mechanism. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
On Mon, May 11, 2015 at 11:50:21PM +0200, Paul Bolle wrote: On Mon, 2015-05-11 at 13:23 -0700, Josh Triplett wrote: --- /dev/null +++ b/init/Kconfig.expert @@ -0,0 +1,231 @@ +menuconfig EXPERT + bool Configure standard kernel features (expert users) + # Unhide debug options, to make the on-by-default options visible + select DEBUG_KERNEL + help + This option allows certain base kernel options and settings + to be disabled or tweaked. This is for specialized + environments which can tolerate a non-standard kernel. + Only use this if you really know what you are doing. Comment here saying # All entries in this file must have if EXPERT after their prompt or something to that effect (pending you patch, that is)? Yeah, I should add such a comment. I was hoping to make it obsolete via kconfig changes, but in the interim, sure. +config KALLSYMS +bool Load all symbols for debugging/ksymoops if EXPERT +default y +help + Say Y here to let the kernel print out symbolic crash information and + symbolic stack backtraces. This increases the size of the kernel + somewhat, as all symbols have to be loaded into the kernel image. + +config KALLSYMS_ALL + bool Include all symbols in kallsyms (For some reason this entry doesn't have if EXPERT but it seems to behave as expected. Odd.) Because it depends on KALLSYMS. Magic! + depends on DEBUG_KERNEL KALLSYMS + help + Normally kallsyms only contains the symbols of functions for nicer + OOPS messages and backtraces (i.e., symbols from the text and inittext + sections). This is sufficient for most cases. And only in very rare + cases (e.g., when a debugger is used) all symbols are required (e.g., + names of variables from the data sections, etc). + + This option makes sure that all symbols are loaded into the kernel + image (i.e., symbols from all sections) in cost of increased kernel + size (depending on the kernel configuration, it may be 300KiB or + something like this). + + Say N unless you really need all symbols. + +config PRINTK + default y + bool Enable support for printk if EXPERT Now you're touching this: bool [...] as the first line, please. I'd like the commit moving these to their own file to very obviously look like it's moving these lines unmodified, rather than making changes in the process. If you want this (and all the subsequent instances you flagged) cleaned up, let's do that as a subsequent patch separate from the move. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
On Tue, May 12, 2015 at 12:36:49AM +0200, Luis R. Rodriguez wrote: On Mon, May 11, 2015 at 01:23:02PM -0700, Josh Triplett wrote: init/Kconfig| 232 +--- init/Kconfig.expert | 231 +++ 2 files changed, 232 insertions(+), 231 deletions(-) create mode 100644 init/Kconfig.expert I'm blinded by this patch, can you use git format-patch -M and also add: [diff] renamelimit=0 To your .gitconfig From a quick test, doing what you suggest doesn't have any effect; if I also use -C1 instead of -M, then the patch to init/Kconfig looks exactly the same (deleting 231 lines and adding a source line), while the patch to init/Kconfig.expert goes from adding 231 lines to copying init/Kconfig and deleting 1800 lines. That does not seem like an improvement. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/1] init/Kconfig: Split expert menu into a separate file, init/Kconfig.expert
On Tue, May 12, 2015 at 12:01:27AM +0200, Paul Bolle wrote: On Mon, 2015-05-11 at 14:47 -0700, Josh Triplett wrote: On Mon, May 11, 2015 at 11:32:28PM +0200, Paul Bolle wrote: Is squashing those two lines worth a new kconfig mechanism? In my opinion, yes. If you use the implicit (and error-prone) menuconfig submenuing, you get a single entry with the '[ ]' and the submenu. There are currently 272 instances of menuconfig in Kconfig files. How many of those use the subtle trick EXPERT uses? A rough search suggests about 35, with the rest using an if right after the menu (presumably because they want dependencies rather than visibility). (A couple of which currently have the same broken-menu problem today's patch 1 fixed for EXPERT, and a couple of which are using that trick *across input files*.) I'd like to have a less error-prone mechanism for people to use, with an explicit endmenu at the end, and I don't want to leave any incentive for people to need the more error-prone version. I would be tempted to just make menuconfig require an endmenu, and convert all users, but that would almost certainly break many third-party users of kconfig. So instead, I'm currently extending menu (which already expects endmenu) to allow the syntax menu config SYMBOL, which acts like a combination of config SYMBOL and a menu with visible if SYMBOL. Bikeshedding (before I'm even convinced of the need of this extension): menu config is far too similar to menuconfig. Yeah, agreed; given that people will still end up using menuconfig in light of the above, I'll use another syntax. Diffstat for the patch I'm testing right now: scripts/kconfig/zconf.y | 18 ++ 1 file changed, 18 insertions(+) That seems worthwhile to have a less error-prone menu mechanism. (The actual patch would also need to updated zconf.tab.c_shipped.) And some lines in Documentation/kbuild/kconfig-language.txt (speaking from memory). Good point; this does need documentation. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH V2] rcu: change function declaration to bool
On Mon, May 11, 2015 at 05:46:12PM +0200, Nicholas Mc Guire wrote: rcu_cpu_has_callbacks() is declared int but is actually returning bool and as the function description states * Return true if the specified CPU has any callback, this probably should be a bool. All (3) call-sites currently treat it as bool. Type-checking coccinelle spatches are being used to locate type mismatches between function signatures and return values in this case this produced: ./kernel/rcu/tree.c:3538 WARNING: return of wrong type int != bool, Patch was compile tested with x86_64_defconfig (implies CONFIG_TREE_RCU=y) Patch is against 4.1-rc3 (localversion-next is -next-20150511) Signed-off-by: Nicholas Mc Guire hof...@osadl.org Reviewed-by: Josh Triplett j...@joshtriplett.org Thanks! --- V3: fix-up of commit message again (hope I got it right this time) as requested by Josh Triplett j...@joshtriplett.org/Steven Rostedt rost...@goodmis.org kernel/rcu/tree.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bcc5943..599550c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3516,7 +3516,7 @@ static int rcu_pending(void) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) +static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) { bool al = true; bool hc = false; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC] init.h: mark init functions hot instead of cold
On Sat, May 09, 2015 at 02:17:39AM -0700, Andi Kleen wrote: > On Sat, May 09, 2015 at 12:45:01AM +0200, Rasmus Villemoes wrote: > > attribute((cold)) causes gcc to optimize the function for size rather > > than speed. But since __init functions will be discarded anyway, I > > don't see why memory should be a concern. On the contrary, everybody > > It makes the bzImage smaller. > A lot of people on smaller systems are interested in flash size. True, but people interested in flash size can set CONFIG_CC_OPTIMIZE_FOR_SIZE. I would propose dropping "cold" and *not* adding "hot"; just let __init functions get default optimizations. People who set CONFIG_CC_OPTIMIZE_FOR_SIZE should get similar size optimizations, and people who don't will get GCC's normal optimizations, which should provide much of the improved boot performance Rasmus observed. Rasmus, can you confirm that just dropping cold 1) doesn't make the kernel larger when building with CONFIG_CC_OPTIMIZE_FOR_SIZE, and 2) gives approximately the same 2% performance benefit you observed? > > wants their box to boot faster. Using the opposite attribute, hot, > > causes gcc to optimize the functions more aggressively, possibly at > > the expense of larger (.init).text. A completely unscientific test > > showed about 2% faster boot time: I booted a kernel in qemu with and > > without this patch five times each; the boot times were very stable in > > each case, so I think the 2% is ok, but of course only applies to that > > specific .config running in a virtual machine on my hardware. > > 2% on boot is basically noise. I disagree; there are people working on shaving milliseconds from boot. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC] init.h: mark init functions hot instead of cold
On Sat, May 09, 2015 at 02:17:39AM -0700, Andi Kleen wrote: On Sat, May 09, 2015 at 12:45:01AM +0200, Rasmus Villemoes wrote: attribute((cold)) causes gcc to optimize the function for size rather than speed. But since __init functions will be discarded anyway, I don't see why memory should be a concern. On the contrary, everybody It makes the bzImage smaller. A lot of people on smaller systems are interested in flash size. True, but people interested in flash size can set CONFIG_CC_OPTIMIZE_FOR_SIZE. I would propose dropping cold and *not* adding hot; just let __init functions get default optimizations. People who set CONFIG_CC_OPTIMIZE_FOR_SIZE should get similar size optimizations, and people who don't will get GCC's normal optimizations, which should provide much of the improved boot performance Rasmus observed. Rasmus, can you confirm that just dropping cold 1) doesn't make the kernel larger when building with CONFIG_CC_OPTIMIZE_FOR_SIZE, and 2) gives approximately the same 2% performance benefit you observed? wants their box to boot faster. Using the opposite attribute, hot, causes gcc to optimize the functions more aggressively, possibly at the expense of larger (.init).text. A completely unscientific test showed about 2% faster boot time: I booted a kernel in qemu with and without this patch five times each; the boot times were very stable in each case, so I think the 2% is ok, but of course only applies to that specific .config running in a virtual machine on my hardware. 2% on boot is basically noise. I disagree; there are people working on shaving milliseconds from boot. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] uidgid: Make uid_valid and gid_valid work with !CONFIG_MULTIUSER
{u,g}id_valid call {u,g}id_eq, which calls __k{u,g}id_val on both arguments and compares. With !CONFIG_MULTIUSER, __k{u,g}id_val return a constant 0, which makes {u,g}id_valid always return false. Change {u,g}id_valid to compare their argument against -1 instead. That produces identical results in the normal CONFIG_MULTIUSER=y case, but with !CONFIG_MULTIUSER will make {u,g}id_valid constant-fold into "return true;" rather than "return false;". This fixes uses of devpts without CONFIG_MULTIUSER. Signed-off-by: Josh Triplett --- include/linux/uidgid.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index 0ee05da..0383552 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -109,12 +109,12 @@ static inline bool gid_lte(kgid_t left, kgid_t right) static inline bool uid_valid(kuid_t uid) { - return !uid_eq(uid, INVALID_UID); + return __kuid_val(uid) != (uid_t) -1; } static inline bool gid_valid(kgid_t gid) { - return !gid_eq(gid, INVALID_GID); + return __kgid_val(gid) != (gid_t) -1; } #ifdef CONFIG_USER_NS -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv2] devpts: If initialization failed, don't crash when opening /dev/ptmx
If devpts failed to initialize, it would store an ERR_PTR in the global devpts_mnt. A subsequent open of /dev/ptmx would call devpts_new_index, which would dereference devpts_mnt and crash. Avoid storing invalid values in devpts_mnt; leave it NULL instead. Make both devpts_new_index and devpts_pty_new fail gracefully with ENODEV in that case, which then becomes the return value to the userspace open call on /dev/ptmx. Signed-off-by: Josh Triplett Reviewed-by: Peter Hurley --- v2: Fix copy-paste error caught by Peter Hurley. As mentioned in v1, this is separate from fixing the bug that makes devpts fail to initialize in the first place, but this makes devpts fail gracefully rather than crashing. fs/devpts/inode.c | 30 +++--- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index cfe8466..016687b 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) return inode->i_sb; #endif + if (!devpts_mnt) + return NULL; return devpts_mnt->mnt_sb; } @@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct inode *ptmx_inode) { struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_fs_info *fsi; int index; int ida_ret; + if (!sb) + return -ENODEV; + + fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(>allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct inode *inode; - struct dentry *root = sb->s_root; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = >mount_opts; + struct dentry *root; + struct pts_fs_info *fsi; + struct pts_mount_opts *opts; char s[12]; + if (!sb) + return ERR_PTR(-ENODEV); + + root = sb->s_root; + fsi = DEVPTS_SB(sb); + opts = >mount_opts; + inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -676,12 +689,15 @@ static int __init init_devpts_fs(void) struct ctl_table_header *table; if (!err) { + struct vfsmount *mnt; table = register_sysctl_table(pty_root_table); - devpts_mnt = kern_mount(_fs_type); - if (IS_ERR(devpts_mnt)) { - err = PTR_ERR(devpts_mnt); + mnt = kern_mount(_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); unregister_filesystem(_fs_type); unregister_sysctl_table(table); + } else { + devpts_mnt = mnt; } } return err; -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] uidgid: Make uid_valid and gid_valid work with !CONFIG_MULTIUSER
{u,g}id_valid call {u,g}id_eq, which calls __k{u,g}id_val on both arguments and compares. With !CONFIG_MULTIUSER, __k{u,g}id_val return a constant 0, which makes {u,g}id_valid always return false. Change {u,g}id_valid to compare their argument against -1 instead. That produces identical results in the normal CONFIG_MULTIUSER=y case, but with !CONFIG_MULTIUSER will make {u,g}id_valid constant-fold into return true; rather than return false;. This fixes uses of devpts without CONFIG_MULTIUSER. Signed-off-by: Josh Triplett j...@joshtriplett.org --- include/linux/uidgid.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index 0ee05da..0383552 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -109,12 +109,12 @@ static inline bool gid_lte(kgid_t left, kgid_t right) static inline bool uid_valid(kuid_t uid) { - return !uid_eq(uid, INVALID_UID); + return __kuid_val(uid) != (uid_t) -1; } static inline bool gid_valid(kgid_t gid) { - return !gid_eq(gid, INVALID_GID); + return __kgid_val(gid) != (gid_t) -1; } #ifdef CONFIG_USER_NS -- 2.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCHv2] devpts: If initialization failed, don't crash when opening /dev/ptmx
If devpts failed to initialize, it would store an ERR_PTR in the global devpts_mnt. A subsequent open of /dev/ptmx would call devpts_new_index, which would dereference devpts_mnt and crash. Avoid storing invalid values in devpts_mnt; leave it NULL instead. Make both devpts_new_index and devpts_pty_new fail gracefully with ENODEV in that case, which then becomes the return value to the userspace open call on /dev/ptmx. Signed-off-by: Josh Triplett j...@joshtriplett.org Reviewed-by: Peter Hurley pe...@hurleysoftware.com --- v2: Fix copy-paste error caught by Peter Hurley. As mentioned in v1, this is separate from fixing the bug that makes devpts fail to initialize in the first place, but this makes devpts fail gracefully rather than crashing. fs/devpts/inode.c | 30 +++--- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index cfe8466..016687b 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) if (inode-i_sb-s_magic == DEVPTS_SUPER_MAGIC) return inode-i_sb; #endif + if (!devpts_mnt) + return NULL; return devpts_mnt-mnt_sb; } @@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct inode *ptmx_inode) { struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_fs_info *fsi; int index; int ida_ret; + if (!sb) + return -ENODEV; + + fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(fsi-allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct inode *inode; - struct dentry *root = sb-s_root; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = fsi-mount_opts; + struct dentry *root; + struct pts_fs_info *fsi; + struct pts_mount_opts *opts; char s[12]; + if (!sb) + return ERR_PTR(-ENODEV); + + root = sb-s_root; + fsi = DEVPTS_SB(sb); + opts = fsi-mount_opts; + inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -676,12 +689,15 @@ static int __init init_devpts_fs(void) struct ctl_table_header *table; if (!err) { + struct vfsmount *mnt; table = register_sysctl_table(pty_root_table); - devpts_mnt = kern_mount(devpts_fs_type); - if (IS_ERR(devpts_mnt)) { - err = PTR_ERR(devpts_mnt); + mnt = kern_mount(devpts_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); unregister_filesystem(devpts_fs_type); unregister_sysctl_table(table); + } else { + devpts_mnt = mnt; } } return err; -- 2.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] devpts: If initialization failed, don't crash when opening /dev/ptmx
If devpts failed to initialize, it would store an ERR_PTR in the global devpts_mnt. A subsequent open of /dev/ptmx would call devpts_new_index, which would dereference devpts_mnt and crash. Avoid storing invalid values in devpts_mnt; leave it NULL instead. Make both devpts_new_index and devpts_pty_new fail gracefully with ENODEV in that case, which then becomes the return value to the userspace open call on /dev/ptmx. Signed-off-by: Josh Triplett --- This fixes a crash found by Fengguang Wu's 0-day service ("BUG: unable to handle kernel paging request at ffee"). It doesn't yet fix the underlying initialization failure in init_devpts_fs, but it stops that failure from becoming a kernel crash. I'm working on the initialization failure now. fs/devpts/inode.c | 30 +++--- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index cfe8466..03e9076 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) return inode->i_sb; #endif + if (!devpts_mnt) + return NULL; return devpts_mnt->mnt_sb; } @@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct inode *ptmx_inode) { struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_fs_info *fsi; int index; int ida_ret; + if (!sb) + return -ENODEV; + + fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(>allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct inode *inode; - struct dentry *root = sb->s_root; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = >mount_opts; + struct dentry *root; + struct pts_fs_info *fsi; + struct pts_mount_opts *opts; char s[12]; + if (!sb) + return ERR_PTR(-ENODEV); + + root = sb->s_root; + fsi = DEVPTS_SB(sb); + opts = >mount_opts; + inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -676,12 +689,15 @@ static int __init init_devpts_fs(void) struct ctl_table_header *table; if (!err) { + static struct vfsmount *mnt; table = register_sysctl_table(pty_root_table); - devpts_mnt = kern_mount(_fs_type); - if (IS_ERR(devpts_mnt)) { - err = PTR_ERR(devpts_mnt); + mnt = kern_mount(_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); unregister_filesystem(_fs_type); unregister_sysctl_table(table); + } else { + devpts_mnt = mnt; } } return err; -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] devpts: If initialization failed, don't crash when opening /dev/ptmx
If devpts failed to initialize, it would store an ERR_PTR in the global devpts_mnt. A subsequent open of /dev/ptmx would call devpts_new_index, which would dereference devpts_mnt and crash. Avoid storing invalid values in devpts_mnt; leave it NULL instead. Make both devpts_new_index and devpts_pty_new fail gracefully with ENODEV in that case, which then becomes the return value to the userspace open call on /dev/ptmx. Signed-off-by: Josh Triplett j...@joshtriplett.org --- This fixes a crash found by Fengguang Wu's 0-day service (BUG: unable to handle kernel paging request at ffee). It doesn't yet fix the underlying initialization failure in init_devpts_fs, but it stops that failure from becoming a kernel crash. I'm working on the initialization failure now. fs/devpts/inode.c | 30 +++--- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index cfe8466..03e9076 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) if (inode-i_sb-s_magic == DEVPTS_SUPER_MAGIC) return inode-i_sb; #endif + if (!devpts_mnt) + return NULL; return devpts_mnt-mnt_sb; } @@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct inode *ptmx_inode) { struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_fs_info *fsi; int index; int ida_ret; + if (!sb) + return -ENODEV; + + fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(fsi-allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct inode *inode; - struct dentry *root = sb-s_root; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = fsi-mount_opts; + struct dentry *root; + struct pts_fs_info *fsi; + struct pts_mount_opts *opts; char s[12]; + if (!sb) + return ERR_PTR(-ENODEV); + + root = sb-s_root; + fsi = DEVPTS_SB(sb); + opts = fsi-mount_opts; + inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -676,12 +689,15 @@ static int __init init_devpts_fs(void) struct ctl_table_header *table; if (!err) { + static struct vfsmount *mnt; table = register_sysctl_table(pty_root_table); - devpts_mnt = kern_mount(devpts_fs_type); - if (IS_ERR(devpts_mnt)) { - err = PTR_ERR(devpts_mnt); + mnt = kern_mount(devpts_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); unregister_filesystem(devpts_fs_type); unregister_sysctl_table(table); + } else { + devpts_mnt = mnt; } } return err; -- 2.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rcu: declare rcu_data variables in the section they are defined in
On Sun, May 03, 2015 at 05:57:53PM +0800, Nicolas Iooss wrote: > Commit 11bbb235c26f ("rcu: Use DEFINE_PER_CPU_SHARED_ALIGNED for > rcu_data") replaced DEFINE_PER_CPU by DEFINE_PER_CPU_SHARED_ALIGNED in > the definition of rcu_sched and rcu_bh without updating > kernel/rcu/tree.h. > > This makes clang report a section mismatch (-Wsection warning) when > building LLVMLinux because the variables are declared in .data..percpu > but defined in .data..percpu..shared_aligned. > > Signed-off-by: Nicolas Iooss Good catch. Reviewed-by: Josh Triplett > --- > kernel/rcu/tree.h | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h > index a69d3dab2ec4..c5e85b27a79f 100644 > --- a/kernel/rcu/tree.h > +++ b/kernel/rcu/tree.h > @@ -519,10 +519,10 @@ extern struct list_head rcu_struct_flavors; > * RCU implementation internal declarations: > */ > extern struct rcu_state rcu_sched_state; > -DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); > +DECLARE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_sched_data); > > extern struct rcu_state rcu_bh_state; > -DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); > +DECLARE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_bh_data); > > #ifdef CONFIG_PREEMPT_RCU > extern struct rcu_state rcu_preempt_state; > -- > 2.3.6 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] rcu: declare rcu_data variables in the section they are defined in
On Sun, May 03, 2015 at 05:57:53PM +0800, Nicolas Iooss wrote: Commit 11bbb235c26f (rcu: Use DEFINE_PER_CPU_SHARED_ALIGNED for rcu_data) replaced DEFINE_PER_CPU by DEFINE_PER_CPU_SHARED_ALIGNED in the definition of rcu_sched and rcu_bh without updating kernel/rcu/tree.h. This makes clang report a section mismatch (-Wsection warning) when building LLVMLinux because the variables are declared in .data..percpu but defined in .data..percpu..shared_aligned. Signed-off-by: Nicolas Iooss nicolas.iooss_li...@m4x.org Good catch. Reviewed-by: Josh Triplett j...@joshtriplett.org --- kernel/rcu/tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a69d3dab2ec4..c5e85b27a79f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -519,10 +519,10 @@ extern struct list_head rcu_struct_flavors; * RCU implementation internal declarations: */ extern struct rcu_state rcu_sched_state; -DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_sched_data); extern struct rcu_state rcu_bh_state; -DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_bh_data); #ifdef CONFIG_PREEMPT_RCU extern struct rcu_state rcu_preempt_state; -- 2.3.6 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [CONFIG_MULTIUSER] init: error.c:320: Assertion failed in nih_error_get: CURRENT_CONTEXT->error != NULL
On Sun, May 03, 2015 at 07:18:28AM +0800, Fengguang Wu wrote: > Hi Iulia, > > FYI, there are Ubuntu init error messages when CONFIG_MULTIUSER=n. > Since it's not embedded system and hence the target user of > CONFIG_MULTIUSER=n, it might be fine.. I would expect a non-trivial amount of work required to make a standard distribution boot with CONFIG_MULTIUSER=n. Anything attempting to set the user ID or group ID will fail, such as su, start-stop-daemon --chuid, or systemd's daemon launching code. So I'd suggest that this is an expected failure; allnoconfig or tinyconfig would already not be expected to boot unmodified Ubuntu. - Josh triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [CONFIG_MULTIUSER] init: error.c:320: Assertion failed in nih_error_get: CURRENT_CONTEXT-error != NULL
On Sun, May 03, 2015 at 07:18:28AM +0800, Fengguang Wu wrote: Hi Iulia, FYI, there are Ubuntu init error messages when CONFIG_MULTIUSER=n. Since it's not embedded system and hence the target user of CONFIG_MULTIUSER=n, it might be fine.. I would expect a non-trivial amount of work required to make a standard distribution boot with CONFIG_MULTIUSER=n. Anything attempting to set the user ID or group ID will fail, such as su, start-stop-daemon --chuid, or systemd's daemon launching code. So I'd suggest that this is an expected failure; allnoconfig or tinyconfig would already not be expected to boot unmodified Ubuntu. - Josh triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone
On Thu, Apr 23, 2015 at 08:24:38AM +0200, Ingo Molnar wrote: > * Josh Triplett wrote: > > On Wed, Apr 22, 2015 at 11:22:02AM -0700, Linus Torvalds wrote: > > > On Wed, Apr 22, 2015 at 10:10 AM, Josh Triplett > > > wrote: > > > > > > > > I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing > > > > this > > > > > > Ugh, I absolutely detesrt that patch. > > > > > > Don't make random crazy function signatures that depend on some config > > > option. That's just evil. The patch is a mess of #ifdef's and should > > > be shot in the head and staked with a silver stake to make sure it > > > never re-appears. > > > > > > Either: > > > > > > (a) make the change for every architecture > > > > > > (b) have side-by-side interfaces. With different names! > > > > ...that's exactly what I did. They're called copy_thread and > > copy_thread_tls; I very intentionally did not conditionally change > > the signature of copy_thread, for exactly that reason. Those > > functions are implemented in architecture-specific code, so the > > config option just specifies which of the two functions the > > architecture provides. > > > > *sys_clone* has different function signatures based on config > > options, but I didn't touch that other than fixing the type of the > > tls argument. That's historical baggage that we can't throw away > > without breaking userspace. > > So you want to add a new clone() parameter. I strongly suspect that > you won't be the last person who wants to do this. > > So why not leave the compatibility baggage alone and introduce a new > clean clone syscall with a flexible, backwards and forwards compatible > parameter ABI? ...that's also exactly what I did, in the clone4 patch series, which uses exactly the arg-structure approach you're describing. :) Take a look at the clone4 patch series (v2). We'll be updating it to v3 to address some feedback, and we're hoping to aim for the 4.2 merge window. > Something like: > > SYSCALL_DEFINE1(clone_params, struct clone_params __user *, params) clone4 passes the size outside the params structure, since otherwise you'd need to copy the first few bytes to know how much more to copy. clone4 also passes flags outside the params structure, to allow for a potential future flag that might indicate a completely different interpretation of the params structure. But otherwise, yes, clone4 moves all the parameters into a structure. (Differences between clone4_args and your clone_params structure: size and flags passed outside, type of the tls parameter is "unsigned long" since it's pointer-sized, and the structure includes the stack_size argument needed on some architectures.) > The only real cost of this approach is that this parameter structure > has to be copied (once): should be fast as it fits into a single cache > line and is a constant size copy. Also note how this parameter block > can be passed down by const reference from that point on, instead of > copying/shuffling 5-6 parameters into increasingly long function > parameter lists. So it might in fact turn out to be slightly faster as > well. Agreed. > Note how easily extensible it is: a new field can be added by > appending to the structure, and compatibility is achieved in the > kernel without explicit versioning, by checking params->size: > > params->size == sizeof(*params): > > Good, kernel and userspace ABI version matches. This is a simple > check and the most typical situation - it will be the fastest as > well. > > > params->size < sizeof(*params): > > Old binary calls into new kernel. Missing fields are set to 0. > Binaries will be forward compatible without any versioning hacks. I combined these two cases by just copying the userspace struct over a pre-zeroed params structure; then any fields not copied over will remain zero. > params->size > sizeof(*params): > > New user-space calls into old kernel. System call can still be > serviced if the new field(s) are all zero. We return -ENOSYS if > any field is non-zero. (i.e. if new user-space tries to make use > of a new syscall feature that this kernel has not implemented > yet.) This way user-space can figure out whether a particular > new parameter is supported by a kernel, without having to add new > system calls or versioning. In theory clone4 could have had this special case for "userspace passed extra parameters this kernel doesn't understand, but they're all zero", but since this is a brand new ABI, i
Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone
On Thu, Apr 23, 2015 at 08:24:38AM +0200, Ingo Molnar wrote: * Josh Triplett j...@joshtriplett.org wrote: On Wed, Apr 22, 2015 at 11:22:02AM -0700, Linus Torvalds wrote: On Wed, Apr 22, 2015 at 10:10 AM, Josh Triplett j...@joshtriplett.org wrote: I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing this Ugh, I absolutely detesrt that patch. Don't make random crazy function signatures that depend on some config option. That's just evil. The patch is a mess of #ifdef's and should be shot in the head and staked with a silver stake to make sure it never re-appears. Either: (a) make the change for every architecture (b) have side-by-side interfaces. With different names! ...that's exactly what I did. They're called copy_thread and copy_thread_tls; I very intentionally did not conditionally change the signature of copy_thread, for exactly that reason. Those functions are implemented in architecture-specific code, so the config option just specifies which of the two functions the architecture provides. *sys_clone* has different function signatures based on config options, but I didn't touch that other than fixing the type of the tls argument. That's historical baggage that we can't throw away without breaking userspace. So you want to add a new clone() parameter. I strongly suspect that you won't be the last person who wants to do this. So why not leave the compatibility baggage alone and introduce a new clean clone syscall with a flexible, backwards and forwards compatible parameter ABI? ...that's also exactly what I did, in the clone4 patch series, which uses exactly the arg-structure approach you're describing. :) Take a look at the clone4 patch series (v2). We'll be updating it to v3 to address some feedback, and we're hoping to aim for the 4.2 merge window. Something like: SYSCALL_DEFINE1(clone_params, struct clone_params __user *, params) clone4 passes the size outside the params structure, since otherwise you'd need to copy the first few bytes to know how much more to copy. clone4 also passes flags outside the params structure, to allow for a potential future flag that might indicate a completely different interpretation of the params structure. But otherwise, yes, clone4 moves all the parameters into a structure. (Differences between clone4_args and your clone_params structure: size and flags passed outside, type of the tls parameter is unsigned long since it's pointer-sized, and the structure includes the stack_size argument needed on some architectures.) The only real cost of this approach is that this parameter structure has to be copied (once): should be fast as it fits into a single cache line and is a constant size copy. Also note how this parameter block can be passed down by const reference from that point on, instead of copying/shuffling 5-6 parameters into increasingly long function parameter lists. So it might in fact turn out to be slightly faster as well. Agreed. Note how easily extensible it is: a new field can be added by appending to the structure, and compatibility is achieved in the kernel without explicit versioning, by checking params-size: params-size == sizeof(*params): Good, kernel and userspace ABI version matches. This is a simple check and the most typical situation - it will be the fastest as well. params-size sizeof(*params): Old binary calls into new kernel. Missing fields are set to 0. Binaries will be forward compatible without any versioning hacks. I combined these two cases by just copying the userspace struct over a pre-zeroed params structure; then any fields not copied over will remain zero. params-size sizeof(*params): New user-space calls into old kernel. System call can still be serviced if the new field(s) are all zero. We return -ENOSYS if any field is non-zero. (i.e. if new user-space tries to make use of a new syscall feature that this kernel has not implemented yet.) This way user-space can figure out whether a particular new parameter is supported by a kernel, without having to add new system calls or versioning. In theory clone4 could have had this special case for userspace passed extra parameters this kernel doesn't understand, but they're all zero, but since this is a brand new ABI, it seems far easier for userspace to simply pass only the size needed for its non-zero parameters, and then the kernel can reject structures larger than it expects. Only pass the new version of the args structure if you need to pass non-zero values for the new fields in that version. Since clone4 doesn't even copy the structure if the size exceeds what it understands, that also avoids additional special cases such as the user passing in an obscenely large size. Also note how 'fool proof' this kind
Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone
On Wed, Apr 22, 2015 at 11:22:02AM -0700, Linus Torvalds wrote: > On Wed, Apr 22, 2015 at 10:10 AM, Josh Triplett wrote: > > > > I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing > > this > > Ugh, I absolutely detesrt that patch. > > Don't make random crazy function signatures that depend on some config > option. That's just evil. The patch is a mess of #ifdef's and should > be shot in the head and staked with a silver stake to make sure it > never re-appears. > > Either: > > (a) make the change for every architecture > > (b) have side-by-side interfaces. With different names! ...that's exactly what I did. They're called copy_thread and copy_thread_tls; I very intentionally did not conditionally change the signature of copy_thread, for exactly that reason. Those functions are implemented in architecture-specific code, so the config option just specifies which of the two functions the architecture provides. *sys_clone* has different function signatures based on config options, but I didn't touch that other than fixing the type of the tls argument. That's historical baggage that we can't throw away without breaking userspace. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone
On Wed, Apr 22, 2015 at 09:54:24AM -0700, Andy Lutomirski wrote: > On Wed, Apr 22, 2015 at 9:40 AM, Denys Vlasenko wrote: > > Really swap arguments #4 and #5 in stub32_clone instead of "optimizing" > > it into a move. > > > > Yes, tls_val is currently unused. Yes, on some CPUs XCHG is a little bit > > more expensive than MOV. But a cycle or two on an expensive syscall like > > clone() is way below noise floor, and obfuscation of logic introduced > > by this optimization is simply not worth it. > > Ditto re: Josh's patch. I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing this, but I'd like to see the final version of Denys' comment added on top of it (with an update to the type and name of the tls argument to match the changes to sys_clone). Denys, would you consider submitting a patch adding your comment on top of the two-patch series I just sent? Thanks, Josh Triplett > --Andy > > > > > Signed-off-by: Denys Vlasenko > > CC: Linus Torvalds > > CC: Steven Rostedt > > CC: Ingo Molnar > > CC: Borislav Petkov > > CC: "H. Peter Anvin" > > CC: Andy Lutomirski > > CC: Oleg Nesterov > > CC: Frederic Weisbecker > > CC: Alexei Starovoitov > > CC: Will Drewry > > CC: Kees Cook > > CC: x...@kernel.org > > CC: linux-kernel@vger.kernel.org > > --- > > arch/x86/ia32/ia32entry.S | 6 ++ > > 1 file changed, 2 insertions(+), 4 deletions(-) > > > > diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S > > index 8e72256..0c302d0 100644 > > --- a/arch/x86/ia32/ia32entry.S > > +++ b/arch/x86/ia32/ia32entry.S > > @@ -567,11 +567,9 @@ GLOBAL(stub32_clone) > > * 32-bit clone API is clone(..., int tls_val, int *child_tidptr). > > * 64-bit clone API is clone(..., int *child_tidptr, int tls_val). > > * Native 64-bit kernel's sys_clone() implements the latter. > > -* We need to swap args here. But since tls_val is in fact ignored > > -* by sys_clone(), we can get away with an assignment > > -* (arg4 = arg5) instead of a full swap: > > +* We need to swap args here: > > */ > > - mov %r8, %rcx > > + xchg%r8, %rcx > > jmp ia32_ptregs_common > > > > ALIGN > > -- > > 1.8.1.4 > > > > > > -- > Andy Lutomirski > AMA Capital Management, LLC -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone
On Wed, Apr 22, 2015 at 09:54:24AM -0700, Andy Lutomirski wrote: On Wed, Apr 22, 2015 at 9:40 AM, Denys Vlasenko dvlas...@redhat.com wrote: Really swap arguments #4 and #5 in stub32_clone instead of optimizing it into a move. Yes, tls_val is currently unused. Yes, on some CPUs XCHG is a little bit more expensive than MOV. But a cycle or two on an expensive syscall like clone() is way below noise floor, and obfuscation of logic introduced by this optimization is simply not worth it. Ditto re: Josh's patch. I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing this, but I'd like to see the final version of Denys' comment added on top of it (with an update to the type and name of the tls argument to match the changes to sys_clone). Denys, would you consider submitting a patch adding your comment on top of the two-patch series I just sent? Thanks, Josh Triplett --Andy Signed-off-by: Denys Vlasenko dvlas...@redhat.com CC: Linus Torvalds torva...@linux-foundation.org CC: Steven Rostedt rost...@goodmis.org CC: Ingo Molnar mi...@kernel.org CC: Borislav Petkov b...@alien8.de CC: H. Peter Anvin h...@zytor.com CC: Andy Lutomirski l...@amacapital.net CC: Oleg Nesterov o...@redhat.com CC: Frederic Weisbecker fweis...@gmail.com CC: Alexei Starovoitov a...@plumgrid.com CC: Will Drewry w...@chromium.org CC: Kees Cook keesc...@chromium.org CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- arch/x86/ia32/ia32entry.S | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 8e72256..0c302d0 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -567,11 +567,9 @@ GLOBAL(stub32_clone) * 32-bit clone API is clone(..., int tls_val, int *child_tidptr). * 64-bit clone API is clone(..., int *child_tidptr, int tls_val). * Native 64-bit kernel's sys_clone() implements the latter. -* We need to swap args here. But since tls_val is in fact ignored -* by sys_clone(), we can get away with an assignment -* (arg4 = arg5) instead of a full swap: +* We need to swap args here: */ - mov %r8, %rcx + xchg%r8, %rcx jmp ia32_ptregs_common ALIGN -- 1.8.1.4 -- Andy Lutomirski AMA Capital Management, LLC -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] x86/asm/entry/32: Remove unnecessary optimization in stub32_clone
On Wed, Apr 22, 2015 at 11:22:02AM -0700, Linus Torvalds wrote: On Wed, Apr 22, 2015 at 10:10 AM, Josh Triplett j...@joshtriplett.org wrote: I do think my two-patch HAVE_COPY_THREAD_TLS series should go in fixing this Ugh, I absolutely detesrt that patch. Don't make random crazy function signatures that depend on some config option. That's just evil. The patch is a mess of #ifdef's and should be shot in the head and staked with a silver stake to make sure it never re-appears. Either: (a) make the change for every architecture (b) have side-by-side interfaces. With different names! ...that's exactly what I did. They're called copy_thread and copy_thread_tls; I very intentionally did not conditionally change the signature of copy_thread, for exactly that reason. Those functions are implemented in architecture-specific code, so the config option just specifies which of the two functions the architecture provides. *sys_clone* has different function signatures based on config options, but I didn't touch that other than fixing the type of the tls argument. That's historical baggage that we can't throw away without breaking userspace. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit
For 32-bit userspace on a 64-bit kernel, this requires modifying stub32_clone to actually swap the appropriate arguments to match CONFIG_CLONE_BACKWARDS, rather than just leaving the C argument for tls broken. Signed-off-by: Josh Triplett Signed-off-by: Thiago Macieira Acked-by: Andy Lutomirski --- arch/x86/Kconfig | 1 + arch/x86/ia32/ia32entry.S| 2 +- arch/x86/kernel/process_32.c | 6 +++--- arch/x86/kernel/process_64.c | 8 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b7d31ca..4960b0d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -124,6 +124,7 @@ config X86 select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 + select HAVE_COPY_THREAD_TLS select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUE_RWLOCK select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 156ebca..0286735 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -487,7 +487,7 @@ GLOBAL(\label) ALIGN GLOBAL(stub32_clone) leaq sys_clone(%rip),%rax - mov %r8, %rcx + xchg %r8, %rcx jmp ia32_ptregs_common ALIGN diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f9..ead28ff 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -129,8 +129,8 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); struct task_struct *tsk; @@ -185,7 +185,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs->si, 0); + (struct user_desc __user *)tls, 0); if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 67fcc43..c69cabc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -151,8 +151,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) return get_desc_base(>thread.tls_array[tls]); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { int err; struct pt_regs *childregs; @@ -209,10 +209,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs->si, 0); + (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + err = do_arch_prctl(p, ARCH_SET_FS, tls); if (err) goto out; } -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
clone with CLONE_SETTLS accepts an argument to set the thread-local storage area for the new thread. sys_clone declares an int argument tls_val in the appropriate point in the argument list (based on the various CLONE_BACKWARDS variants), but doesn't actually use or pass along that argument. Instead, sys_clone calls do_fork, which calls copy_process, which calls the arch-specific copy_thread, and copy_thread pulls the corresponding syscall argument out of the pt_regs captured at kernel entry (knowing what argument of clone that architecture passes tls in). Apart from being awful and inscrutable, that also only works because only one code path into copy_thread can pass the CLONE_SETTLS flag, and that code path comes from sys_clone with its architecture-specific argument-passing order. This prevents introducing a new version of the clone system call without propagating the same architecture-specific position of the tls argument. However, there's no reason to pull the argument out of pt_regs when sys_clone could just pass it down via C function call arguments. Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into, and a new copy_thread_tls that accepts the tls parameter as an additional unsigned long (syscall-argument-sized) argument. Change sys_clone's tls argument to an unsigned long (which does not change the ABI), and pass that down to copy_thread_tls. Architectures that don't opt into copy_thread_tls will continue to ignore the C argument to sys_clone in favor of the pt_regs captured at kernel entry, and thus will be unable to introduce new versions of the clone syscall. Signed-off-by: Josh Triplett Signed-off-by: Thiago Macieira Acked-by: Andy Lutomirski --- arch/Kconfig | 7 ++ include/linux/sched.h| 14 include/linux/syscalls.h | 6 +++--- kernel/fork.c| 55 +++- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 05d7a8a..4834a58 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK This spares a stack switch and improves cache usage on softirq processing. +config HAVE_COPY_THREAD_TLS + bool + help + Architecture provides copy_thread_tls to accept tls argument via + normal C parameter passing, rather than extracting the syscall + argument from pt_regs. + # # ABI hall of shame # diff --git a/include/linux/sched.h b/include/linux/sched.h index a419b65..2cc88c6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif extern void flush_thread(void); extern void exit_thread(void); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 76d1e38..bb51bec 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd); asmlinkage long sys_fork(void); asmlinkage long sys_vfork(void); #ifdef CONFIG_CLONE_BACKWARDS -asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int, +asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long, int __user *); #else #ifdef CONFIG_CLONE_BACKWARDS3 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *, - int __user *, int); + int __user *, unsigned long); #else asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, - int __user *, int); + int __user *, unsigned long); #endif #endif diff --git a/kernel/fork.c b/kernel/fork.c index cf65139..b3dadf4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1192,7 +1192,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, - int trace
[PATCH 0/2] clone: Support passing tls argument via C rather than pt_regs magic
clone has some of the quirkiest syscall handling in the kernel, with a pile of special cases, historical curiosities, and architecture-specific calling conventions. In particular, clone with CLONE_SETTLS accepts a parameter "tls" that the C entry point completely ignores and some assembly entry points overwrite; instead, the low-level arch-specific code pulls the tls parameter out of the arch-specific register captured as part of pt_regs on entry to the kernel. That's a massive hack, and it makes the arch-specific code only work when called via the specific existing syscall entry points; because of this hack, any new clone-like system call would have to accept an identical tls argument in exactly the same arch-specific position, rather than providing a unified system call entry point across architectures. The first patch allows architectures to handle the tls argument via normal C parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt into this. These two patches came out of the clone4 series, which isn't ready for this merge window, but these first two cleanup patches were entirely uncontroversial and have acks. I'd like to go ahead and submit these two so that other architectures can begin building on top of this and opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and send these through the next merge window (along with v3 of clone4) if anyone would prefer that. Josh Triplett (2): clone: Support passing tls argument via C rather than pt_regs magic x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit arch/Kconfig | 7 ++ arch/x86/Kconfig | 1 + arch/x86/ia32/ia32entry.S| 2 +- arch/x86/kernel/process_32.c | 6 ++--- arch/x86/kernel/process_64.c | 8 +++ include/linux/sched.h| 14 +++ include/linux/syscalls.h | 6 ++--- kernel/fork.c| 55 +--- 8 files changed, 69 insertions(+), 30 deletions(-) -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/2] clone: Support passing tls argument via C rather than pt_regs magic
clone has some of the quirkiest syscall handling in the kernel, with a pile of special cases, historical curiosities, and architecture-specific calling conventions. In particular, clone with CLONE_SETTLS accepts a parameter tls that the C entry point completely ignores and some assembly entry points overwrite; instead, the low-level arch-specific code pulls the tls parameter out of the arch-specific register captured as part of pt_regs on entry to the kernel. That's a massive hack, and it makes the arch-specific code only work when called via the specific existing syscall entry points; because of this hack, any new clone-like system call would have to accept an identical tls argument in exactly the same arch-specific position, rather than providing a unified system call entry point across architectures. The first patch allows architectures to handle the tls argument via normal C parameter passing, if they opt in by selecting HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt into this. These two patches came out of the clone4 series, which isn't ready for this merge window, but these first two cleanup patches were entirely uncontroversial and have acks. I'd like to go ahead and submit these two so that other architectures can begin building on top of this and opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and send these through the next merge window (along with v3 of clone4) if anyone would prefer that. Josh Triplett (2): clone: Support passing tls argument via C rather than pt_regs magic x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit arch/Kconfig | 7 ++ arch/x86/Kconfig | 1 + arch/x86/ia32/ia32entry.S| 2 +- arch/x86/kernel/process_32.c | 6 ++--- arch/x86/kernel/process_64.c | 8 +++ include/linux/sched.h| 14 +++ include/linux/syscalls.h | 6 ++--- kernel/fork.c| 55 +--- 8 files changed, 69 insertions(+), 30 deletions(-) -- 2.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic
clone with CLONE_SETTLS accepts an argument to set the thread-local storage area for the new thread. sys_clone declares an int argument tls_val in the appropriate point in the argument list (based on the various CLONE_BACKWARDS variants), but doesn't actually use or pass along that argument. Instead, sys_clone calls do_fork, which calls copy_process, which calls the arch-specific copy_thread, and copy_thread pulls the corresponding syscall argument out of the pt_regs captured at kernel entry (knowing what argument of clone that architecture passes tls in). Apart from being awful and inscrutable, that also only works because only one code path into copy_thread can pass the CLONE_SETTLS flag, and that code path comes from sys_clone with its architecture-specific argument-passing order. This prevents introducing a new version of the clone system call without propagating the same architecture-specific position of the tls argument. However, there's no reason to pull the argument out of pt_regs when sys_clone could just pass it down via C function call arguments. Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into, and a new copy_thread_tls that accepts the tls parameter as an additional unsigned long (syscall-argument-sized) argument. Change sys_clone's tls argument to an unsigned long (which does not change the ABI), and pass that down to copy_thread_tls. Architectures that don't opt into copy_thread_tls will continue to ignore the C argument to sys_clone in favor of the pt_regs captured at kernel entry, and thus will be unable to introduce new versions of the clone syscall. Signed-off-by: Josh Triplett j...@joshtriplett.org Signed-off-by: Thiago Macieira thiago.macie...@intel.com Acked-by: Andy Lutomirski l...@kernel.org --- arch/Kconfig | 7 ++ include/linux/sched.h| 14 include/linux/syscalls.h | 6 +++--- kernel/fork.c| 55 +++- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 05d7a8a..4834a58 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK This spares a stack switch and improves cache usage on softirq processing. +config HAVE_COPY_THREAD_TLS + bool + help + Architecture provides copy_thread_tls to accept tls argument via + normal C parameter passing, rather than extracting the syscall + argument from pt_regs. + # # ABI hall of shame # diff --git a/include/linux/sched.h b/include/linux/sched.h index a419b65..2cc88c6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif extern void flush_thread(void); extern void exit_thread(void); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 76d1e38..bb51bec 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd); asmlinkage long sys_fork(void); asmlinkage long sys_vfork(void); #ifdef CONFIG_CLONE_BACKWARDS -asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int, +asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long, int __user *); #else #ifdef CONFIG_CLONE_BACKWARDS3 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *, - int __user *, int); + int __user *, unsigned long); #else asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, - int __user *, int); + int __user *, unsigned long); #endif #endif diff --git a/kernel/fork.c b/kernel/fork.c index cf65139..b3dadf4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1192,7 +1192,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_size, int __user *child_tidptr, struct pid *pid
[PATCH 2/2] x86: Opt into HAVE_COPY_THREAD_TLS, for both 32-bit and 64-bit
For 32-bit userspace on a 64-bit kernel, this requires modifying stub32_clone to actually swap the appropriate arguments to match CONFIG_CLONE_BACKWARDS, rather than just leaving the C argument for tls broken. Signed-off-by: Josh Triplett j...@joshtriplett.org Signed-off-by: Thiago Macieira thiago.macie...@intel.com Acked-by: Andy Lutomirski l...@kernel.org --- arch/x86/Kconfig | 1 + arch/x86/ia32/ia32entry.S| 2 +- arch/x86/kernel/process_32.c | 6 +++--- arch/x86/kernel/process_64.c | 8 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b7d31ca..4960b0d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -124,6 +124,7 @@ config X86 select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 + select HAVE_COPY_THREAD_TLS select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUE_RWLOCK select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 156ebca..0286735 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -487,7 +487,7 @@ GLOBAL(\label) ALIGN GLOBAL(stub32_clone) leaq sys_clone(%rip),%rax - mov %r8, %rcx + xchg %r8, %rcx jmp ia32_ptregs_common ALIGN diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f9..ead28ff 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -129,8 +129,8 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); struct task_struct *tsk; @@ -185,7 +185,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ if (clone_flags CLONE_SETTLS) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs-si, 0); + (struct user_desc __user *)tls, 0); if (err p-thread.io_bitmap_ptr) { kfree(p-thread.io_bitmap_ptr); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 67fcc43..c69cabc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -151,8 +151,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) return get_desc_base(t-thread.tls_array[tls]); } -int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) { int err; struct pt_regs *childregs; @@ -209,10 +209,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) err = do_set_thread_area(p, -1, - (struct user_desc __user *)childregs-si, 0); + (struct user_desc __user *)tls, 0); else #endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs-r8); + err = do_arch_prctl(p, ARCH_SET_FS, tls); if (err) goto out; } -- 2.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] x86: Align jump targets to 1 byte boundaries
On Fri, Apr 10, 2015 at 07:10:08AM -0700, Paul E. McKenney wrote: > On Fri, Apr 10, 2015 at 02:08:46PM +0200, Ingo Molnar wrote: > > * Ingo Molnar wrote: [...] > > Btw., totally off topic, the following NOP caught my attention: > > > > > 5a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) > > > > That's a dead NOP that boats the function a bit, added for the 16 byte > > alignment of one of the jump targets. > > > > I realize that x86 CPU manufacturers recommend 16-byte jump target > > alignments (it's in the Intel optimization manual), but the cost of > > that is very significant: > > > > text data bss dec filename > > 125663911617840 108953615273767 vmlinux.align.16-byte > > 122249511617840 108953614932327 vmlinux.align.1-byte > > > > By using 1 byte jump target alignment (i.e. no alignment at all) we > > get an almost 3% reduction in kernel size (!) - and a probably similar > > reduction in I$ footprint. > > > > So I'm wondering, is the 16 byte jump target optimization suggestion > > really worth this price? The patch below boots fine and I've not > > measured any noticeable slowdown, but I've not tried hard. > > Good point, adding Josh Triplett on CC. I suspect that he might be > interested. ;-) Quite interested, yes. Even if there *are* benchmarks to support keeping the optimization (which wouldn't surprise me), it'd be nice to have a Kconfig option to enable the jump-target optimization. (With 'y' meaning "pad jump targets to 16 bytes", so that allnoconfig and tinyconfig automatically don't.) - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] x86: Align jump targets to 1 byte boundaries
On Fri, Apr 10, 2015 at 07:10:08AM -0700, Paul E. McKenney wrote: On Fri, Apr 10, 2015 at 02:08:46PM +0200, Ingo Molnar wrote: * Ingo Molnar mi...@kernel.org wrote: [...] Btw., totally off topic, the following NOP caught my attention: 5a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) That's a dead NOP that boats the function a bit, added for the 16 byte alignment of one of the jump targets. I realize that x86 CPU manufacturers recommend 16-byte jump target alignments (it's in the Intel optimization manual), but the cost of that is very significant: text data bss dec filename 125663911617840 108953615273767 vmlinux.align.16-byte 122249511617840 108953614932327 vmlinux.align.1-byte By using 1 byte jump target alignment (i.e. no alignment at all) we get an almost 3% reduction in kernel size (!) - and a probably similar reduction in I$ footprint. So I'm wondering, is the 16 byte jump target optimization suggestion really worth this price? The patch below boots fine and I've not measured any noticeable slowdown, but I've not tried hard. Good point, adding Josh Triplett on CC. I suspect that he might be interested. ;-) Quite interested, yes. Even if there *are* benchmarks to support keeping the optimization (which wouldn't surprise me), it'd be nice to have a Kconfig option to enable the jump-target optimization. (With 'y' meaning pad jump targets to 16 bytes, so that allnoconfig and tinyconfig automatically don't.) - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
On Wed, Apr 01, 2015 at 09:24:20AM +0200, Jonathan Corbet wrote: > On Tue, 31 Mar 2015 15:02:24 -0700 > j...@joshtriplett.org wrote: > > > > This would appear to assume that a clonefd_info structure is the only > > > thing that will ever be read from this descriptor. It seems to me that > > > there is the potential for, someday, wanting to be able to read and write > > > other things as well. Should this structure be marked with type and > > > length fields so that other structures could be added in the future? > > > > I don't think it makes sense for a caller to get an arbitrary structure > > on read(), and have to figure out what they got and ignore something > > they don't understand. Instead, I think it makes more sense for the > > caller to say "Hey, here's a flag saying I understand the new thing, go > > ahead and give me the new thing". So, for instance, if you want to > > receive SIGSTOP/SIGCONT messages for child processes through this > > descriptor, we could add a flag for that. > > The flag is fine, but, once we have set that flag saying we want those > messages, how do we know which type of structure we've gotten? That's > the piece of the puzzle I'm missing, sorry if I'm being overly slow. If you pass a flag saying you can handle a new set of potential structures, those structures can then include any necessary disambiguating flags/IDs/etc. No need for them to match the current clonefd_info structure if userspace has opted into a new version. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
On Wed, Apr 01, 2015 at 09:24:20AM +0200, Jonathan Corbet wrote: On Tue, 31 Mar 2015 15:02:24 -0700 j...@joshtriplett.org wrote: This would appear to assume that a clonefd_info structure is the only thing that will ever be read from this descriptor. It seems to me that there is the potential for, someday, wanting to be able to read and write other things as well. Should this structure be marked with type and length fields so that other structures could be added in the future? I don't think it makes sense for a caller to get an arbitrary structure on read(), and have to figure out what they got and ignore something they don't understand. Instead, I think it makes more sense for the caller to say Hey, here's a flag saying I understand the new thing, go ahead and give me the new thing. So, for instance, if you want to receive SIGSTOP/SIGCONT messages for child processes through this descriptor, we could add a flag for that. The flag is fine, but, once we have set that flag saying we want those messages, how do we know which type of structure we've gotten? That's the piece of the puzzle I'm missing, sorry if I'm being overly slow. If you pass a flag saying you can handle a new set of potential structures, those structures can then include any necessary disambiguating flags/IDs/etc. No need for them to match the current clonefd_info structure if userspace has opted into a new version. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 7/7] clone4: Add a CLONE_FD flag to get task exit notification via fd
On Mon, Apr 06, 2015 at 05:30:35PM +0900, Sergey Senozhatsky wrote: > On (03/15/15 01:00), Josh Triplett wrote: > [..] > > + > > +/* Handle the CLONE_FD case for copy_process. */ > > +int clonefd_do_clone(u64 clone_flags, struct task_struct *p, > > +struct clone4_args *args, struct clonefd_setup *setup) > > +{ > > + int flags; > > + struct file *file; > > + int fd; > > + > > + p->clonefd = !!(clone_flags & CLONE_FD); > > + if (!p->clonefd) > > + return 0; > > + > > + if (args->clonefd_flags & ~(O_CLOEXEC | O_NONBLOCK)) > > + return -EINVAL; > > + > > + init_waitqueue_head(>clonefd_wqh); > > + > > + get_task_struct(p); > > + flags = O_RDONLY | FMODE_ATOMIC_POS | args->clonefd_flags; > > + file = anon_inode_getfile("[process]", _fops, p, flags); > > + if (IS_ERR(file)) { > > + put_task_struct(p); > > + return PTR_ERR(file); > > + } > > + > > + fd = get_unused_fd_flags(flags); > > + if (fd < 0) { > > + put_task_struct(p); ? No, once anon_inode_getfile has succeeded, the file owns the reference to the task_struct, so fput(file) will call the release function which calls put_task_struct. Only the failure case for anon_inode_getfile needs to call put_task_struct directly. > > + fput(file); > > + return fd; > > + } - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 7/7] clone4: Add a CLONE_FD flag to get task exit notification via fd
On Mon, Apr 06, 2015 at 05:30:35PM +0900, Sergey Senozhatsky wrote: On (03/15/15 01:00), Josh Triplett wrote: [..] + +/* Handle the CLONE_FD case for copy_process. */ +int clonefd_do_clone(u64 clone_flags, struct task_struct *p, +struct clone4_args *args, struct clonefd_setup *setup) +{ + int flags; + struct file *file; + int fd; + + p-clonefd = !!(clone_flags CLONE_FD); + if (!p-clonefd) + return 0; + + if (args-clonefd_flags ~(O_CLOEXEC | O_NONBLOCK)) + return -EINVAL; + + init_waitqueue_head(p-clonefd_wqh); + + get_task_struct(p); + flags = O_RDONLY | FMODE_ATOMIC_POS | args-clonefd_flags; + file = anon_inode_getfile([process], clonefd_fops, p, flags); + if (IS_ERR(file)) { + put_task_struct(p); + return PTR_ERR(file); + } + + fd = get_unused_fd_flags(flags); + if (fd 0) { + put_task_struct(p); ? No, once anon_inode_getfile has succeeded, the file owns the reference to the task_struct, so fput(file) will call the release function which calls put_task_struct. Only the failure case for anon_inode_getfile needs to call put_task_struct directly. + fput(file); + return fd; + } - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 7/7] clone4: Add a CLONE_FD flag to get task exit notification via fd
On Mon, Mar 23, 2015 at 05:38:45PM +, David Drysdale wrote: > On Sun, Mar 15, 2015 at 8:00 AM, Josh Triplett wrote: > > diff --git a/include/linux/sched.h b/include/linux/sched.h > > index 9daa017..1dc680b 100644 > > --- a/include/linux/sched.h > > +++ b/include/linux/sched.h > > @@ -1374,6 +1374,11 @@ struct task_struct { > > > > unsigned autoreap:1; /* Do not become a zombie on exit */ > > > > +#ifdef CONFIG_CLONEFD > > + unsigned clonefd:1; /* Notify clonefd_wqh on exit */ > > + wait_queue_head_t clonefd_wqh; > > +#endif > > + > > unsigned long atomic_flags; /* Flags needing atomic access. */ > > > > struct restart_block restart_block; > > Idle thought: are there any concerns about the occupancy > impact of adding a wait_queue_head to every task_struct, > whether it has a clonefd or not? > > I guess we could reduce the size somewhat by just > storing a struct file *clonefd_file in the task, and then have > a separate structure (with the wqh and a task_struct*) referenced > by file->private_data. Not sure whether the added complication > would be worthwhile, though. My original patches did exactly that (minus the reference back to the task_struct). However, there are a couple of problems with that approach. First, it assumes that a task_struct has only a single file referencing it, but in the future I'd like to support obtaining a clonefd for an existing task. Second, the task_struct really shouldn't have a reference to the actual struct file, when it only needs the wait_queue_head_t. Also, AFAICT a wait_queue_head_t is normally (in the absence of kernel lock debugging options) the size of two pointers. Adding an indirection and an extra allocation to change that to the size of one pointer seems iffy, especially when looking at the rest of what's directly in task_struct that's far larger. > > --- /dev/null > > +++ b/kernel/clonefd.c > > @@ -0,0 +1,121 @@ > > +/* > > + * Support functions for CLONE_FD > > + * > > + * Copyright (c) 2015 Intel Corporation > > + * Original authors: Josh Triplett > > + * Thiago Macieira > > + */ > > +#include > > +#include > > +#include > > +#include > > +#include > > +#include "clonefd.h" > > + > > +static int clonefd_release(struct inode *inode, struct file *file) > > +{ > > + put_task_struct(file->private_data); > > + return 0; > > +} > > + > > +static unsigned int clonefd_poll(struct file *file, poll_table *wait) > > +{ > > + struct task_struct *p = file->private_data; > > + poll_wait(file, >clonefd_wqh, wait); > > + return p->exit_state ? (POLLIN | POLLRDNORM | POLLHUP) : 0; > > +} > > + > > +static ssize_t clonefd_read(struct file *file, char __user *buf, size_t > > count, loff_t *ppos) > > +{ > > + struct task_struct *p = file->private_data; > > + int ret = 0; > > + > > + /* EOF after first read */ > > + if (*ppos) > > + return 0; > > + > > + if (file->f_flags & O_NONBLOCK) > > + ret = -EAGAIN; > > + else > > + ret = wait_event_interruptible(p->clonefd_wqh, > > p->exit_state); > > + > > + if (p->exit_state) { > > + struct clonefd_info info = {}; > > + cputime_t utime, stime; > > + task_exit_code_status(p->exit_code, , > > ); > > + info.code &= ~__SI_MASK; > > + task_cputime(p, , ); > > + info.utime = cputime_to_clock_t(utime + p->signal->utime); > > + info.stime = cputime_to_clock_t(stime + p->signal->stime); > > + ret = simple_read_from_buffer(buf, count, ppos, , > > sizeof(info)); > > + } > > + return ret; > > +} > > + > > +static struct file_operations clonefd_fops = { > > + .release = clonefd_release, > > + .poll = clonefd_poll, > > + .read = clonefd_read, > > + .llseek = no_llseek, > > +}; > > It might be nice to include a show_fdinfo() implementation that shows > (say) the pid that the clonefd refers to. E.g. something like: > > static void clonefd_show_fdinfo(struct seq_file *m, struct file *file) > { > struct task_struct *p = file->private_data; > > seq_printf(m, "tid:\t%d\n", task_tgid_vnr(p)); > } I thought about that, but that would add a couple of additional ifdefs (C
Re: [PATCH v2 7/7] clone4: Add a CLONE_FD flag to get task exit notification via fd
On Mon, Mar 23, 2015 at 05:38:45PM +, David Drysdale wrote: On Sun, Mar 15, 2015 at 8:00 AM, Josh Triplett j...@joshtriplett.org wrote: diff --git a/include/linux/sched.h b/include/linux/sched.h index 9daa017..1dc680b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1374,6 +1374,11 @@ struct task_struct { unsigned autoreap:1; /* Do not become a zombie on exit */ +#ifdef CONFIG_CLONEFD + unsigned clonefd:1; /* Notify clonefd_wqh on exit */ + wait_queue_head_t clonefd_wqh; +#endif + unsigned long atomic_flags; /* Flags needing atomic access. */ struct restart_block restart_block; Idle thought: are there any concerns about the occupancy impact of adding a wait_queue_head to every task_struct, whether it has a clonefd or not? I guess we could reduce the size somewhat by just storing a struct file *clonefd_file in the task, and then have a separate structure (with the wqh and a task_struct*) referenced by file-private_data. Not sure whether the added complication would be worthwhile, though. My original patches did exactly that (minus the reference back to the task_struct). However, there are a couple of problems with that approach. First, it assumes that a task_struct has only a single file referencing it, but in the future I'd like to support obtaining a clonefd for an existing task. Second, the task_struct really shouldn't have a reference to the actual struct file, when it only needs the wait_queue_head_t. Also, AFAICT a wait_queue_head_t is normally (in the absence of kernel lock debugging options) the size of two pointers. Adding an indirection and an extra allocation to change that to the size of one pointer seems iffy, especially when looking at the rest of what's directly in task_struct that's far larger. --- /dev/null +++ b/kernel/clonefd.c @@ -0,0 +1,121 @@ +/* + * Support functions for CLONE_FD + * + * Copyright (c) 2015 Intel Corporation + * Original authors: Josh Triplett j...@joshtriplett.org + * Thiago Macieira thi...@macieira.org + */ +#include linux/anon_inodes.h +#include linux/file.h +#include linux/fs.h +#include linux/poll.h +#include linux/slab.h +#include clonefd.h + +static int clonefd_release(struct inode *inode, struct file *file) +{ + put_task_struct(file-private_data); + return 0; +} + +static unsigned int clonefd_poll(struct file *file, poll_table *wait) +{ + struct task_struct *p = file-private_data; + poll_wait(file, p-clonefd_wqh, wait); + return p-exit_state ? (POLLIN | POLLRDNORM | POLLHUP) : 0; +} + +static ssize_t clonefd_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct task_struct *p = file-private_data; + int ret = 0; + + /* EOF after first read */ + if (*ppos) + return 0; + + if (file-f_flags O_NONBLOCK) + ret = -EAGAIN; + else + ret = wait_event_interruptible(p-clonefd_wqh, p-exit_state); + + if (p-exit_state) { + struct clonefd_info info = {}; + cputime_t utime, stime; + task_exit_code_status(p-exit_code, info.code, info.status); + info.code = ~__SI_MASK; + task_cputime(p, utime, stime); + info.utime = cputime_to_clock_t(utime + p-signal-utime); + info.stime = cputime_to_clock_t(stime + p-signal-stime); + ret = simple_read_from_buffer(buf, count, ppos, info, sizeof(info)); + } + return ret; +} + +static struct file_operations clonefd_fops = { + .release = clonefd_release, + .poll = clonefd_poll, + .read = clonefd_read, + .llseek = no_llseek, +}; It might be nice to include a show_fdinfo() implementation that shows (say) the pid that the clonefd refers to. E.g. something like: static void clonefd_show_fdinfo(struct seq_file *m, struct file *file) { struct task_struct *p = file-private_data; seq_printf(m, tid:\t%d\n, task_tgid_vnr(p)); } I thought about that, but that would add a couple of additional ifdefs (CONFIG_PROC_FS), for an informational file of minimal value. More importantly, I don't want to add that until after adding an ioctl or similar to programmatically obtain the pid from a clonefd; otherwise, someone might try to use fdinfo as the API to do so, which would be all kinds of awful. So I'd prefer to add fdinfo in a future extension of clonefd, rather than in the initial patch series. + +/* Do process exit notification for clonefd. */ +void clonefd_do_notify(struct task_struct *p) +{ + if (p-clonefd) + wake_up_all(p-clonefd_wqh); +} + +/* Handle the CLONE_FD case for copy_process. */ +int clonefd_do_clone(u64 clone_flags, struct task_struct
Re: [PATCH] reiserfs: kstrdup() memory handling
On Sun, Mar 22, 2015 at 08:12:46PM -0400, Sanidhya Kashyap wrote: > On Sat, Mar 21, 2015 at 1:15 PM, Josh Triplett wrote: > > On Sat, Mar 21, 2015 at 01:00:13PM -0400, Sanidhya Kashyap wrote: > >> Checking for ENOMEM even for new_opts in reiserfs_remount function as > >> there is a possibility of nothing being allocated. > > > > You don't need to add a new label; kfree(NULL) is a no-op. > > > > Shall I resubmit the patch? I would suggest doing so, yes, after you see if anyone else has feedback on it. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] reiserfs: kstrdup() memory handling
On Sun, Mar 22, 2015 at 08:12:46PM -0400, Sanidhya Kashyap wrote: On Sat, Mar 21, 2015 at 1:15 PM, Josh Triplett j...@joshtriplett.org wrote: On Sat, Mar 21, 2015 at 01:00:13PM -0400, Sanidhya Kashyap wrote: Checking for ENOMEM even for new_opts in reiserfs_remount function as there is a possibility of nothing being allocated. You don't need to add a new label; kfree(NULL) is a no-op. Shall I resubmit the patch? I would suggest doing so, yes, after you see if anyone else has feedback on it. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] reiserfs: kstrdup() memory handling
On Sat, Mar 21, 2015 at 01:00:13PM -0400, Sanidhya Kashyap wrote: > Checking for ENOMEM even for new_opts in reiserfs_remount function as > there is a possibility of nothing being allocated. You don't need to add a new label; kfree(NULL) is a no-op. > Signed-off-by: Sanidhya Kashyap > --- > fs/reiserfs/super.c | 6 ++ > 1 file changed, 6 insertions(+) > > diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c > index 71fbbe3..bf9bc66 100644 > --- a/fs/reiserfs/super.c > +++ b/fs/reiserfs/super.c > @@ -1397,6 +1397,11 @@ static int reiserfs_remount(struct super_block *s, int > *mount_flags, char *arg) > int i; > #endif > > + if (!new_opts) { > + err = -ENOMEM; > + goto out_err_no_kfree; > + } > + > sync_filesystem(s); > reiserfs_write_lock(s); > > @@ -1549,6 +1554,7 @@ out_err_unlock: > reiserfs_write_unlock(s); > out_err: > kfree(new_opts); > +out_err_no_kfree: > return err; > } > > -- > 2.1.0 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] reiserfs: kstrdup() memory handling
On Sat, Mar 21, 2015 at 01:00:13PM -0400, Sanidhya Kashyap wrote: Checking for ENOMEM even for new_opts in reiserfs_remount function as there is a possibility of nothing being allocated. You don't need to add a new label; kfree(NULL) is a no-op. Signed-off-by: Sanidhya Kashyap sanidhya.gat...@gmail.com --- fs/reiserfs/super.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 71fbbe3..bf9bc66 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1397,6 +1397,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) int i; #endif + if (!new_opts) { + err = -ENOMEM; + goto out_err_no_kfree; + } + sync_filesystem(s); reiserfs_write_lock(s); @@ -1549,6 +1554,7 @@ out_err_unlock: reiserfs_write_unlock(s); out_err: kfree(new_opts); +out_err_no_kfree: return err; } -- 2.1.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH v13] sys_membarrier(): system/process-wide memory barrier (x86)
On Tue, Mar 17, 2015 at 07:01:23PM +, Mathieu Desnoyers wrote: > - Original Message - > > On Tue, Mar 17, 2015 at 10:57:50AM -0700, j...@joshtriplett.org wrote: > > > On Tue, Mar 17, 2015 at 06:30:35PM +0100, Peter Zijlstra wrote: > > > > On Tue, Mar 17, 2015 at 01:22:02PM -0400, Mathieu Desnoyers wrote: > > > > > Here is an implementation of a new system call, sys_membarrier(), > > > > > which > > > > > executes a memory barrier on either all running threads of the current > > > > > process (MEMBARRIER_PRIVATE) issues a memory barrier on all threads > > > > > running on the system (~MEMBARRIER_PRIVATE). Both are currently > > > > > implemented by calling synchronize_sched(). > > > > > > > > Then why bother with the flag? > > > > > > Semantically, MEMBARRIER_PRIVATE is allowed to avoid issuing a barrier > > > on CPUs not running the current process if it can, while > > > ~MEMBARRIER_PRIVATE may not. (The latter would be useful for > > > applications such as system-wide tracing.) That they're currently both > > > implemented the same way doesn't mean they're semantically equivalent. > > > > Sure; but why bother with pointless fluff like that? We can always > > introduce the private flag if and when it starts to make sense having > > it. > > Without the expedited implementation, the only usefulness of the > private flag is to skip synchronize_sched() if called from a > single-threaded process. > > We could easily argue that if a process is using sys_membarrier in > the first place, it's very likely that it is multithreaded. So I > agree that we can drop the flag for now, and add it later on, > e.g. when adding the expedited mode. > > I am tempted to leave the "flags" argument in place though, along > with the "MEMBARRIER_QUERY" flag. Thoughts ? You should definitely *always* supply a flags argument with any new syscall, even if you have no flags yet. As for QUERY, I assume that exists because the more expensive forms of barriers (e.g. expedited) are sufficiently invasive that you don't want to trigger one when not needed just to test if they work? If so, then yeah, having that flag seems fine. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH v13] sys_membarrier(): system/process-wide memory barrier (x86)
On Tue, Mar 17, 2015 at 07:01:23PM +, Mathieu Desnoyers wrote: - Original Message - On Tue, Mar 17, 2015 at 10:57:50AM -0700, j...@joshtriplett.org wrote: On Tue, Mar 17, 2015 at 06:30:35PM +0100, Peter Zijlstra wrote: On Tue, Mar 17, 2015 at 01:22:02PM -0400, Mathieu Desnoyers wrote: Here is an implementation of a new system call, sys_membarrier(), which executes a memory barrier on either all running threads of the current process (MEMBARRIER_PRIVATE) issues a memory barrier on all threads running on the system (~MEMBARRIER_PRIVATE). Both are currently implemented by calling synchronize_sched(). Then why bother with the flag? Semantically, MEMBARRIER_PRIVATE is allowed to avoid issuing a barrier on CPUs not running the current process if it can, while ~MEMBARRIER_PRIVATE may not. (The latter would be useful for applications such as system-wide tracing.) That they're currently both implemented the same way doesn't mean they're semantically equivalent. Sure; but why bother with pointless fluff like that? We can always introduce the private flag if and when it starts to make sense having it. Without the expedited implementation, the only usefulness of the private flag is to skip synchronize_sched() if called from a single-threaded process. We could easily argue that if a process is using sys_membarrier in the first place, it's very likely that it is multithreaded. So I agree that we can drop the flag for now, and add it later on, e.g. when adding the expedited mode. I am tempted to leave the flags argument in place though, along with the MEMBARRIER_QUERY flag. Thoughts ? You should definitely *always* supply a flags argument with any new syscall, even if you have no flags yet. As for QUERY, I assume that exists because the more expensive forms of barriers (e.g. expedited) are sufficiently invasive that you don't want to trigger one when not needed just to test if they work? If so, then yeah, having that flag seems fine. - Josh Triplett -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH] sys_membarrier(): system/process-wide memory barrier (x86) (v12)
On Sun, Mar 15, 2015 at 03:24:19PM -0400, Mathieu Desnoyers wrote: > Here is an implementation of a new system call, sys_membarrier(), which > executes a memory barrier on either all running threads of the current > process (MEMBARRIER_PRIVATE_FLAG) or calls synchronize_sched() to issue > a memory barrier on all threads running on the system. It can be used to > distribute the cost of user-space memory barriers asymmetrically by > transforming pairs of memory barriers into pairs consisting of > sys_membarrier() and a compiler barrier. For synchronization primitives > that distinguish between read-side and write-side (e.g. userspace RCU, > rwlocks), the read-side can be accelerated significantly by moving the > bulk of the memory barrier overhead to the write-side. >From a quick review, this seems quite reasonable (as it did 5 years ago). One request: Could you please add a config option (default y) in the EXPERT menu to disable this? You actually seem to already have it marked as a cond_syscall. Also, a very minor nit: flags in kernel APIs aren't typically named with a _FLAG suffix. With the syscall made optional, and with or without that naming nit fixed: Reviewed-by: Josh Triplett - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 5/7] clone4: Add a CLONE_AUTOREAP flag to automatically reap the child process
On Sun, Mar 15, 2015 at 08:55:06PM +0100, Oleg Nesterov wrote: > On 03/15, Josh Triplett wrote: > > On Sun, Mar 15, 2015 at 03:52:23PM +0100, Oleg Nesterov wrote: > > > On 03/15, Josh Triplett wrote: > > > > Add a CLONE_AUTOREAP flag to request this behavior unconditionally, > > > > > > Yes, CLONE_AUTOREAP is much better. And I agree (mostly) with that > > > we should rely on do_notify_parent(). > > > > > > Howver the patch still doesn't look right. First of all, ->autoreap > > > should be per-process, not per-thread. > > > > Ah, you're thinking of the case where the parent process launches a > > ... > > Not really, although we probably need more sanity checks. > > It should be per-process simply because this "autoreap" affects the whole > process. And the sub-threads are already "autoreap". And these 2 autoreap's > semantics differ, we should not confuse them. Will the approach I suggested, of having clones with CLONE_THREAD inherit the autoreap value rather than setting it from CLONE_AUTOREAP, implement the semantics you're looking for? Also, are you suggesting that CLONE_AUTOREAP with CLONE_THREAD should produce -EINVAL, or just that it should be ignored? > > (As an aside, what *is* the use case for CLONE_PARENT without > > CLONE_THREAD?) > > To me CLONE_PARENT is another historical mistake and the source of misc > problems ;) I kinda figured. :) > > > And there are ptrace/mt issues, > > > it seems. Just for example, we should avoid EXIT_TRACE if autoreap in > > > wait_task_zombie() even if we are going to re-notify parent. > > > > I don't see how EXIT_TRACE can happen in wait_task_zombie if autoreap is > > set. wait_task_zombie does a cmpxchg with exit_state and doesn't > > proceed unless exit_state was EXIT_ZOMBIE, and I don't see how we can > > ever reach the EXIT_ZOMBIE state if autoreap. > > Because you again forgot about ptrace ;) > > Josh. Let me try to summarise this later when I have time. Again, I am > not sure, perhaps this is even simpler than I currently think. And let > me apologize in advance, most probably I will be busy tomorrow. I look forward to your later review and feedback. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 5/7] clone4: Add a CLONE_AUTOREAP flag to automatically reap the child process
On Sun, Mar 15, 2015 at 03:52:23PM +0100, Oleg Nesterov wrote: > On 03/15, Josh Triplett wrote: > > Add a CLONE_AUTOREAP flag to request this behavior unconditionally, > > Yes, CLONE_AUTOREAP is much better. And I agree (mostly) with that > we should rely on do_notify_parent(). > > Howver the patch still doesn't look right. First of all, ->autoreap > should be per-process, not per-thread. Ah, you're thinking of the case where the parent process launches a child with CLONE_AUTOREAP, that child process launches siblings with CLONE_THREAD and without CLONE_AUTOREAP, and one of those siblings is the last to exit? That seems easy enough to handle: instead of setting ->autoreap unconditionally in copy_process, I can set it only in the non-CLONE_THREAD case, and otherwise let it inherit. Then every task in the group will have the same value for autoreap. (As an aside, what *is* the use case for CLONE_PARENT without CLONE_THREAD?) > And there are ptrace/mt issues, > it seems. Just for example, we should avoid EXIT_TRACE if autoreap in > wait_task_zombie() even if we are going to re-notify parent. I don't see how EXIT_TRACE can happen in wait_task_zombie if autoreap is set. wait_task_zombie does a cmpxchg with exit_state and doesn't proceed unless exit_state was EXIT_ZOMBIE, and I don't see how we can ever reach the EXIT_ZOMBIE state if autoreap. > EXCEPT: do we really want SIGCHLD from the exiting child? I think we > do not. I won't really argue though, but this should be discussed and > documented. IIUC, with your patch it is still sent. I think we do, yes. The caller of clone can already specify what signal they want, including no signal at all. If they specify a signal (SIGCHLD or otherwise) along with CLONE_AUTOREAP, we can send that signal. I don't think that causes any particular problem. That's the same semantic you'd get if you have a SIGCHLD handler with SA_NOCLDWAIT: you'd still get the signal, even though you don't need to (and can't) wait on the child process. > Josh, please give me some time to think and re-check, I'll write another > email next week. I am not sure this is really needed, but it seems to > me that we need the preparation patch to make this change clear/simple. I'd appreciate any feedback you can offer on this series, including any potential subtle interactions with ptrace. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/6] CLONE_FD: Task exit notification via file descriptor
On Sun, Mar 15, 2015 at 10:18:05AM +, David Drysdale wrote: > On Sat, Mar 14, 2015 at 7:29 PM, Josh Triplett wrote: > > On Sat, Mar 14, 2015 at 12:03:12PM -0700, Thiago Macieira wrote: > >> On Friday 13 March 2015 18:11:32 Thiago Macieira wrote: > >> > On Friday 13 March 2015 14:51:47 Andy Lutomirski wrote: > >> > > In any event, we should find out what FreeBSD does in response to > >> > > read(2) on the fd. > >> > > >> > I've just successfully installed FreeBSD and compiled qtbase (main > >> > package > >> > of Qt 5) on it. > >> > > >> > I'll test pdfork during the weekend and report its behaviour. > >> > >> Here are my findings about pdfork. > >> > >> Source: http://fxr.watson.org/fxr/source/kern/sys_procdesc.c?v=FREEBSD10 > >> Qt adaptations: https://codereview.qt-project.org/108561 > >> > >> Processes created with pdfork() are normal processes that still send > >> SIGCHLD > >> to their parents. The only difference is that you get the extra file > >> descriptor > >> that can be passed to the pdgetpid() system call and works on > >> select()/poll(). > >> Trying to read from that file descriptor will result in EOPNOTSUPP. > > > > OK, since read() doesn't work on a pdfork() file descriptor, we don't > > have to worry about compatibility with pdfork()'s read result. > > > > However, if the expectation is that pdfork()ed child processes still > > send SIGCHLD, then I don't see how we can be compatible there, nor do I > > think we want to; as you mention below, that breaks the ability to > > encapsulate management of the created process entirely within a library. > > I didn't think that was the case -- my understanding was that pdfork()ed > children would not generate SIGCHLD (and that does seem to be the > case with a quick test program). Well, either way, v2 of this series is capable of producing either behavior. You can have a clonefd and still receive SIGCHLD or any other signal, or none at all, and you can decide independently from that if you want autoreaping or waiting. > As an aside, I do think there are some aspects of FreeBSD's process > descriptors that aren't quite right yet, particularly their interaction with > waitpid(-1, ...) -- IIRC pdfork()ed children are visible to it, but I'd expect > them not to be (to allow libraries to use sub-processes invisibly to the > programs using them). There's a thread at: > https://lists.cam.ac.uk/pipermail/cl-capsicum-discuss/2014-March/thread.html > but I'm not sure that anything came of that discussion. As long as you don't use the Linux-specific flags __WALL or __WCLONE, a process created with clone will be invisible to wait if it has an exit signal other than SIGCHLD. That's true independent of this patch series. So you can decide if you want processes visible to wait or not. > As it happens, I'm meeting Robert Watson (one of the progenitors > of Capsicum/process descriptors) tomorrow, so I'll chase further. Sounds good. > >> Since they've never implemented pdwait4() (it's not even declared in the > >> headers), the only way to reap a child if you only have the file > >> descriptor is > >> to first pdgetpid() and then call wait4() or wait6(). > > > > Which suggests that we shouldn't try to implement pdwait4() in glibc > > until FreeBSD implements it in their kernel, since we won't know the > > exact semantics they expect. > > By the way, I should point out one part of the FreeBSD design > which might help explain some of the semantics. > > Process descriptors are particularly designed to be used with > Capsicum, which is a security framework where file descriptors > get extra rights associated with them, and the kernel polices > the use of those rights (e.g. you need CAP_READ for read(2) > operations; normal file descriptors implicitly have all of the > rights for back-compatibility). > https://www.freebsd.org/cgi/man.cgi?query=capsicum=4 > > Capsicum also includes 'capability mode', where system calls > that access global namespaces are disabled -- including the > pid namespace. > > So process descriptors are the only way to manipulate child > processes when a program is in capability mode -- and this > means that pdkill() is then genuinely needed over and above > kill(pdgetpid(),...). Thanks for the explanation. I've seen some details about Capsicum, and I found it quite interesting. I'm particularly interested in the notion of getting rid of global namespaces in favor of descriptors or similar mechanisms that you need specific rights to. Does Capsicum do anything t
Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
On Sun, Mar 15, 2015 at 12:59:17AM -0700, Josh Triplett wrote: > This patch series also introduces a clone flag CLONE_AUTOREAP, which causes > the > kernel to automatically reap the child process when it exits, just as it does > for processes using SIGCHLD when the parent has SIGCHLD ignored or marked as > SA_NOCLDSTOP. Typo: s/SA_NOCLDSTOP/SA_NOCLDWAIT/. - Josh Triplett -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 4/7] kernel/fork.c: Pass arguments to _do_fork and copy_process using clone4_args
Rather than continuing to add arguments to _do_fork and copy_process for future clone4 extensions, with corresponding churn in every caller, pass the arguments using the clone4_args structure instead. This allows clone4 to avoid unpacking the arguments, and allows other callers to use C99 structure initializers to only initialize the arguments they care about. Future extensions to clone4_args will thus not need to touch clone4, fork, vfork, or other callers of _do_fork. Signed-off-by: Josh Triplett Signed-off-by: Thiago Macieira --- kernel/fork.c | 77 +++ 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 8a21f9e..db9012a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1188,12 +1188,9 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) * flags). The actual kick-off is left to the caller. */ static struct task_struct *copy_process(u64 clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *child_tidptr, + struct clone4_args *args, struct pid *pid, - int trace, - unsigned long tls) + int trace) { int retval; struct task_struct *p; @@ -1405,7 +1402,7 @@ static struct task_struct *copy_process(u64 clone_flags, retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); + retval = copy_thread_tls(clone_flags, args->stack_start, args->stack_size, p, args->tls); if (retval) goto bad_fork_cleanup_io; @@ -1416,11 +1413,11 @@ static struct task_struct *copy_process(u64 clone_flags, goto bad_fork_cleanup_io; } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->ctid : NULL; /* * Clear TID on mm_release()? */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->ctid : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1617,7 +1614,8 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, _struct_pid, 0, 0); + struct clone4_args args = {}; + task = copy_process(CLONE_VM, , _struct_pid, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1632,13 +1630,7 @@ struct task_struct *fork_idle(int cpu) * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -static long _do_fork( - u64 clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, - unsigned long tls) +static long _do_fork(u64 clone_flags, struct clone4_args *args) { struct task_struct *p; int trace = 0; @@ -1662,8 +1654,7 @@ static long _do_fork( trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, -child_tidptr, NULL, trace, tls); + p = copy_process(clone_flags, args, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1678,7 +1669,7 @@ static long _do_fork( nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); + put_user(nr, args->ptid); if (clone_flags & CLONE_VFORK) { p->vfork_done = @@ -1722,9 +1713,13 @@ long do_fork(unsigned long clone_flags, int __user *parent_tidptr, int __user *child_tidptr) { - return _do_fork(squelch_clone_flags(clone_flags), - stack_start, stack_size, - parent_tidptr, child_tidptr, 0); + struct clone4_args kargs = { + .ptid = parent_tidptr, + .ctid = child_tidptr, + .stack_start = stack_start, + .stack_start = stack_size, + }; + return _do_fork(squelch_clone_flags(clone_flags), );
[PATCH v2 man-pages] clone4.2: New manpage documenting clone4(2)
Also includes new cross-reference from clone.2. Signed-off-by: Josh Triplett --- man2/clone.2 | 1 + man2/clone4.2 | 345 ++ 2 files changed, 346 insertions(+) create mode 100644 man2/clone4.2 diff --git a/man2/clone.2 b/man2/clone.2 index 752c01e..7013885 100644 --- a/man2/clone.2 +++ b/man2/clone.2 @@ -1209,6 +1209,7 @@ main(int argc, char *argv[]) } .fi .SH SEE ALSO +.BR clone4 (2), .BR fork (2), .BR futex (2), .BR getpid (2), diff --git a/man2/clone4.2 b/man2/clone4.2 new file mode 100644 index 000..f237ebc --- /dev/null +++ b/man2/clone4.2 @@ -0,0 +1,345 @@ +.\" Based on clone.2: +.\" Copyright (c) 1992 Drew Eckhardt , March 28, 1992 +.\" and Copyright (c) Michael Kerrisk, 2001, 2002, 2005, 2013 +.\" +.\" %%%LICENSE_START(GPL_NOVERSION_ONELINE) +.\" May be distributed under the GNU General Public License. +.\" %%%LICENSE_END +.TH CLONE4 2 2015-03-14 "Linux" "Linux Programmer's Manual" +.SH NAME +clone4 \- create a child process +.SH SYNOPSIS +.nf +/* Prototype for the glibc wrapper function */ + +.B #define _GNU_SOURCE +.B #include + +.BI "int clone4(uint64_t " flags , +.BI " size_t " args_size , +.BI " struct clone4_args *" args , +.BI " int (*" "fn" ")(void *), void *" arg ); + +/* Prototype for the raw system call */ + +.BI "int clone4(unsigned " flags_high ", unsigned " flags_low , +.BI " unsigned long " args_size , +.BI " struct clone4_args *" args ); + +struct clone4_args { +pid_t *ptid; +pid_t *ctid; +unsigned long stack_start; +unsigned long stack_size; +unsigned long tls; +int *clonefd; +unsigned clonefd_flags; +}; + +.SH DESCRIPTION +.BR clone4 () +creates a new process, similar to +.BR clone (2) +and +.BR fork (2). +.BR clone4 () +supports additional flags that +.BR clone (2) +does not, and accepts arguments via an extensible structure. + +.I args +points to a +.I clone4_args +structure, and +.I args_size +must contain the size of that structure, as understood by the caller. If the +caller passes a shorter structure than the kernel expects, the remaining fields +will default to 0. If the caller passes a larger structure than the kernel +expects (such as one from a newer kernel), +.BR clone4 () +will return +.BR EINVAL . +The +.I clone4_args +structure may gain additional fields at the end in the future, and callers must +only pass a size that encompasses the number of fields they understand. If the +caller passes 0 for +.IR args_size , +.I args +is ignored and may be NULL. + +In the +.I clone4_args +structure, +.IR ptid , +.IR ctid , +.IR stack_start , +.IR stack_size , +and +.I tls +have the same semantics as they do with +.BR clone (2) +and +.BR clone2 (2). + +In the glibc wrapper, +.I fn +and +.I arg +have the same semantics as they do with +.BR clone (2). +As with +.BR clone (2), +the underlying system call works more like +.BR fork (2), +returning 0 in the child process; the glibc wrapper simplifies thread execution +by calling +.IR fn ( arg ) +and exiting the child when that function exits. + +The 64-bit +.I flags +argument (split into the 32-bit +.I flags_high +and +.I flags_low +arguments in the kernel interface for portability across architectures) +accepts all the same flags as +.BR clone (2), +with the exception of the obsolete +.BR CLONE_PID , +.BR CLONE_DETACHED , +and +.BR CLONE_STOPPED . +In addition, +.I flags +accepts the following flags: + +.TP +.B CLONE_AUTOREAP +When the new process exits, immediately reap it, rather than keeping it around +as a "zombie" until a call to +.BR waitpid (2) +or similar. Without this flag, the kernel will automatically reap a process if +its exit signal is set to +.BR SIGCHLD , +and if the parent process has +.B SIGCHLD +set to +.B SIG_IGN +or has a +.B SIGCHLD +handler installed with +.B SA_NOCLDWAIT +(see +.BR sigaction (2)). +.B CLONE_AUTOREAP +allows the calling process to enable automatic reaping with an exit signal +other than +.B SIGCHLD +(including 0 to disable the exit signal), and does not depend on the +configuration of process-wide signal handling. + +.TP +.B CLONE_FD +Return a file descriptor associated with the new process, storing it in +location +.I clonefd +in the parent's address space. When the new process exits, the file descriptor +will become available for reading. + +Unlike using +.BR signalfd (2) +for the +.B SIGCHLD +signal, +the file descriptor returned by +.BR clone4 () +with the +.B CLONE_FD +flag works even with +.B SIGCHLD +unblocked in one or more threads of the parent process, allowing the process to +have different handlers for different child processes, such as those created by +a library, without introducing race conditions around process-wide signal +handling. + +.I clonefd_