Re: [PATCH v10 26/26] x86/cet/shstk: Add arch_prctl functions for shadow stack

2020-05-22 Thread Eugene Syromiatnikov
On Fri, May 22, 2020 at 10:17:43AM -0700, Yu-cheng Yu wrote:
> On Thu, 2020-05-21 at 15:42 -0700, Kees Cook wrote:
> > On Wed, Apr 29, 2020 at 03:07:32PM -0700, Yu-cheng Yu wrote:
> [...]
> > > +
> > > +int prctl_cet(int option, u64 arg2)
> > > +{
> > > + struct cet_status *cet;
> > > +
> > > + if (!IS_ENABLED(CONFIG_X86_INTEL_CET))
> > > + return -EINVAL;
> > 
> > Using -EINVAL here means userspace can't tell the difference between an
> > old kernel and a kernel not built with CONFIG_X86_INTEL_CET. Perhaps
> > -ENOTSUPP?
> 
> Looked into this.  The kernel and GLIBC are not in sync.  So maybe we still 
> use
> EINVAL here?
> 
> Yu-cheng
> 
> 
> 
> In kernel:
> --
> 
> #define EOPNOTSUPP95
> #define ENOTSUPP  524
> 
> In GLIBC:
> -
> 
> printf("ENOTSUP=%d\n", ENOTSUP);
> printf("EOPNOTSUPP=%d\n", EOPNOTSUPP);
> printf("%s=524\n", strerror(524));
>  
> ENOTSUP=95
> EOPNOTSUPP=95
> Unknown error 524=524

EOPNOTSUPP/ENOTSUP/ENOTSUPP is actually a mess, it's summarized recently
by Michael Kerrisk[1].  From the kernel's point of view, I think it
would be reasonable to return EOPNOTSUPP, and expect that the userspace
would use ENOTSUP to match against it.

[1] 
https://lore.kernel.org/linux-man/cb4c685b-6c5d-9c16-aade-0c95e57de...@gmail.com/



Re: [PATCH v2] uapi, posix-timers: provide clockid-related macros and functions to UAPI

2020-05-12 Thread Eugene Syromiatnikov
On Tue, May 12, 2020 at 10:58:16PM +0300, Sergey Organov wrote:
> Eugene Syromiatnikov  writes:
> 
> > As of now, there is no interface exposed for converting pid/fd into
> > clockid and vice versa; linuxptp, for example, has been carrying these
> > definitions in missing.h header for quite some time[1].
> >
> > [1] https://sourceforge.net/p/linuxptp/code/ci/af380e86/tree/missing.h
> >
> > Signed-off-by: Eugene Syromiatnikov 
> > ---
> > Changes since v1[1]:
> >  * Actually tried to build with the patch and fixed the build error
> >reported by kbuild test robot[2].
> >
> > [1] https://lkml.org/lkml/2019/9/20/698
> > [2] https://lkml.org/lkml/2019/9/22/13
> > ---
> >  include/linux/posix-timers.h | 47 
> > +--
> >  include/uapi/linux/time.h| 48 
> > 
> >  2 files changed, 49 insertions(+), 46 deletions(-)
> 
> Was this patch applied, rejected, lost?
> 
> I can't find it in the current master.

IIRC, it was ignored.



[PATCH v2] uapi, posix-timers: provide clockid-related macros and functions to UAPI

2019-09-23 Thread Eugene Syromiatnikov
As of now, there is no interface exposed for converting pid/fd into
clockid and vice versa; linuxptp, for example, has been carrying these
definitions in missing.h header for quite some time[1].

[1] https://sourceforge.net/p/linuxptp/code/ci/af380e86/tree/missing.h

Signed-off-by: Eugene Syromiatnikov 
---
Changes since v1[1]:
 * Actually tried to build with the patch and fixed the build error
   reported by kbuild test robot[2].

[1] https://lkml.org/lkml/2019/9/20/698
[2] https://lkml.org/lkml/2019/9/22/13
---
 include/linux/posix-timers.h | 47 +--
 include/uapi/linux/time.h| 48 
 2 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 3d10c84..ddc7e6e6 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,58 +4,13 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
 struct kernel_siginfo;
 struct task_struct;
 
-/*
- * Bit fields within a clockid:
- *
- * The most significant 29 bits hold either a pid or a file descriptor.
- *
- * Bit 2 indicates whether a cpu clock refers to a thread or a process.
- *
- * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
- *
- * A clockid is invalid if bits 2, 1, and 0 are all set.
- */
-#define CPUCLOCK_PID(clock)((pid_t) ~((clock) >> 3))
-#define CPUCLOCK_PERTHREAD(clock) \
-   (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
-
-#define CPUCLOCK_PERTHREAD_MASK4
-#define CPUCLOCK_WHICH(clock)  ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
-#define CPUCLOCK_CLOCK_MASK3
-#define CPUCLOCK_PROF  0
-#define CPUCLOCK_VIRT  1
-#define CPUCLOCK_SCHED 2
-#define CPUCLOCK_MAX   3
-#define CLOCKFDCPUCLOCK_MAX
-#define CLOCKFD_MASK   (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
-
-static inline clockid_t make_process_cpuclock(const unsigned int pid,
-   const clockid_t clock)
-{
-   return ((~pid) << 3) | clock;
-}
-static inline clockid_t make_thread_cpuclock(const unsigned int tid,
-   const clockid_t clock)
-{
-   return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK);
-}
-
-static inline clockid_t fd_to_clockid(const int fd)
-{
-   return make_process_cpuclock((unsigned int) fd, CLOCKFD);
-}
-
-static inline int clockid_to_fd(const clockid_t clk)
-{
-   return ~(clk >> 3);
-}
-
 #ifdef CONFIG_POSIX_TIMERS
 
 /**
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 958932e..58a78a7 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -70,4 +70,52 @@ struct itimerval {
  */
 #define TIMER_ABSTIME  0x01
 
+/*
+ * Bit fields within a clockid:
+ *
+ * The most significant 29 bits hold either a pid or a file descriptor.
+ *
+ * Bit 2 indicates whether a cpu clock refers to a thread or a process.
+ *
+ * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
+ *
+ * A clockid is invalid if bits 2, 1, and 0 are all set.
+ */
+#define CPUCLOCK_PID(clock)((pid_t) ~((clock) >> 3))
+#define CPUCLOCK_PERTHREAD(clock) \
+   (((clock) & (__kernel_clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
+
+#define CPUCLOCK_PERTHREAD_MASK4
+#define CPUCLOCK_WHICH(clock)  \
+   ((clock) & (__kernel_clockid_t) CPUCLOCK_CLOCK_MASK)
+#define CPUCLOCK_CLOCK_MASK3
+#define CPUCLOCK_PROF  0
+#define CPUCLOCK_VIRT  1
+#define CPUCLOCK_SCHED 2
+#define CPUCLOCK_MAX   3
+#define CLOCKFDCPUCLOCK_MAX
+#define CLOCKFD_MASK   (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
+
+static inline __kernel_clockid_t make_process_cpuclock(const unsigned int pid,
+   const clockid_t clock)
+{
+   return ((~pid) << 3) | clock;
+}
+static inline __kernel_clockid_t make_thread_cpuclock(const unsigned int tid,
+   const clockid_t clock)
+{
+   return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK);
+}
+
+static inline __kernel_clockid_t fd_to_clockid(const int fd)
+{
+   return make_process_cpuclock((unsigned int) fd, CLOCKFD);
+}
+
+static inline int clockid_to_fd(const __kernel_clockid_t clk)
+{
+   return ~(clk >> 3);
+}
+
+
 #endif /* _UAPI_LINUX_TIME_H */
-- 
2.1.4



[PATCH] uapi, posix-timers: provide clockid-related macros and functions to UAPI

2019-09-20 Thread Eugene Syromiatnikov
As of now, there is no interface exposed for converting pid/fd into
clockid and vice versa; linuxptp, for example, has been carrying these
definitions in missing.h header for quite some time[1].

[1] https://sourceforge.net/p/linuxptp/code/ci/af380e86/tree/missing.h

Signed-off-by: Eugene Syromiatnikov 
---
 include/linux/posix-timers.h | 47 +---
 include/uapi/linux/time.h| 47 
 2 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 3d10c84..ddc7e6e6 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,58 +4,13 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
 struct kernel_siginfo;
 struct task_struct;
 
-/*
- * Bit fields within a clockid:
- *
- * The most significant 29 bits hold either a pid or a file descriptor.
- *
- * Bit 2 indicates whether a cpu clock refers to a thread or a process.
- *
- * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
- *
- * A clockid is invalid if bits 2, 1, and 0 are all set.
- */
-#define CPUCLOCK_PID(clock)((pid_t) ~((clock) >> 3))
-#define CPUCLOCK_PERTHREAD(clock) \
-   (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
-
-#define CPUCLOCK_PERTHREAD_MASK4
-#define CPUCLOCK_WHICH(clock)  ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
-#define CPUCLOCK_CLOCK_MASK3
-#define CPUCLOCK_PROF  0
-#define CPUCLOCK_VIRT  1
-#define CPUCLOCK_SCHED 2
-#define CPUCLOCK_MAX   3
-#define CLOCKFDCPUCLOCK_MAX
-#define CLOCKFD_MASK   (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
-
-static inline clockid_t make_process_cpuclock(const unsigned int pid,
-   const clockid_t clock)
-{
-   return ((~pid) << 3) | clock;
-}
-static inline clockid_t make_thread_cpuclock(const unsigned int tid,
-   const clockid_t clock)
-{
-   return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK);
-}
-
-static inline clockid_t fd_to_clockid(const int fd)
-{
-   return make_process_cpuclock((unsigned int) fd, CLOCKFD);
-}
-
-static inline int clockid_to_fd(const clockid_t clk)
-{
-   return ~(clk >> 3);
-}
-
 #ifdef CONFIG_POSIX_TIMERS
 
 /**
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 958932e..41a004c 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -70,4 +70,51 @@ struct itimerval {
  */
 #define TIMER_ABSTIME  0x01
 
+/*
+ * Bit fields within a clockid:
+ *
+ * The most significant 29 bits hold either a pid or a file descriptor.
+ *
+ * Bit 2 indicates whether a cpu clock refers to a thread or a process.
+ *
+ * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
+ *
+ * A clockid is invalid if bits 2, 1, and 0 are all set.
+ */
+#define CPUCLOCK_PID(clock)((pid_t) ~((clock) >> 3))
+#define CPUCLOCK_PERTHREAD(clock) \
+   (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
+
+#define CPUCLOCK_PERTHREAD_MASK4
+#define CPUCLOCK_WHICH(clock)  ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
+#define CPUCLOCK_CLOCK_MASK3
+#define CPUCLOCK_PROF  0
+#define CPUCLOCK_VIRT  1
+#define CPUCLOCK_SCHED 2
+#define CPUCLOCK_MAX   3
+#define CLOCKFDCPUCLOCK_MAX
+#define CLOCKFD_MASK   (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
+
+static inline clockid_t make_process_cpuclock(const unsigned int pid,
+   const clockid_t clock)
+{
+   return ((~pid) << 3) | clock;
+}
+static inline clockid_t make_thread_cpuclock(const unsigned int tid,
+   const clockid_t clock)
+{
+   return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK);
+}
+
+static inline clockid_t fd_to_clockid(const int fd)
+{
+   return make_process_cpuclock((unsigned int) fd, CLOCKFD);
+}
+
+static inline int clockid_to_fd(const clockid_t clk)
+{
+   return ~(clk >> 3);
+}
+
+
 #endif /* _UAPI_LINUX_TIME_H */
-- 
2.1.4



[PATCH v3 3/3] drivers/md/raid5-ppl.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

2019-09-20 Thread Eugene Syromiatnikov
As it is consistent with prefixes of other write life time hints.

Signed-off-by: Eugene Syromiatnikov 
---
 drivers/md/raid5-ppl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 18a4064..cab5b13 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1404,7 +1404,7 @@ int ppl_init_log(struct r5conf *conf)
atomic64_set(_conf->seq, 0);
INIT_LIST_HEAD(_conf->no_mem_stripes);
spin_lock_init(_conf->no_mem_stripes_lock);
-   ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET;
+   ppl_conf->write_hint = RWH_WRITE_LIFE_NOT_SET;
 
if (!mddev->external) {
ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, 
sizeof(mddev->uuid));
-- 
2.1.4



[PATCH v3 2/3] drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

2019-09-20 Thread Eugene Syromiatnikov
As it is consistent with prefixes of other write life time hints.

Signed-off-by: Eugene Syromiatnikov 
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 223e97a..2f71f6f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1134,7 +1134,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
bi->bi_iter.bi_size = STRIPE_SIZE;
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
@@ -1187,7 +1187,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_iter.bi_size = STRIPE_SIZE;
rbi->bi_write_hint = sh->dev[i].write_hint;
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
-- 
2.1.4



[PATCH v3 0/3] Fix typo in RWH_WRITE_LIFE_NOT_SET constant name

2019-09-20 Thread Eugene Syromiatnikov
Hello.

This is a small fix of a typo (or, more specifically, some remnant of
the old patch version spelling) in RWH_WRITE_LIFE_NOT_SET constant,
which is named as RWF_WRITE_LIFE_NOT_SET currently.  Since the name
with "H" is used in man page and everywhere else, it's probably worth
to make the name used in the fcntl.h UAPI header in line with it.
The two follow-up patches update usage sites of this constant in kernel
to use the new spelling.

The old name is retained as it is part of UAPI now.

Changes since v2[1]:
 * Updated RWF_WRITE_LIFE_NOT_SET constant usage
   in drivers/md/raid5-ppl.c:ppl_init_log().

Changes since v1[2]:
 * Changed format of the commit ID in the commit message of the first patch.
 * Removed bogus Signed-off-by that snuck into the resend of the series.

[1] https://lkml.org/lkml/2018/10/30/34
[2] https://lkml.org/lkml/2018/10/26/88

Eugene Syromiatnikov (3):
  fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint name
  drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET
  drivers/md/raid5-ppl.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

 drivers/md/raid5-ppl.c   | 2 +-
 drivers/md/raid5.c   | 4 ++--
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 5 files changed, 20 insertions(+), 6 deletions(-)

-- 
2.1.4



[PATCH v3 1/3] fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint name

2019-09-20 Thread Eugene Syromiatnikov
According to commit message in the original commit c75b1d9421f8 ("fs:
add fcntl() interface for setting/getting write life time hints"),
as well as userspace library[1] and man page update[2], R/W hint constants
are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained
"RWF_*" prefix used in the early versions of the proposed patch set[3].
Rename it and provide the old name as a synonym for the new one
for backward compatibility.

[1] https://github.com/axboe/fio/commit/bd553af6c849
[2] https://github.com/mkerrisk/man-pages/commit/580082a186fd
[3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html

Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life 
time hints")
Signed-off-by: Eugene Syromiatnikov 
---
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3d40771..41b6438 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -261,7 +261,7 @@ static int f_getowner_uids(struct file *filp, unsigned long 
arg)
 static bool rw_hint_valid(enum rw_hint hint)
 {
switch (hint) {
-   case RWF_WRITE_LIFE_NOT_SET:
+   case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
case RWH_WRITE_LIFE_SHORT:
case RWH_WRITE_LIFE_MEDIUM:
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 1d33835..1f97b33 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -58,7 +58,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -66,6 +66,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling is remained from the first
+ * versions of the patch set that introduced the feature, see commit
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 1d33835..1f97b33 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -58,7 +58,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -66,6 +66,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling is remained from the first
+ * versions of the patch set that introduced the feature, see commit
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
-- 
2.1.4



[PATCH net v2 1/3] uapi, net/smc: move protocol constant definitions to UAPI

2019-09-20 Thread Eugene Syromiatnikov
SMCPROTO_* constants are expected to be used by userspace[1],
but they were defined in a private header instead.

[1] http://manpages.ubuntu.com/manpages/cosmic/man7/af_smc.7.html

Fixes: ac7138746e14 ("smc: establish new socket family")
Fixes: aaa4d33f6da1 ("net/smc: enable ipv6 support for smc")
Signed-off-by: Eugene Syromiatnikov 
---
 include/uapi/linux/smc.h | 7 ++-
 net/smc/smc.h| 4 +---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h
index 0e11ca4..10561f8 100644
--- a/include/uapi/linux/smc.h
+++ b/include/uapi/linux/smc.h
@@ -2,7 +2,8 @@
 /*
  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
  *
- *  Definitions for generic netlink based configuration of an SMC-R PNET table
+ *  Definitions for SMC protocol and generic netlink based configuration
+ *  of an SMC-R PNET table
  *
  *  Copyright IBM Corp. 2016
  *
@@ -12,6 +13,10 @@
 #ifndef _UAPI_LINUX_SMC_H_
 #define _UAPI_LINUX_SMC_H_
 
+/* AF_SMC socket protocols */
+#define SMCPROTO_SMC   0   /* SMC protocol, IPv4 */
+#define SMCPROTO_SMC6  1   /* SMC protocol, IPv6 */
+
 /* Netlink SMC_PNETID attributes */
 enum {
SMC_PNETID_UNSPEC,
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 878313f..e60effc 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -15,12 +15,10 @@
 #include 
 #include  /* __aligned */
 #include 
+#include 
 
 #include "smc_ib.h"
 
-#define SMCPROTO_SMC   0   /* SMC protocol, IPv4 */
-#define SMCPROTO_SMC6  1   /* SMC protocol, IPv6 */
-
 extern struct proto smc_proto;
 extern struct proto smc_proto6;
 
-- 
2.1.4



[PATCH net v2 3/3] uapi, net/smc: provide socket state constants in UAPI

2019-09-20 Thread Eugene Syromiatnikov
As socket state itself is already exposed via sock_diag interface.

Fixes: ac7138746e14 ("smc: establish new socket family")
Fixes: b38d732477e4 ("smc: socket closing and linkgroup cleanup")
Signed-off-by: Eugene Syromiatnikov 
---
 include/uapi/linux/smc_diag.h | 17 +
 net/smc/smc.h | 18 +-
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h
index 8cb3a6f..6798ec0 100644
--- a/include/uapi/linux/smc_diag.h
+++ b/include/uapi/linux/smc_diag.h
@@ -31,6 +31,23 @@ struct smc_diag_msg {
__aligned_u64   diag_inode;
 };
 
+enum smc_state {   /* possible states of an SMC socket */
+   SMC_ACTIVE  = 1,
+   SMC_INIT= 2,
+   SMC_CLOSED  = 7,
+   SMC_LISTEN  = 10,
+   /* normal close */
+   SMC_PEERCLOSEWAIT1  = 20,
+   SMC_PEERCLOSEWAIT2  = 21,
+   SMC_APPFINCLOSEWAIT = 24,
+   SMC_APPCLOSEWAIT1   = 22,
+   SMC_APPCLOSEWAIT2   = 23,
+   SMC_PEERFINCLOSEWAIT= 25,
+   /* abnormal close */
+   SMC_PEERABORTWAIT   = 26,
+   SMC_PROCESSABORT= 27,
+};
+
 /* Mode of a connection */
 enum {
SMC_DIAG_MODE_SMCR,
diff --git a/net/smc/smc.h b/net/smc/smc.h
index e60effc..7eaef72 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -16,6 +16,7 @@
 #include  /* __aligned */
 #include 
 #include 
+#include 
 
 #include "smc_ib.h"
 
@@ -26,23 +27,6 @@ extern struct proto smc_proto6;
 #define KERNEL_HAS_ATOMIC64
 #endif
 
-enum smc_state {   /* possible states of an SMC socket */
-   SMC_ACTIVE  = 1,
-   SMC_INIT= 2,
-   SMC_CLOSED  = 7,
-   SMC_LISTEN  = 10,
-   /* normal close */
-   SMC_PEERCLOSEWAIT1  = 20,
-   SMC_PEERCLOSEWAIT2  = 21,
-   SMC_APPFINCLOSEWAIT = 24,
-   SMC_APPCLOSEWAIT1   = 22,
-   SMC_APPCLOSEWAIT2   = 23,
-   SMC_PEERFINCLOSEWAIT= 25,
-   /* abnormal close */
-   SMC_PEERABORTWAIT   = 26,
-   SMC_PROCESSABORT= 27,
-};
-
 struct smc_link_group;
 
 struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
-- 
2.1.4



[PATCH net v2 2/3] uapi, net/smc: provide fallback diagnostic codes in UAPI

2019-09-20 Thread Eugene Syromiatnikov
Since the values to which these codes are corresponding to are already
exposed via sock_diag interface.

Signed-off-by: Eugene Syromiatnikov 
---
 include/uapi/linux/smc.h | 25 +
 net/smc/smc_clc.h| 22 --
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h
index 10561f8..9fbf365 100644
--- a/include/uapi/linux/smc.h
+++ b/include/uapi/linux/smc.h
@@ -17,6 +17,31 @@
 #define SMCPROTO_SMC   0   /* SMC protocol, IPv4 */
 #define SMCPROTO_SMC6  1   /* SMC protocol, IPv6 */
 
+/* Diagnostic codes */
+#define SMC_CLC_DECL_MEM   0x0101  /* insufficient memory resources  */
+#define SMC_CLC_DECL_TIMEOUT_CL0x0201  /* timeout w4 QP confirm 
link */
+#define SMC_CLC_DECL_TIMEOUT_AL0x0202  /* timeout w4 QP add link   
  */
+#define SMC_CLC_DECL_CNFERR0x0300  /* configuration error*/
+#define SMC_CLC_DECL_PEERNOSMC 0x0301  /* peer did not indicate SMC  */
+#define SMC_CLC_DECL_IPSEC 0x0302  /* IPsec usage*/
+#define SMC_CLC_DECL_NOSMCDEV  0x0303  /* no SMC device found (R or D)   */
+#define SMC_CLC_DECL_NOSMCDDEV 0x03030001  /* no SMC-D device found  */
+#define SMC_CLC_DECL_NOSMCRDEV 0x03030002  /* no SMC-R device found  */
+#define SMC_CLC_DECL_SMCDNOTALK0x03030003  /* SMC-D dev can't talk to 
peer   */
+#define SMC_CLC_DECL_MODEUNSUPP0x0304  /* smc modes do not match 
(R or D)*/
+#define SMC_CLC_DECL_RMBE_EC   0x0305  /* peer has eyecatcher in RMBE*/
+#define SMC_CLC_DECL_OPTUNSUPP 0x0306  /* fastopen sockopt not supported */
+#define SMC_CLC_DECL_DIFFPREFIX0x0307  /* IP prefix / subnet 
mismatch*/
+#define SMC_CLC_DECL_GETVLANERR0x0308  /* err to get vlan id of ip 
device*/
+#define SMC_CLC_DECL_ISMVLANERR0x0309  /* err to reg vlan id on 
ism dev  */
+#define SMC_CLC_DECL_SYNCERR   0x0400  /* synchronization error  */
+#define SMC_CLC_DECL_PEERDECL  0x0500  /* peer declined during handshake */
+#define SMC_CLC_DECL_INTERR0x0999  /* internal error */
+#define SMC_CLC_DECL_ERR_RTOK  0x09990001  /*   rtoken handling failed   */
+#define SMC_CLC_DECL_ERR_RDYLNK0x09990002  /*   ib ready link failed   
  */
+#define SMC_CLC_DECL_ERR_REGRMB0x09990003  /*   reg rmb failed 
  */
+
+
 /* Netlink SMC_PNETID attributes */
 enum {
SMC_PNETID_UNSPEC,
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index ca20927..db6a78f 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -28,28 +28,6 @@
 #define SMC_TYPE_B 3   /* SMC-R and SMC-D*/
 #define CLC_WAIT_TIME  (6 * HZ)/* max. wait time on clcsock  */
 #define CLC_WAIT_TIME_SHORTHZ  /* short wait time on clcsock */
-#define SMC_CLC_DECL_MEM   0x0101  /* insufficient memory resources  */
-#define SMC_CLC_DECL_TIMEOUT_CL0x0201  /* timeout w4 QP confirm 
link */
-#define SMC_CLC_DECL_TIMEOUT_AL0x0202  /* timeout w4 QP add link   
  */
-#define SMC_CLC_DECL_CNFERR0x0300  /* configuration error*/
-#define SMC_CLC_DECL_PEERNOSMC 0x0301  /* peer did not indicate SMC  */
-#define SMC_CLC_DECL_IPSEC 0x0302  /* IPsec usage*/
-#define SMC_CLC_DECL_NOSMCDEV  0x0303  /* no SMC device found (R or D)   */
-#define SMC_CLC_DECL_NOSMCDDEV 0x03030001  /* no SMC-D device found  */
-#define SMC_CLC_DECL_NOSMCRDEV 0x03030002  /* no SMC-R device found  */
-#define SMC_CLC_DECL_SMCDNOTALK0x03030003  /* SMC-D dev can't talk to 
peer   */
-#define SMC_CLC_DECL_MODEUNSUPP0x0304  /* smc modes do not match 
(R or D)*/
-#define SMC_CLC_DECL_RMBE_EC   0x0305  /* peer has eyecatcher in RMBE*/
-#define SMC_CLC_DECL_OPTUNSUPP 0x0306  /* fastopen sockopt not supported */
-#define SMC_CLC_DECL_DIFFPREFIX0x0307  /* IP prefix / subnet 
mismatch*/
-#define SMC_CLC_DECL_GETVLANERR0x0308  /* err to get vlan id of ip 
device*/
-#define SMC_CLC_DECL_ISMVLANERR0x0309  /* err to reg vlan id on 
ism dev  */
-#define SMC_CLC_DECL_SYNCERR   0x0400  /* synchronization error  */
-#define SMC_CLC_DECL_PEERDECL  0x0500  /* peer declined during handshake */
-#define SMC_CLC_DECL_INTERR0x0999  /* internal error */
-#define SMC_CLC_DECL_ERR_RTOK  0x09990001  /*   rtoken handling failed   */
-#define SMC_CLC_DECL_ERR_RDYLNK0x09990002  /*   ib ready link failed   
  */
-#define SMC_CLC_DECL_ERR_REGRMB0x09990003  /*   reg rmb failed 
  */
 
 struct smc_clc_msg_hdr {   /* header1 of clc messages */
u8 eyecatcher[4];   /* eye catcher */
-- 
2.1.4



[PATCH net v2 0/3] net/smc: move some definitions to UAPI

2019-09-20 Thread Eugene Syromiatnikov
Hello.

As of now, it's a bit difficult to use SMC protocol, as significant part
of definitions related to it are defined in private headers and are not
part of UAPI. The following commits move some definitions to UAPI,
making them readily available to the user space.

Changes since v1[1]:
 * Patch "provide fallback diagnostic codes in UAPI" is updated
   in accordance with the updated set of diagnostic codes.

[1] https://lkml.org/lkml/2018/10/7/177

Eugene Syromiatnikov (3):
  uapi, net/smc: move protocol constant definitions to UAPI
  uapi, net/smc: provide fallback diagnostic codes in UAPI
  uapi, net/smc: provide socket state constants in UAPI

 include/uapi/linux/smc.h  | 32 +++-
 include/uapi/linux/smc_diag.h | 17 +
 net/smc/smc.h | 22 ++
 net/smc/smc_clc.h | 22 --
 4 files changed, 50 insertions(+), 43 deletions(-)

-- 
2.1.4



[PATCH v3 5/6] selftests/clone3: enable clone3 self-tests on all architectures

2019-09-11 Thread Eugene Syromiatnikov
clone3() is available on most architectures, so there's no reason to
restrict the respective self-tests to x86_64.

* tools/testing/selftests/clone3/Makefile (TEST_GEN_PROGS): Set always,
not only ifeq ($(ARCH),x86_64).

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/testing/selftests/clone3/Makefile 
b/tools/testing/selftests/clone3/Makefile
index 4efcf45..faa069c 100644
--- a/tools/testing/selftests/clone3/Makefile
+++ b/tools/testing/selftests/clone3/Makefile
@@ -4,8 +4,6 @@ ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
 
 CFLAGS += -I../../../../usr/include/
 
-ifeq ($(ARCH),x86_64)
-   TEST_GEN_PROGS := clone3 clone3_set_tid
-endif
+TEST_GEN_PROGS := clone3 clone3_set_tid
 
 include ../lib.mk
-- 
2.1.4



[PATCH v3 1/6] selftests/clone3: convert test modes into an enum

2019-09-11 Thread Eugene Syromiatnikov
* tools/testing/selftests/clone3/clone3.c (CLONE3_ARGS_NO_TEST,
CLONE3_ARGS_ALL_0, CLONE3_ARGS_ALL_1): Change into an enum.
(call_clone3): Change test_mode parameter type to enum test_mode;
use switch statement for actions that dependent on test_mode selection.
(test_clone3): Change test_mode parameter type to enum test_mode.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index a0f1989..7b65ee5 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -24,16 +24,18 @@
 /* V1 includes set_tid */
 #define CLONE3_ARGS_SIZE_V1 72
 
-#define CLONE3_ARGS_NO_TEST 0
-#define CLONE3_ARGS_ALL_0 1
-#define CLONE3_ARGS_ALL_1 2
+enum test_mode {
+   CLONE3_ARGS_NO_TEST,
+   CLONE3_ARGS_ALL_0,
+   CLONE3_ARGS_ALL_1,
+};
 
 static pid_t raw_clone(struct clone_args *args, size_t size)
 {
return syscall(__NR_clone3, args, size);
 }
 
-static int call_clone3(int flags, size_t size, int test_mode)
+static int call_clone3(int flags, size_t size, enum test_mode test_mode)
 {
struct clone_args args = {0};
pid_t ppid = -1;
@@ -46,7 +48,8 @@ static int call_clone3(int flags, size_t size, int test_mode)
if (size == 0)
size = sizeof(struct clone_args);
 
-   if (test_mode == CLONE3_ARGS_ALL_0) {
+   switch (test_mode) {
+   case CLONE3_ARGS_ALL_0:
args.flags = 0;
args.pidfd = 0;
args.child_tid = 0;
@@ -56,7 +59,9 @@ static int call_clone3(int flags, size_t size, int test_mode)
args. stack_size = 0;
args.tls = 0;
args.set_tid = 0;
-   } else if (test_mode == CLONE3_ARGS_ALL_1) {
+   break;
+
+   case CLONE3_ARGS_ALL_1:
args.flags = 1;
args.pidfd = 1;
args.child_tid = 1;
@@ -66,6 +71,7 @@ static int call_clone3(int flags, size_t size, int test_mode)
args. stack_size = 1;
args.tls = 1;
args.set_tid = 1;
+   break;
}
 
pid = raw_clone(, size);
@@ -91,7 +97,8 @@ static int call_clone3(int flags, size_t size, int test_mode)
return 0;
 }
 
-static int test_clone3(int flags, size_t size, int expected, int test_mode)
+static int test_clone3(int flags, size_t size, int expected,
+  enum test_mode test_mode)
 {
int ret;
 
-- 
2.1.4



[PATCH v3 2/6] selftests/clone3: add a check for invalid exit_signal

2019-09-11 Thread Eugene Syromiatnikov
Check that the kernel fails calls with exit_signal with non-zero highest
32 bits, negative 32-bit exit_signal, and invalid exit_signal withing
CSIGNAL mask, like legacy clone syscalls do.

* tools/testing/selftests/clone3/clone3.c (enum test_mode): Add
CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG,
CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG.
(call_clone3): Add args.exit_signal initialisation in case test_mode
is equal to one of the added enum test_mode values.
(main): Add test_clone3 clone check with test_mode equal to the added
enum test_mode values.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 36 +
 1 file changed, 36 insertions(+)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index 7b65ee5..4837865 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -28,6 +28,10 @@ enum test_mode {
CLONE3_ARGS_NO_TEST,
CLONE3_ARGS_ALL_0,
CLONE3_ARGS_ALL_1,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG,
 };
 
 static pid_t raw_clone(struct clone_args *args, size_t size)
@@ -72,6 +76,22 @@ static int call_clone3(int flags, size_t size, enum 
test_mode test_mode)
args.tls = 1;
args.set_tid = 1;
break;
+
+   case CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG:
+   args.exit_signal = 0xbadc0dedULL;
+   break;
+
+   case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG:
+   args.exit_signal = 0x8000ULL;
+   break;
+
+   case CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG:
+   args.exit_signal = 0x0100ULL;
+   break;
+
+   case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG:
+   args.exit_signal = 0x00f0ULL;
+   break;
}
 
pid = raw_clone(, size);
@@ -146,6 +166,22 @@ int main(int argc, char *argv[])
/* Do a clone3() with all members set to 1 */
if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL, CLONE3_ARGS_ALL_1))
goto on_error;
+   /* Do a clone3() with exit_signal having highest 32 bits non-zero */
+   if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG))
+   goto on_error;
+   /* Do a clone3() with negative 32-bit exit_signal */
+   if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG))
+   goto on_error;
+   /* Do a clone3() with exit_signal not fitting into CSIGNAL mask */
+   if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG))
+   goto on_error;
+   /* Do a clone3() with NSIG < exit_signal < CSIG */
+   if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG))
+   goto on_error;
/*
 * Do a clone3() with sizeof(struct clone_args) + 8
 * and all members set to 0.
-- 
2.1.4



[PATCH v3 6/6] selftests: add clone3 to TARGETS

2019-09-11 Thread Eugene Syromiatnikov
* tools/testing/selftests/Makefile (TARGETS): Add clone3.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 25b43a8c..05163e4 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -4,6 +4,7 @@ TARGETS += bpf
 TARGETS += breakpoints
 TARGETS += capabilities
 TARGETS += cgroup
+TARGETS += clone3
 TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += drivers/dma-buf
-- 
2.1.4



[PATCH v3 4/6] selftests/clone3: fix up format strings

2019-09-11 Thread Eugene Syromiatnikov
* tools/testing/selftests/clone3/clone3.c (test_clone3): Change format
qualifier for printing size field from %d to %zu; place colon right
after the word "says".

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index 264d03a..86f888b 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -124,11 +124,11 @@ static int test_clone3(uint64_t flags, size_t size, int 
expected,
 {
int ret;
 
-   ksft_print_msg("[%d] Trying clone3() with flags %#" PRIx64 " (size %d)"
+   ksft_print_msg("[%d] Trying clone3() with flags %#" PRIx64 " (size %zu)"
"\n",
getpid(), flags, size);
ret = call_clone3(flags, size, test_mode);
-   ksft_print_msg("[%d] clone3() with flags says :%d expected %d\n",
+   ksft_print_msg("[%d] clone3() with flags says: %d expected %d\n",
getpid(), ret, expected);
if (ret != expected)
ksft_exit_fail_msg(
-- 
2.1.4



[PATCH v3 3/6] selftests/clone3: use uint64_t for flags parameter

2019-09-11 Thread Eugene Syromiatnikov
Flags parameter in both userspace and kernel clone args is 64-bit wide,
there's little reason to have it signed and 32-bit in tests.

* tools/testing/selftests/clone3/clone3.c: Include  and
.
(call_clone3): Change flags parameter type from int to uint64_t.
(test_clone3): Change flags parameter type from int to uint64_t; change
the format string that prints it accordingly.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index 4837865..264d03a 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -4,8 +4,10 @@
 
 #define _GNU_SOURCE
 #include 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -39,7 +41,7 @@ static pid_t raw_clone(struct clone_args *args, size_t size)
return syscall(__NR_clone3, args, size);
 }
 
-static int call_clone3(int flags, size_t size, enum test_mode test_mode)
+static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
 {
struct clone_args args = {0};
pid_t ppid = -1;
@@ -117,12 +119,13 @@ static int call_clone3(int flags, size_t size, enum 
test_mode test_mode)
return 0;
 }
 
-static int test_clone3(int flags, size_t size, int expected,
+static int test_clone3(uint64_t flags, size_t size, int expected,
   enum test_mode test_mode)
 {
int ret;
 
-   ksft_print_msg("[%d] Trying clone3() with flags 0x%x (size %d)\n",
+   ksft_print_msg("[%d] Trying clone3() with flags %#" PRIx64 " (size %d)"
+   "\n",
getpid(), flags, size);
ret = call_clone3(flags, size, test_mode);
ksft_print_msg("[%d] clone3() with flags says :%d expected %d\n",
-- 
2.1.4



[PATCH v3 0/6] Update clone3 self-tests

2019-09-11 Thread Eugene Syromiatnikov
Hello.

This patch set updates clone3 selftest in several aspects:
 - adding checks for exit_signal invalid values handling;
 - adding clone3 to selftests targets;
 - enabling clone3 tests on all architectures;
 - minor cleanups of the clone3 test.

This respin alignes additional clone3 self-tests with v3 of the
exit_signal checking patch[1].

Applied on top of brauer/linux.git/for-next.

Changes since v2[2]:
 - CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG check is now expected to fail.

Changes since v1[3]:
 - exit_signal check extended to cover more cases of invalid
   exit_signal value.

[1] https://lkml.org/lkml/2019/9/11/677
[2] https://lkml.org/lkml/2019/9/10/768
[3] https://lkml.org/lkml/2019/9/10/416

Eugene Syromiatnikov (6):
  selftests/clone3: convert test modes into an enum
  selftests/clone3: add a check for invalid exit_signal
  selftests/clone3: use uint64_t for flags parameter
  selftests/clone3: fix up format strings
  selftests/clone3: enable clone3 self-tests on all architectures
  selftests: add clone3 to TARGETS

 tools/testing/selftests/Makefile|  1 +
 tools/testing/selftests/clone3/Makefile |  4 +--
 tools/testing/selftests/clone3/clone3.c | 64 -
 3 files changed, 57 insertions(+), 12 deletions(-)

-- 
2.1.4



[PATCH v3] fork: check exit_signal passed in clone3() call

2019-09-11 Thread Eugene Syromiatnikov
Previously, higher 32 bits of exit_signal fields were lost when
copied to the kernel args structure (that uses int as a type for the
respective field).  Moreover, as Oleg has noted[1], exit_signal is used
unchecked, so it has to be checked for sanity before use; for the legacy
syscalls, applying CSIGNAL mask guarantees that it is at least non-negative;
however, there's no such thing is done in clone3() code path, and that can
break at least thread_group_leader.

Adding checks that user-passed exit_signal fits into int and passes
valid_signal() check solves both of these problems.

[1] https://lkml.org/lkml/2019/9/10/467

* kernel/fork.c (copy_clone_args_from_user): Fail with -EINVAL if
args.exit_signal is greater than UINT_MAX or is not a valid signal.
(_do_fork): Note that exit_signal is expected to be checked for the
sanity by the caller.

Fixes: 7f192e3cd316 ("fork: add clone3")
Reported-by: Oleg Nesterov 
Co-authored-by: Oleg Nesterov 
Co-authored-by: Dmitry V. Levin 
Signed-off-by: Eugene Syromiatnikov 
---
 kernel/fork.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/kernel/fork.c b/kernel/fork.c
index 2852d0e..f98314b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2338,6 +2338,8 @@ struct mm_struct *copy_init_mm(void)
  *
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
+ *
+ * args->exit_signal is expected to be checked for sanity by the caller.
  */
 long _do_fork(struct kernel_clone_args *args)
 {
@@ -2562,6 +2564,15 @@ noinline static int copy_clone_args_from_user(struct 
kernel_clone_args *kargs,
if (copy_from_user(, uargs, size))
return -EFAULT;
 
+   /*
+* Two separate checks are needed, as valid_signal() takes unsigned long
+* as an argument, and struct kernel_clone_args uses int type
+* for the exit_signal field.
+*/
+   if (unlikely((args.exit_signal > UINT_MAX) ||
+!valid_signal(args.exit_signal)))
+   return -EINVAL;
+
*kargs = (struct kernel_clone_args){
.flags  = args.flags,
.pidfd  = u64_to_user_ptr(args.pidfd),
-- 
2.1.4



[PATCH v3] fork: check exit_signal passed in clone3() call

2019-09-11 Thread Eugene Syromiatnikov
Hello.

As was agreed[1][2], clone3 should fail if the provided exit_signal
value fails valid_signal() check, hence the new version.

Changees since v2[3][4]:
 - Rewrite the check to check exit_signal against valid_signal().

Changes since v1[5]:
 - Check changed to comparison against negated CSIGNAL to address
   the bug reported by Oleg[6].
 - Added a comment to _do_fork that exit_signal has to be checked
   by the caller.

[1] https://lkml.org/lkml/2019/9/11/503
[2] https://lkml.org/lkml/2019/9/11/518
[3] https://lkml.org/lkml/2019/9/10/764
[4] https://lkml.org/lkml/2019/9/10/765
[5] https://lkml.org/lkml/2019/9/10/411
[6] https://lkml.org/lkml/2019/9/10/467


Eugene Syromiatnikov (1):
  fork: check exit_signal passed in clone3() call

 kernel/fork.c | 11 +++
 1 file changed, 11 insertions(+)

-- 
2.1.4



Re: [PATCH v2] fork: check exit_signal passed in clone3() call

2019-09-11 Thread Eugene Syromiatnikov
On Wed, Sep 11, 2019 at 04:54:47PM +0200, Christian Brauner wrote:
> On Wed, Sep 11, 2019 at 03:32:13PM +0100, Eugene Syromiatnikov wrote:
> > On Wed, Sep 11, 2019 at 04:16:36PM +0200, Christian Brauner wrote:
> > > On Wed, Sep 11, 2019 at 03:52:36PM +0200, Christian Brauner wrote:
> > > > On Wed, Sep 11, 2019 at 06:48:52AM -0700, Andrew Morton wrote:
> > > > > What are the user-visible runtime effects of this bug?
> > 
> > The userspace can set -1 as an exit_signal, and that will break process
> > signalling and reaping.
> > 
> > > > > Relatedly, should this fix be backported into -stable kernels?  If 
> > > > > so, why?
> > > > 
> > > > No, as I said in my other mail clone3() is not in any released kernel
> > > > yet. clone3() is going to be released in v5.3.
> > > 
> > > Sigh, I spoke to soon... Hm, this is placed in _do_fork(). There's a
> > > chance that this might be visible in legacy clone if anyone passes in an
> > > invalid signal greater than NSIG right now somehow, they'd now get
> > > EINVAL if I'm seeing this right.
> > > 
> > > So an alternative might be to only fix this in clone3() only right now
> > > and get this patch into 5.3 to not release clone3() with this bug from
> > > legacy clone duplicated.
> > > And we defer the actual legacy clone fix until after next merge window
> > > having it stew in linux-next for a couple of rcs. Distros often pull in
> > > rcs so if anyone notices a regression for legacy clone we'll know about
> > > it... valid_signal() checks at process exit time when the parent is
> > > supposed to be notifed will catch faulty signals anyway so it's not that
> > > big of a deal.
> > 
> > As the patch is written, only copy_clone_args_from_user is touched (which
> > is used only by clone3 and not legacy clone), and the check added
> 
> Great!
> 
> > replicates legacy clone behaviour: userspace can set 0..CSIGNAL
> > as an exit_signal.   Having ability to set exit_signal in NSIG..CSIGNAL
> 
> Hm. The way I see it for clone3() it would make sense to only have <
> NSIG right away. valid_signal() won't let through anything else
> anyway... Since clone3() isn't out yet it doesn't make sense to
> replicate the (buggy) behavior of legacy clone, right?

I was thinking about that and in the end decided to go with CSIGNAL;
the only issue I see here is that it prevents for libc to easily
switch clone() library call implementation to clone3(), in case there
are some applications that rely on this kind of behaviour; if there's
no such userspace users, then switch to valid_signal() check for both
legacy and clone3 should be fine (along with possible switching to u64
for kernel_clone_args's exit_signal, so the check can done in one place),
but I'm agree with your hesitance to do it right now.

> Christian


Re: [PATCH v2] fork: check exit_signal passed in clone3() call

2019-09-11 Thread Eugene Syromiatnikov
On Wed, Sep 11, 2019 at 04:16:36PM +0200, Christian Brauner wrote:
> On Wed, Sep 11, 2019 at 03:52:36PM +0200, Christian Brauner wrote:
> > On Wed, Sep 11, 2019 at 06:48:52AM -0700, Andrew Morton wrote:
> > > What are the user-visible runtime effects of this bug?

The userspace can set -1 as an exit_signal, and that will break process
signalling and reaping.

> > > Relatedly, should this fix be backported into -stable kernels?  If so, 
> > > why?
> > 
> > No, as I said in my other mail clone3() is not in any released kernel
> > yet. clone3() is going to be released in v5.3.
> 
> Sigh, I spoke to soon... Hm, this is placed in _do_fork(). There's a
> chance that this might be visible in legacy clone if anyone passes in an
> invalid signal greater than NSIG right now somehow, they'd now get
> EINVAL if I'm seeing this right.
> 
> So an alternative might be to only fix this in clone3() only right now
> and get this patch into 5.3 to not release clone3() with this bug from
> legacy clone duplicated.
> And we defer the actual legacy clone fix until after next merge window
> having it stew in linux-next for a couple of rcs. Distros often pull in
> rcs so if anyone notices a regression for legacy clone we'll know about
> it... valid_signal() checks at process exit time when the parent is
> supposed to be notifed will catch faulty signals anyway so it's not that
> big of a deal.

As the patch is written, only copy_clone_args_from_user is touched (which
is used only by clone3 and not legacy clone), and the check added
replicates legacy clone behaviour: userspace can set 0..CSIGNAL
as an exit_signal.   Having ability to set exit_signal in NSIG..CSIGNAL
renge seems to be a bug, but at least it seems to be harmless one
and indeed may be addressed separately in the future.


[PATCH v2 1/6] selftests/clone3: convert test modes into an enum

2019-09-10 Thread Eugene Syromiatnikov
* tools/testing/selftests/clone3/clone3.c (CLONE3_ARGS_NO_TEST,
CLONE3_ARGS_ALL_0, CLONE3_ARGS_ALL_1): Change into an enum.
(call_clone3): Change test_mode parameter type to enum test_mode;
use switch statement for actions that dependent on test_mode selection.
(test_clone3): Change test_mode parameter type to enum test_mode.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index a0f1989..7b65ee5 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -24,16 +24,18 @@
 /* V1 includes set_tid */
 #define CLONE3_ARGS_SIZE_V1 72
 
-#define CLONE3_ARGS_NO_TEST 0
-#define CLONE3_ARGS_ALL_0 1
-#define CLONE3_ARGS_ALL_1 2
+enum test_mode {
+   CLONE3_ARGS_NO_TEST,
+   CLONE3_ARGS_ALL_0,
+   CLONE3_ARGS_ALL_1,
+};
 
 static pid_t raw_clone(struct clone_args *args, size_t size)
 {
return syscall(__NR_clone3, args, size);
 }
 
-static int call_clone3(int flags, size_t size, int test_mode)
+static int call_clone3(int flags, size_t size, enum test_mode test_mode)
 {
struct clone_args args = {0};
pid_t ppid = -1;
@@ -46,7 +48,8 @@ static int call_clone3(int flags, size_t size, int test_mode)
if (size == 0)
size = sizeof(struct clone_args);
 
-   if (test_mode == CLONE3_ARGS_ALL_0) {
+   switch (test_mode) {
+   case CLONE3_ARGS_ALL_0:
args.flags = 0;
args.pidfd = 0;
args.child_tid = 0;
@@ -56,7 +59,9 @@ static int call_clone3(int flags, size_t size, int test_mode)
args. stack_size = 0;
args.tls = 0;
args.set_tid = 0;
-   } else if (test_mode == CLONE3_ARGS_ALL_1) {
+   break;
+
+   case CLONE3_ARGS_ALL_1:
args.flags = 1;
args.pidfd = 1;
args.child_tid = 1;
@@ -66,6 +71,7 @@ static int call_clone3(int flags, size_t size, int test_mode)
args. stack_size = 1;
args.tls = 1;
args.set_tid = 1;
+   break;
}
 
pid = raw_clone(, size);
@@ -91,7 +97,8 @@ static int call_clone3(int flags, size_t size, int test_mode)
return 0;
 }
 
-static int test_clone3(int flags, size_t size, int expected, int test_mode)
+static int test_clone3(int flags, size_t size, int expected,
+  enum test_mode test_mode)
 {
int ret;
 
-- 
2.1.4



[PATCH v2 5/6] selftests/clone3: enable clone3 self-tests on all architectures

2019-09-10 Thread Eugene Syromiatnikov
clone3() is available on most architectures, so there's no reason to
restrict the respective self-tests to x86_64.

* tools/testing/selftests/clone3/Makefile (TEST_GEN_PROGS): Set always,
not only ifeq ($(ARCH),x86_64).

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/testing/selftests/clone3/Makefile 
b/tools/testing/selftests/clone3/Makefile
index 4efcf45..faa069c 100644
--- a/tools/testing/selftests/clone3/Makefile
+++ b/tools/testing/selftests/clone3/Makefile
@@ -4,8 +4,6 @@ ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
 
 CFLAGS += -I../../../../usr/include/
 
-ifeq ($(ARCH),x86_64)
-   TEST_GEN_PROGS := clone3 clone3_set_tid
-endif
+TEST_GEN_PROGS := clone3 clone3_set_tid
 
 include ../lib.mk
-- 
2.1.4



[PATCH v2 3/6] selftests/clone3: use uint64_t for flags parameter

2019-09-10 Thread Eugene Syromiatnikov
Flags parameter in both userspace and kernel clone args is 64-bit wide,
there's little reason to have it signed and 32-bit in tests.

* tools/testing/selftests/clone3/clone3.c: Include  and
.
(call_clone3): Change flags parameter type from int to uint64_t.
(test_clone3): Change flags parameter type from int to uint64_t; change
the format string that prints it accordingly.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index 318189f9..613544b 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -4,8 +4,10 @@
 
 #define _GNU_SOURCE
 #include 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -39,7 +41,7 @@ static pid_t raw_clone(struct clone_args *args, size_t size)
return syscall(__NR_clone3, args, size);
 }
 
-static int call_clone3(int flags, size_t size, enum test_mode test_mode)
+static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
 {
struct clone_args args = {0};
pid_t ppid = -1;
@@ -117,12 +119,13 @@ static int call_clone3(int flags, size_t size, enum 
test_mode test_mode)
return 0;
 }
 
-static int test_clone3(int flags, size_t size, int expected,
+static int test_clone3(uint64_t flags, size_t size, int expected,
   enum test_mode test_mode)
 {
int ret;
 
-   ksft_print_msg("[%d] Trying clone3() with flags 0x%x (size %d)\n",
+   ksft_print_msg("[%d] Trying clone3() with flags %#" PRIx64 " (size %d)"
+   "\n",
getpid(), flags, size);
ret = call_clone3(flags, size, test_mode);
ksft_print_msg("[%d] clone3() with flags says :%d expected %d\n",
-- 
2.1.4



[PATCH v2 4/6] selftests/clone3: fix up format strings

2019-09-10 Thread Eugene Syromiatnikov
* tools/testing/selftests/clone3/clone3.c (test_clone3): Change format
qualifier for printing size field from %d to %zu; place colon right
after the word "says".

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index 613544b..eb705f6 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -124,11 +124,11 @@ static int test_clone3(uint64_t flags, size_t size, int 
expected,
 {
int ret;
 
-   ksft_print_msg("[%d] Trying clone3() with flags %#" PRIx64 " (size %d)"
+   ksft_print_msg("[%d] Trying clone3() with flags %#" PRIx64 " (size %zu)"
"\n",
getpid(), flags, size);
ret = call_clone3(flags, size, test_mode);
-   ksft_print_msg("[%d] clone3() with flags says :%d expected %d\n",
+   ksft_print_msg("[%d] clone3() with flags says: %d expected %d\n",
getpid(), ret, expected);
if (ret != expected)
ksft_exit_fail_msg(
-- 
2.1.4



[PATCH v2 6/6] selftests: add clone3 to TARGETS

2019-09-10 Thread Eugene Syromiatnikov
* tools/testing/selftests/Makefile (TARGETS): Add clone3.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 25b43a8c..05163e4 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -4,6 +4,7 @@ TARGETS += bpf
 TARGETS += breakpoints
 TARGETS += capabilities
 TARGETS += cgroup
+TARGETS += clone3
 TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += drivers/dma-buf
-- 
2.1.4



[PATCH v2 2/6] selftests/clone3: add a check for invalid exit_signal

2019-09-10 Thread Eugene Syromiatnikov
Check that the kernel fails calls with exit_signal with non-zero highest
32 bits, negative 32-bit exit_signal, and not failing on passing
invalid exit_signal withing CSIGNAL mask, like legacy clone syscalls do.

* tools/testing/selftests/clone3/clone3.c (enum test_mode): Add
CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG,
CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG.
(call_clone3): Add args.exit_signal initialisation in case test_mode
is equal to one of the added enum test_mode values.
(main): Add test_clone3 clone check with test_mode equal to the added
enum test_mode values.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 36 +
 1 file changed, 36 insertions(+)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index 7b65ee5..318189f9 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -28,6 +28,10 @@ enum test_mode {
CLONE3_ARGS_NO_TEST,
CLONE3_ARGS_ALL_0,
CLONE3_ARGS_ALL_1,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG,
 };
 
 static pid_t raw_clone(struct clone_args *args, size_t size)
@@ -72,6 +76,22 @@ static int call_clone3(int flags, size_t size, enum 
test_mode test_mode)
args.tls = 1;
args.set_tid = 1;
break;
+
+   case CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG:
+   args.exit_signal = 0xbadc0dedULL;
+   break;
+
+   case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG:
+   args.exit_signal = 0x8000ULL;
+   break;
+
+   case CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG:
+   args.exit_signal = 0x0100ULL;
+   break;
+
+   case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG:
+   args.exit_signal = 0x00f0ULL;
+   break;
}
 
pid = raw_clone(, size);
@@ -146,6 +166,22 @@ int main(int argc, char *argv[])
/* Do a clone3() with all members set to 1 */
if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL, CLONE3_ARGS_ALL_1))
goto on_error;
+   /* Do a clone3() with exit_signal having highest 32 bits non-zero */
+   if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG))
+   goto on_error;
+   /* Do a clone3() with negative 32-bit exit_signal */
+   if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG))
+   goto on_error;
+   /* Do a clone3() with exit_signal not fitting into CSIGNAL mask */
+   if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG))
+   goto on_error;
+   /* Do a clone3() with NSIG < exit_signal < CSIG */
+   if (test_clone3(0, CLONE3_ARGS_SIZE_V0, 0,
+   CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG))
+   goto on_error;
/*
 * Do a clone3() with sizeof(struct clone_args) + 8
 * and all members set to 0.
-- 
2.1.4



[PATCH v2 0/6] Update clone3 self-tests

2019-09-10 Thread Eugene Syromiatnikov
Hello.

This patch set updates clone3 selftest in several aspects:
 - adding checks for exit_signal invalid values handling;
 - adding clone3 to selftests targets;
 - enabling clone3 tests on all architectures;
 - minor cleanups of the clone3 test.

Applied on top of brauer/linux.git/for-next.

Changes since v1[1]:
 - exit_signal check extended to cover more cases of invalid
   exit_signal value.

[1] https://lkml.org/lkml/2019/9/10/416

Eugene Syromiatnikov (6):
  selftests/clone3: convert test modes into an enum
  selftests/clone3: add a check for invalid exit_signal
  selftests/clone3: use uint64_t for flags parameter
  selftests/clone3: fix up format strings
  selftests/clone3: enable clone3 self-tests on all architectures
  selftests: add clone3 to TARGETS

 tools/testing/selftests/Makefile|  1 +
 tools/testing/selftests/clone3/Makefile |  4 +--
 tools/testing/selftests/clone3/clone3.c | 64 -
 3 files changed, 57 insertions(+), 12 deletions(-)

-- 
2.1.4



[PATCH v2] fork: check exit_signal passed in clone3() call

2019-09-10 Thread Eugene Syromiatnikov
Hello.

After some consideration, I've decided to utilise Oleg's proposal[1]
"(args.exit_signal & ~((u64)CSIGNAL))" as a check. I still don't like
it, as it mixes argument copy check (I'm not sure if it's ever needed,
however, as I'm not sure if there's a reason for exit_signal field
of struct kernel_clone_args to have int type) with argument sanity
check; moreover, it covers only clone3 case, and the code in
copy_process is still error-prone in the long run.  Ideally, the check
should be somewhere in the one place, but as of now this one place
is likely _do_fork, but it's kinda weir to have argument check there
as of now.

Changes since v1[2]:
 - Check changed to comparison against negated CSIGNAL to address
   the bug reported by Oleg[3].
 - Added a comment to _do_fork that exit_signal has to be checked
   by the caller.

[1] https://lkml.org/lkml/2019/9/10/581
[2] https://lkml.org/lkml/2019/9/10/411
[3] https://lkml.org/lkml/2019/9/10/467

Eugene Syromiatnikov (1):
  fork: check exit_signal passed in clone3() call

 kernel/fork.c | 12 
 1 file changed, 12 insertions(+)

-- 
2.1.4



[PATCH v2] fork: check exit_signal passed in clone3() call

2019-09-10 Thread Eugene Syromiatnikov
Previously, higher 32 bits of exit_signal fields were lost when
copied to the kernel args structure (that uses int as a type for the
respective field).  Moreover, as Oleg has noted[1], exit_signal is used
unchecked, so it has to be checked for sanity before use; for the legacy
syscalls, applying CSIGNAL mask guarantees that it is at least non-negative;
however, there's no such thing is done in clone3() code path, and that can
break at least thread_group_leader.

Checking user-passed exit_signal against ~CSIGNAL mask solves both
of these problems.

[1] https://lkml.org/lkml/2019/9/10/467

* kernel/fork.c (copy_clone_args_from_user): Fail with -EINVAL if
args.exit_signal has bits set outside CSIGNAL mask.
(_do_fork): Note that exit_signal is expected to be checked for the
sanity by the caller.

Fixes: 7f192e3cd316 ("fork: add clone3")
Reported-by: Oleg Nesterov 
Signed-off-by: Eugene Syromiatnikov 
---
 kernel/fork.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/kernel/fork.c b/kernel/fork.c
index 2852d0e..9dee2ab 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2338,6 +2338,8 @@ struct mm_struct *copy_init_mm(void)
  *
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
+ *
+ * args->exit_signal is expected to be checked for sanity by the caller.
  */
 long _do_fork(struct kernel_clone_args *args)
 {
@@ -2562,6 +2564,16 @@ noinline static int copy_clone_args_from_user(struct 
kernel_clone_args *kargs,
if (copy_from_user(, uargs, size))
return -EFAULT;
 
+   /*
+* exit_signal is confined to CSIGNAL mask in legacy syscalls,
+* so it is used unchecked deeper in syscall handling routines;
+* moreover, copying to struct kernel_clone_args.exit_signals
+* trims higher 32 bits, so it is has to be checked that they
+* are zero.
+*/
+   if (unlikely(args.exit_signal & ~((u64)CSIGNAL)))
+   return -EINVAL;
+
*kargs = (struct kernel_clone_args){
.flags  = args.flags,
.pidfd  = u64_to_user_ptr(args.pidfd),
-- 
2.1.4



Re: [PATCH] fork: fail on non-zero higher 32 bits of args.exit_signal

2019-09-10 Thread Eugene Syromiatnikov
On Tue, Sep 10, 2019 at 04:46:15PM +0200, Christian Brauner wrote:
> On Tue, Sep 10, 2019 at 04:39:44PM +0200, Oleg Nesterov wrote:
> > On 09/10, Christian Brauner wrote:
> > > On Tue, Sep 10, 2019 at 03:10:48PM +0200, Christian Brauner wrote:
> > > > On Tue, Sep 10, 2019 at 03:09:35PM +0200, Christian Brauner wrote:
> > > > > On Tue, Sep 10, 2019 at 02:44:41PM +0200, Oleg Nesterov wrote:
> > > > > > On 09/10, Eugene Syromiatnikov wrote:
> > > > > > >
> > > > > > > --- a/kernel/fork.c
> > > > > > > +++ b/kernel/fork.c
> > > > > > > @@ -2562,6 +2562,9 @@ noinline static int 
> > > > > > > copy_clone_args_from_user(struct kernel_clone_args *kargs,
> > > > > > >   if (copy_from_user(, uargs, size))
> > > > > > >   return -EFAULT;
> > > > > > >  
> > > > > > > + if (unlikely(((unsigned int)args.exit_signal) != 
> > > > > > > args.exit_signal))
> > > > > > > + return -EINVAL;
> > > > > > 
> > > > > > Hmm. Unless I am totally confused you found a serious bug...
> > > > > > 
> > > > > > Without CLONE_THREAD/CLONE_PARENT copy_process() blindly does
> > > > > > 
> > > > > > p->exit_signal = args->exit_signal;
> > > > > > 
> > > > > > the valid_signal(sig) check in do_notify_parent() mostly saves us, 
> > > > > > but we
> > > > > > must not allow child->exit_signal < 0, if nothing else this breaks
> > > > > > thread_group_leader().
> > > > > > 
> > > > > > And afaics this patch doesn't fix this? I think we need the 
> > > > > > valid_signal()
> > > > > > check...
> > > > > 
> > > > > Thanks for sending this patch so quickly after our conversation
> > > > > yesterday, Eugene!
> > > > > We definitely want valid_signal() to verify the signal is ok.
> > > 
> > > So we could do your check in copy_clone_args_from_user(), and then we do
> > > another valid_signal() check in clone3_args_valid()? We could do the
> > > latter in copy_clone_args_from_user() too but it's nicer to do it along
> > > the other checks in clone3_args_valid().
> > 
> > I am fine either way. Sure, we can add valid_signal() into 
> > clone3_args_valid(),
> > but then I'd ask to simplify the "overflow" check above. Something like
> > 
> > if (args.exit_signal > UINT_MAX)
> > return -EINVAL;
> > 
> > looks much more readable to me.
> > 
> > 
> > Or we can simply do
> > 
> > if (args.exit_signal & ~((u64)CSIGNAL))
> > return -EINVAL;
> > 
> > in copy_clone_args_from_user() and forget about all problems.
> 
> Both are fine with me. The latter might have the advantage that we catch
> both legacy clone and clone3. I think Eugene prefers this as well.

Unfortunately, it doesn't.  I think, the best place for the check is
either in _do_fork or copy_process itself; however, it's quite messy as
that way it's detached from the other checks, but, at the same time,
there are a lot of code paths (like the one in arch/x86/ia32/sys_ia32.c),
and it's kinda obscure that the caller of _do_fork has to check that
exit_syscall is positive itself.

> Christian


Re: [PATCH] fork: fail on non-zero higher 32 bits of args.exit_signal

2019-09-10 Thread Eugene Syromiatnikov
On Tue, Sep 10, 2019 at 03:27:02PM +0200, Christian Brauner wrote:
> On Tue, Sep 10, 2019 at 03:10:48PM +0200, Christian Brauner wrote:
> > On Tue, Sep 10, 2019 at 03:09:35PM +0200, Christian Brauner wrote:
> > > On Tue, Sep 10, 2019 at 02:44:41PM +0200, Oleg Nesterov wrote:
> > > > On 09/10, Eugene Syromiatnikov wrote:
> > > > >
> > > > > --- a/kernel/fork.c
> > > > > +++ b/kernel/fork.c
> > > > > @@ -2562,6 +2562,9 @@ noinline static int 
> > > > > copy_clone_args_from_user(struct kernel_clone_args *kargs,
> > > > >   if (copy_from_user(, uargs, size))
> > > > >   return -EFAULT;
> > > > >  
> > > > > + if (unlikely(((unsigned int)args.exit_signal) != 
> > > > > args.exit_signal))
> > > > > + return -EINVAL;
> > > > 
> > > > Hmm. Unless I am totally confused you found a serious bug...
> > > > 
> > > > Without CLONE_THREAD/CLONE_PARENT copy_process() blindly does
> > > > 
> > > > p->exit_signal = args->exit_signal;
> > > > 
> > > > the valid_signal(sig) check in do_notify_parent() mostly saves us, but 
> > > > we
> > > > must not allow child->exit_signal < 0, if nothing else this breaks
> > > > thread_group_leader().
> > > > 
> > > > And afaics this patch doesn't fix this? I think we need the 
> > > > valid_signal()
> > > > check...
> > > 
> > > Thanks for sending this patch so quickly after our conversation
> > > yesterday, Eugene!
> > > We definitely want valid_signal() to verify the signal is ok.
> 
> So we could do your check in copy_clone_args_from_user(), and then we do
> another valid_signal() check in clone3_args_valid()? We could do the
> latter in copy_clone_args_from_user() too but it's nicer to do it along
> the other checks in clone3_args_valid().

There's also a discrepancy between CSIGNAL (0xff) and _NSIG, used
in valid_signal (which is between 32 and 128, depending on architecture),
it seems it doesn't break thread_group_leader, but definitely allows
passing some invalid signal numbers via legacy clone-like syscalls—I'm
not sure if that's important.

> Christian


[PATCH 5/6] selftests/clone3: enable clone3 self-tests on all architectures

2019-09-10 Thread Eugene Syromiatnikov
clone3() is available on most architectures, so there's no reason to
restrict the respective self-tests to x86_64.

* tools/testing/selftests/clone3/Makefile (TEST_GEN_PROGS): Set always,
not only ifeq ($(ARCH),x86_64).

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/testing/selftests/clone3/Makefile 
b/tools/testing/selftests/clone3/Makefile
index 4efcf45..faa069c 100644
--- a/tools/testing/selftests/clone3/Makefile
+++ b/tools/testing/selftests/clone3/Makefile
@@ -4,8 +4,6 @@ ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
 
 CFLAGS += -I../../../../usr/include/
 
-ifeq ($(ARCH),x86_64)
-   TEST_GEN_PROGS := clone3 clone3_set_tid
-endif
+TEST_GEN_PROGS := clone3 clone3_set_tid
 
 include ../lib.mk
-- 
2.1.4



[PATCH 6/6] selftests: add clone3 to TARGETS

2019-09-10 Thread Eugene Syromiatnikov
* tools/testing/selftests/Makefile (TARGETS): Add clone3.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 25b43a8c..05163e4 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -4,6 +4,7 @@ TARGETS += bpf
 TARGETS += breakpoints
 TARGETS += capabilities
 TARGETS += cgroup
+TARGETS += clone3
 TARGETS += cpufreq
 TARGETS += cpu-hotplug
 TARGETS += drivers/dma-buf
-- 
2.1.4



[PATCH 2/6] selftests/clone3: add a check for invalid exit_signal

2019-09-10 Thread Eugene Syromiatnikov
Check that the kernel fails calls with exit_signal with non-zero highest
32 bits.

* tools/testing/selftests/clone3/clone3.c (enum test_mode): Add
CLONE3_ARGS_BIG_EXIT_SIGNAL.
(call_clone3): Add args.exit_signal initialisation in case
test_mode == CLONE3_ARGS_BIG_EXIT_SIGNAL.
(main): Add test_clone3 clone check with
test_mode == CLONE3_ARGS_BIG_EXIT_SIGNAL.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index 7b65ee5..4f23a0c 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -28,6 +28,7 @@ enum test_mode {
CLONE3_ARGS_NO_TEST,
CLONE3_ARGS_ALL_0,
CLONE3_ARGS_ALL_1,
+   CLONE3_ARGS_BIG_EXIT_SIGNAL,
 };
 
 static pid_t raw_clone(struct clone_args *args, size_t size)
@@ -72,6 +73,10 @@ static int call_clone3(int flags, size_t size, enum 
test_mode test_mode)
args.tls = 1;
args.set_tid = 1;
break;
+
+   case CLONE3_ARGS_BIG_EXIT_SIGNAL:
+   args.exit_signal = 0xbadc0ded;
+   break;
}
 
pid = raw_clone(, size);
@@ -146,6 +151,10 @@ int main(int argc, char *argv[])
/* Do a clone3() with all members set to 1 */
if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL, CLONE3_ARGS_ALL_1))
goto on_error;
+   /* Do a clone3() with exit_signal having highest 32 bits non-zero */
+   if (test_clone3(0, CLONE3_ARGS_SIZE_V0, -EINVAL,
+   CLONE3_ARGS_BIG_EXIT_SIGNAL))
+   goto on_error;
/*
 * Do a clone3() with sizeof(struct clone_args) + 8
 * and all members set to 0.
-- 
2.1.4



[PATCH 3/6] selftests/clone3: use uint64_t for flags parameter

2019-09-10 Thread Eugene Syromiatnikov
Flags parameter in both userspace and kernel clone args is 64-bit wide,
there's little reason to have it signed and 32-bit in tests.

* tools/testing/selftests/clone3/clone3.c: Include  and
.
(call_clone3): Change flags parameter type from int to uint64_t.
(test_clone3): Change flags parameter type from int to uint64_t; change
the format string that prints it accordingly.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index 4f23a0c..1746a9b 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -4,8 +4,10 @@
 
 #define _GNU_SOURCE
 #include 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -36,7 +38,7 @@ static pid_t raw_clone(struct clone_args *args, size_t size)
return syscall(__NR_clone3, args, size);
 }
 
-static int call_clone3(int flags, size_t size, enum test_mode test_mode)
+static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
 {
struct clone_args args = {0};
pid_t ppid = -1;
@@ -102,12 +104,13 @@ static int call_clone3(int flags, size_t size, enum 
test_mode test_mode)
return 0;
 }
 
-static int test_clone3(int flags, size_t size, int expected,
+static int test_clone3(uint64_t flags, size_t size, int expected,
   enum test_mode test_mode)
 {
int ret;
 
-   ksft_print_msg("[%d] Trying clone3() with flags 0x%x (size %d)\n",
+   ksft_print_msg("[%d] Trying clone3() with flags %#" PRIx64 " (size %d)"
+   "\n",
getpid(), flags, size);
ret = call_clone3(flags, size, test_mode);
ksft_print_msg("[%d] clone3() with flags says :%d expected %d\n",
-- 
2.1.4



[PATCH 1/6] selftests/clone3: convert test modes into an enum

2019-09-10 Thread Eugene Syromiatnikov
* tools/testing/selftests/clone3/clone3.c (CLONE3_ARGS_NO_TEST,
CLONE3_ARGS_ALL_0, CLONE3_ARGS_ALL_1): Change into an enum.
(call_clone3): Change test_mode parameter type to enum test_mode;
use switch statement for actions that dependent on test_mode selection.
(test_clone3): Change test_mode parameter type to enum test_mode.

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index a0f1989..7b65ee5 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -24,16 +24,18 @@
 /* V1 includes set_tid */
 #define CLONE3_ARGS_SIZE_V1 72
 
-#define CLONE3_ARGS_NO_TEST 0
-#define CLONE3_ARGS_ALL_0 1
-#define CLONE3_ARGS_ALL_1 2
+enum test_mode {
+   CLONE3_ARGS_NO_TEST,
+   CLONE3_ARGS_ALL_0,
+   CLONE3_ARGS_ALL_1,
+};
 
 static pid_t raw_clone(struct clone_args *args, size_t size)
 {
return syscall(__NR_clone3, args, size);
 }
 
-static int call_clone3(int flags, size_t size, int test_mode)
+static int call_clone3(int flags, size_t size, enum test_mode test_mode)
 {
struct clone_args args = {0};
pid_t ppid = -1;
@@ -46,7 +48,8 @@ static int call_clone3(int flags, size_t size, int test_mode)
if (size == 0)
size = sizeof(struct clone_args);
 
-   if (test_mode == CLONE3_ARGS_ALL_0) {
+   switch (test_mode) {
+   case CLONE3_ARGS_ALL_0:
args.flags = 0;
args.pidfd = 0;
args.child_tid = 0;
@@ -56,7 +59,9 @@ static int call_clone3(int flags, size_t size, int test_mode)
args. stack_size = 0;
args.tls = 0;
args.set_tid = 0;
-   } else if (test_mode == CLONE3_ARGS_ALL_1) {
+   break;
+
+   case CLONE3_ARGS_ALL_1:
args.flags = 1;
args.pidfd = 1;
args.child_tid = 1;
@@ -66,6 +71,7 @@ static int call_clone3(int flags, size_t size, int test_mode)
args. stack_size = 1;
args.tls = 1;
args.set_tid = 1;
+   break;
}
 
pid = raw_clone(, size);
@@ -91,7 +97,8 @@ static int call_clone3(int flags, size_t size, int test_mode)
return 0;
 }
 
-static int test_clone3(int flags, size_t size, int expected, int test_mode)
+static int test_clone3(int flags, size_t size, int expected,
+  enum test_mode test_mode)
 {
int ret;
 
-- 
2.1.4



[PATCH 4/6] selftests/clone3: fix up format strings

2019-09-10 Thread Eugene Syromiatnikov
* tools/testing/selftests/clone3/clone3.c (test_clone3): Change format
qualifier for printing size field from %d to %zu; place colon right
after the word "says".

Signed-off-by: Eugene Syromiatnikov 
---
 tools/testing/selftests/clone3/clone3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/clone3/clone3.c 
b/tools/testing/selftests/clone3/clone3.c
index 1746a9b..7b8d927 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -109,11 +109,11 @@ static int test_clone3(uint64_t flags, size_t size, int 
expected,
 {
int ret;
 
-   ksft_print_msg("[%d] Trying clone3() with flags %#" PRIx64 " (size %d)"
+   ksft_print_msg("[%d] Trying clone3() with flags %#" PRIx64 " (size %zu)"
"\n",
getpid(), flags, size);
ret = call_clone3(flags, size, test_mode);
-   ksft_print_msg("[%d] clone3() with flags says :%d expected %d\n",
+   ksft_print_msg("[%d] clone3() with flags says: %d expected %d\n",
getpid(), ret, expected);
if (ret != expected)
ksft_exit_fail_msg(
-- 
2.1.4



[PATCH 0/6] Update clone3 self-tests

2019-09-10 Thread Eugene Syromiatnikov
Hello.

This patch set updates clone3 selftest in several aspects:
 - adding a check for kernel not ignoring highest 32 bits of exit_signal
   field of clone3 syscall arguments structure;
 - adding clone3 to selftests targets;
 - enabling clone3 tests on all architectures;
 - minor cleanups of the clone3 test.

Applied on top of brauer/linux.git/for-next.

Eugene Syromiatnikov (6):
  selftests/clone3: convert test modes into an enum
  selftests/clone3: add a check for invalid exit_signal
  selftests/clone3: use uint64_t for flags parameter
  selftests/clone3: fix up format strings
  selftests/clone3: enable clone3 self-tests on all architectures
  selftests: add clone3 to TARGETS

 tools/testing/selftests/Makefile|  1 +
 tools/testing/selftests/clone3/Makefile |  4 +---
 tools/testing/selftests/clone3/clone3.c | 37 +
 3 files changed, 30 insertions(+), 12 deletions(-)

-- 
2.1.4



[PATCH] fork: fail on non-zero higher 32 bits of args.exit_signal

2019-09-10 Thread Eugene Syromiatnikov
Previously, higher 32 bits of exit_signal fields were lost when
copied to the kernel args structure (that uses int as a type for the
respective field).  Fail with EINVAL if these are set as it looks like
there's no sane reason to accept them.

* kernel/fork.c (copy_clone_args_from_user): Fail with -EINVAL if
args.exit_signal converted to unsigned int is not equal to the original
value.

Signed-off-by: Eugene Syromiatnikov 
---
 kernel/fork.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/fork.c b/kernel/fork.c
index 2852d0e..fcbc4d5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2562,6 +2562,9 @@ noinline static int copy_clone_args_from_user(struct 
kernel_clone_args *kargs,
if (copy_from_user(, uargs, size))
return -EFAULT;
 
+   if (unlikely(((unsigned int)args.exit_signal) != args.exit_signal))
+   return -EINVAL;
+
*kargs = (struct kernel_clone_args){
.flags  = args.flags,
.pidfd  = u64_to_user_ptr(args.pidfd),
-- 
2.1.4



Re: [PATCH v2 0/2] Fix typo in RWH_WRITE_LIFE_NOT_SET constant name

2018-12-17 Thread Eugene Syromiatnikov
On Mon, Oct 29, 2018 at 09:00:45PM +0100, Eugene Syromiatnikov wrote:
> This is a small fix of a typo (or, more specifically, some remnant of
> the old patch version spelling) in RWH_WRITE_LIFE_NOT_SET constant,
> which is named as RWF_WRITE_LIFE_NOT_SET currently.  Since the name
> with "H" is used in man page and everywhere else, it's probably worth
> to make the name used in the fcntl.h UAPI header in line with it.
> Second follow-up patch updates (the sole) usage site of this constant
> in kernel to use the new spelling.
> 
> The old name is retained as it is part of UAPI now.

Hello.

Are there any issues with this version of the series?

> Changes since v1:
>  * Changed format of the commit ID in the commit message of the first patch.
>  * Removed bogus Signed-off-by that snuck into the resend of the series.
> 
> Eugene Syromiatnikov (2):
>   fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint
>   drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET
> 
>  drivers/md/raid5.c   | 4 ++--
>  fs/fcntl.c   | 2 +-
>  include/uapi/linux/fcntl.h   | 9 -
>  tools/include/uapi/linux/fcntl.h | 9 -
>  4 files changed, 19 insertions(+), 5 deletions(-)
> 
> -- 
> 2.1.4
> 


Re: [PATCH] execve.2: document an effect of BINPRM_BUF_SIZE increase to 256

2018-11-22 Thread Eugene Syromiatnikov
On Thu, Nov 22, 2018 at 04:59:24PM +0100, Michael Kerrisk (man-pages) wrote:
> Hi Eugene,
> 
> On Wed, 21 Nov 2018 at 17:07, Eugene Syromiatnikov  wrote:
> >
> > Increase of BINPRM_BUF_SIZE to 256 increases the limit on the possible
> > interpreter line length for scripts to 255.
> 
> I don't see this in the kernel source tree? Could you point me at the
> right source file / git commit?

Sorry, it's a corresponding man page update for a proposed kernel
change[1][2], and I failed to Cc: all the related parties properly.

[1] https://lkml.org/lkml/2018/11/12/2330
[2] https://lkml.org/lkml/2018/11/12/2328

> Thanks,
> 
> Michael
> 
> > Signed-off-by: Eugene Syromiatnikov 
> > ---
> >  man2/execve.2 | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/man2/execve.2 b/man2/execve.2
> > index d2bb861..f4e37d6 100644
> > --- a/man2/execve.2
> > +++ b/man2/execve.2
> > @@ -633,8 +633,9 @@ prototype:
> >  .in
> >  .\"
> >  .SS Interpreter scripts
> > -A maximum line length of 127 characters is allowed for the first line in
> > -an interpreter script.
> > +Before Linux 4.20, a maximum line length of 127 characters was allowed
> > +for the first line in an interpreter script.
> > +In Linux 4.20 onwards, this limit was increased up to 255 characters.
> >  .PP
> >  The semantics of the
> >  .I optional-arg
> > --
> > 2.1.4
> >
> 
> 
> -- 
> Michael Kerrisk
> Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> Linux/UNIX System Programming Training: http://man7.org/training/


Re: [PATCH] execve.2: document an effect of BINPRM_BUF_SIZE increase to 256

2018-11-22 Thread Eugene Syromiatnikov
On Thu, Nov 22, 2018 at 04:59:24PM +0100, Michael Kerrisk (man-pages) wrote:
> Hi Eugene,
> 
> On Wed, 21 Nov 2018 at 17:07, Eugene Syromiatnikov  wrote:
> >
> > Increase of BINPRM_BUF_SIZE to 256 increases the limit on the possible
> > interpreter line length for scripts to 255.
> 
> I don't see this in the kernel source tree? Could you point me at the
> right source file / git commit?

Sorry, it's a corresponding man page update for a proposed kernel
change[1][2], and I failed to Cc: all the related parties properly.

[1] https://lkml.org/lkml/2018/11/12/2330
[2] https://lkml.org/lkml/2018/11/12/2328

> Thanks,
> 
> Michael
> 
> > Signed-off-by: Eugene Syromiatnikov 
> > ---
> >  man2/execve.2 | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/man2/execve.2 b/man2/execve.2
> > index d2bb861..f4e37d6 100644
> > --- a/man2/execve.2
> > +++ b/man2/execve.2
> > @@ -633,8 +633,9 @@ prototype:
> >  .in
> >  .\"
> >  .SS Interpreter scripts
> > -A maximum line length of 127 characters is allowed for the first line in
> > -an interpreter script.
> > +Before Linux 4.20, a maximum line length of 127 characters was allowed
> > +for the first line in an interpreter script.
> > +In Linux 4.20 onwards, this limit was increased up to 255 characters.
> >  .PP
> >  The semantics of the
> >  .I optional-arg
> > --
> > 2.1.4
> >
> 
> 
> -- 
> Michael Kerrisk
> Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> Linux/UNIX System Programming Training: http://man7.org/training/


Re: [PATCH v1 2/2] signal: add procfd_signal() syscall

2018-11-19 Thread Eugene Syromiatnikov
On Mon, Nov 19, 2018 at 11:32:39AM +0100, Christian Brauner wrote:
> +/**
> + *  sys_procfd_signal - send a signal to a process through a process file
> + *  descriptor
> + *  @fd: the file descriptor of the process
> + *  @sig: signal to be sent
> + *  @info: the signal info
> + *  @flags: future flags to be passed
> + */
> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
> + int, flags)

It could be considered better to use an unsigned type for the "flags"
argument.


Re: [PATCH v1 2/2] signal: add procfd_signal() syscall

2018-11-19 Thread Eugene Syromiatnikov
On Mon, Nov 19, 2018 at 11:32:39AM +0100, Christian Brauner wrote:
> +/**
> + *  sys_procfd_signal - send a signal to a process through a process file
> + *  descriptor
> + *  @fd: the file descriptor of the process
> + *  @sig: signal to be sent
> + *  @info: the signal info
> + *  @flags: future flags to be passed
> + */
> +SYSCALL_DEFINE4(procfd_signal, int, fd, int, sig, siginfo_t __user *, info,
> + int, flags)

It could be considered better to use an unsigned type for the "flags"
argument.


Re: [PATCH v1 2/2] signal: add procfd_signal() syscall

2018-11-19 Thread Eugene Syromiatnikov
On Mon, Nov 19, 2018 at 11:32:39AM +0100, Christian Brauner wrote:
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
> b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3cf7b533b3d1..e637eab883e9 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -398,3 +398,4 @@
>  384  i386arch_prctl  sys_arch_prctl  
> __ia32_compat_sys_arch_prctl
>  385  i386io_pgetevents   sys_io_pgetevents   
> __ia32_compat_sys_io_pgetevents
>  386  i386rseqsys_rseq
> __ia32_sys_rseq
> +387  i386procfd_signal   sys_procfd_signal   
> __ia32_sys_procfd_signal
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
> b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..e95f6741ab42 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
>  332  common  statx   __x64_sys_statx
>  333  common  io_pgetevents   __x64_sys_io_pgetevents
>  334  common  rseq__x64_sys_rseq
> +335  common  procfd_signal   __x64_sys_procfd_signal

You have wired up the syscall on x86 but have not added the syscall number
to the generic syscall header (include/uapi/asm-generic/unistd.h), why?


Re: [PATCH v1 2/2] signal: add procfd_signal() syscall

2018-11-19 Thread Eugene Syromiatnikov
On Mon, Nov 19, 2018 at 11:32:39AM +0100, Christian Brauner wrote:
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
> b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3cf7b533b3d1..e637eab883e9 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -398,3 +398,4 @@
>  384  i386arch_prctl  sys_arch_prctl  
> __ia32_compat_sys_arch_prctl
>  385  i386io_pgetevents   sys_io_pgetevents   
> __ia32_compat_sys_io_pgetevents
>  386  i386rseqsys_rseq
> __ia32_sys_rseq
> +387  i386procfd_signal   sys_procfd_signal   
> __ia32_sys_procfd_signal
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
> b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..e95f6741ab42 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
>  332  common  statx   __x64_sys_statx
>  333  common  io_pgetevents   __x64_sys_io_pgetevents
>  334  common  rseq__x64_sys_rseq
> +335  common  procfd_signal   __x64_sys_procfd_signal

You have wired up the syscall on x86 but have not added the syscall number
to the generic syscall header (include/uapi/asm-generic/unistd.h), why?


[PATCH v2 2/2] drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

2018-10-29 Thread Eugene Syromiatnikov
As it is consistent with prefixes of other write life time hints.

Signed-off-by: Eugene Syromiatnikov 
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e4e98f4..0bcfbd3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1139,7 +1139,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
bi->bi_iter.bi_size = STRIPE_SIZE;
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
@@ -1192,7 +1192,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_iter.bi_size = STRIPE_SIZE;
rbi->bi_write_hint = sh->dev[i].write_hint;
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
-- 
2.1.4



[PATCH v2 2/2] drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

2018-10-29 Thread Eugene Syromiatnikov
As it is consistent with prefixes of other write life time hints.

Signed-off-by: Eugene Syromiatnikov 
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e4e98f4..0bcfbd3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1139,7 +1139,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
bi->bi_iter.bi_size = STRIPE_SIZE;
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
@@ -1192,7 +1192,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_iter.bi_size = STRIPE_SIZE;
rbi->bi_write_hint = sh->dev[i].write_hint;
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
-- 
2.1.4



[PATCH v2 1/2] fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint

2018-10-29 Thread Eugene Syromiatnikov
According to commit message in the original commit c75b1d9421f8 ("fs:
add fcntl() interface for setting/getting write life time hints"),
as well as userspace library[1] and man page update[2], R/W hint constants
are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained
"RWF_*" prefix used in early versions of the proposed patch set[3].
Rename it and provide the old name as a synonym for the new one
for backward compatibility.

[1] https://github.com/axboe/fio/commit/bd553af6c849
[2] https://github.com/mkerrisk/man-pages/commit/580082a186fd
[3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html

Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life 
time hints")
Signed-off-by: Eugene Syromiatnikov 
---
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 0831851..9f7f57e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -261,7 +261,7 @@ static int f_getowner_uids(struct file *filp, unsigned long 
arg)
 static bool rw_hint_valid(enum rw_hint hint)
 {
switch (hint) {
-   case RWF_WRITE_LIFE_NOT_SET:
+   case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
case RWH_WRITE_LIFE_SHORT:
case RWH_WRITE_LIFE_MEDIUM:
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
-- 
2.1.4



[PATCH v2 1/2] fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint

2018-10-29 Thread Eugene Syromiatnikov
According to commit message in the original commit c75b1d9421f8 ("fs:
add fcntl() interface for setting/getting write life time hints"),
as well as userspace library[1] and man page update[2], R/W hint constants
are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained
"RWF_*" prefix used in early versions of the proposed patch set[3].
Rename it and provide the old name as a synonym for the new one
for backward compatibility.

[1] https://github.com/axboe/fio/commit/bd553af6c849
[2] https://github.com/mkerrisk/man-pages/commit/580082a186fd
[3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html

Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life 
time hints")
Signed-off-by: Eugene Syromiatnikov 
---
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 0831851..9f7f57e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -261,7 +261,7 @@ static int f_getowner_uids(struct file *filp, unsigned long 
arg)
 static bool rw_hint_valid(enum rw_hint hint)
 {
switch (hint) {
-   case RWF_WRITE_LIFE_NOT_SET:
+   case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
case RWH_WRITE_LIFE_SHORT:
case RWH_WRITE_LIFE_MEDIUM:
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
-- 
2.1.4



[PATCH v2 0/2] Fix typo in RWH_WRITE_LIFE_NOT_SET constant name

2018-10-29 Thread Eugene Syromiatnikov
This is a small fix of a typo (or, more specifically, some remnant of
the old patch version spelling) in RWH_WRITE_LIFE_NOT_SET constant,
which is named as RWF_WRITE_LIFE_NOT_SET currently.  Since the name
with "H" is used in man page and everywhere else, it's probably worth
to make the name used in the fcntl.h UAPI header in line with it.
Second follow-up patch updates (the sole) usage site of this constant
in kernel to use the new spelling.

The old name is retained as it is part of UAPI now.

Changes since v1:
 * Changed format of the commit ID in the commit message of the first patch.
 * Removed bogus Signed-off-by that snuck into the resend of the series.

Eugene Syromiatnikov (2):
  fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint
  drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

 drivers/md/raid5.c   | 4 ++--
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 4 files changed, 19 insertions(+), 5 deletions(-)

-- 
2.1.4



[PATCH v2 0/2] Fix typo in RWH_WRITE_LIFE_NOT_SET constant name

2018-10-29 Thread Eugene Syromiatnikov
This is a small fix of a typo (or, more specifically, some remnant of
the old patch version spelling) in RWH_WRITE_LIFE_NOT_SET constant,
which is named as RWF_WRITE_LIFE_NOT_SET currently.  Since the name
with "H" is used in man page and everywhere else, it's probably worth
to make the name used in the fcntl.h UAPI header in line with it.
Second follow-up patch updates (the sole) usage site of this constant
in kernel to use the new spelling.

The old name is retained as it is part of UAPI now.

Changes since v1:
 * Changed format of the commit ID in the commit message of the first patch.
 * Removed bogus Signed-off-by that snuck into the resend of the series.

Eugene Syromiatnikov (2):
  fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint
  drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

 drivers/md/raid5.c   | 4 ++--
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 4 files changed, 19 insertions(+), 5 deletions(-)

-- 
2.1.4



Re: [PATCH RESEND 1/2] fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint

2018-10-29 Thread Eugene Syromiatnikov
On Fri, Oct 26, 2018 at 08:44:35AM -0600, Jens Axboe wrote:
> On 10/25/18 3:54 PM, Eugene Syromiatnikov wrote:
> > According to commit message in the original commit v4.13-rc1~212^2~51,
> > as well as userspace library[1] and man page update[2], R/W hint constants
> > are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained
> > "RWF_*" prefix used in earlyy versions of the proposed patch set[3].
> > Rename it and provide the old name as a synonym for the new one
> > for backward compatibility.
> > 
> > [1] https://github.com/axboe/fio/commit/bd553af6c849
> > [2] https://github.com/mkerrisk/man-pages/commit/580082a186fd
> > [3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html
> 
> Looks good, thanks for catching this. One note:
> 
> > Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write 
> > life time hints")
> > Signed-off-by: Eugene Syromiatnikov 
> > Signed-off-by: Eugene Syromyatnikov 
> 
> Why do you have two separate sign-offs?

Uh, sorry, I've messed up format-patch flags again.

> -- 
> Jens Axboe
> 


Re: [PATCH RESEND 1/2] fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint

2018-10-29 Thread Eugene Syromiatnikov
On Fri, Oct 26, 2018 at 08:44:35AM -0600, Jens Axboe wrote:
> On 10/25/18 3:54 PM, Eugene Syromiatnikov wrote:
> > According to commit message in the original commit v4.13-rc1~212^2~51,
> > as well as userspace library[1] and man page update[2], R/W hint constants
> > are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained
> > "RWF_*" prefix used in earlyy versions of the proposed patch set[3].
> > Rename it and provide the old name as a synonym for the new one
> > for backward compatibility.
> > 
> > [1] https://github.com/axboe/fio/commit/bd553af6c849
> > [2] https://github.com/mkerrisk/man-pages/commit/580082a186fd
> > [3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html
> 
> Looks good, thanks for catching this. One note:
> 
> > Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write 
> > life time hints")
> > Signed-off-by: Eugene Syromiatnikov 
> > Signed-off-by: Eugene Syromyatnikov 
> 
> Why do you have two separate sign-offs?

Uh, sorry, I've messed up format-patch flags again.

> -- 
> Jens Axboe
> 


[PATCH RESEND 2/2] drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

2018-10-25 Thread Eugene Syromiatnikov
As it is consistent with prefixes of other write life time hints.

Signed-off-by: Eugene Syromiatnikov 
Signed-off-by: Eugene Syromyatnikov 
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e4e98f4..0bcfbd3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1139,7 +1139,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
bi->bi_iter.bi_size = STRIPE_SIZE;
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
@@ -1192,7 +1192,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_iter.bi_size = STRIPE_SIZE;
rbi->bi_write_hint = sh->dev[i].write_hint;
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
-- 
2.1.4



[PATCH RESEND 2/2] drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

2018-10-25 Thread Eugene Syromiatnikov
As it is consistent with prefixes of other write life time hints.

Signed-off-by: Eugene Syromiatnikov 
Signed-off-by: Eugene Syromyatnikov 
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e4e98f4..0bcfbd3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1139,7 +1139,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
bi->bi_iter.bi_size = STRIPE_SIZE;
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
@@ -1192,7 +1192,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_iter.bi_size = STRIPE_SIZE;
rbi->bi_write_hint = sh->dev[i].write_hint;
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
-- 
2.1.4



[PATCH RESEND 1/2] fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint

2018-10-25 Thread Eugene Syromiatnikov
According to commit message in the original commit v4.13-rc1~212^2~51,
as well as userspace library[1] and man page update[2], R/W hint constants
are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained
"RWF_*" prefix used in earlyy versions of the proposed patch set[3].
Rename it and provide the old name as a synonym for the new one
for backward compatibility.

[1] https://github.com/axboe/fio/commit/bd553af6c849
[2] https://github.com/mkerrisk/man-pages/commit/580082a186fd
[3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html

Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life 
time hints")
Signed-off-by: Eugene Syromiatnikov 
Signed-off-by: Eugene Syromyatnikov 
---
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 0831851..9f7f57e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -261,7 +261,7 @@ static int f_getowner_uids(struct file *filp, unsigned long 
arg)
 static bool rw_hint_valid(enum rw_hint hint)
 {
switch (hint) {
-   case RWF_WRITE_LIFE_NOT_SET:
+   case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
case RWH_WRITE_LIFE_SHORT:
case RWH_WRITE_LIFE_MEDIUM:
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
-- 
2.1.4



[PATCH RESEND 1/2] fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint

2018-10-25 Thread Eugene Syromiatnikov
According to commit message in the original commit v4.13-rc1~212^2~51,
as well as userspace library[1] and man page update[2], R/W hint constants
are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained
"RWF_*" prefix used in earlyy versions of the proposed patch set[3].
Rename it and provide the old name as a synonym for the new one
for backward compatibility.

[1] https://github.com/axboe/fio/commit/bd553af6c849
[2] https://github.com/mkerrisk/man-pages/commit/580082a186fd
[3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html

Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life 
time hints")
Signed-off-by: Eugene Syromiatnikov 
Signed-off-by: Eugene Syromyatnikov 
---
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 0831851..9f7f57e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -261,7 +261,7 @@ static int f_getowner_uids(struct file *filp, unsigned long 
arg)
 static bool rw_hint_valid(enum rw_hint hint)
 {
switch (hint) {
-   case RWF_WRITE_LIFE_NOT_SET:
+   case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
case RWH_WRITE_LIFE_SHORT:
case RWH_WRITE_LIFE_MEDIUM:
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
-- 
2.1.4



Re: [PATCH 0/2] Fix typo in RWH_WRITE_LIFE_NOT_SET constant name

2018-10-25 Thread Eugene Syromiatnikov
On Thu, Oct 25, 2018 at 08:25:44AM -0600, Jens Axboe wrote:
> On 10/25/18 8:19 AM, Eugene Syromiatnikov wrote:
> > On Sat, Oct 06, 2018 at 07:51:14PM +0200, Eugene Syromiatnikov wrote:
> >> Hello.
> >>
> >> This is a small fix of a typo (or, more specifically, some remnant of
> >> the old patch version spelling) in RWH_WRITE_LIFE_NOT_SET constant,
> >> which is named as RWF_WRITE_LIFE_NOT_SET currently.  Since the name
> >> with "H" is used in man page and everywhere else, it's probably worth
> >> to make the name used in the fcntl.h UAPI header in line with it.
> >> Second follow-up patch updates (the sole) usage site of this constant
> >> in kernel to use the new spelling.
> >>
> >> The old name is retained as it is part of UAPI now.
> > 
> > Hello, are there issues with the patches?
> 
> I only got the 0/2 patch, none of the two others..

Uh, sorry, I see them on lkml[1][2], but likely have messed up git
format-patch options, as I usually do. Will resend shortly.

[1] https://lkml.org/lkml/2018/10/6/292
[2] https://lkml.org/lkml/2018/10/6/291

> -- 
> Jens Axboe
> 


Re: [PATCH 0/2] Fix typo in RWH_WRITE_LIFE_NOT_SET constant name

2018-10-25 Thread Eugene Syromiatnikov
On Thu, Oct 25, 2018 at 08:25:44AM -0600, Jens Axboe wrote:
> On 10/25/18 8:19 AM, Eugene Syromiatnikov wrote:
> > On Sat, Oct 06, 2018 at 07:51:14PM +0200, Eugene Syromiatnikov wrote:
> >> Hello.
> >>
> >> This is a small fix of a typo (or, more specifically, some remnant of
> >> the old patch version spelling) in RWH_WRITE_LIFE_NOT_SET constant,
> >> which is named as RWF_WRITE_LIFE_NOT_SET currently.  Since the name
> >> with "H" is used in man page and everywhere else, it's probably worth
> >> to make the name used in the fcntl.h UAPI header in line with it.
> >> Second follow-up patch updates (the sole) usage site of this constant
> >> in kernel to use the new spelling.
> >>
> >> The old name is retained as it is part of UAPI now.
> > 
> > Hello, are there issues with the patches?
> 
> I only got the 0/2 patch, none of the two others..

Uh, sorry, I see them on lkml[1][2], but likely have messed up git
format-patch options, as I usually do. Will resend shortly.

[1] https://lkml.org/lkml/2018/10/6/292
[2] https://lkml.org/lkml/2018/10/6/291

> -- 
> Jens Axboe
> 


Re: [PATCH 0/2] Fix typo in RWH_WRITE_LIFE_NOT_SET constant name

2018-10-25 Thread Eugene Syromiatnikov
On Sat, Oct 06, 2018 at 07:51:14PM +0200, Eugene Syromiatnikov wrote:
> Hello.
> 
> This is a small fix of a typo (or, more specifically, some remnant of
> the old patch version spelling) in RWH_WRITE_LIFE_NOT_SET constant,
> which is named as RWF_WRITE_LIFE_NOT_SET currently.  Since the name
> with "H" is used in man page and everywhere else, it's probably worth
> to make the name used in the fcntl.h UAPI header in line with it.
> Second follow-up patch updates (the sole) usage site of this constant
> in kernel to use the new spelling.
> 
> The old name is retained as it is part of UAPI now.

Hello, are there issues with the patches?

> Eugene Syromiatnikov (2):
>   fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint
>   drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET
> 
>  drivers/md/raid5.c   | 4 ++--
>  fs/fcntl.c   | 2 +-
>  include/uapi/linux/fcntl.h   | 9 -
>  tools/include/uapi/linux/fcntl.h | 9 -
>  4 files changed, 19 insertions(+), 5 deletions(-)
> 
> -- 
> 2.1.4
> 


Re: [PATCH 0/2] Fix typo in RWH_WRITE_LIFE_NOT_SET constant name

2018-10-25 Thread Eugene Syromiatnikov
On Sat, Oct 06, 2018 at 07:51:14PM +0200, Eugene Syromiatnikov wrote:
> Hello.
> 
> This is a small fix of a typo (or, more specifically, some remnant of
> the old patch version spelling) in RWH_WRITE_LIFE_NOT_SET constant,
> which is named as RWF_WRITE_LIFE_NOT_SET currently.  Since the name
> with "H" is used in man page and everywhere else, it's probably worth
> to make the name used in the fcntl.h UAPI header in line with it.
> Second follow-up patch updates (the sole) usage site of this constant
> in kernel to use the new spelling.
> 
> The old name is retained as it is part of UAPI now.

Hello, are there issues with the patches?

> Eugene Syromiatnikov (2):
>   fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint
>   drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET
> 
>  drivers/md/raid5.c   | 4 ++--
>  fs/fcntl.c   | 2 +-
>  include/uapi/linux/fcntl.h   | 9 -
>  tools/include/uapi/linux/fcntl.h | 9 -
>  4 files changed, 19 insertions(+), 5 deletions(-)
> 
> -- 
> 2.1.4
> 


Re: [PATCH v3 7/7] ia64: wire up system calls

2018-10-11 Thread Eugene Syromiatnikov
On Thu, Oct 11, 2018 at 09:24:43AM +0200, Arnd Bergmann wrote:
> On Thu, Oct 11, 2018 at 6:26 AM Firoz Khan  wrote:
> > +# perf_event_open requires an architecture specific implementation
> > +326common  perf_event_open sys_perf_event_open
[...]
> 
> I don't think that's correct for these two. perf_event_open() of
> course requires 'perf' support that ia64 does not have

That's actuallt quite funny, given the fact that IA-64 has perfmonctl()
which likely was the precursor of current perf infractructure.

> > +# pkey_mprotect requires an architecture specific implementation
> > +328common  pkey_mprotect   sys_pkey_mprotect
> > +# pkey_alloc requires an architecture specific implementation
> > +329common  pkey_alloc  sys_pkey_alloc
> > +# pkey_free requires an architecture specific implementation
> > +330common  pkey_free   sys_pkey_free
> 
> One comment for all pkey calls would be sufficient. More importantly
> it requires hardware support that ia64 does not have AFAICT.

Except it has[1].

[1] 
https://www.thailand.intel.com/content/dam/www/public/us/en/documents/manuals/itanium-architecture-software-developer-rev-2-3-vol-2-manual.pdf#page=78


Re: [PATCH v3 7/7] ia64: wire up system calls

2018-10-11 Thread Eugene Syromiatnikov
On Thu, Oct 11, 2018 at 09:24:43AM +0200, Arnd Bergmann wrote:
> On Thu, Oct 11, 2018 at 6:26 AM Firoz Khan  wrote:
> > +# perf_event_open requires an architecture specific implementation
> > +326common  perf_event_open sys_perf_event_open
[...]
> 
> I don't think that's correct for these two. perf_event_open() of
> course requires 'perf' support that ia64 does not have

That's actuallt quite funny, given the fact that IA-64 has perfmonctl()
which likely was the precursor of current perf infractructure.

> > +# pkey_mprotect requires an architecture specific implementation
> > +328common  pkey_mprotect   sys_pkey_mprotect
> > +# pkey_alloc requires an architecture specific implementation
> > +329common  pkey_alloc  sys_pkey_alloc
> > +# pkey_free requires an architecture specific implementation
> > +330common  pkey_free   sys_pkey_free
> 
> One comment for all pkey calls would be sufficient. More importantly
> it requires hardware support that ia64 does not have AFAICT.

Except it has[1].

[1] 
https://www.thailand.intel.com/content/dam/www/public/us/en/documents/manuals/itanium-architecture-software-developer-rev-2-3-vol-2-manual.pdf#page=78


[RFC PATCH] uapi, posix-timers: provide clockid-related macros and functions to UAPI

2018-10-07 Thread Eugene Syromiatnikov
As of now, there is no interface exposed for converting pid/fd into
clockid and vice versa; linuxptp, for example, has been carrying these
definitions in missing.h header for quite some time[1].

[1] https://sourceforge.net/p/linuxptp/code/ci/af380e86/tree/missing.h

Signed-off-by: Eugene Syromiatnikov 
---
 include/linux/posix-timers.h | 47 +---
 include/uapi/linux/time.h| 47 
 2 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index ee7e987e..eef1be8 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -17,52 +18,6 @@ struct cpu_timer_list {
int firing;
 };
 
-/*
- * Bit fields within a clockid:
- *
- * The most significant 29 bits hold either a pid or a file descriptor.
- *
- * Bit 2 indicates whether a cpu clock refers to a thread or a process.
- *
- * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
- *
- * A clockid is invalid if bits 2, 1, and 0 are all set.
- */
-#define CPUCLOCK_PID(clock)((pid_t) ~((clock) >> 3))
-#define CPUCLOCK_PERTHREAD(clock) \
-   (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
-
-#define CPUCLOCK_PERTHREAD_MASK4
-#define CPUCLOCK_WHICH(clock)  ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
-#define CPUCLOCK_CLOCK_MASK3
-#define CPUCLOCK_PROF  0
-#define CPUCLOCK_VIRT  1
-#define CPUCLOCK_SCHED 2
-#define CPUCLOCK_MAX   3
-#define CLOCKFDCPUCLOCK_MAX
-#define CLOCKFD_MASK   (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
-
-static inline clockid_t make_process_cpuclock(const unsigned int pid,
-   const clockid_t clock)
-{
-   return ((~pid) << 3) | clock;
-}
-static inline clockid_t make_thread_cpuclock(const unsigned int tid,
-   const clockid_t clock)
-{
-   return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK);
-}
-
-static inline clockid_t fd_to_clockid(const int fd)
-{
-   return make_process_cpuclock((unsigned int) fd, CLOCKFD);
-}
-
-static inline int clockid_to_fd(const clockid_t clk)
-{
-   return ~(clk >> 3);
-}
-
 #define REQUEUE_PENDING 1
 
 /**
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 6b56a22..0043110 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -97,4 +97,51 @@ struct __kernel_old_timeval {
  */
 #define TIMER_ABSTIME  0x01
 
+/*
+ * Bit fields within a clockid:
+ *
+ * The most significant 29 bits hold either a pid or a file descriptor.
+ *
+ * Bit 2 indicates whether a cpu clock refers to a thread or a process.
+ *
+ * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
+ *
+ * A clockid is invalid if bits 2, 1, and 0 are all set.
+ */
+#define CPUCLOCK_PID(clock)((pid_t) ~((clock) >> 3))
+#define CPUCLOCK_PERTHREAD(clock) \
+   (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
+
+#define CPUCLOCK_PERTHREAD_MASK4
+#define CPUCLOCK_WHICH(clock)  ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
+#define CPUCLOCK_CLOCK_MASK3
+#define CPUCLOCK_PROF  0
+#define CPUCLOCK_VIRT  1
+#define CPUCLOCK_SCHED 2
+#define CPUCLOCK_MAX   3
+#define CLOCKFDCPUCLOCK_MAX
+#define CLOCKFD_MASK   (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
+
+static inline clockid_t make_process_cpuclock(const unsigned int pid,
+   const clockid_t clock)
+{
+   return ((~pid) << 3) | clock;
+}
+static inline clockid_t make_thread_cpuclock(const unsigned int tid,
+   const clockid_t clock)
+{
+   return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK);
+}
+
+static inline clockid_t fd_to_clockid(const int fd)
+{
+   return make_process_cpuclock((unsigned int) fd, CLOCKFD);
+}
+
+static inline int clockid_to_fd(const clockid_t clk)
+{
+   return ~(clk >> 3);
+}
+
+
 #endif /* _UAPI_LINUX_TIME_H */
-- 
2.1.4



[RFC PATCH] uapi, posix-timers: provide clockid-related macros and functions to UAPI

2018-10-07 Thread Eugene Syromiatnikov
As of now, there is no interface exposed for converting pid/fd into
clockid and vice versa; linuxptp, for example, has been carrying these
definitions in missing.h header for quite some time[1].

[1] https://sourceforge.net/p/linuxptp/code/ci/af380e86/tree/missing.h

Signed-off-by: Eugene Syromiatnikov 
---
 include/linux/posix-timers.h | 47 +---
 include/uapi/linux/time.h| 47 
 2 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index ee7e987e..eef1be8 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -17,52 +18,6 @@ struct cpu_timer_list {
int firing;
 };
 
-/*
- * Bit fields within a clockid:
- *
- * The most significant 29 bits hold either a pid or a file descriptor.
- *
- * Bit 2 indicates whether a cpu clock refers to a thread or a process.
- *
- * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
- *
- * A clockid is invalid if bits 2, 1, and 0 are all set.
- */
-#define CPUCLOCK_PID(clock)((pid_t) ~((clock) >> 3))
-#define CPUCLOCK_PERTHREAD(clock) \
-   (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
-
-#define CPUCLOCK_PERTHREAD_MASK4
-#define CPUCLOCK_WHICH(clock)  ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
-#define CPUCLOCK_CLOCK_MASK3
-#define CPUCLOCK_PROF  0
-#define CPUCLOCK_VIRT  1
-#define CPUCLOCK_SCHED 2
-#define CPUCLOCK_MAX   3
-#define CLOCKFDCPUCLOCK_MAX
-#define CLOCKFD_MASK   (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
-
-static inline clockid_t make_process_cpuclock(const unsigned int pid,
-   const clockid_t clock)
-{
-   return ((~pid) << 3) | clock;
-}
-static inline clockid_t make_thread_cpuclock(const unsigned int tid,
-   const clockid_t clock)
-{
-   return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK);
-}
-
-static inline clockid_t fd_to_clockid(const int fd)
-{
-   return make_process_cpuclock((unsigned int) fd, CLOCKFD);
-}
-
-static inline int clockid_to_fd(const clockid_t clk)
-{
-   return ~(clk >> 3);
-}
-
 #define REQUEUE_PENDING 1
 
 /**
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 6b56a22..0043110 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -97,4 +97,51 @@ struct __kernel_old_timeval {
  */
 #define TIMER_ABSTIME  0x01
 
+/*
+ * Bit fields within a clockid:
+ *
+ * The most significant 29 bits hold either a pid or a file descriptor.
+ *
+ * Bit 2 indicates whether a cpu clock refers to a thread or a process.
+ *
+ * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
+ *
+ * A clockid is invalid if bits 2, 1, and 0 are all set.
+ */
+#define CPUCLOCK_PID(clock)((pid_t) ~((clock) >> 3))
+#define CPUCLOCK_PERTHREAD(clock) \
+   (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0)
+
+#define CPUCLOCK_PERTHREAD_MASK4
+#define CPUCLOCK_WHICH(clock)  ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK)
+#define CPUCLOCK_CLOCK_MASK3
+#define CPUCLOCK_PROF  0
+#define CPUCLOCK_VIRT  1
+#define CPUCLOCK_SCHED 2
+#define CPUCLOCK_MAX   3
+#define CLOCKFDCPUCLOCK_MAX
+#define CLOCKFD_MASK   (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK)
+
+static inline clockid_t make_process_cpuclock(const unsigned int pid,
+   const clockid_t clock)
+{
+   return ((~pid) << 3) | clock;
+}
+static inline clockid_t make_thread_cpuclock(const unsigned int tid,
+   const clockid_t clock)
+{
+   return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK);
+}
+
+static inline clockid_t fd_to_clockid(const int fd)
+{
+   return make_process_cpuclock((unsigned int) fd, CLOCKFD);
+}
+
+static inline int clockid_to_fd(const clockid_t clk)
+{
+   return ~(clk >> 3);
+}
+
+
 #endif /* _UAPI_LINUX_TIME_H */
-- 
2.1.4



Re: [PATCH V7 00/20] C-SKY(csky) Linux Kernel Port

2018-10-06 Thread Eugene Syromiatnikov
On Sun, Oct 07, 2018 at 12:48:15PM +0800, Guo Ren wrote:
> On Sat, Oct 06, 2018 at 10:06:49PM +0200, Eugene Syromiatnikov wrote:
> > I'm sorry for my ignorance, but I'm struggling to find ISA reference/manual,
> > architecture programming manual, and System V ABI definition;
> > may I ask to give links to them?
> Here is the link: https://github.com/c-sky/csky-doc

Thank you!


Re: [PATCH V7 00/20] C-SKY(csky) Linux Kernel Port

2018-10-06 Thread Eugene Syromiatnikov
On Sun, Oct 07, 2018 at 12:48:15PM +0800, Guo Ren wrote:
> On Sat, Oct 06, 2018 at 10:06:49PM +0200, Eugene Syromiatnikov wrote:
> > I'm sorry for my ignorance, but I'm struggling to find ISA reference/manual,
> > architecture programming manual, and System V ABI definition;
> > may I ask to give links to them?
> Here is the link: https://github.com/c-sky/csky-doc

Thank you!


Re: [PATCH V7 00/20] C-SKY(csky) Linux Kernel Port

2018-10-06 Thread Eugene Syromiatnikov
On Fri, Oct 05, 2018 at 01:41:42PM +0800, Guo Ren wrote:
> This is the 7th version patchset to add the Linux kernel port for
> C-SKY(csky) based on linux-4.19-rc3.
> 
> In this patchset some fixup patches are folded into original patch in
> order to make review clearly and reduce the patches' number for upstream
> patchset. The changelog is added in the every patch's commit-msg.
> 
> Here is the LTP test report for this patchset:
> (and add "V10 C-SKY(csky) Linux Kernel Driver" patchset)
> ---
> Total Tests: 1298
> Total Skipped Tests: 280
> Total Failures: 11
> Kernel Version: 4.19.0-rc3+
> Machine Architecture: csky
> Hostname: buildroot
> ---
> 
> This patchset adds architecture support to Linux for C-SKY's 32-bit embedded
> 
> There are two ABI versions with several CPU cores in this patchset:
>   ABIv1: ck610 (16-bit instruction, 32-bit data path, VIPT Cache ...)
>   ABIv2: ck807 ck810 ck860 (16/32-bit variable length instruction, PIPT Cache,
>SMP ...)
> More information: http://en.c-sky.com
> The development repo: https://gitlab.com/c-sky/csky-linux

I'm sorry for my ignorance, but I'm struggling to find ISA reference/manual,
architecture programming manual, and System V ABI definition;
may I ask to give links to them?


Re: [PATCH V7 00/20] C-SKY(csky) Linux Kernel Port

2018-10-06 Thread Eugene Syromiatnikov
On Fri, Oct 05, 2018 at 01:41:42PM +0800, Guo Ren wrote:
> This is the 7th version patchset to add the Linux kernel port for
> C-SKY(csky) based on linux-4.19-rc3.
> 
> In this patchset some fixup patches are folded into original patch in
> order to make review clearly and reduce the patches' number for upstream
> patchset. The changelog is added in the every patch's commit-msg.
> 
> Here is the LTP test report for this patchset:
> (and add "V10 C-SKY(csky) Linux Kernel Driver" patchset)
> ---
> Total Tests: 1298
> Total Skipped Tests: 280
> Total Failures: 11
> Kernel Version: 4.19.0-rc3+
> Machine Architecture: csky
> Hostname: buildroot
> ---
> 
> This patchset adds architecture support to Linux for C-SKY's 32-bit embedded
> 
> There are two ABI versions with several CPU cores in this patchset:
>   ABIv1: ck610 (16-bit instruction, 32-bit data path, VIPT Cache ...)
>   ABIv2: ck807 ck810 ck860 (16/32-bit variable length instruction, PIPT Cache,
>SMP ...)
> More information: http://en.c-sky.com
> The development repo: https://gitlab.com/c-sky/csky-linux

I'm sorry for my ignorance, but I'm struggling to find ISA reference/manual,
architecture programming manual, and System V ABI definition;
may I ask to give links to them?


[PATCH 2/2] drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

2018-10-06 Thread Eugene Syromiatnikov
As it is consistent with prefixes of other write life time hints.

Signed-off-by: Eugene Syromiatnikov 
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e4e98f4..0bcfbd3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1139,7 +1139,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
bi->bi_iter.bi_size = STRIPE_SIZE;
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
@@ -1192,7 +1192,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_iter.bi_size = STRIPE_SIZE;
rbi->bi_write_hint = sh->dev[i].write_hint;
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
-- 
2.1.4



[PATCH 2/2] drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

2018-10-06 Thread Eugene Syromiatnikov
As it is consistent with prefixes of other write life time hints.

Signed-off-by: Eugene Syromiatnikov 
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e4e98f4..0bcfbd3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1139,7 +1139,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
bi->bi_iter.bi_size = STRIPE_SIZE;
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
@@ -1192,7 +1192,7 @@ static void ops_run_io(struct stripe_head *sh, struct 
stripe_head_state *s)
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_iter.bi_size = STRIPE_SIZE;
rbi->bi_write_hint = sh->dev[i].write_hint;
-   sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
+   sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
 * If this is discard request, set bi_vcnt 0. We don't
 * want to confuse SCSI because SCSI will replace 
payload
-- 
2.1.4



[PATCH 1/2] fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint

2018-10-06 Thread Eugene Syromiatnikov
According to commit message in the original commit v4.13-rc1~212^2~51,
as well as userspace library[1] and man page update[2], R/W hint constants
are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained
"RWF_*" prefix used in earlyy versions of the proposed patch set[3].
Rename it and provide the old name as a synonym for the new one
for backward compatibility.

[1] https://github.com/axboe/fio/commit/bd553af6c849
[2] https://github.com/mkerrisk/man-pages/commit/580082a186fd
[3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html

Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life 
time hints")
Signed-off-by: Eugene Syromiatnikov 
---
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 4137d96..8f312fc 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -261,7 +261,7 @@ static int f_getowner_uids(struct file *filp, unsigned long 
arg)
 static bool rw_hint_valid(enum rw_hint hint)
 {
switch (hint) {
-   case RWF_WRITE_LIFE_NOT_SET:
+   case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
case RWH_WRITE_LIFE_SHORT:
case RWH_WRITE_LIFE_MEDIUM:
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
-- 
2.1.4



[PATCH 1/2] fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint

2018-10-06 Thread Eugene Syromiatnikov
According to commit message in the original commit v4.13-rc1~212^2~51,
as well as userspace library[1] and man page update[2], R/W hint constants
are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained
"RWF_*" prefix used in earlyy versions of the proposed patch set[3].
Rename it and provide the old name as a synonym for the new one
for backward compatibility.

[1] https://github.com/axboe/fio/commit/bd553af6c849
[2] https://github.com/mkerrisk/man-pages/commit/580082a186fd
[3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html

Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life 
time hints")
Signed-off-by: Eugene Syromiatnikov 
---
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 4137d96..8f312fc 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -261,7 +261,7 @@ static int f_getowner_uids(struct file *filp, unsigned long 
arg)
 static bool rw_hint_valid(enum rw_hint hint)
 {
switch (hint) {
-   case RWF_WRITE_LIFE_NOT_SET:
+   case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
case RWH_WRITE_LIFE_SHORT:
case RWH_WRITE_LIFE_MEDIUM:
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 6448cdd..e8f878a 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -57,7 +57,7 @@
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
  * used to clear any hints previously set.
  */
-#define RWF_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NOT_SET 0
 #define RWH_WRITE_LIFE_NONE1
 #define RWH_WRITE_LIFE_SHORT   2
 #define RWH_WRITE_LIFE_MEDIUM  3
@@ -65,6 +65,13 @@
 #define RWH_WRITE_LIFE_EXTREME 5
 
 /*
+ * The originally introduced spelling remained from the first
+ * versions of the patch set that introduced the feature,
+ * v4.13-rc1~212^2~51.
+ */
+#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS  0x0001  /* File accessed */
-- 
2.1.4



[PATCH 0/2] Fix typo in RWH_WRITE_LIFE_NOT_SET constant name

2018-10-06 Thread Eugene Syromiatnikov
Hello.

This is a small fix of a typo (or, more specifically, some remnant of
the old patch version spelling) in RWH_WRITE_LIFE_NOT_SET constant,
which is named as RWF_WRITE_LIFE_NOT_SET currently.  Since the name
with "H" is used in man page and everywhere else, it's probably worth
to make the name used in the fcntl.h UAPI header in line with it.
Second follow-up patch updates (the sole) usage site of this constant
in kernel to use the new spelling.

The old name is retained as it is part of UAPI now.

Eugene Syromiatnikov (2):
  fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint
  drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

 drivers/md/raid5.c   | 4 ++--
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 4 files changed, 19 insertions(+), 5 deletions(-)

-- 
2.1.4



[PATCH 0/2] Fix typo in RWH_WRITE_LIFE_NOT_SET constant name

2018-10-06 Thread Eugene Syromiatnikov
Hello.

This is a small fix of a typo (or, more specifically, some remnant of
the old patch version spelling) in RWH_WRITE_LIFE_NOT_SET constant,
which is named as RWF_WRITE_LIFE_NOT_SET currently.  Since the name
with "H" is used in man page and everywhere else, it's probably worth
to make the name used in the fcntl.h UAPI header in line with it.
Second follow-up patch updates (the sole) usage site of this constant
in kernel to use the new spelling.

The old name is retained as it is part of UAPI now.

Eugene Syromiatnikov (2):
  fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint
  drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET

 drivers/md/raid5.c   | 4 ++--
 fs/fcntl.c   | 2 +-
 include/uapi/linux/fcntl.h   | 9 -
 tools/include/uapi/linux/fcntl.h | 9 -
 4 files changed, 19 insertions(+), 5 deletions(-)

-- 
2.1.4



Re: [PATCH resend] uapi/linux/keyctl.h: don't use C++ reserved keyword as a struct member name

2018-09-09 Thread Eugene Syromiatnikov
On Tue, Aug 28, 2018 at 04:34:04PM -0700, Randy Dunlap wrote:
> From: Randy Dunlap 
> 
> Since this header is in "include/uapi/linux/", apparently people
> want to use it in userspace programs -- even in C++ ones.
> However, the header uses a C++ reserved keyword ("private"),
> so change that to "dh_private" instead to allow the header file
> to be used in C++ userspace.

This change breaks all existing C programs that rely on 
uapi header in order to get struct keyctl_dh_params definition, however.

> 
> Fixes https://bugzilla.kernel.org/show_bug.cgi?id=191051
> Fixes: ddbb41148724 ("KEYS: Add KEYCTL_DH_COMPUTE command")
> 
> Signed-off-by: Randy Dunlap 
> Cc: David Howells 
> Cc: James Morris 
> Cc: "Serge E. Hallyn" 
> Cc: keyri...@vger.kernel.org
> Cc: linux-security-mod...@vger.kernel.org
> Cc: Mat Martineau 
> Cc: sta...@vger.kernel.org
> ---
>  include/uapi/linux/keyctl.h |2 +-
>  security/keys/dh.c  |2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> --- lnx-416.orig/include/uapi/linux/keyctl.h
> +++ lnx-416/include/uapi/linux/keyctl.h
> @@ -65,7 +65,7 @@
>  
>  /* keyctl structures */
>  struct keyctl_dh_params {
> - __s32 private;
> + __s32 dh_private;

>   __s32 prime;
>   __s32 base;
>  };
> --- lnx-416.orig/security/keys/dh.c
> +++ lnx-416/security/keys/dh.c
> @@ -307,7 +307,7 @@ long __keyctl_dh_compute(struct keyctl_d
>   }
>   dh_inputs.g_size = dlen;
>  
> - dlen = dh_data_from_key(pcopy.private, _inputs.key);
> + dlen = dh_data_from_key(pcopy.dh_private, _inputs.key);
>   if (dlen < 0) {
>   ret = dlen;
>   goto out2;
> 
> 


Re: [PATCH resend] uapi/linux/keyctl.h: don't use C++ reserved keyword as a struct member name

2018-09-09 Thread Eugene Syromiatnikov
On Tue, Aug 28, 2018 at 04:34:04PM -0700, Randy Dunlap wrote:
> From: Randy Dunlap 
> 
> Since this header is in "include/uapi/linux/", apparently people
> want to use it in userspace programs -- even in C++ ones.
> However, the header uses a C++ reserved keyword ("private"),
> so change that to "dh_private" instead to allow the header file
> to be used in C++ userspace.

This change breaks all existing C programs that rely on 
uapi header in order to get struct keyctl_dh_params definition, however.

> 
> Fixes https://bugzilla.kernel.org/show_bug.cgi?id=191051
> Fixes: ddbb41148724 ("KEYS: Add KEYCTL_DH_COMPUTE command")
> 
> Signed-off-by: Randy Dunlap 
> Cc: David Howells 
> Cc: James Morris 
> Cc: "Serge E. Hallyn" 
> Cc: keyri...@vger.kernel.org
> Cc: linux-security-mod...@vger.kernel.org
> Cc: Mat Martineau 
> Cc: sta...@vger.kernel.org
> ---
>  include/uapi/linux/keyctl.h |2 +-
>  security/keys/dh.c  |2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> --- lnx-416.orig/include/uapi/linux/keyctl.h
> +++ lnx-416/include/uapi/linux/keyctl.h
> @@ -65,7 +65,7 @@
>  
>  /* keyctl structures */
>  struct keyctl_dh_params {
> - __s32 private;
> + __s32 dh_private;

>   __s32 prime;
>   __s32 base;
>  };
> --- lnx-416.orig/security/keys/dh.c
> +++ lnx-416/security/keys/dh.c
> @@ -307,7 +307,7 @@ long __keyctl_dh_compute(struct keyctl_d
>   }
>   dh_inputs.g_size = dlen;
>  
> - dlen = dh_data_from_key(pcopy.private, _inputs.key);
> + dlen = dh_data_from_key(pcopy.dh_private, _inputs.key);
>   if (dlen < 0) {
>   ret = dlen;
>   goto out2;
> 
> 


[PATCH bpf 0/2] Use __aligned_u64 in UAPI fields

2018-05-27 Thread Eugene Syromiatnikov
Hello.

It was discovered during strace development that struct bpf_map_info and
struct bpf_prog_info now have different layouts of i386/compat and x86_64.
Since it's already broken and bpf syscall has no separate compat (as
far as I can see), and the affecting change was introduced recently (in
Linux 4.16), it's proposed to change the layout of these structures
on 32-bit architectures by using __aligned_u64.

In order to somewhat future-proof from this problem in future, an
approach similar to the one implemented in RDMA subsystem recently
is proposed: use __aligned_u64 consistently throughout the UAPI header.

Eugene Syromiatnikov (2):
  bpf: fix alignment of netns_dev/netns_ino fields in
bpf_{map,prog}_info
  bpf: enforce usage of __aligned_u64 in the UAPI header

 include/uapi/linux/bpf.h   | 30 +++---
 tools/include/uapi/linux/bpf.h | 30 +++---
 2 files changed, 30 insertions(+), 30 deletions(-)

-- 
2.1.4



[PATCH bpf 2/2] bpf: enforce usage of __aligned_u64 in the UAPI header

2018-05-27 Thread Eugene Syromiatnikov
Use __aligned_u64 instead of __u64 everywhere in the UAPI header,
similarly to v4.17-rc1~94^2~58^2 "RDMA: Change all uapi headers to use
__aligned_u64 instead of __u64".

This commit doesn't change structure layouts, but imposes additional
alignment requirements on struct bpf_stack_build_id, struct
bpf_sock_ops, struct bpf_perf_event_value, and struct
bpf_raw_tracepoint_args; of them only struct bpf_sock_ops and struct
bpf_perf_event_value have 64-bit fields that were present in a released
kernel without this additional alignment requirement (bytes_received
and bytes_acked were added to struct bpf_sock_ops in commit
v4.16-rc1~123^2~33^2~5^2~4, struct bpf_perf_event_value was added
in commit v4.15-rc1~84^2~532^2~3).

Signed-off-by: Eugene Syromiatnikov <e...@redhat.com>
---
 include/uapi/linux/bpf.h   | 22 +++---
 tools/include/uapi/linux/bpf.h | 22 +++---
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 903010a..174e99a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -259,8 +259,8 @@ struct bpf_stack_build_id {
__s32   status;
unsigned char   build_id[BPF_BUILD_ID_SIZE];
union {
-   __u64   offset;
-   __u64   ip;
+   __aligned_u64   offset;
+   __aligned_u64   ip;
};
 };
 
@@ -288,7 +288,7 @@ union bpf_attr {
__aligned_u64 value;
__aligned_u64 next_key;
};
-   __u64   flags;
+   __aligned_u64   flags;
};
 
struct { /* anonymous struct used by BPF_PROG_LOAD command */
@@ -360,7 +360,7 @@ union bpf_attr {
} query;
 
struct {
-   __u64 name;
+   __aligned_u64 name;
__u32 prog_fd;
} raw_tracepoint;
 } __attribute__((aligned(8)));
@@ -1011,7 +1011,7 @@ struct bpf_prog_info {
__u32 xlated_prog_len;
__aligned_u64 jited_prog_insns;
__aligned_u64 xlated_prog_insns;
-   __u64 load_time;/* ns since boottime */
+   __aligned_u64 load_time;/* ns since boottime */
__u32 created_by_uid;
__u32 nr_map_ids;
__aligned_u64 map_ids;
@@ -1101,8 +1101,8 @@ struct bpf_sock_ops {
__u32 lost_out;
__u32 sacked_out;
__u32 sk_txhash;
-   __u64 bytes_received;
-   __u64 bytes_acked;
+   __aligned_u64 bytes_received;
+   __aligned_u64 bytes_acked;
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
@@ -1189,9 +1189,9 @@ enum {
 #define TCP_BPF_SNDCWND_CLAMP  1002/* Set sndcwnd_clamp */
 
 struct bpf_perf_event_value {
-   __u64 counter;
-   __u64 enabled;
-   __u64 running;
+   __aligned_u64 counter;
+   __aligned_u64 enabled;
+   __aligned_u64 running;
 };
 
 #define BPF_DEVCG_ACC_MKNOD(1ULL << 0)
@@ -1209,7 +1209,7 @@ struct bpf_cgroup_dev_ctx {
 };
 
 struct bpf_raw_tracepoint_args {
-   __u64 args[0];
+   __aligned_u64 args[0];
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 903010a..174e99a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -259,8 +259,8 @@ struct bpf_stack_build_id {
__s32   status;
unsigned char   build_id[BPF_BUILD_ID_SIZE];
union {
-   __u64   offset;
-   __u64   ip;
+   __aligned_u64   offset;
+   __aligned_u64   ip;
};
 };
 
@@ -288,7 +288,7 @@ union bpf_attr {
__aligned_u64 value;
__aligned_u64 next_key;
};
-   __u64   flags;
+   __aligned_u64   flags;
};
 
struct { /* anonymous struct used by BPF_PROG_LOAD command */
@@ -360,7 +360,7 @@ union bpf_attr {
} query;
 
struct {
-   __u64 name;
+   __aligned_u64 name;
__u32 prog_fd;
} raw_tracepoint;
 } __attribute__((aligned(8)));
@@ -1011,7 +1011,7 @@ struct bpf_prog_info {
__u32 xlated_prog_len;
__aligned_u64 jited_prog_insns;
__aligned_u64 xlated_prog_insns;
-   __u64 load_time;/* ns since boottime */
+   __aligned_u64 load_time;/* ns since boottime */
__u32 created_by_uid;
__u32 nr_map_ids;
__aligned_u64 map_ids;
@@ -1101,8 +1101,8 @@ struct bpf_sock_ops {
__u32 lost_out;
__u32 sacked_out;
__u32 sk_txhash;
-   __u64 bytes_received;
-   __u64 bytes_acked;
+   __aligned_u64 bytes_received;
+   __aligned_u64 bytes_acked;
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
@@ -1189,9 +1189,9 @@ enum {
 #define TCP_BPF_SNDCWND_CLAMP  1002/* Set sndcwnd_clamp */
 
 struct bpf_perf_event_value {
-   __u64 counter;

[PATCH bpf 1/2] bpf: fix alignment of netns_dev/netns_ino fields in bpf_{map,prog}_info

2018-05-27 Thread Eugene Syromiatnikov
Recent introduction of netns_dev/netns_ino to bpf_map_info/bpf_prog info
has broken compat, as offsets of these fields are different in 32-bit
and 64-bit ABIs.  One fix (other than implementing compat support in
syscall in order to handle this discrepancy) is to use __aligned_u64
instead of __u64 for these fields.

Reported-by: Dmitry V. Levin <l...@altlinux.org>
Fixes: 52775b33bb507 ("bpf: offload: report device information about
offloaded maps")
Fixes: 675fc275a3a2d ("bpf: offload: report device information for
offloaded programs")

Signed-off-by: Eugene Syromiatnikov <e...@redhat.com>
---
 include/uapi/linux/bpf.h   | 8 
 tools/include/uapi/linux/bpf.h | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec897..903010a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1017,8 +1017,8 @@ struct bpf_prog_info {
__aligned_u64 map_ids;
char name[BPF_OBJ_NAME_LEN];
__u32 ifindex;
-   __u64 netns_dev;
-   __u64 netns_ino;
+   __aligned_u64 netns_dev;
+   __aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -1030,8 +1030,8 @@ struct bpf_map_info {
__u32 map_flags;
char  name[BPF_OBJ_NAME_LEN];
__u32 ifindex;
-   __u64 netns_dev;
-   __u64 netns_ino;
+   __aligned_u64 netns_dev;
+   __aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c5ec897..903010a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1017,8 +1017,8 @@ struct bpf_prog_info {
__aligned_u64 map_ids;
char name[BPF_OBJ_NAME_LEN];
__u32 ifindex;
-   __u64 netns_dev;
-   __u64 netns_ino;
+   __aligned_u64 netns_dev;
+   __aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -1030,8 +1030,8 @@ struct bpf_map_info {
__u32 map_flags;
char  name[BPF_OBJ_NAME_LEN];
__u32 ifindex;
-   __u64 netns_dev;
-   __u64 netns_ino;
+   __aligned_u64 netns_dev;
+   __aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
-- 
2.1.4



[PATCH bpf 0/2] Use __aligned_u64 in UAPI fields

2018-05-27 Thread Eugene Syromiatnikov
Hello.

It was discovered during strace development that struct bpf_map_info and
struct bpf_prog_info now have different layouts of i386/compat and x86_64.
Since it's already broken and bpf syscall has no separate compat (as
far as I can see), and the affecting change was introduced recently (in
Linux 4.16), it's proposed to change the layout of these structures
on 32-bit architectures by using __aligned_u64.

In order to somewhat future-proof from this problem in future, an
approach similar to the one implemented in RDMA subsystem recently
is proposed: use __aligned_u64 consistently throughout the UAPI header.

Eugene Syromiatnikov (2):
  bpf: fix alignment of netns_dev/netns_ino fields in
bpf_{map,prog}_info
  bpf: enforce usage of __aligned_u64 in the UAPI header

 include/uapi/linux/bpf.h   | 30 +++---
 tools/include/uapi/linux/bpf.h | 30 +++---
 2 files changed, 30 insertions(+), 30 deletions(-)

-- 
2.1.4



[PATCH bpf 2/2] bpf: enforce usage of __aligned_u64 in the UAPI header

2018-05-27 Thread Eugene Syromiatnikov
Use __aligned_u64 instead of __u64 everywhere in the UAPI header,
similarly to v4.17-rc1~94^2~58^2 "RDMA: Change all uapi headers to use
__aligned_u64 instead of __u64".

This commit doesn't change structure layouts, but imposes additional
alignment requirements on struct bpf_stack_build_id, struct
bpf_sock_ops, struct bpf_perf_event_value, and struct
bpf_raw_tracepoint_args; of them only struct bpf_sock_ops and struct
bpf_perf_event_value have 64-bit fields that were present in a released
kernel without this additional alignment requirement (bytes_received
and bytes_acked were added to struct bpf_sock_ops in commit
v4.16-rc1~123^2~33^2~5^2~4, struct bpf_perf_event_value was added
in commit v4.15-rc1~84^2~532^2~3).

Signed-off-by: Eugene Syromiatnikov 
---
 include/uapi/linux/bpf.h   | 22 +++---
 tools/include/uapi/linux/bpf.h | 22 +++---
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 903010a..174e99a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -259,8 +259,8 @@ struct bpf_stack_build_id {
__s32   status;
unsigned char   build_id[BPF_BUILD_ID_SIZE];
union {
-   __u64   offset;
-   __u64   ip;
+   __aligned_u64   offset;
+   __aligned_u64   ip;
};
 };
 
@@ -288,7 +288,7 @@ union bpf_attr {
__aligned_u64 value;
__aligned_u64 next_key;
};
-   __u64   flags;
+   __aligned_u64   flags;
};
 
struct { /* anonymous struct used by BPF_PROG_LOAD command */
@@ -360,7 +360,7 @@ union bpf_attr {
} query;
 
struct {
-   __u64 name;
+   __aligned_u64 name;
__u32 prog_fd;
} raw_tracepoint;
 } __attribute__((aligned(8)));
@@ -1011,7 +1011,7 @@ struct bpf_prog_info {
__u32 xlated_prog_len;
__aligned_u64 jited_prog_insns;
__aligned_u64 xlated_prog_insns;
-   __u64 load_time;/* ns since boottime */
+   __aligned_u64 load_time;/* ns since boottime */
__u32 created_by_uid;
__u32 nr_map_ids;
__aligned_u64 map_ids;
@@ -1101,8 +1101,8 @@ struct bpf_sock_ops {
__u32 lost_out;
__u32 sacked_out;
__u32 sk_txhash;
-   __u64 bytes_received;
-   __u64 bytes_acked;
+   __aligned_u64 bytes_received;
+   __aligned_u64 bytes_acked;
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
@@ -1189,9 +1189,9 @@ enum {
 #define TCP_BPF_SNDCWND_CLAMP  1002/* Set sndcwnd_clamp */
 
 struct bpf_perf_event_value {
-   __u64 counter;
-   __u64 enabled;
-   __u64 running;
+   __aligned_u64 counter;
+   __aligned_u64 enabled;
+   __aligned_u64 running;
 };
 
 #define BPF_DEVCG_ACC_MKNOD(1ULL << 0)
@@ -1209,7 +1209,7 @@ struct bpf_cgroup_dev_ctx {
 };
 
 struct bpf_raw_tracepoint_args {
-   __u64 args[0];
+   __aligned_u64 args[0];
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 903010a..174e99a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -259,8 +259,8 @@ struct bpf_stack_build_id {
__s32   status;
unsigned char   build_id[BPF_BUILD_ID_SIZE];
union {
-   __u64   offset;
-   __u64   ip;
+   __aligned_u64   offset;
+   __aligned_u64   ip;
};
 };
 
@@ -288,7 +288,7 @@ union bpf_attr {
__aligned_u64 value;
__aligned_u64 next_key;
};
-   __u64   flags;
+   __aligned_u64   flags;
};
 
struct { /* anonymous struct used by BPF_PROG_LOAD command */
@@ -360,7 +360,7 @@ union bpf_attr {
} query;
 
struct {
-   __u64 name;
+   __aligned_u64 name;
__u32 prog_fd;
} raw_tracepoint;
 } __attribute__((aligned(8)));
@@ -1011,7 +1011,7 @@ struct bpf_prog_info {
__u32 xlated_prog_len;
__aligned_u64 jited_prog_insns;
__aligned_u64 xlated_prog_insns;
-   __u64 load_time;/* ns since boottime */
+   __aligned_u64 load_time;/* ns since boottime */
__u32 created_by_uid;
__u32 nr_map_ids;
__aligned_u64 map_ids;
@@ -1101,8 +1101,8 @@ struct bpf_sock_ops {
__u32 lost_out;
__u32 sacked_out;
__u32 sk_txhash;
-   __u64 bytes_received;
-   __u64 bytes_acked;
+   __aligned_u64 bytes_received;
+   __aligned_u64 bytes_acked;
 };
 
 /* Definitions for bpf_sock_ops_cb_flags */
@@ -1189,9 +1189,9 @@ enum {
 #define TCP_BPF_SNDCWND_CLAMP  1002/* Set sndcwnd_clamp */
 
 struct bpf_perf_event_value {
-   __u64 counter;
-   __u64 enabled;
-

[PATCH bpf 1/2] bpf: fix alignment of netns_dev/netns_ino fields in bpf_{map,prog}_info

2018-05-27 Thread Eugene Syromiatnikov
Recent introduction of netns_dev/netns_ino to bpf_map_info/bpf_prog info
has broken compat, as offsets of these fields are different in 32-bit
and 64-bit ABIs.  One fix (other than implementing compat support in
syscall in order to handle this discrepancy) is to use __aligned_u64
instead of __u64 for these fields.

Reported-by: Dmitry V. Levin 
Fixes: 52775b33bb507 ("bpf: offload: report device information about
offloaded maps")
Fixes: 675fc275a3a2d ("bpf: offload: report device information for
offloaded programs")

Signed-off-by: Eugene Syromiatnikov 
---
 include/uapi/linux/bpf.h   | 8 
 tools/include/uapi/linux/bpf.h | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec897..903010a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1017,8 +1017,8 @@ struct bpf_prog_info {
__aligned_u64 map_ids;
char name[BPF_OBJ_NAME_LEN];
__u32 ifindex;
-   __u64 netns_dev;
-   __u64 netns_ino;
+   __aligned_u64 netns_dev;
+   __aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -1030,8 +1030,8 @@ struct bpf_map_info {
__u32 map_flags;
char  name[BPF_OBJ_NAME_LEN];
__u32 ifindex;
-   __u64 netns_dev;
-   __u64 netns_ino;
+   __aligned_u64 netns_dev;
+   __aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c5ec897..903010a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1017,8 +1017,8 @@ struct bpf_prog_info {
__aligned_u64 map_ids;
char name[BPF_OBJ_NAME_LEN];
__u32 ifindex;
-   __u64 netns_dev;
-   __u64 netns_ino;
+   __aligned_u64 netns_dev;
+   __aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -1030,8 +1030,8 @@ struct bpf_map_info {
__u32 map_flags;
char  name[BPF_OBJ_NAME_LEN];
__u32 ifindex;
-   __u64 netns_dev;
-   __u64 netns_ino;
+   __aligned_u64 netns_dev;
+   __aligned_u64 netns_ino;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
-- 
2.1.4



Re: [PATCH bpf-next v2 0/3] bpf: add boot parameters for sysctl knobs

2018-05-25 Thread Eugene Syromiatnikov
On Thu, May 24, 2018 at 04:34:51PM -0700, Alexei Starovoitov wrote:
> On Thu, May 24, 2018 at 09:41:08AM +0200, Jesper Dangaard Brouer wrote:
> > On Wed, 23 May 2018 15:02:45 -0700
> > Alexei Starovoitov <alexei.starovoi...@gmail.com> wrote:
> > 
> > > On Wed, May 23, 2018 at 02:18:19PM +0200, Eugene Syromiatnikov wrote:
> > > > Some BPF sysctl knobs affect the loading of BPF programs, and during
> > > > system boot/init stages these sysctls are not yet configured.
> > > > A concrete example is systemd, that has implemented loading of BPF
> > > > programs.
> > > > 
> > > > Thus, to allow controlling these setting at early boot, this patch set
> > > > adds the ability to change the default setting of these sysctl knobs
> > > > as well as option to override them via a boot-time kernel parameter
> > > > (in order to avoid rebuilding kernel each time a need of changing these
> > > > defaults arises).
> > > > 
> > > > The sysctl knobs in question are kernel.unprivileged_bpf_disable,
> > > > net.core.bpf_jit_harden, and net.core.bpf_jit_kallsyms.  
> > > 
> > > - systemd is root. today it only uses cgroup-bpf progs which require root,
> > >   so disabling unpriv during boot time makes no difference to systemd.
> > >   what is the actual reason to present time?
systemd also runs a lot of code, some of which is unprivileged.

> > > - say in the future systemd wants to use so_reuseport+bpf for faster
> > >   networking. With unpriv disable during boot, it will force systemd
> > >   to do such networking from root, which will lower its security barrier.
No, it will force systemd not to use SO_REUSEPORT BPF.

> > > - bpf_jit_kallsyms sysctl has immediate effect on loaded programs.
> > >   Flipping it during the boot or right after or any time after
> > >   is the same thing. Why add such boot flag then?
Well, that one was for completeness.

> > > - jit_harden can be turned on by systemd. so turning it during the boot
> > >   will make systemd progs to be constant blinded.
> > >   Constant blinding protects kernel from unprivileged JIT spraying.
> > >   Are you worried that systemd will attack the kernel with JIT spraying?
I'm worried that systemd can be exploited for a JIT spraying attack.

Another thing I'm concerned with is that the generated code is different,
which introduces additional complication during debugging.

> > I think you are missing that, we want the ability to change these
> > defaults in-order to avoid depending on /etc/sysctl.conf settings, and
> > that the these sysctl.conf setting happen too late.
> 
> What does it mean 'happens too late' ?
> Too late for what?
> sysctl.conf has plenty of system critical knobs like
> kernel.perf_event_paranoid, kernel.core_pattern, etc
> The behavior of the host is drastically different after sysctl config
> is applied.
> 
> > For example with jit_harden, there will be a difference between the
> > loaded BPF program that got loaded at boot-time with systemd (no
> > constant blinding) and when someone reloads that systemd service after
> > /etc/sysctl.conf have been evaluated and setting bpf_jit_harden (now
> > slower due to constant blinding).   This is inconsistent behavior.
> 
> net.core.bpf_jit_harden can be flipped back and forth at run-time,
> so bpf progs before and after will be either blinded or not.
> I don't see any inconsistency.

That can't be the reason to maintain that inconsistency.


Re: [PATCH bpf-next v2 0/3] bpf: add boot parameters for sysctl knobs

2018-05-25 Thread Eugene Syromiatnikov
On Thu, May 24, 2018 at 04:34:51PM -0700, Alexei Starovoitov wrote:
> On Thu, May 24, 2018 at 09:41:08AM +0200, Jesper Dangaard Brouer wrote:
> > On Wed, 23 May 2018 15:02:45 -0700
> > Alexei Starovoitov  wrote:
> > 
> > > On Wed, May 23, 2018 at 02:18:19PM +0200, Eugene Syromiatnikov wrote:
> > > > Some BPF sysctl knobs affect the loading of BPF programs, and during
> > > > system boot/init stages these sysctls are not yet configured.
> > > > A concrete example is systemd, that has implemented loading of BPF
> > > > programs.
> > > > 
> > > > Thus, to allow controlling these setting at early boot, this patch set
> > > > adds the ability to change the default setting of these sysctl knobs
> > > > as well as option to override them via a boot-time kernel parameter
> > > > (in order to avoid rebuilding kernel each time a need of changing these
> > > > defaults arises).
> > > > 
> > > > The sysctl knobs in question are kernel.unprivileged_bpf_disable,
> > > > net.core.bpf_jit_harden, and net.core.bpf_jit_kallsyms.  
> > > 
> > > - systemd is root. today it only uses cgroup-bpf progs which require root,
> > >   so disabling unpriv during boot time makes no difference to systemd.
> > >   what is the actual reason to present time?
systemd also runs a lot of code, some of which is unprivileged.

> > > - say in the future systemd wants to use so_reuseport+bpf for faster
> > >   networking. With unpriv disable during boot, it will force systemd
> > >   to do such networking from root, which will lower its security barrier.
No, it will force systemd not to use SO_REUSEPORT BPF.

> > > - bpf_jit_kallsyms sysctl has immediate effect on loaded programs.
> > >   Flipping it during the boot or right after or any time after
> > >   is the same thing. Why add such boot flag then?
Well, that one was for completeness.

> > > - jit_harden can be turned on by systemd. so turning it during the boot
> > >   will make systemd progs to be constant blinded.
> > >   Constant blinding protects kernel from unprivileged JIT spraying.
> > >   Are you worried that systemd will attack the kernel with JIT spraying?
I'm worried that systemd can be exploited for a JIT spraying attack.

Another thing I'm concerned with is that the generated code is different,
which introduces additional complication during debugging.

> > I think you are missing that, we want the ability to change these
> > defaults in-order to avoid depending on /etc/sysctl.conf settings, and
> > that the these sysctl.conf setting happen too late.
> 
> What does it mean 'happens too late' ?
> Too late for what?
> sysctl.conf has plenty of system critical knobs like
> kernel.perf_event_paranoid, kernel.core_pattern, etc
> The behavior of the host is drastically different after sysctl config
> is applied.
> 
> > For example with jit_harden, there will be a difference between the
> > loaded BPF program that got loaded at boot-time with systemd (no
> > constant blinding) and when someone reloads that systemd service after
> > /etc/sysctl.conf have been evaluated and setting bpf_jit_harden (now
> > slower due to constant blinding).   This is inconsistent behavior.
> 
> net.core.bpf_jit_harden can be flipped back and forth at run-time,
> so bpf progs before and after will be either blinded or not.
> I don't see any inconsistency.

That can't be the reason to maintain that inconsistency.


[tip:perf/core] perf/core: Wire up compat PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_MODIFY_ATTRIBUTES

2018-05-25 Thread tip-bot for Eugene Syromiatnikov
Commit-ID:  82489c5fe5f99ca95f708fecae9f2c8aa99398bb
Gitweb: https://git.kernel.org/tip/82489c5fe5f99ca95f708fecae9f2c8aa99398bb
Author: Eugene Syromiatnikov <e...@redhat.com>
AuthorDate: Mon, 21 May 2018 14:34:20 +0200
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Fri, 25 May 2018 08:11:11 +0200

perf/core: Wire up compat PERF_EVENT_IOC_QUERY_BPF, 
PERF_EVENT_IOC_MODIFY_ATTRIBUTES

Since pointer size is different in compat, and switching in _perf_ioctl
is done using exact ioctl numbers, all new ioctl numbers that use pointer
should be added to perf_compat_ioctl for _IOC_SIZE fixup before passing
to perf_ioctl routine (this shouldn't be needed if semantics of the size
argument of _IO* macros was honored).

Signed-off-by: Eugene Syromiatnikov <e...@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Alexander Shishkin <alexander.shish...@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <a...@kernel.org>
Cc: Arnaldo Carvalho de Melo <a...@redhat.com>
Cc: Jiri Olsa <jo...@redhat.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Stephane Eranian <eran...@google.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Vince Weaver <vincent.wea...@maine.edu>
Link: http://lkml.kernel.org/r/20180521123420.ga24...@asgard.redhat.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 kernel/events/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 24dea13a27ed..08f5e1b42b43 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5120,6 +5120,8 @@ static long perf_compat_ioctl(struct file *file, unsigned 
int cmd,
switch (_IOC_NR(cmd)) {
case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
case _IOC_NR(PERF_EVENT_IOC_ID):
+   case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
+   case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
cmd &= ~IOCSIZE_MASK;


[tip:perf/core] perf/core: Wire up compat PERF_EVENT_IOC_QUERY_BPF, PERF_EVENT_IOC_MODIFY_ATTRIBUTES

2018-05-25 Thread tip-bot for Eugene Syromiatnikov
Commit-ID:  82489c5fe5f99ca95f708fecae9f2c8aa99398bb
Gitweb: https://git.kernel.org/tip/82489c5fe5f99ca95f708fecae9f2c8aa99398bb
Author: Eugene Syromiatnikov 
AuthorDate: Mon, 21 May 2018 14:34:20 +0200
Committer:  Ingo Molnar 
CommitDate: Fri, 25 May 2018 08:11:11 +0200

perf/core: Wire up compat PERF_EVENT_IOC_QUERY_BPF, 
PERF_EVENT_IOC_MODIFY_ATTRIBUTES

Since pointer size is different in compat, and switching in _perf_ioctl
is done using exact ioctl numbers, all new ioctl numbers that use pointer
should be added to perf_compat_ioctl for _IOC_SIZE fixup before passing
to perf_ioctl routine (this shouldn't be needed if semantics of the size
argument of _IO* macros was honored).

Signed-off-by: Eugene Syromiatnikov 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Link: http://lkml.kernel.org/r/20180521123420.ga24...@asgard.redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/events/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 24dea13a27ed..08f5e1b42b43 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5120,6 +5120,8 @@ static long perf_compat_ioctl(struct file *file, unsigned 
int cmd,
switch (_IOC_NR(cmd)) {
case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
case _IOC_NR(PERF_EVENT_IOC_ID):
+   case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
+   case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
cmd &= ~IOCSIZE_MASK;


[PATCH bpf-next v2 0/3] bpf: add boot parameters for sysctl knobs

2018-05-23 Thread Eugene Syromiatnikov
Some BPF sysctl knobs affect the loading of BPF programs, and during
system boot/init stages these sysctls are not yet configured.
A concrete example is systemd, that has implemented loading of BPF
programs.

Thus, to allow controlling these setting at early boot, this patch set
adds the ability to change the default setting of these sysctl knobs
as well as option to override them via a boot-time kernel parameter
(in order to avoid rebuilding kernel each time a need of changing these
defaults arises).

The sysctl knobs in question are kernel.unprivileged_bpf_disable,
net.core.bpf_jit_harden, and net.core.bpf_jit_kallsyms.

Eugene Syromiatnikov (3):
  bpf: add ability to configure unprivileged BPF via boot-time parameter
  bpf: add ability to configure BPF JIT hardening via boot-time
parameter
  bpf: add ability to configure BPF JIT kallsyms export at the boot time

 Documentation/admin-guide/kernel-parameters.txt | 28 
 init/Kconfig| 90 +
 kernel/bpf/core.c   | 31 +
 kernel/bpf/syscall.c| 16 +
 4 files changed, 165 insertions(+)

-- 
2.1.4



[PATCH bpf-next v2 0/3] bpf: add boot parameters for sysctl knobs

2018-05-23 Thread Eugene Syromiatnikov
Some BPF sysctl knobs affect the loading of BPF programs, and during
system boot/init stages these sysctls are not yet configured.
A concrete example is systemd, that has implemented loading of BPF
programs.

Thus, to allow controlling these setting at early boot, this patch set
adds the ability to change the default setting of these sysctl knobs
as well as option to override them via a boot-time kernel parameter
(in order to avoid rebuilding kernel each time a need of changing these
defaults arises).

The sysctl knobs in question are kernel.unprivileged_bpf_disable,
net.core.bpf_jit_harden, and net.core.bpf_jit_kallsyms.

Eugene Syromiatnikov (3):
  bpf: add ability to configure unprivileged BPF via boot-time parameter
  bpf: add ability to configure BPF JIT hardening via boot-time
parameter
  bpf: add ability to configure BPF JIT kallsyms export at the boot time

 Documentation/admin-guide/kernel-parameters.txt | 28 
 init/Kconfig| 90 +
 kernel/bpf/core.c   | 31 +
 kernel/bpf/syscall.c| 16 +
 4 files changed, 165 insertions(+)

-- 
2.1.4



[PATCH bpf-next v2 1/3] bpf: add ability to configure unprivileged BPF via boot-time parameter

2018-05-23 Thread Eugene Syromiatnikov
This patch introduces two configuration options,
UNPRIVILEGED_BPF_BOOTPARAM and UNPRIVILEGED_BPF_BOOTPARAM_VALUE, that
allow configuring the initial value of kernel.unprivileged_bpf_disabled
sysctl knob, which is useful for the cases when disabling unprivileged
bpf() access during the early boot is desirable.

Signed-off-by: Eugene Syromiatnikov <e...@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  8 +++
 init/Kconfig| 31 +
 kernel/bpf/syscall.c| 16 +
 3 files changed, 55 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 11fc28e..aa8e831 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4355,6 +4355,14 @@
unknown_nmi_panic
[X86] Cause panic on unknown NMI.
 
+   unprivileged_bpf_disabled=
+   Format: { "0" | "1" }
+   Sets initial value of kernel.unprivileged_bpf_disabled
+   sysctl knob.
+   0 - unprivileged bpf() syscall access enabled.
+   1 - unprivileged bpf() syscall access disabled.
+   Default value is set via kernel config option.
+
usbcore.authorized_default=
[USB] Default USB device authorization:
(default -1 = authorized except for wireless USB,
diff --git a/init/Kconfig b/init/Kconfig
index 480a4f2..1403a3e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1404,6 +1404,37 @@ config BPF_JIT_ALWAYS_ON
  Enables BPF JIT and removes BPF interpreter to avoid
  speculative execution of BPF instructions by the interpreter
 
+config UNPRIVILEGED_BPF_BOOTPARAM
+   bool "Unprivileged bpf() boot parameter"
+   depends on BPF_SYSCALL
+   default n
+   help
+ This option adds a kernel parameter 'unprivileged_bpf_disabled'
+ that allows configuring default state of the
+ kernel.unprivileged_bpf_disabled sysctl knob.
+ If this option is selected, unprivileged access to the bpf() syscall
+ can be disabled with unprivileged_bpf_disabled=1 on the kernel command
+ line.  The purpose of this option is to allow disabling unprivileged
+ bpf() syscall access during the early boot.
+
+ If you are unsure how to answer this question, answer N.
+
+config UNPRIVILEGED_BPF_BOOTPARAM_VALUE
+   int "Unprivileged bpf() boot parameter default value"
+   depends on UNPRIVILEGED_BPF_BOOTPARAM
+   range 0 1
+   default 0
+   help
+ This option sets the default value for the kernel parameter
+ 'unprivileged_bpf_disabled', which allows disabling unprivileged bpf()
+ syscall access at boot.  If this option is set to 0 (zero), the
+ unprivileged bpf() boot kernel parameter will default to 0, allowing
+ unprivileged bpf() syscall access at bootup.  If this option is
+ set to 1 (one), the unprivileged bpf() kernel parameter will default
+ to 1, disabling unprivileged bpf() syscall access at bootup.
+
+ If you are unsure how to answer this question, answer 0.
+
 config USERFAULTFD
bool "Enable userfaultfd() system call"
select ANON_INODES
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bfcde94..fdc5fd9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -45,7 +46,22 @@ static DEFINE_SPINLOCK(prog_idr_lock);
 static DEFINE_IDR(map_idr);
 static DEFINE_SPINLOCK(map_idr_lock);
 
+#ifdef CONFIG_UNPRIVILEGED_BPF_BOOTPARAM
+int sysctl_unprivileged_bpf_disabled __read_mostly =
+   CONFIG_UNPRIVILEGED_BPF_BOOTPARAM_VALUE;
+
+static int __init unprivileged_bpf_setup(char *str)
+{
+   unsigned long disabled;
+
+   if (!kstrtoul(str, 0, ))
+   sysctl_unprivileged_bpf_disabled = !!disabled;
+   return 1;
+}
+__setup("unprivileged_bpf_disabled=", unprivileged_bpf_setup);
+#else /* !CONFIG_UNPRIVILEGED_BPF_BOOTPARAM */
 int sysctl_unprivileged_bpf_disabled __read_mostly;
+#endif /* CONFIG_UNPRIVILEGED_BPF_BOOTPARAM */
 
 static const struct bpf_map_ops * const bpf_map_types[] = {
 #define BPF_PROG_TYPE(_id, _ops)
-- 
2.1.4



[PATCH bpf-next v2 1/3] bpf: add ability to configure unprivileged BPF via boot-time parameter

2018-05-23 Thread Eugene Syromiatnikov
This patch introduces two configuration options,
UNPRIVILEGED_BPF_BOOTPARAM and UNPRIVILEGED_BPF_BOOTPARAM_VALUE, that
allow configuring the initial value of kernel.unprivileged_bpf_disabled
sysctl knob, which is useful for the cases when disabling unprivileged
bpf() access during the early boot is desirable.

Signed-off-by: Eugene Syromiatnikov 
---
 Documentation/admin-guide/kernel-parameters.txt |  8 +++
 init/Kconfig| 31 +
 kernel/bpf/syscall.c| 16 +
 3 files changed, 55 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 11fc28e..aa8e831 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4355,6 +4355,14 @@
unknown_nmi_panic
[X86] Cause panic on unknown NMI.
 
+   unprivileged_bpf_disabled=
+   Format: { "0" | "1" }
+   Sets initial value of kernel.unprivileged_bpf_disabled
+   sysctl knob.
+   0 - unprivileged bpf() syscall access enabled.
+   1 - unprivileged bpf() syscall access disabled.
+   Default value is set via kernel config option.
+
usbcore.authorized_default=
[USB] Default USB device authorization:
(default -1 = authorized except for wireless USB,
diff --git a/init/Kconfig b/init/Kconfig
index 480a4f2..1403a3e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1404,6 +1404,37 @@ config BPF_JIT_ALWAYS_ON
  Enables BPF JIT and removes BPF interpreter to avoid
  speculative execution of BPF instructions by the interpreter
 
+config UNPRIVILEGED_BPF_BOOTPARAM
+   bool "Unprivileged bpf() boot parameter"
+   depends on BPF_SYSCALL
+   default n
+   help
+ This option adds a kernel parameter 'unprivileged_bpf_disabled'
+ that allows configuring default state of the
+ kernel.unprivileged_bpf_disabled sysctl knob.
+ If this option is selected, unprivileged access to the bpf() syscall
+ can be disabled with unprivileged_bpf_disabled=1 on the kernel command
+ line.  The purpose of this option is to allow disabling unprivileged
+ bpf() syscall access during the early boot.
+
+ If you are unsure how to answer this question, answer N.
+
+config UNPRIVILEGED_BPF_BOOTPARAM_VALUE
+   int "Unprivileged bpf() boot parameter default value"
+   depends on UNPRIVILEGED_BPF_BOOTPARAM
+   range 0 1
+   default 0
+   help
+ This option sets the default value for the kernel parameter
+ 'unprivileged_bpf_disabled', which allows disabling unprivileged bpf()
+ syscall access at boot.  If this option is set to 0 (zero), the
+ unprivileged bpf() boot kernel parameter will default to 0, allowing
+ unprivileged bpf() syscall access at bootup.  If this option is
+ set to 1 (one), the unprivileged bpf() kernel parameter will default
+ to 1, disabling unprivileged bpf() syscall access at bootup.
+
+ If you are unsure how to answer this question, answer 0.
+
 config USERFAULTFD
bool "Enable userfaultfd() system call"
select ANON_INODES
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bfcde94..fdc5fd9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -45,7 +46,22 @@ static DEFINE_SPINLOCK(prog_idr_lock);
 static DEFINE_IDR(map_idr);
 static DEFINE_SPINLOCK(map_idr_lock);
 
+#ifdef CONFIG_UNPRIVILEGED_BPF_BOOTPARAM
+int sysctl_unprivileged_bpf_disabled __read_mostly =
+   CONFIG_UNPRIVILEGED_BPF_BOOTPARAM_VALUE;
+
+static int __init unprivileged_bpf_setup(char *str)
+{
+   unsigned long disabled;
+
+   if (!kstrtoul(str, 0, ))
+   sysctl_unprivileged_bpf_disabled = !!disabled;
+   return 1;
+}
+__setup("unprivileged_bpf_disabled=", unprivileged_bpf_setup);
+#else /* !CONFIG_UNPRIVILEGED_BPF_BOOTPARAM */
 int sysctl_unprivileged_bpf_disabled __read_mostly;
+#endif /* CONFIG_UNPRIVILEGED_BPF_BOOTPARAM */
 
 static const struct bpf_map_ops * const bpf_map_types[] = {
 #define BPF_PROG_TYPE(_id, _ops)
-- 
2.1.4



[PATCH bpf-next v2 3/3] bpf: add ability to configure BPF JIT kallsyms export at the boot time

2018-05-23 Thread Eugene Syromiatnikov
This patch introduces two configuration options,
BPF_JIT_KALLSYMS_BOOTPARAM and BPF_JIT_KALLSYMS_BOOTPARAM_VALUE, that
allow configuring the initial value of net.core.bpf_jit_kallsyms sysctl
knob. This enables export of addresses of JIT'ed BPF programs that
created during the early boot.

Signed-off-by: Eugene Syromiatnikov <e...@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 10 +
 init/Kconfig| 30 +
 kernel/bpf/core.c   | 14 
 3 files changed, 54 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 5adc6d0..10e7502 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -452,6 +452,16 @@
2 - JIT hardening is enabled for all users.
Default value is set via kernel config option.
 
+   bpf_jit_kallsyms=
+   Format: { "0" | "1" }
+   Sets initial value of net.core.bpf_jit_kallsyms
+   sysctl knob.
+   0 - Addresses of JIT'ed BPF programs are not exported
+   to kallsyms.
+   1 - Export of addresses of JIT'ed BPF programs is
+   enabled for privileged users.
+   Default value is set via kernel config option.
+
bttv.card=  [HW,V4L] bttv (bt848 + bt878 based grabber cards)
bttv.radio= Most important insmod options are available as
kernel args too.
diff --git a/init/Kconfig b/init/Kconfig
index b661497..b5405ca 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1464,6 +1464,36 @@ config BPF_JIT_HARDEN_BOOTPARAM_VALUE
 
  If you are unsure how to answer this question, answer 0.
 
+config BPF_JIT_KALLSYMS_BOOTPARAM
+   bool "BPF JIT kallsyms export boot parameter"
+   default n
+   help
+ This option adds a kernel parameter 'bpf_jit_kallsyms' that allows
+ configuring default state of the net.core.bpf_jit_kallsyms sysctl
+ knob.  If this option is selected, the default value of the
+ net.core.bpf_jit_kallsyms sysctl knob can be set on the kernel command
+ line.  The purpose of this option is to allow enabling BPF JIT
+ kallsyms export for the BPF programs created during the early boot,
+ so they can be traced later.
+
+ If you are unsure how to answer this question, answer N.
+
+config BPF_JIT_KALLSYMS_BOOTPARAM_VALUE
+   int "BPF JIT kallsyms export boot parameter default value"
+   depends on BPF_JIT_HARDEN_BOOTPARAM
+   range 0 1
+   default 0
+   help
+ This option sets the default value for the kernel parameter
+ 'bpf_jit_kallsyms' that configures default value of the
+ net.core.bpf_jit_kallsyms sysctl knob at boot.  If this option is set
+ to 0 (zero), the net.core.bpf_jit_kallsyms will default to 0, which
+ will lead to disabling of exporting of addresses of JIT'ed BPF
+ programs.  If this option is set to 1 (one), addresses of privileged
+ BPF programs are exported to kallsyms.
+
+ If you are unsure how to answer this question, answer 0.
+
 config USERFAULTFD
bool "Enable userfaultfd() system call"
select ANON_INODES
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9edb7a8..003d708 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -321,7 +321,21 @@ __setup("bpf_jit_harden=", bpf_jit_harden_setup);
 int bpf_jit_harden   __read_mostly;
 #endif /* CONFIG_BPF_JIT_HARDEN_BOOTPARAM */
 
+#ifdef CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM
+int bpf_jit_kallsyms __read_mostly = CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM_VALUE;
+
+static int __init bpf_jit_kallsyms_setup(char *str)
+{
+   unsigned long enabled;
+
+   if (!kstrtoul(str, 0, ))
+   bpf_jit_kallsyms = !!enabled;
+   return 1;
+}
+__setup("bpf_jit_kallsyms=", bpf_jit_kallsyms_setup);
+#else /* !CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM */
 int bpf_jit_kallsyms __read_mostly;
+#endif /* CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM */
 
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
-- 
2.1.4



[PATCH bpf-next v2 3/3] bpf: add ability to configure BPF JIT kallsyms export at the boot time

2018-05-23 Thread Eugene Syromiatnikov
This patch introduces two configuration options,
BPF_JIT_KALLSYMS_BOOTPARAM and BPF_JIT_KALLSYMS_BOOTPARAM_VALUE, that
allow configuring the initial value of net.core.bpf_jit_kallsyms sysctl
knob. This enables export of addresses of JIT'ed BPF programs that
created during the early boot.

Signed-off-by: Eugene Syromiatnikov 
---
 Documentation/admin-guide/kernel-parameters.txt | 10 +
 init/Kconfig| 30 +
 kernel/bpf/core.c   | 14 
 3 files changed, 54 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 5adc6d0..10e7502 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -452,6 +452,16 @@
2 - JIT hardening is enabled for all users.
Default value is set via kernel config option.
 
+   bpf_jit_kallsyms=
+   Format: { "0" | "1" }
+   Sets initial value of net.core.bpf_jit_kallsyms
+   sysctl knob.
+   0 - Addresses of JIT'ed BPF programs are not exported
+   to kallsyms.
+   1 - Export of addresses of JIT'ed BPF programs is
+   enabled for privileged users.
+   Default value is set via kernel config option.
+
bttv.card=  [HW,V4L] bttv (bt848 + bt878 based grabber cards)
bttv.radio= Most important insmod options are available as
kernel args too.
diff --git a/init/Kconfig b/init/Kconfig
index b661497..b5405ca 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1464,6 +1464,36 @@ config BPF_JIT_HARDEN_BOOTPARAM_VALUE
 
  If you are unsure how to answer this question, answer 0.
 
+config BPF_JIT_KALLSYMS_BOOTPARAM
+   bool "BPF JIT kallsyms export boot parameter"
+   default n
+   help
+ This option adds a kernel parameter 'bpf_jit_kallsyms' that allows
+ configuring default state of the net.core.bpf_jit_kallsyms sysctl
+ knob.  If this option is selected, the default value of the
+ net.core.bpf_jit_kallsyms sysctl knob can be set on the kernel command
+ line.  The purpose of this option is to allow enabling BPF JIT
+ kallsyms export for the BPF programs created during the early boot,
+ so they can be traced later.
+
+ If you are unsure how to answer this question, answer N.
+
+config BPF_JIT_KALLSYMS_BOOTPARAM_VALUE
+   int "BPF JIT kallsyms export boot parameter default value"
+   depends on BPF_JIT_HARDEN_BOOTPARAM
+   range 0 1
+   default 0
+   help
+ This option sets the default value for the kernel parameter
+ 'bpf_jit_kallsyms' that configures default value of the
+ net.core.bpf_jit_kallsyms sysctl knob at boot.  If this option is set
+ to 0 (zero), the net.core.bpf_jit_kallsyms will default to 0, which
+ will lead to disabling of exporting of addresses of JIT'ed BPF
+ programs.  If this option is set to 1 (one), addresses of privileged
+ BPF programs are exported to kallsyms.
+
+ If you are unsure how to answer this question, answer 0.
+
 config USERFAULTFD
bool "Enable userfaultfd() system call"
select ANON_INODES
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9edb7a8..003d708 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -321,7 +321,21 @@ __setup("bpf_jit_harden=", bpf_jit_harden_setup);
 int bpf_jit_harden   __read_mostly;
 #endif /* CONFIG_BPF_JIT_HARDEN_BOOTPARAM */
 
+#ifdef CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM
+int bpf_jit_kallsyms __read_mostly = CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM_VALUE;
+
+static int __init bpf_jit_kallsyms_setup(char *str)
+{
+   unsigned long enabled;
+
+   if (!kstrtoul(str, 0, ))
+   bpf_jit_kallsyms = !!enabled;
+   return 1;
+}
+__setup("bpf_jit_kallsyms=", bpf_jit_kallsyms_setup);
+#else /* !CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM */
 int bpf_jit_kallsyms __read_mostly;
+#endif /* CONFIG_BPF_JIT_KALLSYMS_BOOTPARAM */
 
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
-- 
2.1.4



[PATCH bpf-next v2 2/3] bpf: add ability to configure BPF JIT hardening via boot-time parameter

2018-05-23 Thread Eugene Syromiatnikov
This patch introduces two configuration options,
BPF_JIT_HARDEN_BOOTPARAM and BPF_JIT_HARDEN_BOOTPARAM_VALUE, that allow
configuring the initial value of net.core.bpf_jit_harden sysctl knob,
which is useful for enforcing JIT hardening during the early boot.

Signed-off-by: Eugene Syromiatnikov <e...@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 10 +
 init/Kconfig| 29 +
 kernel/bpf/core.c   | 17 +++
 3 files changed, 56 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index aa8e831..5adc6d0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -442,6 +442,16 @@
bert_disable[ACPI]
Disable BERT OS support on buggy BIOSes.
 
+   bpf_jit_harden=
+   Format: { "0" | "1" | "2" }
+   Sets initial value of net.core.bpf_jit_harden
+   sysctl knob.
+   0 - JIT hardening is disabled.
+   1 - JIT hardening is enabled for unprivileged users
+   only.
+   2 - JIT hardening is enabled for all users.
+   Default value is set via kernel config option.
+
bttv.card=  [HW,V4L] bttv (bt848 + bt878 based grabber cards)
bttv.radio= Most important insmod options are available as
kernel args too.
diff --git a/init/Kconfig b/init/Kconfig
index 1403a3e..b661497 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1435,6 +1435,35 @@ config UNPRIVILEGED_BPF_BOOTPARAM_VALUE
 
  If you are unsure how to answer this question, answer 0.
 
+config BPF_JIT_HARDEN_BOOTPARAM
+   bool "BPF JIT harden boot parameter"
+   default n
+   help
+ This option adds a kernel parameter 'bpf_jit_harden' that allows
+ configuring default state of the net.core.bpf_jit_harden sysctl knob.
+ If this option is selected, the default value of the
+ net.core.bpf_jit_harden sysctl knob can be set on the kernel command
+ line.  The purpose of this option is to allow enabling BPF JIT
+ hardening for the BPF programs created during the early boot.
+
+ If you are unsure how to answer this question, answer N.
+
+config BPF_JIT_HARDEN_BOOTPARAM_VALUE
+   int "BPF JIT harden boot parameter default value"
+   depends on BPF_JIT_HARDEN_BOOTPARAM
+   range 0 2
+   default 0
+   help
+ This option sets the default value for the kernel parameter
+ 'bpf_jit_enabled' that configures default value of the
+ net.core.bpf_jit_harden sysctl knob at boot.  If this option is set to
+ 0 (zero), the net.core.bpf_jit_harden will default to 0, which will
+ lead to no hardening at bootup.  If this option is set to 1 (one),
+ hardening will be applied only to unprivileged users only.  If this
+ option is set to 2 (two), JIT hardening will be enabled for all users.
+
+ If you are unsure how to answer this question, answer 0.
+
 config USERFAULTFD
bool "Enable userfaultfd() system call"
select ANON_INODES
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2194c6a..9edb7a8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -303,7 +304,23 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog 
*prog, u32 off,
 #ifdef CONFIG_BPF_JIT
 /* All BPF JIT sysctl knobs here. */
 int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+
+#ifdef CONFIG_BPF_JIT_HARDEN_BOOTPARAM
+int bpf_jit_harden   __read_mostly = CONFIG_BPF_JIT_HARDEN_BOOTPARAM_VALUE;
+
+static int __init bpf_jit_harden_setup(char *str)
+{
+   unsigned long value;
+
+   if (!kstrtoul(str, 0, ))
+   bpf_jit_harden = min(value, 2UL);
+   return 1;
+}
+__setup("bpf_jit_harden=", bpf_jit_harden_setup);
+#else /* !CONFIG_BPF_JIT_HARDEN_BOOTPARAM */
 int bpf_jit_harden   __read_mostly;
+#endif /* CONFIG_BPF_JIT_HARDEN_BOOTPARAM */
+
 int bpf_jit_kallsyms __read_mostly;
 
 static __always_inline void
-- 
2.1.4



[PATCH bpf-next v2 2/3] bpf: add ability to configure BPF JIT hardening via boot-time parameter

2018-05-23 Thread Eugene Syromiatnikov
This patch introduces two configuration options,
BPF_JIT_HARDEN_BOOTPARAM and BPF_JIT_HARDEN_BOOTPARAM_VALUE, that allow
configuring the initial value of net.core.bpf_jit_harden sysctl knob,
which is useful for enforcing JIT hardening during the early boot.

Signed-off-by: Eugene Syromiatnikov 
---
 Documentation/admin-guide/kernel-parameters.txt | 10 +
 init/Kconfig| 29 +
 kernel/bpf/core.c   | 17 +++
 3 files changed, 56 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index aa8e831..5adc6d0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -442,6 +442,16 @@
bert_disable[ACPI]
Disable BERT OS support on buggy BIOSes.
 
+   bpf_jit_harden=
+   Format: { "0" | "1" | "2" }
+   Sets initial value of net.core.bpf_jit_harden
+   sysctl knob.
+   0 - JIT hardening is disabled.
+   1 - JIT hardening is enabled for unprivileged users
+   only.
+   2 - JIT hardening is enabled for all users.
+   Default value is set via kernel config option.
+
bttv.card=  [HW,V4L] bttv (bt848 + bt878 based grabber cards)
bttv.radio= Most important insmod options are available as
kernel args too.
diff --git a/init/Kconfig b/init/Kconfig
index 1403a3e..b661497 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1435,6 +1435,35 @@ config UNPRIVILEGED_BPF_BOOTPARAM_VALUE
 
  If you are unsure how to answer this question, answer 0.
 
+config BPF_JIT_HARDEN_BOOTPARAM
+   bool "BPF JIT harden boot parameter"
+   default n
+   help
+ This option adds a kernel parameter 'bpf_jit_harden' that allows
+ configuring default state of the net.core.bpf_jit_harden sysctl knob.
+ If this option is selected, the default value of the
+ net.core.bpf_jit_harden sysctl knob can be set on the kernel command
+ line.  The purpose of this option is to allow enabling BPF JIT
+ hardening for the BPF programs created during the early boot.
+
+ If you are unsure how to answer this question, answer N.
+
+config BPF_JIT_HARDEN_BOOTPARAM_VALUE
+   int "BPF JIT harden boot parameter default value"
+   depends on BPF_JIT_HARDEN_BOOTPARAM
+   range 0 2
+   default 0
+   help
+ This option sets the default value for the kernel parameter
+ 'bpf_jit_enabled' that configures default value of the
+ net.core.bpf_jit_harden sysctl knob at boot.  If this option is set to
+ 0 (zero), the net.core.bpf_jit_harden will default to 0, which will
+ lead to no hardening at bootup.  If this option is set to 1 (one),
+ hardening will be applied only to unprivileged users only.  If this
+ option is set to 2 (two), JIT hardening will be enabled for all users.
+
+ If you are unsure how to answer this question, answer 0.
+
 config USERFAULTFD
bool "Enable userfaultfd() system call"
select ANON_INODES
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2194c6a..9edb7a8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -303,7 +304,23 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog 
*prog, u32 off,
 #ifdef CONFIG_BPF_JIT
 /* All BPF JIT sysctl knobs here. */
 int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+
+#ifdef CONFIG_BPF_JIT_HARDEN_BOOTPARAM
+int bpf_jit_harden   __read_mostly = CONFIG_BPF_JIT_HARDEN_BOOTPARAM_VALUE;
+
+static int __init bpf_jit_harden_setup(char *str)
+{
+   unsigned long value;
+
+   if (!kstrtoul(str, 0, ))
+   bpf_jit_harden = min(value, 2UL);
+   return 1;
+}
+__setup("bpf_jit_harden=", bpf_jit_harden_setup);
+#else /* !CONFIG_BPF_JIT_HARDEN_BOOTPARAM */
 int bpf_jit_harden   __read_mostly;
+#endif /* CONFIG_BPF_JIT_HARDEN_BOOTPARAM */
+
 int bpf_jit_kallsyms __read_mostly;
 
 static __always_inline void
-- 
2.1.4



Re: [PATCH 0/3] bpf: add boot parameters for sysctl knobs

2018-05-23 Thread Eugene Syromiatnikov
On Mon, May 21, 2018 at 11:58:13AM -0700, Alexei Starovoitov wrote:
> On Mon, May 21, 2018 at 02:29:30PM +0200, Eugene Syromiatnikov wrote:
> > Hello.
> > 
> > This patch set adds ability to set default values for
> > kernel.unprivileged_bpf_disable, net.core.bpf_jit_harden,
> > net.core.bpf_jit_kallsyms sysctl knobs as well as option to override
> > them via a boot-time kernel parameter.
> 
> Commits log not only should explain 'what' is being done by the patch,
> but 'why' as well.

Some BPF sysctl knobs affect the loading of BPF programs, and during
system boot/init stages these sysctls are not yet configured. A
concrete example is systemd, that has implemented loading of BPF
programs.

Thus, to allow controlling these setting at early boot, this patch set
adds the ability to change the default setting of these sysctl knobs
as well as option to override them via a boot-time kernel parameter
(in order to avoid rebuilding kernel each time a need of changing these
defaults arises).

The sysctl knobs in question are kernel.unprivileged_bpf_disable,
net.core.bpf_jit_harden, and net.core.bpf_jit_kallsyms.


  1   2   >