[PATCH v2 2/2] iio: trigger: Fix strange (ladder-type) indentation

2021-04-02 Thread Andy Shevchenko
In some cases indentation looks a bit weird with starting from = sign
and being in a ladder-type style. Unify it across the module.

While at it, add blank line after definition block where it needed,

Signed-off-by: Andy Shevchenko 
---
v2: fixed typo in the commit message (tupe->type)
 drivers/iio/industrialio-trigger.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/iio/industrialio-trigger.c 
b/drivers/iio/industrialio-trigger.c
index 78e30f0f915c..ec72ff04b38d 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -211,6 +211,7 @@ EXPORT_SYMBOL(iio_trigger_notify_done);
 static int iio_trigger_get_irq(struct iio_trigger *trig)
 {
int ret;
+
mutex_lock(>pool_lock);
ret = bitmap_find_free_region(trig->pool,
  CONFIG_IIO_CONSUMERS_PER_TRIGGER,
@@ -239,9 +240,9 @@ static void iio_trigger_put_irq(struct iio_trigger *trig, 
int irq)
 int iio_trigger_attach_poll_func(struct iio_trigger *trig,
 struct iio_poll_func *pf)
 {
+   bool notinuse =
+   bitmap_empty(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER);
int ret = 0;
-   bool notinuse
-   = bitmap_empty(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER);
 
/* Prevent the module from being removed whilst attached to a trigger */
__module_get(pf->indio_dev->driver_module);
@@ -290,11 +291,10 @@ int iio_trigger_attach_poll_func(struct iio_trigger *trig,
 int iio_trigger_detach_poll_func(struct iio_trigger *trig,
 struct iio_poll_func *pf)
 {
+   bool no_other_users =
+   bitmap_weight(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER) == 
1;
int ret = 0;
-   bool no_other_users
-   = (bitmap_weight(trig->pool,
-CONFIG_IIO_CONSUMERS_PER_TRIGGER)
-  == 1);
+
if (trig->ops && trig->ops->set_trigger_state && no_other_users) {
ret = trig->ops->set_trigger_state(trig, false);
if (ret)
@@ -312,6 +312,7 @@ int iio_trigger_detach_poll_func(struct iio_trigger *trig,
 irqreturn_t iio_pollfunc_store_time(int irq, void *p)
 {
struct iio_poll_func *pf = p;
+
pf->timestamp = iio_get_time_ns(pf->indio_dev);
return IRQ_WAKE_THREAD;
 }
@@ -498,18 +499,16 @@ static const struct device_type iio_trig_type = {
 static void iio_trig_subirqmask(struct irq_data *d)
 {
struct irq_chip *chip = irq_data_get_irq_chip(d);
-   struct iio_trigger *trig
-   = container_of(chip,
-  struct iio_trigger, subirq_chip);
+   struct iio_trigger *trig = container_of(chip, struct iio_trigger, 
subirq_chip);
+
trig->subirqs[d->irq - trig->subirq_base].enabled = false;
 }
 
 static void iio_trig_subirqunmask(struct irq_data *d)
 {
struct irq_chip *chip = irq_data_get_irq_chip(d);
-   struct iio_trigger *trig
-   = container_of(chip,
-  struct iio_trigger, subirq_chip);
+   struct iio_trigger *trig = container_of(chip, struct iio_trigger, 
subirq_chip);
+
trig->subirqs[d->irq - trig->subirq_base].enabled = true;
 }
 
@@ -695,7 +694,7 @@ EXPORT_SYMBOL(iio_trigger_using_own);
  * device, -EINVAL otherwise.
  */
 int iio_trigger_validate_own_device(struct iio_trigger *trig,
-   struct iio_dev *indio_dev)
+   struct iio_dev *indio_dev)
 {
if (indio_dev->dev.parent != trig->dev.parent)
return -EINVAL;
-- 
2.30.2



[PATCH v1 2/2] iio: trigger: Fix strange (ladder-tupe) indentation

2021-04-01 Thread Andy Shevchenko
In some cases indentation looks a bit weird with starting from = sign
and being in a ladder-type style. Unify it across the module.

While at it, add blank line after definition block where it needed,

Signed-off-by: Andy Shevchenko 
---
 drivers/iio/industrialio-trigger.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/iio/industrialio-trigger.c 
b/drivers/iio/industrialio-trigger.c
index 77fca24147b2..f998900a34f5 100644
--- a/drivers/iio/industrialio-trigger.c
+++ b/drivers/iio/industrialio-trigger.c
@@ -211,6 +211,7 @@ EXPORT_SYMBOL(iio_trigger_notify_done);
 static int iio_trigger_get_irq(struct iio_trigger *trig)
 {
int ret;
+
mutex_lock(>pool_lock);
ret = bitmap_find_free_region(trig->pool,
  CONFIG_IIO_CONSUMERS_PER_TRIGGER,
@@ -239,9 +240,9 @@ static void iio_trigger_put_irq(struct iio_trigger *trig, 
int irq)
 int iio_trigger_attach_poll_func(struct iio_trigger *trig,
 struct iio_poll_func *pf)
 {
+   bool notinuse =
+   bitmap_empty(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER);
int ret = 0;
-   bool notinuse
-   = bitmap_empty(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER);
 
/* Prevent the module from being removed whilst attached to a trigger */
__module_get(pf->indio_dev->driver_module);
@@ -290,11 +291,10 @@ int iio_trigger_attach_poll_func(struct iio_trigger *trig,
 int iio_trigger_detach_poll_func(struct iio_trigger *trig,
 struct iio_poll_func *pf)
 {
+   bool no_other_users =
+   bitmap_weight(trig->pool, CONFIG_IIO_CONSUMERS_PER_TRIGGER) == 
1;
int ret = 0;
-   bool no_other_users
-   = (bitmap_weight(trig->pool,
-CONFIG_IIO_CONSUMERS_PER_TRIGGER)
-  == 1);
+
if (trig->ops && trig->ops->set_trigger_state && no_other_users) {
ret = trig->ops->set_trigger_state(trig, false);
if (ret)
@@ -312,6 +312,7 @@ int iio_trigger_detach_poll_func(struct iio_trigger *trig,
 irqreturn_t iio_pollfunc_store_time(int irq, void *p)
 {
struct iio_poll_func *pf = p;
+
pf->timestamp = iio_get_time_ns(pf->indio_dev);
return IRQ_WAKE_THREAD;
 }
@@ -498,18 +499,16 @@ static const struct device_type iio_trig_type = {
 static void iio_trig_subirqmask(struct irq_data *d)
 {
struct irq_chip *chip = irq_data_get_irq_chip(d);
-   struct iio_trigger *trig
-   = container_of(chip,
-  struct iio_trigger, subirq_chip);
+   struct iio_trigger *trig = container_of(chip, struct iio_trigger, 
subirq_chip);
+
trig->subirqs[d->irq - trig->subirq_base].enabled = false;
 }
 
 static void iio_trig_subirqunmask(struct irq_data *d)
 {
struct irq_chip *chip = irq_data_get_irq_chip(d);
-   struct iio_trigger *trig
-   = container_of(chip,
-  struct iio_trigger, subirq_chip);
+   struct iio_trigger *trig = container_of(chip, struct iio_trigger, 
subirq_chip);
+
trig->subirqs[d->irq - trig->subirq_base].enabled = true;
 }
 
@@ -695,7 +694,7 @@ EXPORT_SYMBOL(iio_trigger_using_own);
  * device, -EINVAL otherwise.
  */
 int iio_trigger_validate_own_device(struct iio_trigger *trig,
-   struct iio_dev *indio_dev)
+   struct iio_dev *indio_dev)
 {
if (indio_dev->dev.parent != trig->dev.parent)
return -EINVAL;
-- 
2.30.2



BUG: broken overlay causes very strange kernel crash

2021-02-12 Thread Enrico Weigelt, metux IT consult

Hi folks,


while playing around with overlays, I've encountered a funny crash,
that even seems to affect the filesystem. No idea what really happens,
as oftree code detected the broken phandle.

What I did:

* i've written a driver that loads a builtin oftree overlay and tries
  to apply it.
* as its running on x86 (acpi-based), I'm also creating a of_root node
  and add some properties to it. (yes, calling of_node_init())
* using of_overlay_fdt_apply(), which seemed to work, but still trying
  to find out how to make it add new top-level nodes ...
* if the call fails, the driver does nothing (except printing the err)
* when adding a fragment with target <0> the crash happens

The crash *much* later than loading the overlay, NULL pointer deref in
ext2_error(). Since I can't see any relation between oftree and ext2,
this smells that oftree code is overwriting some unrelated memory.

Maybe something's creating broken pointers and then writing there ?

Obviously my driver code shit, but those strange things happending
smells like some weird is going on deep inside the oftree code, that
maybe *could* provide an attack surface.


Does anyone have an idea what's going here ?


thx
--mtx


[0.629870] OF: overlay: find target, node: /fragment@0, phandle 0x0 
not found

[0.631603] OF: overlay: init_overlay_changeset() failed, ret = -22
[0.633131] ofboard: ret=-22 ovcs_id=0
[0.634039] ofboard: dumping all nodes ...
[0.634932] ofboard: ==> of node:
[0.635579] ofboard:  --> property: model
[0.636333] ofboard:  --> property: compatible
[0.637202] ofboard: ret=-22 ovcs_id=0
[0.637919] ofboard: ofdrv done
[0.638529] IPI shorthand broadcast: enabled
[0.640553] VFS: Mounted root (ext2 filesystem) readonly on device 254:0.
[0.642639] Freeing unused kernel image (initmem) memory: 700K
[0.649080] Write protecting the kernel read-only data: 10240k
[0.651415] Freeing unused kernel image (text/rodata gap) memory: 2044K
[0.653478] Freeing unused kernel image (rodata/data gap) memory: 1124K
[0.655178] Run /sbin/init as init process
[0.665905] BUG: kernel NULL pointer dereference, address: 
003a

[0.667634] #PF: supervisor write access in kernel mode
[0.668919] #PF: error_code(0x0002) - not-present page
[0.669011] PGD 0 P4D 0
[0.669011] Oops: 0002 [#1] PREEMPT SMP PTI
[0.669011] CPU: 0 PID: 25 Comm: rcS Not tainted 
5.11.0-rc7-00105-g4fb1c4f664da-dirty #247
[0.669011] Hardware name: PC engines Standard PC (i440FX + PIIX, 
1996)/APU3, BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 044

[0.669011] RIP: 0010:ext2_error+0x6d/0x90
[0.669011] Code: 30 31 c0 f6 47 50 01 0f 85 04 32 15 00 4d 8d bc 24 
80 01 00 00 4c 89 ff e8 f0 a4 16 00 4c 89 ff 66 41 83 8c 24 9f

[0.669011] RSP: 0018:c90d7aa8 EFLAGS: 00010206
[0.669011] RAX:  RBX: 888000256000 RCX: 
0077
[0.669011] RDX: 0001 RSI: 81895e52 RDI: 
88800025e380
[0.669011] RBP: c90d7b38 R08: 88800048da78 R09: 
8880019f8ff4
[0.669011] R10:  R11: 8f9a8d98 R12: 
88800025e200
[0.669011] R13:  R14: 81895e52 R15: 
88800025e380
[0.669011] FS:  7f500a373740() GS:888007a0() 
knlGS:

[0.669011] CS:  0010 DS:  ES:  CR0: 80050033
[0.669011] CR2: 003a CR3: 009cc000 CR4: 
06b0

[0.669011] Call Trace:
[0.669011]  ? kmem_cache_alloc+0x1a/0x150
[0.669011]  ext2_get_inode+0x5e/0x130
[0.669011]  ? iget_locked+0x1e3/0x1f0
[0.669011]  ext2_iget+0x81/0x420
[0.669011]  ext2_lookup+0x79/0xb0
[0.669011]  __lookup_slow+0x79/0x130
[0.669011]  walk_component+0x139/0x1b0
[0.669011]  ? path_init+0x2bd/0x3e0
[0.669011]  path_lookupat+0x6d/0x1b0
[0.669011]  filename_lookup+0xa5/0x170
[0.669011]  ? strncpy_from_user+0x53/0x140
[0.669011]  user_path_at_empty+0x35/0x40
[0.669011]  vfs_statx+0x6e/0x110
[0.669011]  ? handle_mm_fault+0x11ee/0x1280
[0.669011]  __do_sys_newstat+0x3e/0x70
[0.669011]  ? irqentry_exit+0x3c/0x60
[0.669011]  ? exc_page_fault+0x22c/0x380
[0.669011]  ? __do_sys_rt_sigreturn+0xc5/0xe0
[0.669011]  __x64_sys_newstat+0x11/0x20
[0.669011]  do_syscall_64+0x32/0x50
[0.669011]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[0.669011] RIP: 0033:0x7f500a462ee6
[0.669011] Code: 00 00 75 05 48 83 c4 18 c3 e8 e6 ef 01 00 66 0f 1f 
44 00 00 41 89 f8 48 89 f7 48 89 d6 41 83 f8 01 77 29 b8 04 02
[0.669011] RSP: 002b:7ffd1fb01848 EFLAGS: 0246 ORIG_RAX: 
0004
[0.669011] RAX: ffda RBX: 7ffd1fb019d0 RCX: 
7f500a462ee6
[0.669011] RDX: 7ffd1fb01890 RSI: 7ffd1fb01890 RDI: 
561c13db2498
[0.669011] RBP: 561c13db1778 R08: 0001 R09: 
ff736cff6f647166
[0.669011] R10: 7

Re: Strange problem with SCTP+IPv6

2020-06-26 Thread Michael Tuexen
> On 26. Jun 2020, at 18:13, David Laight  wrote:
> 
> From: Xin Long
>> Sent: 23 June 2020 11:14
 It looks like a bug to me. Testing with this test app here, I can see
 the INIT_ACK being sent with a bunch of ipv4 addresses in it and
 that's unexpected for a v6only socket. As is, it's the server saying
 "I'm available at these other addresses too, but not."
>>> I agree.
>> Then we need a fix in sctp_bind_addrs_to_raw():
>> 
>> @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const
>> struct sctp_bind_addr *bp,
>>addrparms = retval;
>> 
>>list_for_each_entry(addr, >address_list, list) {
>> +   if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) &&
>> +   (AF_INET == addr->a.sa.sa_family))
>> +   continue;
>>af = sctp_get_af_specific(addr->a.v4.sin_family);
>>len = af->to_addr_param(>a, );
>>memcpy(addrparms.v, , len);
> 
> Thought.
> 
> Does it make any sense to offer addresses in the INIT_ACK that don't
> have routes to those proposed in the received INIT?
> 
> 'routes' probably isn't exactly the right word.
> You probably only want the local address that will be used
> as the source address for the probes.
> Or, at least, sources addresses that could be used for the probes.
> 
> So if the INIT only contains IPv6 addresses should the INIT_ACK
> ever contain IPv4 ones.
The client (if it not using an IPv6 socket having IPv6 only enabled) could
add an IPv4 address during the lifetime of the association by using the
address reconfiguration extension.

What could be done is to not send IPv4 addresses if the INIT contains
a Supported Address Types parameter indicating IPv6, but not IPv4 support.
As a client you might want to send this parameter, when the IPv6 socket has
enabled the IPV6_ONLY socket option.
Also if the client uses an IPv4 socket, it can indicate in the Supported
Address Parameter that it only support IPv4, and the server does not need
to list IPv6 addresses.

Best regards
Michael
> 
>   David.
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 
> 1PT, UK
> Registration No: 1397386 (Wales)



RE: Strange problem with SCTP+IPv6

2020-06-26 Thread David Laight
From: Xin Long
> Sent: 23 June 2020 11:14
> > > It looks like a bug to me. Testing with this test app here, I can see
> > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> > > that's unexpected for a v6only socket. As is, it's the server saying
> > > "I'm available at these other addresses too, but not."
> > I agree.
> Then we need a fix in sctp_bind_addrs_to_raw():
> 
> @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const
> struct sctp_bind_addr *bp,
> addrparms = retval;
> 
> list_for_each_entry(addr, >address_list, list) {
> +   if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) &&
> +   (AF_INET == addr->a.sa.sa_family))
> +   continue;
> af = sctp_get_af_specific(addr->a.v4.sin_family);
> len = af->to_addr_param(>a, );
> memcpy(addrparms.v, , len);

Thought.

Does it make any sense to offer addresses in the INIT_ACK that don't
have routes to those proposed in the received INIT?

'routes' probably isn't exactly the right word.
You probably only want the local address that will be used
as the source address for the probes.
Or, at least, sources addresses that could be used for the probes.

So if the INIT only contains IPv6 addresses should the INIT_ACK
ever contain IPv4 ones.

David.

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)


Re: Strange problem with SCTP+IPv6

2020-06-24 Thread Michael Tuexen
> On 24. Jun 2020, at 09:25, Xin Long  wrote:
> 
> On Wed, Jun 24, 2020 at 5:48 AM Michael Tuexen
>  wrote:
>> 
>>> On 23. Jun 2020, at 23:31, Marcelo Ricardo Leitner 
>>>  wrote:
>>> 
>>> On Tue, Jun 23, 2020 at 11:24:59PM +0200, Michael Tuexen wrote:
>>>>> On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner 
>>>>>  wrote:
>>>>> 
>>>>> On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote:
>>>>>> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote:
>>>>>>> From: Marcelo Ricardo Leitner
>>>>>>>> Sent: 22 June 2020 19:33
>>>>>>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
>>>>>>>>>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
>>>>>>>>>> 
>>>>>>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
>>>>>>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  
>>>>>>>>>>> wrote:
>>>>>>>>>>>> 
>>>>>>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I 
>>>>>>>>>>>> create an
>>>>>>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option 
>>>>>>>>>>>> on it,
>>>>>>>>>>>> then I make a connection to it using ::1, the connection will drop 
>>>>>>>>>>>> after
>>>>>>>>>>>> 2.5 seconds with an ECONNRESET error.
>>>>>>>>>>>> 
>>>>>>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect 
>>>>>>>>>>>> to a
>>>>>>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you 
>>>>>>>>>>>> don't
>>>>>>>>>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
>>>>>>>>>>>> I tried on an ARM system and x86_64.
>>>>>>>>>>>> 
>>>>>>>>>>>> I haven't dug into the kernel to see if I could find anything yet, 
>>>>>>>>>>>> but I
>>>>>>>>>>>> thought I would go ahead and report it.  I am attaching a 
>>>>>>>>>>>> reproducer.
>>>>>>>>>>>> Basically, compile the following code:
>>>>>>>>>>> The code only set IPV6_V6ONLY on server side, so the client side 
>>>>>>>>>>> will
>>>>>>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() 
>>>>>>>>>>> to
>>>>>>>>>>> bind any specific addresses ). Then after the connection is created,
>>>>>>>>>>> the client will send HB on the v4 paths to the server. The server
>>>>>>>>>>> will abort the connection, as it can't support v4.
>>>>>>>>>>> 
>>>>>>>>>>> So you can work around it by either:
>>>>>>>>>>> 
>>>>>>>>>>> - set IPV6_V6ONLY on client side.
>>>>>>>>>>> 
>>>>>>>>>>> or
>>>>>>>>>>> 
>>>>>>>>>>> - bind to the specific v6 addresses on the client side.
>>>>>>>>>>> 
>>>>>>>>>>> I don't see RFC said something about this.
>>>>>>>>>>> So it may not be a good idea to change the current behaviour
>>>>>>>>>>> to not establish the connection in this case, which may cause 
>>>>>>>>>>> regression.
>>>>>>>>>> 
>>>>>>>>>> Ok, I understand this.  It's a little strange, but I see why it works
>>>>>>>>>> this way.
>>>>>>>>> I don't. I would expect it to work as I described in my email.
>>>>>>>>> Could someone explain me how and why it is behaving different from
>>>>>>>>> my expectation?
>>>>>>>> 
>>>>>>>> It looks

Re: Strange problem with SCTP+IPv6

2020-06-24 Thread Xin Long
On Wed, Jun 24, 2020 at 5:48 AM Michael Tuexen
 wrote:
>
> > On 23. Jun 2020, at 23:31, Marcelo Ricardo Leitner 
> >  wrote:
> >
> > On Tue, Jun 23, 2020 at 11:24:59PM +0200, Michael Tuexen wrote:
> >>> On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner 
> >>>  wrote:
> >>>
> >>> On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote:
> >>>> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote:
> >>>>> From: Marcelo Ricardo Leitner
> >>>>>> Sent: 22 June 2020 19:33
> >>>>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> >>>>>>>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> >>>>>>>>
> >>>>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> >>>>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  
> >>>>>>>>> wrote:
> >>>>>>>>>>
> >>>>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I 
> >>>>>>>>>> create an
> >>>>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option 
> >>>>>>>>>> on it,
> >>>>>>>>>> then I make a connection to it using ::1, the connection will drop 
> >>>>>>>>>> after
> >>>>>>>>>> 2.5 seconds with an ECONNRESET error.
> >>>>>>>>>>
> >>>>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect 
> >>>>>>>>>> to a
> >>>>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you 
> >>>>>>>>>> don't
> >>>>>>>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> >>>>>>>>>> I tried on an ARM system and x86_64.
> >>>>>>>>>>
> >>>>>>>>>> I haven't dug into the kernel to see if I could find anything yet, 
> >>>>>>>>>> but I
> >>>>>>>>>> thought I would go ahead and report it.  I am attaching a 
> >>>>>>>>>> reproducer.
> >>>>>>>>>> Basically, compile the following code:
> >>>>>>>>> The code only set IPV6_V6ONLY on server side, so the client side 
> >>>>>>>>> will
> >>>>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() 
> >>>>>>>>> to
> >>>>>>>>> bind any specific addresses ). Then after the connection is created,
> >>>>>>>>> the client will send HB on the v4 paths to the server. The server
> >>>>>>>>> will abort the connection, as it can't support v4.
> >>>>>>>>>
> >>>>>>>>> So you can work around it by either:
> >>>>>>>>>
> >>>>>>>>> - set IPV6_V6ONLY on client side.
> >>>>>>>>>
> >>>>>>>>> or
> >>>>>>>>>
> >>>>>>>>> - bind to the specific v6 addresses on the client side.
> >>>>>>>>>
> >>>>>>>>> I don't see RFC said something about this.
> >>>>>>>>> So it may not be a good idea to change the current behaviour
> >>>>>>>>> to not establish the connection in this case, which may cause 
> >>>>>>>>> regression.
> >>>>>>>>
> >>>>>>>> Ok, I understand this.  It's a little strange, but I see why it works
> >>>>>>>> this way.
> >>>>>>> I don't. I would expect it to work as I described in my email.
> >>>>>>> Could someone explain me how and why it is behaving different from
> >>>>>>> my expectation?
> >>>>>>
> >>>>>> It looks like a bug to me. Testing with this test app here, I can see
> >>>>>> the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> >>>>>> that's unexpected for a v6only socket. As is, it's the server saying
> >>>>>>

Re: Strange problem with SCTP+IPv6

2020-06-24 Thread Xin Long
On Wed, Jun 24, 2020 at 12:00 AM Corey Minyard  wrote:
>
> On Tue, Jun 23, 2020 at 11:40:21PM +0800, Xin Long wrote:
> > On Tue, Jun 23, 2020 at 9:29 PM Corey Minyard  wrote:
> > >
> > > On Tue, Jun 23, 2020 at 06:13:30PM +0800, Xin Long wrote:
> > > > On Tue, Jun 23, 2020 at 2:34 AM Michael Tuexen
> > > >  wrote:
> > > > >
> > > > > > On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner 
> > > > > >  wrote:
> > > > > >
> > > > > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> > > > > >>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> > > > > >>>
> > > > > >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> > > > > >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  
> > > > > >>>> wrote:
> > > > > >>>>>
> > > > > >>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I 
> > > > > >>>>> create an
> > > > > >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket 
> > > > > >>>>> option on it,
> > > > > >>>>> then I make a connection to it using ::1, the connection will 
> > > > > >>>>> drop after
> > > > > >>>>> 2.5 seconds with an ECONNRESET error.
> > > > > >>>>>
> > > > > >>>>> It only happens on SCTP, it doesn't have the issue if you 
> > > > > >>>>> connect to a
> > > > > >>>>> full IPv6 address instead of ::1, and it doesn't happen if you 
> > > > > >>>>> don't
> > > > > >>>>> set IPV6_V6ONLY.  I have verified current end of tree 
> > > > > >>>>> kernel.org.
> > > > > >>>>> I tried on an ARM system and x86_64.
> > > > > >>>>>
> > > > > >>>>> I haven't dug into the kernel to see if I could find anything 
> > > > > >>>>> yet, but I
> > > > > >>>>> thought I would go ahead and report it.  I am attaching a 
> > > > > >>>>> reproducer.
> > > > > >>>>> Basically, compile the following code:
> > > > > >>>> The code only set IPV6_V6ONLY on server side, so the client side 
> > > > > >>>> will
> > > > > >>>> still bind all the local ipv4 addresses (as you didn't call 
> > > > > >>>> bind() to
> > > > > >>>> bind any specific addresses ). Then after the connection is 
> > > > > >>>> created,
> > > > > >>>> the client will send HB on the v4 paths to the server. The server
> > > > > >>>> will abort the connection, as it can't support v4.
> > > > > >>>>
> > > > > >>>> So you can work around it by either:
> > > > > >>>>
> > > > > >>>> - set IPV6_V6ONLY on client side.
> > > > > >>>>
> > > > > >>>> or
> > > > > >>>>
> > > > > >>>> - bind to the specific v6 addresses on the client side.
> > > > > >>>>
> > > > > >>>> I don't see RFC said something about this.
> > > > > >>>> So it may not be a good idea to change the current behaviour
> > > > > >>>> to not establish the connection in this case, which may cause 
> > > > > >>>> regression.
> > > > > >>>
> > > > > >>> Ok, I understand this.  It's a little strange, but I see why it 
> > > > > >>> works
> > > > > >>> this way.
> > > > > >> I don't. I would expect it to work as I described in my email.
> > > > > >> Could someone explain me how and why it is behaving different from
> > > > > >> my expectation?
> > > > > >
> > > > > > It looks like a bug to me. Testing with this test app here, I can 
> > > > > > see
> > > > > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> > > &

Re: Strange problem with SCTP+IPv6

2020-06-23 Thread Michael Tuexen
> On 23. Jun 2020, at 23:31, Marcelo Ricardo Leitner 
>  wrote:
> 
> On Tue, Jun 23, 2020 at 11:24:59PM +0200, Michael Tuexen wrote:
>>> On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner 
>>>  wrote:
>>> 
>>> On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote:
>>>> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote:
>>>>> From: Marcelo Ricardo Leitner
>>>>>> Sent: 22 June 2020 19:33
>>>>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
>>>>>>>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
>>>>>>>> 
>>>>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
>>>>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  
>>>>>>>>> wrote:
>>>>>>>>>> 
>>>>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I 
>>>>>>>>>> create an
>>>>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on 
>>>>>>>>>> it,
>>>>>>>>>> then I make a connection to it using ::1, the connection will drop 
>>>>>>>>>> after
>>>>>>>>>> 2.5 seconds with an ECONNRESET error.
>>>>>>>>>> 
>>>>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect to 
>>>>>>>>>> a
>>>>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't
>>>>>>>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
>>>>>>>>>> I tried on an ARM system and x86_64.
>>>>>>>>>> 
>>>>>>>>>> I haven't dug into the kernel to see if I could find anything yet, 
>>>>>>>>>> but I
>>>>>>>>>> thought I would go ahead and report it.  I am attaching a reproducer.
>>>>>>>>>> Basically, compile the following code:
>>>>>>>>> The code only set IPV6_V6ONLY on server side, so the client side will
>>>>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() to
>>>>>>>>> bind any specific addresses ). Then after the connection is created,
>>>>>>>>> the client will send HB on the v4 paths to the server. The server
>>>>>>>>> will abort the connection, as it can't support v4.
>>>>>>>>> 
>>>>>>>>> So you can work around it by either:
>>>>>>>>> 
>>>>>>>>> - set IPV6_V6ONLY on client side.
>>>>>>>>> 
>>>>>>>>> or
>>>>>>>>> 
>>>>>>>>> - bind to the specific v6 addresses on the client side.
>>>>>>>>> 
>>>>>>>>> I don't see RFC said something about this.
>>>>>>>>> So it may not be a good idea to change the current behaviour
>>>>>>>>> to not establish the connection in this case, which may cause 
>>>>>>>>> regression.
>>>>>>>> 
>>>>>>>> Ok, I understand this.  It's a little strange, but I see why it works
>>>>>>>> this way.
>>>>>>> I don't. I would expect it to work as I described in my email.
>>>>>>> Could someone explain me how and why it is behaving different from
>>>>>>> my expectation?
>>>>>> 
>>>>>> It looks like a bug to me. Testing with this test app here, I can see
>>>>>> the INIT_ACK being sent with a bunch of ipv4 addresses in it and
>>>>>> that's unexpected for a v6only socket. As is, it's the server saying
>>>>>> "I'm available at these other addresses too, but not."
>>>>> 
>>>>> Does it even make sense to mix IPv4 and IPv6 addresses on the same
>>>>> connection?
>>>>> I don't remember ever seeing both types of address in a message,
>>>>> but may not have looked.
>>>> 
>>>> That's an interesting question.  Do the RFCs say anything?  I would
>>>> assume it was ok unless ipv6only was set.
>>>> 
>>>>> 
>>&

Re: Strange problem with SCTP+IPv6

2020-06-23 Thread Marcelo Ricardo Leitner
On Tue, Jun 23, 2020 at 11:24:59PM +0200, Michael Tuexen wrote:
> > On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner 
> >  wrote:
> > 
> > On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote:
> >> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote:
> >>> From: Marcelo Ricardo Leitner
> >>>> Sent: 22 June 2020 19:33
> >>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> >>>>>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> >>>>>> 
> >>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> >>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  
> >>>>>>> wrote:
> >>>>>>>> 
> >>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I 
> >>>>>>>> create an
> >>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on 
> >>>>>>>> it,
> >>>>>>>> then I make a connection to it using ::1, the connection will drop 
> >>>>>>>> after
> >>>>>>>> 2.5 seconds with an ECONNRESET error.
> >>>>>>>> 
> >>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect to 
> >>>>>>>> a
> >>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> >>>>>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> >>>>>>>> I tried on an ARM system and x86_64.
> >>>>>>>> 
> >>>>>>>> I haven't dug into the kernel to see if I could find anything yet, 
> >>>>>>>> but I
> >>>>>>>> thought I would go ahead and report it.  I am attaching a reproducer.
> >>>>>>>> Basically, compile the following code:
> >>>>>>> The code only set IPV6_V6ONLY on server side, so the client side will
> >>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() to
> >>>>>>> bind any specific addresses ). Then after the connection is created,
> >>>>>>> the client will send HB on the v4 paths to the server. The server
> >>>>>>> will abort the connection, as it can't support v4.
> >>>>>>> 
> >>>>>>> So you can work around it by either:
> >>>>>>> 
> >>>>>>> - set IPV6_V6ONLY on client side.
> >>>>>>> 
> >>>>>>> or
> >>>>>>> 
> >>>>>>> - bind to the specific v6 addresses on the client side.
> >>>>>>> 
> >>>>>>> I don't see RFC said something about this.
> >>>>>>> So it may not be a good idea to change the current behaviour
> >>>>>>> to not establish the connection in this case, which may cause 
> >>>>>>> regression.
> >>>>>> 
> >>>>>> Ok, I understand this.  It's a little strange, but I see why it works
> >>>>>> this way.
> >>>>> I don't. I would expect it to work as I described in my email.
> >>>>> Could someone explain me how and why it is behaving different from
> >>>>> my expectation?
> >>>> 
> >>>> It looks like a bug to me. Testing with this test app here, I can see
> >>>> the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> >>>> that's unexpected for a v6only socket. As is, it's the server saying
> >>>> "I'm available at these other addresses too, but not."
> >>> 
> >>> Does it even make sense to mix IPv4 and IPv6 addresses on the same
> >>> connection?
> >>> I don't remember ever seeing both types of address in a message,
> >>> but may not have looked.
> >> 
> >> That's an interesting question.  Do the RFCs say anything?  I would
> >> assume it was ok unless ipv6only was set.
> >> 
> >>> 
> >>> I also wonder whether the connection should be dropped for an error
> >>> response on a path that has never been validated.
> >> 
> >> That actually bothered me a bit more.  Shouldn't it stay up if any path
> >> is up?  That's kind of the whole p

Re: Strange problem with SCTP+IPv6

2020-06-23 Thread Michael Tuexen
> On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner 
>  wrote:
> 
> On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote:
>> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote:
>>> From: Marcelo Ricardo Leitner
>>>> Sent: 22 June 2020 19:33
>>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
>>>>>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
>>>>>> 
>>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
>>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
>>>>>>>> 
>>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create 
>>>>>>>> an
>>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on 
>>>>>>>> it,
>>>>>>>> then I make a connection to it using ::1, the connection will drop 
>>>>>>>> after
>>>>>>>> 2.5 seconds with an ECONNRESET error.
>>>>>>>> 
>>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect to a
>>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't
>>>>>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
>>>>>>>> I tried on an ARM system and x86_64.
>>>>>>>> 
>>>>>>>> I haven't dug into the kernel to see if I could find anything yet, but 
>>>>>>>> I
>>>>>>>> thought I would go ahead and report it.  I am attaching a reproducer.
>>>>>>>> Basically, compile the following code:
>>>>>>> The code only set IPV6_V6ONLY on server side, so the client side will
>>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() to
>>>>>>> bind any specific addresses ). Then after the connection is created,
>>>>>>> the client will send HB on the v4 paths to the server. The server
>>>>>>> will abort the connection, as it can't support v4.
>>>>>>> 
>>>>>>> So you can work around it by either:
>>>>>>> 
>>>>>>> - set IPV6_V6ONLY on client side.
>>>>>>> 
>>>>>>> or
>>>>>>> 
>>>>>>> - bind to the specific v6 addresses on the client side.
>>>>>>> 
>>>>>>> I don't see RFC said something about this.
>>>>>>> So it may not be a good idea to change the current behaviour
>>>>>>> to not establish the connection in this case, which may cause 
>>>>>>> regression.
>>>>>> 
>>>>>> Ok, I understand this.  It's a little strange, but I see why it works
>>>>>> this way.
>>>>> I don't. I would expect it to work as I described in my email.
>>>>> Could someone explain me how and why it is behaving different from
>>>>> my expectation?
>>>> 
>>>> It looks like a bug to me. Testing with this test app here, I can see
>>>> the INIT_ACK being sent with a bunch of ipv4 addresses in it and
>>>> that's unexpected for a v6only socket. As is, it's the server saying
>>>> "I'm available at these other addresses too, but not."
>>> 
>>> Does it even make sense to mix IPv4 and IPv6 addresses on the same
>>> connection?
>>> I don't remember ever seeing both types of address in a message,
>>> but may not have looked.
>> 
>> That's an interesting question.  Do the RFCs say anything?  I would
>> assume it was ok unless ipv6only was set.
>> 
>>> 
>>> I also wonder whether the connection should be dropped for an error
>>> response on a path that has never been validated.
>> 
>> That actually bothered me a bit more.  Shouldn't it stay up if any path
>> is up?  That's kind of the whole point of multihoming.
> 
> Michael explained it on the other email. What he described is what I
> observed in my tests.
> 
>> 
>>> 
>>> OTOH the whole 'multi-homing' part of SCTP sucks.
>> 
>> I don't think so.
>> 
>>> The IP addresses a server needs to bind to depend on where the
>>> incoming connection will come from.
>>> A local connection may be able to use a 192.168.x.x address
>>> but a remote connection must not - as it may be defined locally
>>> at the remote system.
>>> But both connections can come into the public (routable) address.
>>> We have to tell customers to explicitly configure the local IP
>>> addresses - which means the application has to know what they are.
>>> Fortunately these apps are pretty static - usually M3UA.
>> 
>> Umm, no,  If you have a private address, it better be behind a firewall,
>> and the firewall should handle rewriting the packet to fix the addresses.
>> 
>> It doesn't appear that Linux netfilter does this.  There is a TODO in
>> the code for this.  But that's how it *should* work.
> 
> Right, we don't support SCTP aware NAT [1].
> 
> 1.https://tools.ietf.org/html/draft-stewart-behave-sctpnat-04
The current version is: https://tools.ietf.org/html/draft-ietf-tsvwg-natsupp-16

Another possibility for NAT traversal is UDP encapsulation...

Best regards
Michael
> 
>  Marcelo
> 
>> 
>> -corey
>> 
>>> 
>>> David
>>> 
>>> -
>>> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 
>>> 1PT, UK
>>> Registration No: 1397386 (Wales)
>>> 



Re: Strange problem with SCTP+IPv6

2020-06-23 Thread 'Marcelo Ricardo Leitner'
On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote:
> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote:
> > From: Marcelo Ricardo Leitner
> > > Sent: 22 June 2020 19:33
> > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> > > > > On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> > > > >
> > > > > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> > > > >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  
> > > > >> wrote:
> > > > >>>
> > > > >>> I've stumbled upon a strange problem with SCTP and IPv6.  If I 
> > > > >>> create an
> > > > >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option 
> > > > >>> on it,
> > > > >>> then I make a connection to it using ::1, the connection will drop 
> > > > >>> after
> > > > >>> 2.5 seconds with an ECONNRESET error.
> > > > >>>
> > > > >>> It only happens on SCTP, it doesn't have the issue if you connect 
> > > > >>> to a
> > > > >>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> > > > >>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> > > > >>> I tried on an ARM system and x86_64.
> > > > >>>
> > > > >>> I haven't dug into the kernel to see if I could find anything yet, 
> > > > >>> but I
> > > > >>> thought I would go ahead and report it.  I am attaching a 
> > > > >>> reproducer.
> > > > >>> Basically, compile the following code:
> > > > >> The code only set IPV6_V6ONLY on server side, so the client side will
> > > > >> still bind all the local ipv4 addresses (as you didn't call bind() to
> > > > >> bind any specific addresses ). Then after the connection is created,
> > > > >> the client will send HB on the v4 paths to the server. The server
> > > > >> will abort the connection, as it can't support v4.
> > > > >>
> > > > >> So you can work around it by either:
> > > > >>
> > > > >>  - set IPV6_V6ONLY on client side.
> > > > >>
> > > > >> or
> > > > >>
> > > > >>  - bind to the specific v6 addresses on the client side.
> > > > >>
> > > > >> I don't see RFC said something about this.
> > > > >> So it may not be a good idea to change the current behaviour
> > > > >> to not establish the connection in this case, which may cause 
> > > > >> regression.
> > > > >
> > > > > Ok, I understand this.  It's a little strange, but I see why it works
> > > > > this way.
> > > > I don't. I would expect it to work as I described in my email.
> > > > Could someone explain me how and why it is behaving different from
> > > > my expectation?
> > > 
> > > It looks like a bug to me. Testing with this test app here, I can see
> > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> > > that's unexpected for a v6only socket. As is, it's the server saying
> > > "I'm available at these other addresses too, but not."
> > 
> > Does it even make sense to mix IPv4 and IPv6 addresses on the same
> > connection?
> > I don't remember ever seeing both types of address in a message,
> > but may not have looked.
> 
> That's an interesting question.  Do the RFCs say anything?  I would
> assume it was ok unless ipv6only was set.
> 
> > 
> > I also wonder whether the connection should be dropped for an error
> > response on a path that has never been validated.
> 
> That actually bothered me a bit more.  Shouldn't it stay up if any path
> is up?  That's kind of the whole point of multihoming.

Michael explained it on the other email. What he described is what I
observed in my tests.

> 
> > 
> > OTOH the whole 'multi-homing' part of SCTP sucks.
> 
> I don't think so.
> 
> > The IP addresses a server needs to bind to depend on where the
> > incoming connection will come from.
> > A local connection may be able to use a 192.168.x.x address
> > but a remote connection must not - as it may be defined locally
> > at the remote system.
> > But both connections can come into the public (routable) address.
> > We have to tell customers to explicitly configure the local IP
> > addresses - which means the application has to know what they are.
> > Fortunately these apps are pretty static - usually M3UA.
> 
> Umm, no,  If you have a private address, it better be behind a firewall,
> and the firewall should handle rewriting the packet to fix the addresses.
> 
> It doesn't appear that Linux netfilter does this.  There is a TODO in
> the code for this.  But that's how it *should* work.

Right, we don't support SCTP aware NAT [1].

1.https://tools.ietf.org/html/draft-stewart-behave-sctpnat-04

  Marcelo

> 
> -corey
> 
> > 
> > David
> > 
> > -
> > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 
> > 1PT, UK
> > Registration No: 1397386 (Wales)
> > 


Re: Strange problem with SCTP+IPv6

2020-06-23 Thread Michael Tuexen
> On 23. Jun 2020, at 15:17, David Laight  wrote:
> 
> From: Marcelo Ricardo Leitner
>> Sent: 22 June 2020 19:33
>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
>>>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
>>>> 
>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
>>>>>> 
>>>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
>>>>>> then I make a connection to it using ::1, the connection will drop after
>>>>>> 2.5 seconds with an ECONNRESET error.
>>>>>> 
>>>>>> It only happens on SCTP, it doesn't have the issue if you connect to a
>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't
>>>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
>>>>>> I tried on an ARM system and x86_64.
>>>>>> 
>>>>>> I haven't dug into the kernel to see if I could find anything yet, but I
>>>>>> thought I would go ahead and report it.  I am attaching a reproducer.
>>>>>> Basically, compile the following code:
>>>>> The code only set IPV6_V6ONLY on server side, so the client side will
>>>>> still bind all the local ipv4 addresses (as you didn't call bind() to
>>>>> bind any specific addresses ). Then after the connection is created,
>>>>> the client will send HB on the v4 paths to the server. The server
>>>>> will abort the connection, as it can't support v4.
>>>>> 
>>>>> So you can work around it by either:
>>>>> 
>>>>> - set IPV6_V6ONLY on client side.
>>>>> 
>>>>> or
>>>>> 
>>>>> - bind to the specific v6 addresses on the client side.
>>>>> 
>>>>> I don't see RFC said something about this.
>>>>> So it may not be a good idea to change the current behaviour
>>>>> to not establish the connection in this case, which may cause regression.
>>>> 
>>>> Ok, I understand this.  It's a little strange, but I see why it works
>>>> this way.
>>> I don't. I would expect it to work as I described in my email.
>>> Could someone explain me how and why it is behaving different from
>>> my expectation?
>> 
>> It looks like a bug to me. Testing with this test app here, I can see
>> the INIT_ACK being sent with a bunch of ipv4 addresses in it and
>> that's unexpected for a v6only socket. As is, it's the server saying
>> "I'm available at these other addresses too, but not."
> 
> Does it even make sense to mix IPv4 and IPv6 addresses on the same
> connection?
Sure, if you have an IPv6 socket, which has not enabled the IPV6ONLY
socket option.
> I don't remember ever seeing both types of address in a message,
> but may not have looked.
> 
> I also wonder whether the connection should be dropped for an error
> response on a path that has never been validated.
Assuming that it is not an ERROR chunk which comes back, but an ABORT,
this should happen as long as the verification tag is OK.
> 
> OTOH the whole 'multi-homing' part of SCTP sucks.
> The IP addresses a server needs to bind to depend on where the
> incoming connection will come from.
Not sure what this means. The application can bind a wildcard
address or a specific subset. However, when an INIT comes in,
the INIT-ACK might contain only a subset of there due to
scoping.
> A local connection may be able to use a 192.168.x.x address
> but a remote connection must not - as it may be defined locally
> at the remote system.
Yepp. Not sure what you can do about it.
> But both connections can come into the public (routable) address.
> We have to tell customers to explicitly configure the local IP
> addresses - which means the application has to know what they are.
> Fortunately these apps are pretty static - usually M3UA.
Please note that in SIGRTRAN scenarios you normally not have NATs
involved as you have usually in setups used at home.

Best regards
Michael
> 
>   David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 
> 1PT, UK
> Registration No: 1397386 (Wales)
> 



Re: Strange problem with SCTP+IPv6

2020-06-23 Thread Corey Minyard
On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote:
> From: Marcelo Ricardo Leitner
> > Sent: 22 June 2020 19:33
> > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> > > > On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> > > >
> > > > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> > > >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
> > > >>>
> > > >>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create 
> > > >>> an
> > > >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on 
> > > >>> it,
> > > >>> then I make a connection to it using ::1, the connection will drop 
> > > >>> after
> > > >>> 2.5 seconds with an ECONNRESET error.
> > > >>>
> > > >>> It only happens on SCTP, it doesn't have the issue if you connect to a
> > > >>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> > > >>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> > > >>> I tried on an ARM system and x86_64.
> > > >>>
> > > >>> I haven't dug into the kernel to see if I could find anything yet, 
> > > >>> but I
> > > >>> thought I would go ahead and report it.  I am attaching a reproducer.
> > > >>> Basically, compile the following code:
> > > >> The code only set IPV6_V6ONLY on server side, so the client side will
> > > >> still bind all the local ipv4 addresses (as you didn't call bind() to
> > > >> bind any specific addresses ). Then after the connection is created,
> > > >> the client will send HB on the v4 paths to the server. The server
> > > >> will abort the connection, as it can't support v4.
> > > >>
> > > >> So you can work around it by either:
> > > >>
> > > >>  - set IPV6_V6ONLY on client side.
> > > >>
> > > >> or
> > > >>
> > > >>  - bind to the specific v6 addresses on the client side.
> > > >>
> > > >> I don't see RFC said something about this.
> > > >> So it may not be a good idea to change the current behaviour
> > > >> to not establish the connection in this case, which may cause 
> > > >> regression.
> > > >
> > > > Ok, I understand this.  It's a little strange, but I see why it works
> > > > this way.
> > > I don't. I would expect it to work as I described in my email.
> > > Could someone explain me how and why it is behaving different from
> > > my expectation?
> > 
> > It looks like a bug to me. Testing with this test app here, I can see
> > the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> > that's unexpected for a v6only socket. As is, it's the server saying
> > "I'm available at these other addresses too, but not."
> 
> Does it even make sense to mix IPv4 and IPv6 addresses on the same
> connection?
> I don't remember ever seeing both types of address in a message,
> but may not have looked.

That's an interesting question.  Do the RFCs say anything?  I would
assume it was ok unless ipv6only was set.

> 
> I also wonder whether the connection should be dropped for an error
> response on a path that has never been validated.

That actually bothered me a bit more.  Shouldn't it stay up if any path
is up?  That's kind of the whole point of multihoming.

> 
> OTOH the whole 'multi-homing' part of SCTP sucks.

I don't think so.

> The IP addresses a server needs to bind to depend on where the
> incoming connection will come from.
> A local connection may be able to use a 192.168.x.x address
> but a remote connection must not - as it may be defined locally
> at the remote system.
> But both connections can come into the public (routable) address.
> We have to tell customers to explicitly configure the local IP
> addresses - which means the application has to know what they are.
> Fortunately these apps are pretty static - usually M3UA.

Umm, no,  If you have a private address, it better be behind a firewall,
and the firewall should handle rewriting the packet to fix the addresses.

It doesn't appear that Linux netfilter does this.  There is a TODO in
the code for this.  But that's how it *should* work.

-corey

> 
>   David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 
> 1PT, UK
> Registration No: 1397386 (Wales)
> 


Re: Strange problem with SCTP+IPv6

2020-06-23 Thread Corey Minyard
On Tue, Jun 23, 2020 at 11:40:21PM +0800, Xin Long wrote:
> On Tue, Jun 23, 2020 at 9:29 PM Corey Minyard  wrote:
> >
> > On Tue, Jun 23, 2020 at 06:13:30PM +0800, Xin Long wrote:
> > > On Tue, Jun 23, 2020 at 2:34 AM Michael Tuexen
> > >  wrote:
> > > >
> > > > > On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner 
> > > > >  wrote:
> > > > >
> > > > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> > > > >>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> > > > >>>
> > > > >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> > > > >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  
> > > > >>>> wrote:
> > > > >>>>>
> > > > >>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I 
> > > > >>>>> create an
> > > > >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option 
> > > > >>>>> on it,
> > > > >>>>> then I make a connection to it using ::1, the connection will 
> > > > >>>>> drop after
> > > > >>>>> 2.5 seconds with an ECONNRESET error.
> > > > >>>>>
> > > > >>>>> It only happens on SCTP, it doesn't have the issue if you connect 
> > > > >>>>> to a
> > > > >>>>> full IPv6 address instead of ::1, and it doesn't happen if you 
> > > > >>>>> don't
> > > > >>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> > > > >>>>> I tried on an ARM system and x86_64.
> > > > >>>>>
> > > > >>>>> I haven't dug into the kernel to see if I could find anything 
> > > > >>>>> yet, but I
> > > > >>>>> thought I would go ahead and report it.  I am attaching a 
> > > > >>>>> reproducer.
> > > > >>>>> Basically, compile the following code:
> > > > >>>> The code only set IPV6_V6ONLY on server side, so the client side 
> > > > >>>> will
> > > > >>>> still bind all the local ipv4 addresses (as you didn't call bind() 
> > > > >>>> to
> > > > >>>> bind any specific addresses ). Then after the connection is 
> > > > >>>> created,
> > > > >>>> the client will send HB on the v4 paths to the server. The server
> > > > >>>> will abort the connection, as it can't support v4.
> > > > >>>>
> > > > >>>> So you can work around it by either:
> > > > >>>>
> > > > >>>> - set IPV6_V6ONLY on client side.
> > > > >>>>
> > > > >>>> or
> > > > >>>>
> > > > >>>> - bind to the specific v6 addresses on the client side.
> > > > >>>>
> > > > >>>> I don't see RFC said something about this.
> > > > >>>> So it may not be a good idea to change the current behaviour
> > > > >>>> to not establish the connection in this case, which may cause 
> > > > >>>> regression.
> > > > >>>
> > > > >>> Ok, I understand this.  It's a little strange, but I see why it 
> > > > >>> works
> > > > >>> this way.
> > > > >> I don't. I would expect it to work as I described in my email.
> > > > >> Could someone explain me how and why it is behaving different from
> > > > >> my expectation?
> > > > >
> > > > > It looks like a bug to me. Testing with this test app here, I can see
> > > > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> > > > > that's unexpected for a v6only socket. As is, it's the server saying
> > > > > "I'm available at these other addresses too, but not."
> > > > I agree.
> > > Then we need a fix in sctp_bind_addrs_to_raw():
> > >
> > > @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const
> > > struct sctp_bind_addr *bp,
> > > addrparms = retval;
> > >
> > > list_for_each_entry(

Re: Strange problem with SCTP+IPv6

2020-06-23 Thread Xin Long
On Tue, Jun 23, 2020 at 9:29 PM Corey Minyard  wrote:
>
> On Tue, Jun 23, 2020 at 06:13:30PM +0800, Xin Long wrote:
> > On Tue, Jun 23, 2020 at 2:34 AM Michael Tuexen
> >  wrote:
> > >
> > > > On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner 
> > > >  wrote:
> > > >
> > > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> > > >>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> > > >>>
> > > >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> > > >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  
> > > >>>> wrote:
> > > >>>>>
> > > >>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I 
> > > >>>>> create an
> > > >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option 
> > > >>>>> on it,
> > > >>>>> then I make a connection to it using ::1, the connection will drop 
> > > >>>>> after
> > > >>>>> 2.5 seconds with an ECONNRESET error.
> > > >>>>>
> > > >>>>> It only happens on SCTP, it doesn't have the issue if you connect 
> > > >>>>> to a
> > > >>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> > > >>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> > > >>>>> I tried on an ARM system and x86_64.
> > > >>>>>
> > > >>>>> I haven't dug into the kernel to see if I could find anything yet, 
> > > >>>>> but I
> > > >>>>> thought I would go ahead and report it.  I am attaching a 
> > > >>>>> reproducer.
> > > >>>>> Basically, compile the following code:
> > > >>>> The code only set IPV6_V6ONLY on server side, so the client side will
> > > >>>> still bind all the local ipv4 addresses (as you didn't call bind() to
> > > >>>> bind any specific addresses ). Then after the connection is created,
> > > >>>> the client will send HB on the v4 paths to the server. The server
> > > >>>> will abort the connection, as it can't support v4.
> > > >>>>
> > > >>>> So you can work around it by either:
> > > >>>>
> > > >>>> - set IPV6_V6ONLY on client side.
> > > >>>>
> > > >>>> or
> > > >>>>
> > > >>>> - bind to the specific v6 addresses on the client side.
> > > >>>>
> > > >>>> I don't see RFC said something about this.
> > > >>>> So it may not be a good idea to change the current behaviour
> > > >>>> to not establish the connection in this case, which may cause 
> > > >>>> regression.
> > > >>>
> > > >>> Ok, I understand this.  It's a little strange, but I see why it works
> > > >>> this way.
> > > >> I don't. I would expect it to work as I described in my email.
> > > >> Could someone explain me how and why it is behaving different from
> > > >> my expectation?
> > > >
> > > > It looks like a bug to me. Testing with this test app here, I can see
> > > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> > > > that's unexpected for a v6only socket. As is, it's the server saying
> > > > "I'm available at these other addresses too, but not."
> > > I agree.
> > Then we need a fix in sctp_bind_addrs_to_raw():
> >
> > @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const
> > struct sctp_bind_addr *bp,
> > addrparms = retval;
> >
> > list_for_each_entry(addr, >address_list, list) {
> > +   if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) &&
> > +   (AF_INET == addr->a.sa.sa_family))
> > +   continue;
>
> This does not compile in the latest mainline.  sk is not defined.
> Also, if you could send a normal git patch, that would be easier to
> manage.
sorry, that was just the code to show the idea.

For the compilable one, pls see:
https://paste.centos.org/view/49f5ff5a

Thanks.
>
> Thanks,
>
> -corey
>
> > a

Re: Strange problem with SCTP+IPv6

2020-06-23 Thread Corey Minyard
On Tue, Jun 23, 2020 at 06:13:30PM +0800, Xin Long wrote:
> On Tue, Jun 23, 2020 at 2:34 AM Michael Tuexen
>  wrote:
> >
> > > On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner 
> > >  wrote:
> > >
> > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> > >>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> > >>>
> > >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> > >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
> > >>>>>
> > >>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create 
> > >>>>> an
> > >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on 
> > >>>>> it,
> > >>>>> then I make a connection to it using ::1, the connection will drop 
> > >>>>> after
> > >>>>> 2.5 seconds with an ECONNRESET error.
> > >>>>>
> > >>>>> It only happens on SCTP, it doesn't have the issue if you connect to a
> > >>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> > >>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> > >>>>> I tried on an ARM system and x86_64.
> > >>>>>
> > >>>>> I haven't dug into the kernel to see if I could find anything yet, 
> > >>>>> but I
> > >>>>> thought I would go ahead and report it.  I am attaching a reproducer.
> > >>>>> Basically, compile the following code:
> > >>>> The code only set IPV6_V6ONLY on server side, so the client side will
> > >>>> still bind all the local ipv4 addresses (as you didn't call bind() to
> > >>>> bind any specific addresses ). Then after the connection is created,
> > >>>> the client will send HB on the v4 paths to the server. The server
> > >>>> will abort the connection, as it can't support v4.
> > >>>>
> > >>>> So you can work around it by either:
> > >>>>
> > >>>> - set IPV6_V6ONLY on client side.
> > >>>>
> > >>>> or
> > >>>>
> > >>>> - bind to the specific v6 addresses on the client side.
> > >>>>
> > >>>> I don't see RFC said something about this.
> > >>>> So it may not be a good idea to change the current behaviour
> > >>>> to not establish the connection in this case, which may cause 
> > >>>> regression.
> > >>>
> > >>> Ok, I understand this.  It's a little strange, but I see why it works
> > >>> this way.
> > >> I don't. I would expect it to work as I described in my email.
> > >> Could someone explain me how and why it is behaving different from
> > >> my expectation?
> > >
> > > It looks like a bug to me. Testing with this test app here, I can see
> > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> > > that's unexpected for a v6only socket. As is, it's the server saying
> > > "I'm available at these other addresses too, but not."
> > I agree.
> Then we need a fix in sctp_bind_addrs_to_raw():
> 
> @@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const
> struct sctp_bind_addr *bp,
> addrparms = retval;
> 
> list_for_each_entry(addr, >address_list, list) {
> +   if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) &&
> +   (AF_INET == addr->a.sa.sa_family))
> +   continue;

This does not compile in the latest mainline.  sk is not defined.
Also, if you could send a normal git patch, that would be easier to 
manage.

Thanks,

-corey

> af = sctp_get_af_specific(addr->a.v4.sin_family);
> len = af->to_addr_param(>a, );
> memcpy(addrparms.v, , len);
> 
> >
> > Best regards
> > Michael
> > >
> > > Thanks,
> > > Marcelo
> > >
> > >>
> > >> Best regards
> > >> Michael
> > >>>
> > >>> Thanks,
> > >>>
> > >>> -corey
> > >>>
> > >>>>
> > >>>>>
> > >>>>> gcc -g -o sctptest -Wall sctptest.c
> > >>>>>
> > >>&

RE: Strange problem with SCTP+IPv6

2020-06-23 Thread David Laight
From: Marcelo Ricardo Leitner
> Sent: 22 June 2020 19:33
> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> > > On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> > >
> > > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> > >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
> > >>>
> > >>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
> > >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
> > >>> then I make a connection to it using ::1, the connection will drop after
> > >>> 2.5 seconds with an ECONNRESET error.
> > >>>
> > >>> It only happens on SCTP, it doesn't have the issue if you connect to a
> > >>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> > >>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> > >>> I tried on an ARM system and x86_64.
> > >>>
> > >>> I haven't dug into the kernel to see if I could find anything yet, but I
> > >>> thought I would go ahead and report it.  I am attaching a reproducer.
> > >>> Basically, compile the following code:
> > >> The code only set IPV6_V6ONLY on server side, so the client side will
> > >> still bind all the local ipv4 addresses (as you didn't call bind() to
> > >> bind any specific addresses ). Then after the connection is created,
> > >> the client will send HB on the v4 paths to the server. The server
> > >> will abort the connection, as it can't support v4.
> > >>
> > >> So you can work around it by either:
> > >>
> > >>  - set IPV6_V6ONLY on client side.
> > >>
> > >> or
> > >>
> > >>  - bind to the specific v6 addresses on the client side.
> > >>
> > >> I don't see RFC said something about this.
> > >> So it may not be a good idea to change the current behaviour
> > >> to not establish the connection in this case, which may cause regression.
> > >
> > > Ok, I understand this.  It's a little strange, but I see why it works
> > > this way.
> > I don't. I would expect it to work as I described in my email.
> > Could someone explain me how and why it is behaving different from
> > my expectation?
> 
> It looks like a bug to me. Testing with this test app here, I can see
> the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> that's unexpected for a v6only socket. As is, it's the server saying
> "I'm available at these other addresses too, but not."

Does it even make sense to mix IPv4 and IPv6 addresses on the same
connection?
I don't remember ever seeing both types of address in a message,
but may not have looked.

I also wonder whether the connection should be dropped for an error
response on a path that has never been validated.

OTOH the whole 'multi-homing' part of SCTP sucks.
The IP addresses a server needs to bind to depend on where the
incoming connection will come from.
A local connection may be able to use a 192.168.x.x address
but a remote connection must not - as it may be defined locally
at the remote system.
But both connections can come into the public (routable) address.
We have to tell customers to explicitly configure the local IP
addresses - which means the application has to know what they are.
Fortunately these apps are pretty static - usually M3UA.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)



Re: Strange problem with SCTP+IPv6

2020-06-23 Thread Xin Long
On Tue, Jun 23, 2020 at 2:34 AM Michael Tuexen
 wrote:
>
> > On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner 
> >  wrote:
> >
> > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> >>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> >>>
> >>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> >>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
> >>>>>
> >>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
> >>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
> >>>>> then I make a connection to it using ::1, the connection will drop after
> >>>>> 2.5 seconds with an ECONNRESET error.
> >>>>>
> >>>>> It only happens on SCTP, it doesn't have the issue if you connect to a
> >>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> >>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> >>>>> I tried on an ARM system and x86_64.
> >>>>>
> >>>>> I haven't dug into the kernel to see if I could find anything yet, but I
> >>>>> thought I would go ahead and report it.  I am attaching a reproducer.
> >>>>> Basically, compile the following code:
> >>>> The code only set IPV6_V6ONLY on server side, so the client side will
> >>>> still bind all the local ipv4 addresses (as you didn't call bind() to
> >>>> bind any specific addresses ). Then after the connection is created,
> >>>> the client will send HB on the v4 paths to the server. The server
> >>>> will abort the connection, as it can't support v4.
> >>>>
> >>>> So you can work around it by either:
> >>>>
> >>>> - set IPV6_V6ONLY on client side.
> >>>>
> >>>> or
> >>>>
> >>>> - bind to the specific v6 addresses on the client side.
> >>>>
> >>>> I don't see RFC said something about this.
> >>>> So it may not be a good idea to change the current behaviour
> >>>> to not establish the connection in this case, which may cause regression.
> >>>
> >>> Ok, I understand this.  It's a little strange, but I see why it works
> >>> this way.
> >> I don't. I would expect it to work as I described in my email.
> >> Could someone explain me how and why it is behaving different from
> >> my expectation?
> >
> > It looks like a bug to me. Testing with this test app here, I can see
> > the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> > that's unexpected for a v6only socket. As is, it's the server saying
> > "I'm available at these other addresses too, but not."
> I agree.
Then we need a fix in sctp_bind_addrs_to_raw():

@@ -238,6 +240,9 @@ union sctp_params sctp_bind_addrs_to_raw(const
struct sctp_bind_addr *bp,
addrparms = retval;

list_for_each_entry(addr, >address_list, list) {
+   if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) &&
+   (AF_INET == addr->a.sa.sa_family))
+   continue;
af = sctp_get_af_specific(addr->a.v4.sin_family);
len = af->to_addr_param(>a, );
memcpy(addrparms.v, , len);

>
> Best regards
> Michael
> >
> > Thanks,
> > Marcelo
> >
> >>
> >> Best regards
> >> Michael
> >>>
> >>> Thanks,
> >>>
> >>> -corey
> >>>
> >>>>
> >>>>>
> >>>>> gcc -g -o sctptest -Wall sctptest.c
> >>>>>
> >>>>> and run it in one window as a server:
> >>>>>
> >>>>> ./sctptest a
> >>>>>
> >>>>> (Pass in any option to be the server) and run the following in another
> >>>>> window as the client:
> >>>>>
> >>>>> ./sctptest
> >>>>>
> >>>>> It disconnects after about 2.5 seconds.  If it works, it should just sit
> >>>>> there forever.
> >>>>>
> >>>>> -corey
> >>>>>
> >>>>>
> >>>>> #include 
> >>>>> #include 
> >>>>> #include 
> >>>>> #include 
> >>>>> #include 
> >>

Re: Strange problem with SCTP+IPv6

2020-06-22 Thread Michael Tuexen
> On 22. Jun 2020, at 20:32, Marcelo Ricardo Leitner 
>  wrote:
> 
> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
>>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
>>> 
>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
>>>>> 
>>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
>>>>> then I make a connection to it using ::1, the connection will drop after
>>>>> 2.5 seconds with an ECONNRESET error.
>>>>> 
>>>>> It only happens on SCTP, it doesn't have the issue if you connect to a
>>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't
>>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
>>>>> I tried on an ARM system and x86_64.
>>>>> 
>>>>> I haven't dug into the kernel to see if I could find anything yet, but I
>>>>> thought I would go ahead and report it.  I am attaching a reproducer.
>>>>> Basically, compile the following code:
>>>> The code only set IPV6_V6ONLY on server side, so the client side will
>>>> still bind all the local ipv4 addresses (as you didn't call bind() to
>>>> bind any specific addresses ). Then after the connection is created,
>>>> the client will send HB on the v4 paths to the server. The server
>>>> will abort the connection, as it can't support v4.
>>>> 
>>>> So you can work around it by either:
>>>> 
>>>> - set IPV6_V6ONLY on client side.
>>>> 
>>>> or
>>>> 
>>>> - bind to the specific v6 addresses on the client side.
>>>> 
>>>> I don't see RFC said something about this.
>>>> So it may not be a good idea to change the current behaviour
>>>> to not establish the connection in this case, which may cause regression.
>>> 
>>> Ok, I understand this.  It's a little strange, but I see why it works
>>> this way.
>> I don't. I would expect it to work as I described in my email.
>> Could someone explain me how and why it is behaving different from
>> my expectation?
> 
> It looks like a bug to me. Testing with this test app here, I can see
> the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> that's unexpected for a v6only socket. As is, it's the server saying
> "I'm available at these other addresses too, but not."
I agree.

Best regards
Michael
> 
> Thanks,
> Marcelo
> 
>> 
>> Best regards
>> Michael
>>> 
>>> Thanks,
>>> 
>>> -corey
>>> 
>>>> 
>>>>> 
>>>>> gcc -g -o sctptest -Wall sctptest.c
>>>>> 
>>>>> and run it in one window as a server:
>>>>> 
>>>>> ./sctptest a
>>>>> 
>>>>> (Pass in any option to be the server) and run the following in another
>>>>> window as the client:
>>>>> 
>>>>> ./sctptest
>>>>> 
>>>>> It disconnects after about 2.5 seconds.  If it works, it should just sit
>>>>> there forever.
>>>>> 
>>>>> -corey
>>>>> 
>>>>> 
>>>>> #include 
>>>>> #include 
>>>>> #include 
>>>>> #include 
>>>>> #include 
>>>>> #include 
>>>>> #include 
>>>>> #include 
>>>>> #include 
>>>>> #include 
>>>>> #include 
>>>>> 
>>>>> static int
>>>>> getaddr(const char *addr, const char *port, bool listen,
>>>>>   struct addrinfo **rai)
>>>>> {
>>>>>   struct addrinfo *ai, hints;
>>>>> 
>>>>>   memset(, 0, sizeof(hints));
>>>>>   hints.ai_flags = AI_ADDRCONFIG;
>>>>>   if (listen)
>>>>>   hints.ai_flags |= AI_PASSIVE;
>>>>>   hints.ai_family = AF_UNSPEC;
>>>>>   hints.ai_socktype = SOCK_STREAM;
>>>>>   hints.ai_protocol = IPPROTO_SCTP;
>>>>>   if (getaddrinfo(addr, port, , )) {
>>>>>   perror("getaddrinfo");
>>>>>   return -1;
>>>>>   }
>>>>> 
>>>>>   *rai = ai;
>>>>>  

Re: Strange problem with SCTP+IPv6

2020-06-22 Thread Marcelo Ricardo Leitner
On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> > On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> > 
> > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
> >>> 
> >>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
> >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
> >>> then I make a connection to it using ::1, the connection will drop after
> >>> 2.5 seconds with an ECONNRESET error.
> >>> 
> >>> It only happens on SCTP, it doesn't have the issue if you connect to a
> >>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> >>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> >>> I tried on an ARM system and x86_64.
> >>> 
> >>> I haven't dug into the kernel to see if I could find anything yet, but I
> >>> thought I would go ahead and report it.  I am attaching a reproducer.
> >>> Basically, compile the following code:
> >> The code only set IPV6_V6ONLY on server side, so the client side will
> >> still bind all the local ipv4 addresses (as you didn't call bind() to
> >> bind any specific addresses ). Then after the connection is created,
> >> the client will send HB on the v4 paths to the server. The server
> >> will abort the connection, as it can't support v4.
> >> 
> >> So you can work around it by either:
> >> 
> >>  - set IPV6_V6ONLY on client side.
> >> 
> >> or
> >> 
> >>  - bind to the specific v6 addresses on the client side.
> >> 
> >> I don't see RFC said something about this.
> >> So it may not be a good idea to change the current behaviour
> >> to not establish the connection in this case, which may cause regression.
> > 
> > Ok, I understand this.  It's a little strange, but I see why it works
> > this way.
> I don't. I would expect it to work as I described in my email.
> Could someone explain me how and why it is behaving different from
> my expectation?

It looks like a bug to me. Testing with this test app here, I can see
the INIT_ACK being sent with a bunch of ipv4 addresses in it and
that's unexpected for a v6only socket. As is, it's the server saying
"I'm available at these other addresses too, but not."

Thanks,
Marcelo

> 
> Best regards
> Michael
> > 
> > Thanks,
> > 
> > -corey
> > 
> >> 
> >>> 
> >>>  gcc -g -o sctptest -Wall sctptest.c
> >>> 
> >>> and run it in one window as a server:
> >>> 
> >>>  ./sctptest a
> >>> 
> >>> (Pass in any option to be the server) and run the following in another
> >>> window as the client:
> >>> 
> >>>  ./sctptest
> >>> 
> >>> It disconnects after about 2.5 seconds.  If it works, it should just sit
> >>> there forever.
> >>> 
> >>> -corey
> >>> 
> >>> 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> 
> >>> static int
> >>> getaddr(const char *addr, const char *port, bool listen,
> >>>struct addrinfo **rai)
> >>> {
> >>>struct addrinfo *ai, hints;
> >>> 
> >>>memset(, 0, sizeof(hints));
> >>>hints.ai_flags = AI_ADDRCONFIG;
> >>>if (listen)
> >>>hints.ai_flags |= AI_PASSIVE;
> >>>hints.ai_family = AF_UNSPEC;
> >>>hints.ai_socktype = SOCK_STREAM;
> >>>hints.ai_protocol = IPPROTO_SCTP;
> >>>if (getaddrinfo(addr, port, , )) {
> >>>perror("getaddrinfo");
> >>>return -1;
> >>>}
> >>> 
> >>>*rai = ai;
> >>>return 0;
> >>> }
> >>> 
> >>> static int
> >>> waitread(int s)
> >>> {
> >>>char data[1];
> >>>ssize_t rv;
> >>> 
> >>>rv = read(s, data, sizeof(data));
> >>>if (rv == -1) {
> >>>perror("read");
> >>>return -1;
> >>>}
> >>

Re: Strange problem with SCTP+IPv6

2020-06-22 Thread Michael Tuexen
> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> 
> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
>>> 
>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
>>> then I make a connection to it using ::1, the connection will drop after
>>> 2.5 seconds with an ECONNRESET error.
>>> 
>>> It only happens on SCTP, it doesn't have the issue if you connect to a
>>> full IPv6 address instead of ::1, and it doesn't happen if you don't
>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
>>> I tried on an ARM system and x86_64.
>>> 
>>> I haven't dug into the kernel to see if I could find anything yet, but I
>>> thought I would go ahead and report it.  I am attaching a reproducer.
>>> Basically, compile the following code:
>> The code only set IPV6_V6ONLY on server side, so the client side will
>> still bind all the local ipv4 addresses (as you didn't call bind() to
>> bind any specific addresses ). Then after the connection is created,
>> the client will send HB on the v4 paths to the server. The server
>> will abort the connection, as it can't support v4.
>> 
>> So you can work around it by either:
>> 
>>  - set IPV6_V6ONLY on client side.
>> 
>> or
>> 
>>  - bind to the specific v6 addresses on the client side.
>> 
>> I don't see RFC said something about this.
>> So it may not be a good idea to change the current behaviour
>> to not establish the connection in this case, which may cause regression.
> 
> Ok, I understand this.  It's a little strange, but I see why it works
> this way.
I don't. I would expect it to work as I described in my email.
Could someone explain me how and why it is behaving different from
my expectation?

Best regards
Michael
> 
> Thanks,
> 
> -corey
> 
>> 
>>> 
>>>  gcc -g -o sctptest -Wall sctptest.c
>>> 
>>> and run it in one window as a server:
>>> 
>>>  ./sctptest a
>>> 
>>> (Pass in any option to be the server) and run the following in another
>>> window as the client:
>>> 
>>>  ./sctptest
>>> 
>>> It disconnects after about 2.5 seconds.  If it works, it should just sit
>>> there forever.
>>> 
>>> -corey
>>> 
>>> 
>>> #include 
>>> #include 
>>> #include 
>>> #include 
>>> #include 
>>> #include 
>>> #include 
>>> #include 
>>> #include 
>>> #include 
>>> #include 
>>> 
>>> static int
>>> getaddr(const char *addr, const char *port, bool listen,
>>>struct addrinfo **rai)
>>> {
>>>struct addrinfo *ai, hints;
>>> 
>>>memset(, 0, sizeof(hints));
>>>hints.ai_flags = AI_ADDRCONFIG;
>>>if (listen)
>>>hints.ai_flags |= AI_PASSIVE;
>>>hints.ai_family = AF_UNSPEC;
>>>hints.ai_socktype = SOCK_STREAM;
>>>hints.ai_protocol = IPPROTO_SCTP;
>>>if (getaddrinfo(addr, port, , )) {
>>>perror("getaddrinfo");
>>>return -1;
>>>}
>>> 
>>>*rai = ai;
>>>return 0;
>>> }
>>> 
>>> static int
>>> waitread(int s)
>>> {
>>>char data[1];
>>>ssize_t rv;
>>> 
>>>rv = read(s, data, sizeof(data));
>>>if (rv == -1) {
>>>perror("read");
>>>return -1;
>>>}
>>>printf("Read %d bytes\n", (int) rv);
>>>return 0;
>>> }
>>> 
>>> static int
>>> do_server(void)
>>> {
>>>int err, ls, s, optval;
>>>struct addrinfo *ai;
>>> 
>>>printf("Server\n");
>>> 
>>>err = getaddr("::", "3023", true, );
>>>if (err)
>>>return err;
>>> 
>>>ls = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
>>>if (ls == -1) {
>>>perror("socket");
>>>return -1;
>>>}
>>> 
>>>optval = 1;
>>>if (setsockopt(ls, SOL_SOCKET, SO_REUSEADDR,
>>>   (void *), sizeof(optval)) == -1) {
>>>perror("setsockopt reuseaddr");
>>>  

Re: Strange problem with SCTP+IPv6

2020-06-22 Thread Corey Minyard
On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
> >
> > I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
> > sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
> > then I make a connection to it using ::1, the connection will drop after
> > 2.5 seconds with an ECONNRESET error.
> >
> > It only happens on SCTP, it doesn't have the issue if you connect to a
> > full IPv6 address instead of ::1, and it doesn't happen if you don't
> > set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> > I tried on an ARM system and x86_64.
> >
> > I haven't dug into the kernel to see if I could find anything yet, but I
> > thought I would go ahead and report it.  I am attaching a reproducer.
> > Basically, compile the following code:
> The code only set IPV6_V6ONLY on server side, so the client side will
> still bind all the local ipv4 addresses (as you didn't call bind() to
> bind any specific addresses ). Then after the connection is created,
> the client will send HB on the v4 paths to the server. The server
> will abort the connection, as it can't support v4.
> 
> So you can work around it by either:
> 
>   - set IPV6_V6ONLY on client side.
> 
> or
> 
>   - bind to the specific v6 addresses on the client side.
> 
> I don't see RFC said something about this.
> So it may not be a good idea to change the current behaviour
> to not establish the connection in this case, which may cause regression.

Ok, I understand this.  It's a little strange, but I see why it works
this way.

Thanks,

-corey

> 
> >
> >   gcc -g -o sctptest -Wall sctptest.c
> >
> > and run it in one window as a server:
> >
> >   ./sctptest a
> >
> > (Pass in any option to be the server) and run the following in another
> > window as the client:
> >
> >   ./sctptest
> >
> > It disconnects after about 2.5 seconds.  If it works, it should just sit
> > there forever.
> >
> > -corey
> >
> >
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> >
> > static int
> > getaddr(const char *addr, const char *port, bool listen,
> > struct addrinfo **rai)
> > {
> > struct addrinfo *ai, hints;
> >
> > memset(, 0, sizeof(hints));
> > hints.ai_flags = AI_ADDRCONFIG;
> > if (listen)
> > hints.ai_flags |= AI_PASSIVE;
> > hints.ai_family = AF_UNSPEC;
> > hints.ai_socktype = SOCK_STREAM;
> > hints.ai_protocol = IPPROTO_SCTP;
> > if (getaddrinfo(addr, port, , )) {
> > perror("getaddrinfo");
> > return -1;
> > }
> >
> > *rai = ai;
> > return 0;
> > }
> >
> > static int
> > waitread(int s)
> > {
> > char data[1];
> > ssize_t rv;
> >
> > rv = read(s, data, sizeof(data));
> > if (rv == -1) {
> > perror("read");
> > return -1;
> > }
> > printf("Read %d bytes\n", (int) rv);
> > return 0;
> > }
> >
> > static int
> > do_server(void)
> > {
> > int err, ls, s, optval;
> > struct addrinfo *ai;
> >
> > printf("Server\n");
> >
> > err = getaddr("::", "3023", true, );
> > if (err)
> > return err;
> >
> > ls = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
> > if (ls == -1) {
> > perror("socket");
> > return -1;
> > }
> >
> > optval = 1;
> > if (setsockopt(ls, SOL_SOCKET, SO_REUSEADDR,
> >(void *), sizeof(optval)) == -1) {
> > perror("setsockopt reuseaddr");
> > return -1;
> > }
> >
> > /* Comment this out and it will work. */
> > if (setsockopt(ls, IPPROTO_IPV6, IPV6_V6ONLY, ,
> >sizeof(optval)) == -1) {
> > perror("setsockopt ipv6 only");
> > return -1;
> > }
> >
> > err = bind(ls, ai->ai_addr, ai->ai_addrlen);
> > if (err == -1) {
> > perror("bind");
> > return -1;
> > }
> >
> > err = listen(ls, 5);
> > if (err == -1) {
> > perror("

Re: Strange problem with SCTP+IPv6

2020-06-22 Thread Michael Tuexen



> On 22. Jun 2020, at 14:01, Xin Long  wrote:
> 
> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
>> 
>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
>> then I make a connection to it using ::1, the connection will drop after
>> 2.5 seconds with an ECONNRESET error.
>> 
>> It only happens on SCTP, it doesn't have the issue if you connect to a
>> full IPv6 address instead of ::1, and it doesn't happen if you don't
>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
>> I tried on an ARM system and x86_64.
>> 
>> I haven't dug into the kernel to see if I could find anything yet, but I
>> thought I would go ahead and report it.  I am attaching a reproducer.
>> Basically, compile the following code:
> The code only set IPV6_V6ONLY on server side, so the client side will
> still bind all the local ipv4 addresses (as you didn't call bind() to
> bind any specific addresses ). Then after the connection is created,
Let's focus on the loopback addresses ::1 and 127.0.0.1.

So the server will only use ::1. The client will send an INIT from
::1 to ::1 and lists 127.0.0.1 and ::1. That is what I would expect.
Is that happening?

The server would respond with an INIT-ACK from ::1 to ::1 and would
not list any IP addresses. Especially not 127.0.0.1, since it is IPv6 only.

After the association has beed established, the client can't send
any IPv4 packet to the server, since the server did not announce
any. The server can't send any IPv4 packets since it is IPv6 only.

This is what I would expect and this scenario should just work.
What am I missing?

Best regards
Michael
> the client will send HB on the v4 paths to the server. The server
> will abort the connection, as it can't support v4.
> 
> So you can work around it by either:
> 
>  - set IPV6_V6ONLY on client side.
> 
> or
> 
>  - bind to the specific v6 addresses on the client side.
> 
> I don't see RFC said something about this.
> So it may not be a good idea to change the current behaviour
> to not establish the connection in this case, which may cause regression.
> 
>> 
>>  gcc -g -o sctptest -Wall sctptest.c
>> 
>> and run it in one window as a server:
>> 
>>  ./sctptest a
>> 
>> (Pass in any option to be the server) and run the following in another
>> window as the client:
>> 
>>  ./sctptest
>> 
>> It disconnects after about 2.5 seconds.  If it works, it should just sit
>> there forever.
>> 
>> -corey
>> 
>> 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> 
>> static int
>> getaddr(const char *addr, const char *port, bool listen,
>>struct addrinfo **rai)
>> {
>>struct addrinfo *ai, hints;
>> 
>>memset(, 0, sizeof(hints));
>>hints.ai_flags = AI_ADDRCONFIG;
>>if (listen)
>>hints.ai_flags |= AI_PASSIVE;
>>hints.ai_family = AF_UNSPEC;
>>hints.ai_socktype = SOCK_STREAM;
>>hints.ai_protocol = IPPROTO_SCTP;
>>if (getaddrinfo(addr, port, , )) {
>>perror("getaddrinfo");
>>return -1;
>>}
>> 
>>*rai = ai;
>>return 0;
>> }
>> 
>> static int
>> waitread(int s)
>> {
>>char data[1];
>>ssize_t rv;
>> 
>>rv = read(s, data, sizeof(data));
>>if (rv == -1) {
>>perror("read");
>>return -1;
>>}
>>printf("Read %d bytes\n", (int) rv);
>>return 0;
>> }
>> 
>> static int
>> do_server(void)
>> {
>>int err, ls, s, optval;
>>struct addrinfo *ai;
>> 
>>printf("Server\n");
>> 
>>err = getaddr("::", "3023", true, );
>>if (err)
>>return err;
>> 
>>ls = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
>>if (ls == -1) {
>>perror("socket");
>>return -1;
>>}
>> 
>>optval = 1;
>>if (setsockopt(ls, SOL_SOCKET, SO_REUSEADDR,
>>   (void *), sizeof(optval)) == -1) {
>>perror("setsockopt reuseaddr");
>>return -1;
>>}
>> 
>>/* Comment this out and it will work. */
>>if (setsockopt(ls, IPPROTO_IPV6, IPV6_V6ONLY, ,
>>   sizeof(opt

Re: Strange problem with SCTP+IPv6

2020-06-22 Thread Xin Long
On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
>
> I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
> then I make a connection to it using ::1, the connection will drop after
> 2.5 seconds with an ECONNRESET error.
>
> It only happens on SCTP, it doesn't have the issue if you connect to a
> full IPv6 address instead of ::1, and it doesn't happen if you don't
> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> I tried on an ARM system and x86_64.
>
> I haven't dug into the kernel to see if I could find anything yet, but I
> thought I would go ahead and report it.  I am attaching a reproducer.
> Basically, compile the following code:
The code only set IPV6_V6ONLY on server side, so the client side will
still bind all the local ipv4 addresses (as you didn't call bind() to
bind any specific addresses ). Then after the connection is created,
the client will send HB on the v4 paths to the server. The server
will abort the connection, as it can't support v4.

So you can work around it by either:

  - set IPV6_V6ONLY on client side.

or

  - bind to the specific v6 addresses on the client side.

I don't see RFC said something about this.
So it may not be a good idea to change the current behaviour
to not establish the connection in this case, which may cause regression.

>
>   gcc -g -o sctptest -Wall sctptest.c
>
> and run it in one window as a server:
>
>   ./sctptest a
>
> (Pass in any option to be the server) and run the following in another
> window as the client:
>
>   ./sctptest
>
> It disconnects after about 2.5 seconds.  If it works, it should just sit
> there forever.
>
> -corey
>
>
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
>
> static int
> getaddr(const char *addr, const char *port, bool listen,
> struct addrinfo **rai)
> {
> struct addrinfo *ai, hints;
>
> memset(, 0, sizeof(hints));
> hints.ai_flags = AI_ADDRCONFIG;
> if (listen)
> hints.ai_flags |= AI_PASSIVE;
> hints.ai_family = AF_UNSPEC;
> hints.ai_socktype = SOCK_STREAM;
> hints.ai_protocol = IPPROTO_SCTP;
> if (getaddrinfo(addr, port, , )) {
> perror("getaddrinfo");
> return -1;
> }
>
> *rai = ai;
> return 0;
> }
>
> static int
> waitread(int s)
> {
> char data[1];
> ssize_t rv;
>
> rv = read(s, data, sizeof(data));
> if (rv == -1) {
> perror("read");
> return -1;
> }
> printf("Read %d bytes\n", (int) rv);
> return 0;
> }
>
> static int
> do_server(void)
> {
> int err, ls, s, optval;
> struct addrinfo *ai;
>
> printf("Server\n");
>
> err = getaddr("::", "3023", true, );
> if (err)
> return err;
>
> ls = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
> if (ls == -1) {
> perror("socket");
> return -1;
> }
>
> optval = 1;
> if (setsockopt(ls, SOL_SOCKET, SO_REUSEADDR,
>(void *), sizeof(optval)) == -1) {
> perror("setsockopt reuseaddr");
> return -1;
> }
>
> /* Comment this out and it will work. */
> if (setsockopt(ls, IPPROTO_IPV6, IPV6_V6ONLY, ,
>sizeof(optval)) == -1) {
> perror("setsockopt ipv6 only");
> return -1;
> }
>
> err = bind(ls, ai->ai_addr, ai->ai_addrlen);
> if (err == -1) {
> perror("bind");
> return -1;
> }
>
> err = listen(ls, 5);
> if (err == -1) {
> perror("listen");
> return -1;
> }
>
> s = accept(ls, NULL, NULL);
> if (s == -1) {
> perror("accept");
> return -1;
> }
>
> close(ls);
>
> err = waitread(s);
> close(s);
> return err;
> }
>
> static int
> do_client(void)
> {
> int err, s;
> struct addrinfo *ai;
>
> printf("Client\n");
>
> err = getaddr("::1", "3023", false, );
> if (err)
> return err;
>
> s = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
> if (s == -1) {
> perror("socket");
> return -1;
> }
>
> err = connect(s, ai->ai_addr, ai->ai_addrlen);
> if (err == -1) {
> perror("connect");
> return -1;
> }
>
> err = waitread(s);
> close(s);
> return err;
> }
>
> int
> main(int argc, char *argv[])
> {
> int err;
>
> if (argc > 1)
> err = do_server();
> else
> err = do_client();
> return !!err;
> }
>


Strange problem with SCTP+IPv6

2020-06-21 Thread Corey Minyard
I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
then I make a connection to it using ::1, the connection will drop after
2.5 seconds with an ECONNRESET error.

It only happens on SCTP, it doesn't have the issue if you connect to a
full IPv6 address instead of ::1, and it doesn't happen if you don't
set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
I tried on an ARM system and x86_64.

I haven't dug into the kernel to see if I could find anything yet, but I
thought I would go ahead and report it.  I am attaching a reproducer.
Basically, compile the following code:

  gcc -g -o sctptest -Wall sctptest.c

and run it in one window as a server:

  ./sctptest a

(Pass in any option to be the server) and run the following in another
window as the client:

  ./sctptest

It disconnects after about 2.5 seconds.  If it works, it should just sit
there forever.

-corey


#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

static int
getaddr(const char *addr, const char *port, bool listen,
struct addrinfo **rai)
{
struct addrinfo *ai, hints;

memset(, 0, sizeof(hints));
hints.ai_flags = AI_ADDRCONFIG;
if (listen)
hints.ai_flags |= AI_PASSIVE;
hints.ai_family = AF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
hints.ai_protocol = IPPROTO_SCTP;
if (getaddrinfo(addr, port, , )) {
perror("getaddrinfo");
return -1;
}

*rai = ai;
return 0;
}

static int
waitread(int s)
{
char data[1];
ssize_t rv;

rv = read(s, data, sizeof(data));
if (rv == -1) {
perror("read");
return -1;
}
printf("Read %d bytes\n", (int) rv);
return 0;
}

static int
do_server(void)
{
int err, ls, s, optval;
struct addrinfo *ai;

printf("Server\n");

err = getaddr("::", "3023", true, );
if (err)
return err;

ls = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
if (ls == -1) {
perror("socket");
return -1;
}

optval = 1;
if (setsockopt(ls, SOL_SOCKET, SO_REUSEADDR,
   (void *), sizeof(optval)) == -1) {
perror("setsockopt reuseaddr");
return -1;
}

/* Comment this out and it will work. */
if (setsockopt(ls, IPPROTO_IPV6, IPV6_V6ONLY, ,
   sizeof(optval)) == -1) {
perror("setsockopt ipv6 only");
return -1;
}

err = bind(ls, ai->ai_addr, ai->ai_addrlen);
if (err == -1) {
perror("bind");
return -1;
}

err = listen(ls, 5);
if (err == -1) {
perror("listen");
return -1;
}

s = accept(ls, NULL, NULL);
if (s == -1) {
perror("accept");
return -1;
}

close(ls);

err = waitread(s);
close(s);
return err;
}

static int
do_client(void)
{
int err, s;
struct addrinfo *ai;

printf("Client\n");

err = getaddr("::1", "3023", false, );
if (err)
return err;

s = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
if (s == -1) {
perror("socket");
return -1;
}

err = connect(s, ai->ai_addr, ai->ai_addrlen);
if (err == -1) {
perror("connect");
return -1;
}

err = waitread(s);
close(s);
return err;
}

int
main(int argc, char *argv[])
{
int err;

if (argc > 1)
err = do_server();
else
err = do_client();
return !!err;
}



linux-next: strange set of patches in the y2038 tree

2019-09-11 Thread Stephen Rothwell
Hi Arnd,

You seem t have remerged a whole series of patches in your tree:

commits a55aa89aab90..ecc43067d9a5 are identical to commits
a55aa89aab90..846e9b109913 apart from the commit message for commit
e83dd16c24d8/654f7717ef51.  And you have both sets fo commits merged.

-- 
Cheers,
Stephen Rothwell


pgp9xpWQdmzrS.pgp
Description: OpenPGP digital signature


Re: [PATCH v2] fat: Add nobarrier to workaround the strange behavior of device

2019-07-02 Thread Andrew Morton
On Fri, 28 Jun 2019 23:32:06 +0900 OGAWA Hirofumi  
wrote:

> 
> v2:
> Just cleanup, changed the place of options under comment of fat.
> 
> ---

Please be careful with the "^---$" - it denotes "end of changelog", so
I ended up without a changelog!


> There was the report of strange behavior of device with recent
> blkdev_issue_flush() position change.

A Reported-by: would be nice, but not necessary.

> The following is simplified usbmon trace.
> 
>  4203   9.160230 host -> 1.25.1   USBMS 95 SCSI: Synchronize 
> Cache(10) LUN: 0x00 (LBA: 0x, Len: 0)
>  4206   9.164911   1.25.1 -> host USBMS 77 SCSI: Response LUN: 
> 0x00 (Synchronize Cache(10)) (Good)
>  4207   9.323927 host -> 1.25.1   USBMS 95 SCSI: Read(10) LUN: 
> 0x00 (LBA: 0x00279950, Len: 240)
>  4212   9.327138   1.25.1 -> host USBMS 77 SCSI: Response LUN: 
> 0x00 (Read(10)) (Good)
> 
> [...]
> 
>  7323  10.202167 host -> 1.25.1   USBMS 95 SCSI: Synchronize 
> Cache(10) LUN: 0x00 (LBA: 0x, Len: 0)
>  7326  10.432266   1.25.1 -> host USBMS 77 SCSI: Response LUN: 
> 0x00 (Synchronize Cache(10)) (Good)
>  7327  10.769092 host -> 1.25.1   USBMS 95 SCSI: Test Unit Ready 
> LUN: 0x00 
>  7330  10.769192   1.25.1 -> host USBMS 77 SCSI: Response LUN: 
> 0x00 (Test Unit Ready) (Good)
>  7335  12.849093 host -> 1.25.1   USBMS 95 SCSI: Test Unit Ready 
> LUN: 0x00 
>  7338  12.849206   1.25.1 -> host USBMS 77 SCSI: Response LUN: 
> 0x00 (Test Unit Ready) (Check Condition)
>  7339  12.849209 host -> 1.25.1   USBMS 95 SCSI: Request Sense 
> LUN: 0x00
>  
> If "Synchronize Cache" command issued then there is idle time, the
> device stop to process further commands, and behave as like no media.
> (it returns NOT_READY [MEDIUM NOT PRESENT] for SENSE command, and this
> happened on Kindle) [just a guess, the device is trying to detect the
> "safe-unplug" operation of Windows or such?]
> 
> To workaround those devices and provide flexibility, this adds
> "barrier"/"nobarrier" mount options to fat driver.

I think it would be helpful if the changelog were to at least describe
the longer-term plan which hch described.

> --- linux/fs/fat/fat.h~fat-nobarrier  2019-06-28 21:22:18.146191739 +0900
> +++ linux-hirofumi/fs/fat/fat.h   2019-06-28 23:26:04.881215721 +0900
> @@ -51,6 +51,7 @@ struct fat_mount_options {
>tz_set:1, /* Filesystem timestamps' offset set */
>rodir:1,  /* allow ATTR_RO for directory */
>discard:1,/* Issue discard requests on deletions */
> +  barrier:1,/* Issue FLUSH command */
>dos1xfloppy:1;/* Assume default BPB for DOS 1.x floppies */

Documentation/filesystems/vfat.txt should be updated to describe this
new option please.  And perhaps to mention that a device-level quirk should be
used in preference, if it is available.




Re: [PATCH] fat: Add nobarrier to workaround the strange behavior of device

2019-06-28 Thread OGAWA Hirofumi
Christoph Hellwig  writes:

> On Sat, Jun 29, 2019 at 12:03:46AM +0900, OGAWA Hirofumi wrote:
>> I see, sounds like good though. Does it work for all stable versions?
>> Can it disable only flush command without other effect? And it would be
>> better to be normal user controllable easily.
>
> The option was added in 2.6.17, so it's been around forever.  But
> no, it obviously is not user exposed as using it on a normal drive
> can lead to data loss.

I see. It sounds like good as long term solution though (if no effect
other than disabling flush command), it may need some monitor daemon and
detect the device, and apply user policy as root. (BTW, I meant,
workaround is normal user controllable with config by root or such)

I think, it may not be good as short term solution for especially stable
users. If there is no better short term solution, I would like to still
push this patch as short term workaround.

Thanks.
-- 
OGAWA Hirofumi 


Re: [PATCH] fat: Add nobarrier to workaround the strange behavior of device

2019-06-28 Thread Christoph Hellwig
On Sat, Jun 29, 2019 at 12:03:46AM +0900, OGAWA Hirofumi wrote:
> I see, sounds like good though. Does it work for all stable versions?
> Can it disable only flush command without other effect? And it would be
> better to be normal user controllable easily.

The option was added in 2.6.17, so it's been around forever.  But
no, it obviously is not user exposed as using it on a normal drive
can lead to data loss.


Re: [PATCH] fat: Add nobarrier to workaround the strange behavior of device

2019-06-28 Thread OGAWA Hirofumi
Christoph Hellwig  writes:

> On Fri, Jun 28, 2019 at 11:18:19PM +0900, OGAWA Hirofumi wrote:
>> To workaround those devices and provide flexibility, this adds
>> "barrier"/"nobarrier" mount options to fat driver.
>
> We have deprecated these rather misnamed options, and now instead allow
> tweaking the 'cache_type' attribute on the SCSI device.

I see, sounds like good though. Does it work for all stable versions?
Can it disable only flush command without other effect? And it would be
better to be normal user controllable easily.

This happened on normal user's calibre app that mount via udisks.  With
this option, user can workaround with /etc/fstab for immediate users.

> That being said if the device behave this buggy you should also report
> it to to the usb-storage and scsi maintainers so that we can add a
> quirk for it.

It might not be able to say as buggy simply. The device looks work if no
idle and not hit pattern of usage, so quirk can be overkill.

Anyway, I don't have the device, if you can get the device and
investigate, it can be fine.

Thanks.
-- 
OGAWA Hirofumi 


[PATCH v2] fat: Add nobarrier to workaround the strange behavior of device

2019-06-28 Thread OGAWA Hirofumi


v2:
Just cleanup, changed the place of options under comment of fat.

---

There was the report of strange behavior of device with recent
blkdev_issue_flush() position change.

The following is simplified usbmon trace.

 4203   9.160230 host -> 1.25.1   USBMS 95 SCSI: Synchronize 
Cache(10) LUN: 0x00 (LBA: 0x, Len: 0)
 4206   9.164911   1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 
(Synchronize Cache(10)) (Good)
 4207   9.323927 host -> 1.25.1   USBMS 95 SCSI: Read(10) LUN: 0x00 
(LBA: 0x00279950, Len: 240)
 4212   9.327138   1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 
(Read(10)) (Good)

[...]

 7323  10.202167 host -> 1.25.1   USBMS 95 SCSI: Synchronize 
Cache(10) LUN: 0x00 (LBA: 0x, Len: 0)
 7326  10.432266   1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 
(Synchronize Cache(10)) (Good)
 7327  10.769092 host -> 1.25.1   USBMS 95 SCSI: Test Unit Ready 
LUN: 0x00 
 7330  10.769192   1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 
(Test Unit Ready) (Good)
 7335  12.849093 host -> 1.25.1   USBMS 95 SCSI: Test Unit Ready 
LUN: 0x00 
 7338  12.849206   1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 
(Test Unit Ready) (Check Condition)
 7339  12.849209 host -> 1.25.1   USBMS 95 SCSI: Request Sense LUN: 
0x00
 
If "Synchronize Cache" command issued then there is idle time, the
device stop to process further commands, and behave as like no media.
(it returns NOT_READY [MEDIUM NOT PRESENT] for SENSE command, and this
happened on Kindle) [just a guess, the device is trying to detect the
"safe-unplug" operation of Windows or such?]

To workaround those devices and provide flexibility, this adds
"barrier"/"nobarrier" mount options to fat driver.

Cc: 
Signed-off-by: OGAWA Hirofumi 
---

 fs/fat/fat.h   |1 +
 fs/fat/file.c  |8 ++--
 fs/fat/inode.c |   22 +-
 3 files changed, 24 insertions(+), 7 deletions(-)

diff -puN fs/fat/fat.h~fat-nobarrier fs/fat/fat.h
--- linux/fs/fat/fat.h~fat-nobarrier2019-06-28 21:22:18.146191739 +0900
+++ linux-hirofumi/fs/fat/fat.h 2019-06-28 23:26:04.881215721 +0900
@@ -51,6 +51,7 @@ struct fat_mount_options {
 tz_set:1, /* Filesystem timestamps' offset set */
 rodir:1,  /* allow ATTR_RO for directory */
 discard:1,/* Issue discard requests on deletions */
+barrier:1,/* Issue FLUSH command */
 dos1xfloppy:1;/* Assume default BPB for DOS 1.x floppies */
 };
 
diff -puN fs/fat/file.c~fat-nobarrier fs/fat/file.c
--- linux/fs/fat/file.c~fat-nobarrier   2019-06-28 21:22:18.147191734 +0900
+++ linux-hirofumi/fs/fat/file.c2019-06-28 23:26:04.881215721 +0900
@@ -193,17 +193,21 @@ static int fat_file_release(struct inode
 int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
struct inode *inode = filp->f_mapping->host;
+   struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
int err;
 
err = __generic_file_fsync(filp, start, end, datasync);
if (err)
return err;
 
-   err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+   err = sync_mapping_buffers(sbi->fat_inode->i_mapping);
if (err)
return err;
 
-   return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+   if (sbi->options.barrier)
+   err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+
+   return err;
 }
 
 
diff -puN fs/fat/inode.c~fat-nobarrier fs/fat/inode.c
--- linux/fs/fat/inode.c~fat-nobarrier  2019-06-28 21:22:18.148191730 +0900
+++ linux-hirofumi/fs/fat/inode.c   2019-06-28 23:26:28.029103863 +0900
@@ -1016,6 +1016,8 @@ static int fat_show_options(struct seq_f
seq_puts(m, ",nfs=stale_rw");
if (opts->discard)
seq_puts(m, ",discard");
+   if (!opts->barrier)
+   seq_puts(m, ",nobarrier");
if (opts->dos1xfloppy)
seq_puts(m, ",dos1xfloppy");
 
@@ -1031,8 +1033,9 @@ enum {
Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
-   Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
-   Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
+   Opt_err_panic, Opt_err_ro, Opt_discard, Opt_barrier, Opt_nobarrier,
+   Opt_nfs, Opt_time_offset, Opt_nfs_stale_rw, Opt_nfs_nostale_ro,
+   Opt_err, Opt_dos1xfloppy,
 };
 
 static const match_table_t fat_tokens = {
@@ -1062,6 +1065,8 @@ static const matc

Re: [PATCH] fat: Add nobarrier to workaround the strange behavior of device

2019-06-28 Thread Christoph Hellwig
On Fri, Jun 28, 2019 at 11:18:19PM +0900, OGAWA Hirofumi wrote:
> To workaround those devices and provide flexibility, this adds
> "barrier"/"nobarrier" mount options to fat driver.

We have deprecated these rather misnamed options, and now instead allow
tweaking the 'cache_type' attribute on the SCSI device.

That being said if the device behave this buggy you should also report
it to to the usb-storage and scsi maintainers so that we can add a
quirk for it.


[PATCH] fat: Add nobarrier to workaround the strange behavior of device

2019-06-28 Thread OGAWA Hirofumi


There was the report of strange behavior of device with recent
blkdev_issue_flush() position change.

The following is simplified usbmon trace.

 4203   9.160230 host -> 1.25.1   USBMS 95 SCSI: Synchronize 
Cache(10) LUN: 0x00 (LBA: 0x, Len: 0)
 4206   9.164911   1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 
(Synchronize Cache(10)) (Good)
 4207   9.323927 host -> 1.25.1   USBMS 95 SCSI: Read(10) LUN: 0x00 
(LBA: 0x00279950, Len: 240)
 4212   9.327138   1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 
(Read(10)) (Good)

[...]

 7323  10.202167 host -> 1.25.1   USBMS 95 SCSI: Synchronize 
Cache(10) LUN: 0x00 (LBA: 0x, Len: 0)
 7326  10.432266   1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 
(Synchronize Cache(10)) (Good)
 7327  10.769092 host -> 1.25.1   USBMS 95 SCSI: Test Unit Ready 
LUN: 0x00 
 7330  10.769192   1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 
(Test Unit Ready) (Good)
 7335  12.849093 host -> 1.25.1   USBMS 95 SCSI: Test Unit Ready 
LUN: 0x00 
 7338  12.849206   1.25.1 -> host USBMS 77 SCSI: Response LUN: 0x00 
(Test Unit Ready) (Check Condition)
 7339  12.849209 host -> 1.25.1   USBMS 95 SCSI: Request Sense LUN: 
0x00
 
If "Synchronize Cache" command issued then there is idle time, the
device stop to process further commands, and behave as like no media.
(it returns NOT_READY [MEDIUM NOT PRESENT] for SENSE command, and this
happened on Kindle) [just a guess, the device is trying to detect the
"safe-unplug" operation of Windows or such?]

To workaround those devices and provide flexibility, this adds
"barrier"/"nobarrier" mount options to fat driver.

Cc: 
Signed-off-by: OGAWA Hirofumi 
---

 fs/fat/fat.h   |1 +
 fs/fat/file.c  |8 ++--
 fs/fat/inode.c |   16 ++--
 3 files changed, 21 insertions(+), 4 deletions(-)

diff -puN fs/fat/fat.h~fat-nobarrier fs/fat/fat.h
--- linux/fs/fat/fat.h~fat-nobarrier2019-06-28 21:22:18.146191739 +0900
+++ linux-hirofumi/fs/fat/fat.h 2019-06-28 21:59:11.693934616 +0900
@@ -51,6 +51,7 @@ struct fat_mount_options {
 tz_set:1, /* Filesystem timestamps' offset set */
 rodir:1,  /* allow ATTR_RO for directory */
 discard:1,/* Issue discard requests on deletions */
+barrier:1,/* Issue FLUSH command */
 dos1xfloppy:1;/* Assume default BPB for DOS 1.x floppies */
 };
 
diff -puN fs/fat/file.c~fat-nobarrier fs/fat/file.c
--- linux/fs/fat/file.c~fat-nobarrier   2019-06-28 21:22:18.147191734 +0900
+++ linux-hirofumi/fs/fat/file.c2019-06-28 21:59:11.693934616 +0900
@@ -193,17 +193,21 @@ static int fat_file_release(struct inode
 int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
struct inode *inode = filp->f_mapping->host;
+   struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
int err;
 
err = __generic_file_fsync(filp, start, end, datasync);
if (err)
return err;
 
-   err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+   err = sync_mapping_buffers(sbi->fat_inode->i_mapping);
if (err)
return err;
 
-   return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+   if (sbi->options.barrier)
+   err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+
+   return err;
 }
 
 
diff -puN fs/fat/inode.c~fat-nobarrier fs/fat/inode.c
--- linux/fs/fat/inode.c~fat-nobarrier  2019-06-28 21:22:18.148191730 +0900
+++ linux-hirofumi/fs/fat/inode.c   2019-06-28 21:59:11.694934611 +0900
@@ -1016,6 +1016,8 @@ static int fat_show_options(struct seq_f
seq_puts(m, ",nfs=stale_rw");
if (opts->discard)
seq_puts(m, ",discard");
+   if (!opts->barrier)
+   seq_puts(m, ",nobarrier");
if (opts->dos1xfloppy)
seq_puts(m, ",dos1xfloppy");
 
@@ -1031,8 +1033,9 @@ enum {
Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
-   Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
-   Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
+   Opt_err_panic, Opt_err_ro, Opt_discard, Opt_barrier, Opt_nobarrier,
+   Opt_nfs, Opt_time_offset, Opt_nfs_stale_rw, Opt_nfs_nostale_ro,
+   Opt_err, Opt_dos1xfloppy,
 };
 
 static const match_table_t fat_tokens = {
@@ -1062,6 +1065,8 @@ static const match_table_t fat_tokens =
{Opt_err_panic, "errors=panic"},
{

Re: Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)

2019-06-12 Thread Jiri Kosina
On Wed, 12 Jun 2019, Rafael J. Wysocki wrote:

> > > hid-logitech-dj is not loaded after a fresh boot, so I need to modprobe 
> > > it manually and that
> > > appears to be blocking (apparently indefinitely) until terminated with 
> > > ^C.  But then it turns
> > > out that hid-logitech-dj is there in the list of modules and it is in use 
> > > (by usbhid) and the
> > > mouse works.
> > 
> > My bad, I should've asked you to test with the complete 'for-5.2/fixes' 
> > branch which contains two reverts [1] [2] that should fix this issue as 
> > well.
> > 
> > [1] 
> > https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes=e0b7f9bc0246bc642d1de2ff3ff133730584c956
> > [2] 
> > https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes=f9482dabfd1686987cc6044e06ae0e4c05915518
> 
> Yes, with the two reverts applied in addition to the fix, it all works as 
> expected.

Rafael, thanks a lot for testing. I am sending the pile to Linus today or 
tomorrow latest.

-- 
Jiri Kosina
SUSE Labs



Re: Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)

2019-06-12 Thread Rafael J. Wysocki
On Wednesday, June 12, 2019 10:31:45 AM CEST Jiri Kosina wrote:
> On Wed, 12 Jun 2019, Rafael J. Wysocki wrote:
> 
> > It kind of helps, but there is a catch.
> > 
> > hid-logitech-dj is not loaded after a fresh boot, so I need to modprobe it 
> > manually and that
> > appears to be blocking (apparently indefinitely) until terminated with ^C.  
> > But then it turns
> > out that hid-logitech-dj is there in the list of modules and it is in use 
> > (by usbhid) and the
> > mouse works.
> 
> My bad, I should've asked you to test with the complete 'for-5.2/fixes' 
> branch which contains two reverts [1] [2] that should fix this issue as 
> well.
> 
> [1] 
> https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes=e0b7f9bc0246bc642d1de2ff3ff133730584c956
> [2] 
> https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes=f9482dabfd1686987cc6044e06ae0e4c05915518

Yes, with the two reverts applied in addition to the fix, it all works as 
expected.

Thanks!





Re: Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)

2019-06-12 Thread Jiri Kosina
On Wed, 12 Jun 2019, Rafael J. Wysocki wrote:

> It kind of helps, but there is a catch.
> 
> hid-logitech-dj is not loaded after a fresh boot, so I need to modprobe it 
> manually and that
> appears to be blocking (apparently indefinitely) until terminated with ^C.  
> But then it turns
> out that hid-logitech-dj is there in the list of modules and it is in use (by 
> usbhid) and the
> mouse works.

My bad, I should've asked you to test with the complete 'for-5.2/fixes' 
branch which contains two reverts [1] [2] that should fix this issue as 
well.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes=e0b7f9bc0246bc642d1de2ff3ff133730584c956
[2] 
https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes=f9482dabfd1686987cc6044e06ae0e4c05915518

-- 
Jiri Kosina
SUSE Labs



Re: Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)

2019-06-12 Thread Rafael J. Wysocki
On Wednesday, June 12, 2019 12:02:21 AM CEST Jiri Kosina wrote:
> On Tue, 11 Jun 2019, Rafael J. Wysocki wrote:
> 
> > I noticed that the cordless mouse used by me with one of the machines here
> > stopped to work in 5.2-rc (up to and including the -rc4).
> > 
> > Bisection turned up commit 74808f9115ce ("HID: logitech-dj: add support for 
> > non
> > unifying receivers").
> > 
> > Of course, that commit does not revert cleanly from 5.2-rc4, but I have 
> > reverted
> > the changes made by it in hid/hid-ids.h and I took the version of 
> > hid/hid-logitech-dj.c
> > from commit b6aeeddef68d ("HID: logitech-dj: add 
> > logi_dj_recv_queue_unknown_work
> > helper"), which is the parent of commit 74808f9115ce, and that made the 
> > mouse
> > work again for me.
> > 
> > Here's the output of "dmesg | grep -i logitech" from 5.2-rc4 with the above 
> > changes:
> > 
> > [4.288905] usb 1-2: Manufacturer: Logitech
> > [5.444621] input: Logitech USB Receiver as 
> > /devices/pci:00/:00:14.0/usb1/1-2/1-2:1.0/0003:046D:C52F.0002/input/input23
> > [5.446960] hid-generic 0003:046D:C52F.0002: input,hidraw1: USB HID 
> > v1.11 Mouse [Logitech USB Receiver] on usb-:00:14.0-2/input0
> > [5.451265] input: Logitech USB Receiver Consumer Control as 
> > /devices/pci:00/:00:14.0/usb1/1-2/1-2:1.1/0003:046D:C52F.0003/input/input24
> > [5.507545] hid-generic 0003:046D:C52F.0003: input,hiddev96,hidraw2: USB 
> > HID v1.11 Device [Logitech USB Receiver] on usb-:00:14.0-2/input1
> 
> Hi Rafael,
> 
> 0x046d/0xc52f is known to have issues in 5.2-rcX. There is a patch queued 
> [1] that is believed to fix all this; my plan is to send it to Linus in 
> the coming 1-2 days. If you could report whether it fixes the issues 
> you've been seeing yourself as well, it'd be helpful.
> 
> Thanks.
> 
> [1] 
> https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes=3ed224e273ac5880eeab4c3043a6b06b0478dd56

It kind of helps, but there is a catch.

hid-logitech-dj is not loaded after a fresh boot, so I need to modprobe it 
manually and that
appears to be blocking (apparently indefinitely) until terminated with ^C.  But 
then it turns
out that hid-logitech-dj is there in the list of modules and it is in use (by 
usbhid) and the
mouse works.

I guess I need to update the mkinitrd configuration, but even so that is not 
exactly
straightforward IMO. :-)

Cheers!





Re: Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)

2019-06-11 Thread Jiri Kosina
On Tue, 11 Jun 2019, Rafael J. Wysocki wrote:

> I noticed that the cordless mouse used by me with one of the machines here
> stopped to work in 5.2-rc (up to and including the -rc4).
> 
> Bisection turned up commit 74808f9115ce ("HID: logitech-dj: add support for 
> non
> unifying receivers").
> 
> Of course, that commit does not revert cleanly from 5.2-rc4, but I have 
> reverted
> the changes made by it in hid/hid-ids.h and I took the version of 
> hid/hid-logitech-dj.c
> from commit b6aeeddef68d ("HID: logitech-dj: add 
> logi_dj_recv_queue_unknown_work
> helper"), which is the parent of commit 74808f9115ce, and that made the mouse
> work again for me.
> 
> Here's the output of "dmesg | grep -i logitech" from 5.2-rc4 with the above 
> changes:
> 
> [4.288905] usb 1-2: Manufacturer: Logitech
> [5.444621] input: Logitech USB Receiver as 
> /devices/pci:00/:00:14.0/usb1/1-2/1-2:1.0/0003:046D:C52F.0002/input/input23
> [5.446960] hid-generic 0003:046D:C52F.0002: input,hidraw1: USB HID v1.11 
> Mouse [Logitech USB Receiver] on usb-:00:14.0-2/input0
> [5.451265] input: Logitech USB Receiver Consumer Control as 
> /devices/pci:00/:00:14.0/usb1/1-2/1-2:1.1/0003:046D:C52F.0003/input/input24
> [5.507545] hid-generic 0003:046D:C52F.0003: input,hiddev96,hidraw2: USB 
> HID v1.11 Device [Logitech USB Receiver] on usb-:00:14.0-2/input1

Hi Rafael,

0x046d/0xc52f is known to have issues in 5.2-rcX. There is a patch queued 
[1] that is believed to fix all this; my plan is to send it to Linus in 
the coming 1-2 days. If you could report whether it fixes the issues 
you've been seeing yourself as well, it'd be helpful.

Thanks.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/hid/hid.git/commit/?h=for-5.2/fixes=3ed224e273ac5880eeab4c3043a6b06b0478dd56

-- 
Jiri Kosina
SUSE Labs



Strange regression in hid_llogitech_dj (was: Re: Linux 5.2-rc4)

2019-06-11 Thread Rafael J. Wysocki
On Sunday, June 9, 2019 5:46:48 AM CEST Linus Torvalds wrote:
> No, I'm not confused, and I haven't lost track of what day it is, I do
> actually know that it's still Saturday here, not Sunday, and I'm just
> doing rc4 a bit early because I'll be on an airplane during my normal
> release time. And while I've done releases on airports and airplanes
> before, I looked at my empty queue of pull requests and went "let's
> just do it now".
> 
> We've had a fairly calm release so far, and on the whole that seems to
> hold. rc4 isn't smaller than rc3 was (it's a bit bigger), but rc3 was
> fairly small, so the size increase isn't all that worrisome. I do hope
> that we'll start actually shrinking now, though.

I noticed that the cordless mouse used by me with one of the machines here
stopped to work in 5.2-rc (up to and including the -rc4).

Bisection turned up commit 74808f9115ce ("HID: logitech-dj: add support for non
unifying receivers").

Of course, that commit does not revert cleanly from 5.2-rc4, but I have reverted
the changes made by it in hid/hid-ids.h and I took the version of 
hid/hid-logitech-dj.c
from commit b6aeeddef68d ("HID: logitech-dj: add logi_dj_recv_queue_unknown_work
helper"), which is the parent of commit 74808f9115ce, and that made the mouse
work again for me.

Here's the output of "dmesg | grep -i logitech" from 5.2-rc4 with the above 
changes:

[4.288905] usb 1-2: Manufacturer: Logitech
[5.444621] input: Logitech USB Receiver as 
/devices/pci:00/:00:14.0/usb1/1-2/1-2:1.0/0003:046D:C52F.0002/input/input23
[5.446960] hid-generic 0003:046D:C52F.0002: input,hidraw1: USB HID v1.11 
Mouse [Logitech USB Receiver] on usb-:00:14.0-2/input0
[5.451265] input: Logitech USB Receiver Consumer Control as 
/devices/pci:00/:00:14.0/usb1/1-2/1-2:1.1/0003:046D:C52F.0003/input/input24
[5.507545] hid-generic 0003:046D:C52F.0003: input,hiddev96,hidraw2: USB HID 
v1.11 Device [Logitech USB Receiver] on usb-:00:14.0-2/input1

Please let me know what you need to diagnose this.

Thanks,
Rafael






Re: Strange issues with epoll since 5.0

2019-05-02 Thread Davidlohr Bueso

On Thu, 02 May 2019, Deepa Dinamani wrote:


Reported-by: Omar Kilani 


Do we actually know if this was the issue Omar was hitting?

Thanks,
Davidlohr


Re: Strange issues with epoll since 5.0

2019-05-02 Thread Eric Wong
Deepa Dinamani  wrote:
> Eric,
> Can you please help test this?

Nope, that was _really_ badly whitespace-damaged.
(C'mon, it's not like you're new to this)


Re: Strange issues with epoll since 5.0

2019-05-02 Thread Deepa Dinamani
Eric,
Can you please help test this?
If this solves your problem, I can post the fix.
Thanks,
- Deepa

-8<---

Subject: [PATCH] signal: Adjust error codes according to restore_user_sigmask()

For all the syscalls that receive a sigmask from the userland,
the user sigmask is to be in effect through the syscall execution.
At the end of syscall, sigmask of the current process is restored
to what it was before the switch over to user sigmask.
But, for this to be true in practice, the sigmask should be restored
only at the the point we change the saved_sigmask. Anything before
that loses signals. And, anything after is just pointless as the
signal is already lost by restoring the sigmask.

The issue was detected because of a regression caused by 854a6ed56839a.
The patch moved the signal_pending() check closer to restoring of the
user sigmask. But, it failed to update the error code accordingly.

Detailed issue discussion permalink:
https://lore.kernel.org/linux-fsdevel/20190427093319.sgicqik2oqkez3wk@dcvr/

Note that the patch returns interrupted errors (EINTR, ERESTARTNOHAND, etc)
only when there is no other error. If there is a signal and an error like
EINVAL, the syscalls return -EINVAL rather than the interrupted error codes.

Reported-by: Omar Kilani 
Reported-by: Eric Wong 
Fixes: 854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add
restore_user_sigmask()")
Signed-off-by: Deepa Dinamani 
---
 fs/aio.c   | 24 
 fs/eventpoll.c | 14 ++
 fs/io_uring.c  |  9 ++---
 fs/select.c| 37 +
 include/linux/signal.h |  2 +-
 kernel/signal.c|  8 +---
 6 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 38b741aef0bf..7de2f7573d55 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2133,7 +2133,7 @@ SYSCALL_DEFINE6(io_pgetevents,
 struct __aio_sigsetksig = { NULL, };
 sigset_tksigmask, sigsaved;
 struct timespec64ts;
-int ret;
+int ret, signal_detected;

 if (timeout && unlikely(get_timespec64(, timeout)))
 return -EFAULT;
@@ -2146,8 +2146,8 @@ SYSCALL_DEFINE6(io_pgetevents,
 return ret;

 ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ?  : NULL);
-restore_user_sigmask(ksig.sigmask, );
-if (signal_pending(current) && !ret)
+signal_detected = restore_user_sigmask(ksig.sigmask, );
+if (signal_detected && !ret)
 ret = -ERESTARTNOHAND;

 return ret;
@@ -2166,7 +2166,7 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
 struct __aio_sigsetksig = { NULL, };
 sigset_tksigmask, sigsaved;
 struct timespec64ts;
-int ret;
+int ret, signal_detected;

 if (timeout && unlikely(get_old_timespec32(, timeout)))
 return -EFAULT;
@@ -2180,8 +2180,8 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
 return ret;

 ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ?  : NULL);
-restore_user_sigmask(ksig.sigmask, );
-if (signal_pending(current) && !ret)
+signal_detected = restore_user_sigmask(ksig.sigmask, );
+if (signal_detected && !ret)
 ret = -ERESTARTNOHAND;

 return ret;
@@ -2231,7 +2231,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 struct __compat_aio_sigset ksig = { NULL, };
 sigset_t ksigmask, sigsaved;
 struct timespec64 t;
-int ret;
+int ret, signal_detected;

 if (timeout && get_old_timespec32(, timeout))
 return -EFAULT;
@@ -2244,8 +2244,8 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 return ret;

 ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ?  : NULL);
-restore_user_sigmask(ksig.sigmask, );
-if (signal_pending(current) && !ret)
+signal_detected = restore_user_sigmask(ksig.sigmask, );
+if (signal_detected && !ret)
 ret = -ERESTARTNOHAND;

 return ret;
@@ -2264,7 +2264,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
 struct __compat_aio_sigset ksig = { NULL, };
 sigset_t ksigmask, sigsaved;
 struct timespec64 t;
-int ret;
+int ret, signal_detected;

 if (timeout && get_timespec64(, timeout))
 return -EFAULT;
@@ -2277,8 +2277,8 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
 return ret;

 ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ?  : NULL);
-restore_user_sigmask(ksig.sigmask, );
-if (signal_pending(current) && !ret)
+signal_detected = restore_user_sigmask(ksig.sigmask, );
+if (signal_detected && !ret)
 ret = -ERESTARTNOHAND;

 return ret;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4a0e98d87fcc..fe5a0724b417 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2317,7 +2317,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
epoll_event __user *, events,
 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
 size_t, sigsetsize)
 {
-int error;
+int error, signal_detected;
 sigset_t 

Re: Strange issues with epoll since 5.0

2019-05-01 Thread Deepa Dinamani
On Wed, May 1, 2019 at 1:48 PM Eric Wong  wrote:
>
> Deepa Dinamani  wrote:
> > So here is my analysis:
>
> 
>
> > So the 854a6ed56839a40f6 seems to be better than the original code in
> > that it detects the signal.
>
> OTOH, does matter to anybody that a signal is detected slightly
> sooner than it would've been, otherwise?

The original code drops the signal altogether. This is because it
overwrites the current's sigmask with the provided
one(set_current_blocked()). If a signal bit was set, it is lost
forever. It does not detect it sooner. The check for pending signal is
sooner and not just before the syscall returns.
This is what the patch in discussion does: check for signals just
before returning.

>
> > But, the problem is that it doesn't
> > communicate it to the userspace.
>
> Yup, that's a big problem :)
>
> > So a patch like below solves the problem. This is incomplete. I'll
> > verify and send you a proper fix you can test soon. This is just for
> > the sake of discussion:
> >
> > diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> > index 4a0e98d87fcc..63a387329c3d 100644
> > --- a/fs/eventpoll.c
> > +++ b/fs/eventpoll.c
> > @@ -2317,7 +2317,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
> > epoll_event __user *, events,
> > int, maxevents, int, timeout, const sigset_t __user *, 
> > sigmask,
> > size_t, sigsetsize)
> >  {
> > -   int error;
> > +   int error, signal_detected;
> > sigset_t ksigmask, sigsaved;
> >
> > /*
> > @@ -2330,7 +2330,10 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
> > epoll_event __user *, events,
> >
> > error = do_epoll_wait(epfd, events, maxevents, timeout);
> >
> > -   restore_user_sigmask(sigmask, );
> > +   signal_detected = restore_user_sigmask(sigmask, );
> > +
> > +   if (signal_detected && !error)
> > +   return -EITNR;
> >
> > return error;
>
> Looks like a reasonable API.
>
> > @@ -2862,7 +2862,7 @@ void restore_user_sigmask(const void __user
> > *usigmask, sigset_t *sigsaved)
> > if (signal_pending(current)) {
> > current->saved_sigmask = *sigsaved;
> > set_restore_sigmask();
> > -   return;
> > +   return 0;
>
> Shouldn't that "return 1" if a signal is pending?

Yep, I meant this to be 1.

-Deepa


Re: Strange issues with epoll since 5.0

2019-05-01 Thread Eric Wong
Deepa Dinamani  wrote:
> So here is my analysis:



> So the 854a6ed56839a40f6 seems to be better than the original code in
> that it detects the signal.

OTOH, does matter to anybody that a signal is detected slightly
sooner than it would've been, otherwise?

> But, the problem is that it doesn't
> communicate it to the userspace.

Yup, that's a big problem :)
 
> So a patch like below solves the problem. This is incomplete. I'll
> verify and send you a proper fix you can test soon. This is just for
> the sake of discussion:
> 
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 4a0e98d87fcc..63a387329c3d 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -2317,7 +2317,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
> epoll_event __user *, events,
> int, maxevents, int, timeout, const sigset_t __user *, 
> sigmask,
> size_t, sigsetsize)
>  {
> -   int error;
> +   int error, signal_detected;
> sigset_t ksigmask, sigsaved;
> 
> /*
> @@ -2330,7 +2330,10 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
> epoll_event __user *, events,
> 
> error = do_epoll_wait(epfd, events, maxevents, timeout);
> 
> -   restore_user_sigmask(sigmask, );
> +   signal_detected = restore_user_sigmask(sigmask, );
> +
> +   if (signal_detected && !error)
> +   return -EITNR;
> 
> return error;

Looks like a reasonable API.

> @@ -2862,7 +2862,7 @@ void restore_user_sigmask(const void __user
> *usigmask, sigset_t *sigsaved)
> if (signal_pending(current)) {
> current->saved_sigmask = *sigsaved;
> set_restore_sigmask();
> -   return;
> +   return 0;

Shouldn't that "return 1" if a signal is pending?


Re: Strange issues with epoll since 5.0

2019-05-01 Thread Deepa Dinamani
Thanks for trying the fix.

So here is my analysis:

Let's start with epoll_pwait:

ep_poll() is what checks for signal_pending() and is responsible for
setting errno to -EINTR when there is a signal.

So if a signal is received after ep_poll(), it is never noticed by the
syscall during execution.

Moreover, the original code before
854a6ed56839a40f6b5d02a2962f48841482eec4 ("signal: Add
restore_user_sigmask()"), had the following call flow:

 error = do_epoll_wait(epfd, events, maxevents, timeout);

 Here error = 0 if the signal is received after ep_poll().

-   /*
-* If we changed the signal mask, we need to restore the original one.
-* In case we've got a signal while waiting, we do not restore the
-* signal mask yet, and we allow do_signal() to deliver the signal on
-* the way back to userspace, before the signal mask is restored.
-*/
-   if (sigmask) {
-   if (error == -EINTR) {
-   memcpy(>saved_sigmask, ,
-  sizeof(sigsaved));
-   set_restore_sigmask();
-   } else

 Execution reaches this else statement and the sigmask is restored
directly, ignoring the newly generated signal. The signal is never
handled.

-   set_current_blocked();
-   }

In the current execution flow:

error = do_epoll_wait(epfd, events, maxevents, timeout);

 error is still 0 as ep_poll() did not detect the signal.

restore_user_sigmask(sigmask, , error == -EITNR);

void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
{

if (!usigmask)
return;
/*
 * When signals are pending, do not restore them here.
 * Restoring sigmask here can lead to delivering signals that the above
 * syscalls are intended to block because of the sigmask passed in.
 */
if (signal_pending(current)) {

 execution path reaches here and do_signal() actually delivers the
signal to userspace. But the errno is not set. So the userspace fails
to notice it.

current->saved_sigmask = *sigsaved;
set_restore_sigmask();
return;
}

/*
 * This is needed because the fast syscall return path does not restore
 * saved_sigmask when signals are not pending.
 */
set_current_blocked(sigsaved);
}

For other syscalls in the same commit:

sys_io_pgetevents() does not seem to have this problem as we are still
checking signal_pending() here.
sys_pselect6() seems to have a similar problem. The changes to
sys_pselect6() also impact sys_select() as the changes are in the
common code path.

So the 854a6ed56839a40f6 seems to be better than the original code in
that it detects the signal. But, the problem is that it doesn't
communicate it to the userspace.

So a patch like below solves the problem. This is incomplete. I'll
verify and send you a proper fix you can test soon. This is just for
the sake of discussion:

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4a0e98d87fcc..63a387329c3d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2317,7 +2317,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
 {
-   int error;
+   int error, signal_detected;
sigset_t ksigmask, sigsaved;

/*
@@ -2330,7 +2330,10 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
epoll_event __user *, events,

error = do_epoll_wait(epfd, events, maxevents, timeout);

-   restore_user_sigmask(sigmask, );
+   signal_detected = restore_user_sigmask(sigmask, );
+
+   if (signal_detected && !error)
+   return -EITNR;

return error;
 }
@@ -2342,7 +2345,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
const compat_sigset_t __user *, sigmask,
compat_size_t, sigsetsize)
 {
-   long err;
+   long err, signal_detected;
sigset_t ksigmask, sigsaved;

/*
@@ -2355,7 +2358,10 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,

err = do_epoll_wait(epfd, events, maxevents, timeout);

-   restore_user_sigmask(sigmask, );
+   signal_detected = restore_user_sigmask(sigmask, );
+
+   if (signal_detected && !err)
+   return -EITNR;

return err;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 3a9e41197d46..c76ab2a52ebf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2849,11 +2849,11 @@ EXPORT_SYMBOL(set_compat_user_sigmask);
  * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
  * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
  */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+int restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
 {

Re: Strange issues with epoll since 5.0

2019-05-01 Thread Eric Wong
Eric Wong  wrote:
> (didn't test AIO, but everything else seems good)

"seems" != "is"

Now that I understand the fix for epoll, the fs/select.c changes
would hit the same problem and not return -EINTR when it should.

I'll let you guys decide how to fix this, but there's definitely
a problem when "(errno == EINTR)" comparisons in userspace
stop working.


Re: Strange issues with epoll since 5.0

2019-04-30 Thread Eric Wong
Eric Wong  wrote:
> Deepa Dinamani  wrote:
> > I'm not sure what the hang in the userspace is about. Is it because
> > the syscall did not return an error or the particular signal was
> > blocked etc.
> 
> Uh, ok; that's less comforting.

Nevermind, I think I understand everything, now.  epoll_pwait
never set errno without our patch.

cmogstored does this in notify.c:

/* wait_intr calls epoll_pwait: */
mfd = mog_idleq_wait_intr(mog_notify_queue, timeout);
if (mfd)
notify_queue_step(mfd);
else if (errno == EINTR) /* EINTR fails to be noticed */
note_run();


Re: Strange issues with epoll since 5.0

2019-04-30 Thread Eric Wong
Deepa Dinamani  wrote:
> I was also not able to reproduce this.
> Arnd and I were talking about this today morning. Here is something
> Arnd noticed:
> 
> If there was a signal after do_epoll_wait(), we never were not
> entering the if (err = -EINTR) at all before.

I'm not sure which `if' statement you're talking about, but ok...

> But, now we do.
> We could try with the below patch:

Wasn't close to applying or being buildable, but I put a
working version together (below).

epoll_pwait wakes up as expected, now :>

> If this works that means we know what is busted.

OK, good to know...

> I'm not sure what the hang in the userspace is about. Is it because
> the syscall did not return an error or the particular signal was
> blocked etc.

Uh, ok; that's less comforting.

> There are also a few timing differences also. But, can we try this first?

Anyways, the following patch works and builds cleanly for me
(didn't test AIO, but everything else seems good)

Thanks!

-8<---
Subject: [PATCH] test fix from Deepa

TBD
---
 fs/aio.c   |  8 
 fs/eventpoll.c |  4 ++--
 fs/select.c| 12 ++--
 include/linux/signal.h |  2 +-
 kernel/signal.c|  6 --
 5 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 3d9669d011b9..d54513ca11b6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2146,7 +2146,7 @@ SYSCALL_DEFINE6(io_pgetevents,
return ret;
 
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ?  : NULL);
-   restore_user_sigmask(ksig.sigmask, );
+   restore_user_sigmask(ksig.sigmask, , -1);
if (signal_pending(current) && !ret)
ret = -ERESTARTNOHAND;
 
@@ -2180,7 +2180,7 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
return ret;
 
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ?  : NULL);
-   restore_user_sigmask(ksig.sigmask, );
+   restore_user_sigmask(ksig.sigmask, , -1);
if (signal_pending(current) && !ret)
ret = -ERESTARTNOHAND;
 
@@ -2244,7 +2244,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
return ret;
 
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ?  : NULL);
-   restore_user_sigmask(ksig.sigmask, );
+   restore_user_sigmask(ksig.sigmask, , -1);
if (signal_pending(current) && !ret)
ret = -ERESTARTNOHAND;
 
@@ -2277,7 +2277,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
return ret;
 
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ?  : NULL);
-   restore_user_sigmask(ksig.sigmask, );
+   restore_user_sigmask(ksig.sigmask, , -1);
if (signal_pending(current) && !ret)
ret = -ERESTARTNOHAND;
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a5d219d920e7..bd84ec54a8fb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2247,7 +2247,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct 
epoll_event __user *, events,
 
error = do_epoll_wait(epfd, events, maxevents, timeout);
 
-   restore_user_sigmask(sigmask, );
+   restore_user_sigmask(sigmask, , error == -EINTR);
 
return error;
 }
@@ -2272,7 +2272,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 
err = do_epoll_wait(epfd, events, maxevents, timeout);
 
-   restore_user_sigmask(sigmask, );
+   restore_user_sigmask(sigmask, , err == -EINTR);
 
return err;
 }
diff --git a/fs/select.c b/fs/select.c
index d0f35dbc0e8f..04720e5856ed 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -760,7 +760,7 @@ static long do_pselect(int n, fd_set __user *inp, fd_set 
__user *outp,
ret = core_sys_select(n, inp, outp, exp, to);
ret = poll_select_copy_remaining(_time, tsp, type, ret);
 
-   restore_user_sigmask(sigmask, );
+   restore_user_sigmask(sigmask, , -1);
 
return ret;
 }
@@ -1106,7 +1106,7 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, 
unsigned int, nfds,
 
ret = do_sys_poll(ufds, nfds, to);
 
-   restore_user_sigmask(sigmask, );
+   restore_user_sigmask(sigmask, , -1);
 
/* We can restart this syscall, usually */
if (ret == -EINTR)
@@ -1142,7 +1142,7 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, 
ufds, unsigned int, nfds,
 
ret = do_sys_poll(ufds, nfds, to);
 
-   restore_user_sigmask(sigmask, );
+   restore_user_sigmask(sigmask, , -1);
 
/* We can restart this syscall, usually */
if (ret == -EINTR)
@@ -1352,7 +1352,7 @@ static long do_compat_pselect(int n, compat_ulong_t 
__user *inp,
ret = compat_core_sys_select(n, inp, outp, exp, to);
ret = poll_select_copy_remaining(_time, tsp, type, ret);
 
-   restore_user_sigmask(sigmask, );
+   restore_user_sigmask(sigmask, , -1);
 
return ret;
 }
@@ -1425,7 +1425,7 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, 
ufds,
 
ret = do_sys_poll(ufds, nfds, to);
 
-   

Re: Strange issues with epoll since 5.0

2019-04-30 Thread Deepa Dinamani
I was also not able to reproduce this.
Arnd and I were talking about this today morning. Here is something
Arnd noticed:

If there was a signal after do_epoll_wait(), we never were not
entering the if (err = -EINTR) at all before. But, now we do.
We could try with the below patch:

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4a0e98d87fcc..5cfb800cf598 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2330,7 +2330,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct
epoll_event __user *, events,

error = do_epoll_wait(epfd, events, maxevents, timeout);

-   restore_user_sigmask(sigmask, );
+   restore_user_sigmask(sigmask, , error == -EITNR);

return error;
 }

diff --git a/kernel/signal.c b/kernel/signal.c
index 3a9e41197d46..4a8f96f5c1c0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2849,7 +2849,7 @@ EXPORT_SYMBOL(set_compat_user_sigmask);
  * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
  * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
  */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+void restore_user_sigmask(const void __user *usigmask, sigset_t
*sigsaved, int sig_pending)
 {

if (!usigmask)
@@ -2859,7 +2859,7 @@ void restore_user_sigmask(const void __user
*usigmask, sigset_t *sigsaved)
 * Restoring sigmask here can lead to delivering signals that the above
 * syscalls are intended to block because of the sigmask passed in.
 */
-   if (signal_pending(current)) {
+   if (sig_pending) {
current->saved_sigmask = *sigsaved;
set_restore_sigmask();


If this works that means we know what is busted.
I'm not sure what the hang in the userspace is about. Is it because
the syscall did not return an error or the particular signal was
blocked etc.

There are also a few timing differences also. But, can we try this first?

-Deepa


Re: Strange issues with epoll since 5.0

2019-04-29 Thread Eric Wong
Davidlohr Bueso  wrote:
> On Sun, 28 Apr 2019, Eric Wong wrote:
> 
> > Just running one test won't trigger since it needs a busy
> > machine; but:
> > 
> > make test/mgmt_auto_adjust.log
> > (and "rm make test/mgmt_auto_adjust.log" if you want to rerun)
> 
> fyi no luck reproducing on both either a large (280) or small (4 cpu)
> machine, I'm running it along with a kernel build overcommiting cpus x2.

Thanks for taking a look.

> Is there any workload in particular you are using to stress the box?

Just the "make -j$N check" I mentioned up-thread which spawns a
lot of tests in parallel.  For the small 4 CPU machine,
-j32 or -j16 ought to be reproducing the problem.

I'll try to set aside some time this week to get familiar with
kernel internals w.r.t. signal handling.


Re: Strange issues with epoll since 5.0

2019-04-29 Thread Davidlohr Bueso

On Sun, 28 Apr 2019, Eric Wong wrote:


Just running one test won't trigger since it needs a busy
machine; but:

make test/mgmt_auto_adjust.log
(and "rm make test/mgmt_auto_adjust.log" if you want to rerun)


fyi no luck reproducing on both either a large (280) or small (4 cpu)
machine, I'm running it along with a kernel build overcommiting cpus x2.

Is there any workload in particular you are using to stress the box?

Thanks,
Davidlohr


Re: Strange issues with epoll since 5.0

2019-04-27 Thread Eric Wong
Deepa Dinamani  wrote:
> I tried to replicate the failure on qemu.
> I do not see the failure with N=32.

> Does it work for N < 32?

Depends on number of cores you have; I have 4 cores, 8 threads
with HT; so I needed to have a lot of load on the machine to get
it to fail (it takes about 1 minute).

cmogstored is intended to run on machines that were already
saturated in CPU/memory from other processes, but not HDD I/O
bandwidth.

> Does any other signal work?

SIGCONT does, via:

   perl -i -p -e 's/SIGURG/SIGCONT/g' `git ls-files`

> Are there any other architectures that fail?

I don't have other arches (well, 32-bit x86, but I've never
really tried cmogstored on that, even).

> Could you help me figure out how to run just the one test that is failing?

Just running one test won't trigger since it needs a busy
machine; but:

make test/mgmt_auto_adjust.log
(and "rm make test/mgmt_auto_adjust.log" if you want to rerun)

Thanks for looking into this.  Fwiw, cmogstored uses epoll in
strange and uncommon ways which has led to kernel bugfixes
in the past.


Re: Strange issues with epoll since 5.0

2019-04-27 Thread Deepa Dinamani
I tried to replicate the failure on qemu.
I do not see the failure with N=32.

Does it work for N < 32?
Does any other signal work?
Are there any other architectures that fail?

Could you help me figure out how to run just the one test that is failing?

-Deepa


Re: Strange issues with epoll since 5.0

2019-04-27 Thread Eric Wong
Eric Wong  wrote:
> Omar Kilani  wrote:
> > Hi there,
> > 
> > I’m still trying to piece together a reproducible test that triggers
> > this, but I wanted to post in case someone goes “hmmm... change X
> > might have done this”.
> 
> Maybe Davidlohr knows, since he's responsible for most of the
> epoll changes in 5.0.

Well, I am not sure if I am hitting the same problem Omar is
hitting.  But I did find an epoll_pwait regression in 5.0:

epoll_pwait seems unresponsive to SIGURG in my
heavily-parallelized use case[1] on 5.0.9.  I bisected it to
commit 854a6ed56839a40f6b5d02a2962f48841482eec4
("signal: Add restore_user_sigmask()")

Just reverting the fs/eventpoll.c change in 854a6ed56 seems
enough to fix the non-responsive epoll_pwait for me.  I have not
looked deeply into this, but perhaps the signal_pending check in
restore_user_sigmask is racy w.r.t. epoll.  It is been a while
since I have looked at kernel stuff, myself.

Anyways, this revert works; but I'm not 100% sure why...

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a5d219d920e7..151739d76801 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2247,7 +2247,20 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct 
epoll_event __user *, events,
 
error = do_epoll_wait(epfd, events, maxevents, timeout);
 
-   restore_user_sigmask(sigmask, );
+   /*
+* If we changed the signal mask, we need to restore the original one.
+* In case we've got a signal while waiting, we do not restore the
+* signal mask yet, and we allow do_signal() to deliver the signal on
+* the way back to userspace, before the signal mask is restored.
+*/
+   if (sigmask) {
+   if (error == -EINTR) {
+   memcpy(>saved_sigmask, ,
+  sizeof(sigsaved));
+   set_restore_sigmask();
+   } else
+   set_current_blocked();
+   }
 
return error;
 }
@@ -2272,7 +2285,20 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 
err = do_epoll_wait(epfd, events, maxevents, timeout);
 
-   restore_user_sigmask(sigmask, );
+   /*
+* If we changed the signal mask, we need to restore the original one.
+* In case we've got a signal while waiting, we do not restore the
+* signal mask yet, and we allow do_signal() to deliver the signal on
+* the way back to userspace, before the signal mask is restored.
+*/
+   if (sigmask) {
+   if (err == -EINTR) {
+   memcpy(>saved_sigmask, ,
+  sizeof(sigsaved));
+   set_restore_sigmask();
+   } else
+   set_current_blocked();
+   }
 
return err;
 }

Comments and/or a proper fix would be greatly appreciated.

[1] my test case is running the cmogstored 1.7.0 test suite
in amd64 Debian stable environment.
test/mgmt_auto_adjust would get stuck and time-out after 60s
on vanilla v5.0.9

tgz: https://bogomips.org/cmogstored/files/cmogstored-1.7.0.tar.gz
# Standard autotools install,  N=32 or some high-ish number
./configure
make -j$N
make check -j$N

# OR git clone https://bogomips.org/cmogstored.git

So, requoting the rest of Omar's original report, here; since
I am not sure if his use case involves epoll_pwait like mine does:

> Omar Kilani  wrote:
> > Basically, something’s broken (or at least, has changed enough to
> > cause problems in user space) in epoll since 5.0. It’s still broken in
> > 5.1-rc5.
> > 
> > It doesn’t happen 100% of the time. It’s sort of hard to pin down but
> > I’ve observed the following:
> > 
> > * nginx not accepting connections under load
> > * A java app which uses netty / NIO having strange writability
> > semantics on channels, which confuses netty / java enough to not
> > properly flush written data on the socket.
> > 
> > I went and tested these Linux kernels:
> > 
> > 4.20.17
> > 4.19.32
> > 4.14.111
> > 
> > And the issue(s) do not show up there.
> > 
> > I’m still actively chasing this up, and will report back — I haven’t
> > touched kernel code in 15 years so I’m a little rusty. :)
> > 
> > Regards,
> > Omar


Re: Strange issues with epoll since 5.0

2019-04-24 Thread Davidlohr Bueso

On Wed, 24 Apr 2019, Davidlohr Bueso wrote:


On Wed, 24 Apr 2019, Eric Wong wrote:


Omar Kilani  wrote:

Hi there,

I???m still trying to piece together a reproducible test that triggers
this, but I wanted to post in case someone goes ???hmmm... change X
might have done this???.


Maybe Davidlohr knows, since he's responsible for most of the
epoll changes in 5.0.


Not really, I have not been made aware of any issues until now.




Basically, something???s broken (or at least, has changed enough to
cause problems in user space) in epoll since 5.0. It???s still broken in
5.1-rc5.

It doesn???t happen 100% of the time. It???s sort of hard to pin down but
I???ve observed the following:

* nginx not accepting connections under load
* A java app which uses netty / NIO having strange writability
semantics on channels, which confuses netty / java enough to not
properly flush written data on the socket.


Off the top of my head, could the following be responsible?

c5a282e9635 (fs/epoll: reduce the scope of wq lock in epoll_wait())


Re: Strange issues with epoll since 5.0

2019-04-24 Thread Davidlohr Bueso

On Wed, 24 Apr 2019, Eric Wong wrote:


Omar Kilani  wrote:

Hi there,

I???m still trying to piece together a reproducible test that triggers
this, but I wanted to post in case someone goes ???hmmm... change X
might have done this???.


Maybe Davidlohr knows, since he's responsible for most of the
epoll changes in 5.0.


Not really, I have not been made aware of any issues until now.




Basically, something???s broken (or at least, has changed enough to
cause problems in user space) in epoll since 5.0. It???s still broken in
5.1-rc5.

It doesn???t happen 100% of the time. It???s sort of hard to pin down but
I???ve observed the following:

* nginx not accepting connections under load
* A java app which uses netty / NIO having strange writability
semantics on channels, which confuses netty / java enough to not
properly flush written data on the socket.

I went and tested these Linux kernels:

4.20.17
4.19.32
4.14.111

And the issue(s) do not show up there.

I???m still actively chasing this up, and will report back ??? I haven???t
touched kernel code in 15 years so I???m a little rusty. :)


A bisection and/or workload that triggers the issue would be great to
see what's going on.

Thanks,
Davidlohr


Re: Strange issues with epoll since 5.0

2019-04-24 Thread Eric Wong
Omar Kilani  wrote:
> Hi there,
> 
> I’m still trying to piece together a reproducible test that triggers
> this, but I wanted to post in case someone goes “hmmm... change X
> might have done this”.

Maybe Davidlohr knows, since he's responsible for most of the
epoll changes in 5.0.

> Basically, something’s broken (or at least, has changed enough to
> cause problems in user space) in epoll since 5.0. It’s still broken in
> 5.1-rc5.
> 
> It doesn’t happen 100% of the time. It’s sort of hard to pin down but
> I’ve observed the following:
> 
> * nginx not accepting connections under load
> * A java app which uses netty / NIO having strange writability
> semantics on channels, which confuses netty / java enough to not
> properly flush written data on the socket.
> 
> I went and tested these Linux kernels:
> 
> 4.20.17
> 4.19.32
> 4.14.111
> 
> And the issue(s) do not show up there.
> 
> I’m still actively chasing this up, and will report back — I haven’t
> touched kernel code in 15 years so I’m a little rusty. :)
> 
> Regards,
> Omar


Strange issues with epoll since 5.0

2019-04-15 Thread Omar Kilani
Hi there,

I’m still trying to piece together a reproducible test that triggers
this, but I wanted to post in case someone goes “hmmm... change X
might have done this”.

Basically, something’s broken (or at least, has changed enough to
cause problems in user space) in epoll since 5.0. It’s still broken in
5.1-rc5.

It doesn’t happen 100% of the time. It’s sort of hard to pin down but
I’ve observed the following:

* nginx not accepting connections under load
* A java app which uses netty / NIO having strange writability
semantics on channels, which confuses netty / java enough to not
properly flush written data on the socket.

I went and tested these Linux kernels:

4.20.17
4.19.32
4.14.111

And the issue(s) do not show up there.

I’m still actively chasing this up, and will report back — I haven’t
touched kernel code in 15 years so I’m a little rusty. :)

Regards,
Omar


Re: Kernel 4.9: strange behavior with fifo scheduler

2019-02-07 Thread Dietmar Eggemann

On 2/6/19 2:25 PM, Frédéric Mathieu wrote:

Hi Dietmar,

Attention !, these tests were executed on a kernel with the patch RT and the 
option CONFIG_PREEMPT_RT_FULL = y. I confirm the truth of my priority settings
On a vanilla kernel, I get the same results as you.

After talking with mike Galbraith, I turned my attention to the priority of 
kernel threads. The following link explains in the behavior of the scheduler :
https://wiki.linuxfoundation.org/realtime/documentation/technical_details/hr_timers

Contrary to what I thought, there is no dynamic adjustment of the priority 
according to the priority of the calling task.

-Message d'origine-
De : linux-kernel-ow...@vger.kernel.org 
[mailto:linux-kernel-ow...@vger.kernel.org] De la part de Dietmar Eggemann
Envoyé : mercredi 6 février 2019 11:55
À : Frédéric Mathieu ; linux-kernel@vger.kernel.org
Objet : Re: Kernel 4.9: strange behavior with fifo scheduler

Hi Frédéric,

On 2/5/19 11:47 AM, Frédéric Mathieu wrote:

Hi,

on an X86_64 architecture (Intel(R) Core(TM) i3-6100U CPU @ 2.30GHz),
I use the linux kernel 4.9.146 with patch rt 125.
uname -a: Linux 4.9.146-rt125 #1 SMP PREEMPT RT Tue Jan 29 14:17:55
CET 2019
x86_64 GNU/Linux


Ah, OK, I should have read it more carefully! So you're talking about 
the ktimersoftd/0 thread in this case which makes cyclictest show the 
latency of ~6ms every 5th time.


v4.9 + rt 125 on ARM64 Juno: (test prio (49) > cyclictest prio (50)

   ...
   ktimersoftd/0-4 [000]54.948862: sched_pi_setprio:
comm=ktimersoftd/0 pid=4 oldprio=49 newprio=98
   ktimersoftd/0-4 [000]54.948867: sched_waking: 
comm=cyclictest pid=2730 prio=49 target_cpu=000
   ktimersoftd/0-4 [000]54.948870: sched_wakeup: 
cyclictest:2730 [49] success=1 CPU:000
   ktimersoftd/0-4 [000]54.948874: sched_switch: 
ktimersoftd/0:4 [98] R ==> cyclictest:2730 [49]
  cyclictest-2730  [000]54.948890: sched_switch: 
cyclictest:2730 [49] S ==> ktimersoftd/0:4 [98]
   ktimersoftd/0-4 [000]54.948906: sched_switch: 
ktimersoftd/0:4 [98] S ==> swapper/0:0 [120]
  -0 [000]54.949087: sched_waking: 
comm=test pid=2723 prio=50 target_cpu=000
  -0 [000]54.949090: sched_wakeup: 
test:2723 [50] success=1 CPU:000
  -0 [000]54.949098: sched_switch: 
swapper/0:0 [120] R ==> test:2723 [50]
test-2723  [000]54.949784: sched_waking: 
comm=ktimersoftd/0 pid=4 prio=98 target_cpu=000
test-2723  [000]54.949787: sched_wakeup: 
ktimersoftd/0:4 [98] success=1 CPU:000
test-2723  [000]54.951979: sched_waking: 
comm=ksoftirqd/0 pid=3 prio=120 target_cpu=000
test-2723  [000]54.951983: sched_wakeup: 
ksoftirqd/0:3 [120] success=1 CPU:000
test-2723  [000]54.955111: sched_switch: 
test:2723 [50] S ==> ktimersoftd/0:4 [98]
   ktimersoftd/0-4 [000]54.955125: sched_waking: 
comm=cyclictest pid=2730 prio=49 target_cpu=000
   ktimersoftd/0-4 [000]54.955128: sched_wakeup: 
cyclictest:2730 [49] success=1 CPU:000
   ktimersoftd/0-4 [000]54.955132: sched_switch: 
ktimersoftd/0:4 [98] R ==> cyclictest:2730 [49]
  cyclictest-2730  [000]54.955138: sched_pi_setprio: 
comm=ktimersoftd/0 pid=4 oldprio=98 newprio=49
  cyclictest-2730  [000]54.955145: sched_switch: 
cyclictest:2730 [49] D ==> ktimersoftd/0:4 [49]
   ktimersoftd/0-4 [000]54.955149: sched_pi_setprio: 
comm=ktimersoftd/0 pid=4 oldprio=49 newprio=98
   ktimersoftd/0-4 [000]54.955154: sched_waking: 
comm=cyclictest pid=2730 prio=49 target_cpu=000
   ktimersoftd/0-4 [000]54.955157: sched_wakeup: 
cyclictest:2730 [49] success=1 CPU:000

   ...

[...]


RE: Kernel 4.9: strange behavior with fifo scheduler

2019-02-06 Thread Frédéric Mathieu
Hi Dietmar,

Attention !, these tests were executed on a kernel with the patch RT and the 
option CONFIG_PREEMPT_RT_FULL = y. I confirm the truth of my priority settings
On a vanilla kernel, I get the same results as you.

After talking with mike Galbraith, I turned my attention to the priority of 
kernel threads. The following link explains in the behavior of the scheduler : 
https://wiki.linuxfoundation.org/realtime/documentation/technical_details/hr_timers

Contrary to what I thought, there is no dynamic adjustment of the priority 
according to the priority of the calling task.

-Message d'origine-
De : linux-kernel-ow...@vger.kernel.org 
[mailto:linux-kernel-ow...@vger.kernel.org] De la part de Dietmar Eggemann
Envoyé : mercredi 6 février 2019 11:55
À : Frédéric Mathieu ; linux-kernel@vger.kernel.org
Objet : Re: Kernel 4.9: strange behavior with fifo scheduler

Hi Frédéric,

On 2/5/19 11:47 AM, Frédéric Mathieu wrote:
> Hi,
> 
> on an X86_64 architecture (Intel(R) Core(TM) i3-6100U CPU @ 2.30GHz), 
> I use the linux kernel 4.9.146 with patch rt 125.
> uname -a: Linux 4.9.146-rt125 #1 SMP PREEMPT RT Tue Jan 29 14:17:55 
> CET 2019
> x86_64 GNU/Linux
> 
> I observed a strange behavior of the scheduler when several tasks are 
> executed in FIFO mode on a CPU core and a significant CPU activity.
> 
> first test (reference: cpu load=0%):
> cyclictest -m -D 5 -i 1000 -p 50 -a 0
>   # / dev / cpu_dma_latency set to 0us
>   policy: fifo: loadavg: 1.95 1.06 0.43 1/159 14305
>   T: 0 (14145) P: 50 I: 1000 C: 4997 Min: 7 Act: 7 Avg: 7 Max: 
> 18 work fine
> 
> now, i'm loading the system on the cpu core 0 with a homemade process:
>cpu load 60%, sched FIFO prio 1, cpu 0

Are you sure that your test app runs with prio 1? Is this in the range of the 
SCHED_FIFO (userspace) priorities shown by chrt -m?

...
SCHED_FIFO min/max priority : 1/99
...

If I run your setup (test and cyclictest affine to CPU0) on 4.15.0-43
(i7-4750HQ) with:

(1) test prio > cyclictest prio

# chrt -p $PID_TEST

pid 28489's current scheduling policy: SCHED_FIFO pid 28489's current 
scheduling priority: 51

# cat /proc/$PID_TEST/stat

28489 (test) R 28488 28487 8664 34828 28487 4194304 86 0 0 0 0 0 0 0 -52

I get your behaviour:

# /dev/cpu_dma_latency set to 0us
Thread 0 Interval: 1500
0:   0:   6
0:   1:   5
0:   2:   2
0:   3:5419
0:   4:   3
0:   5:   2
0:   6:   2
0:   7:   2
0:   8:5422
0:   9:   3
...

whereas with:

(2) test prio < cyclictest prio

# chrt -p $PID_TEST

pid 28811's current scheduling policy: SCHED_FIFO pid 28811's current 
scheduling priority: 49

# cat /proc/$PID_TEST/stat

28811 (test) S 28810 28809 8664 34828 28809 1077936128 90 0 0 0 0 0 0 0 -50

I get:

# /dev/cpu_dma_latency set to 0us
Thread 0 Interval: 1500
0:   0:   7
0:   1:   4
0:   2:   3
0:   3:   5
0:   4:   4
0:   5:   2
0:   6:   2
0:   7:   2
0:   8:   2
0:   9:   3
...

[...]

> In this case cyclictest detects very long latencies
>cyclictest -m -D 5 -i 1000 -p 50 -a 0 -v
> 
> Max CPUs = 2
> # /dev/cpu_dma_latency set to 0us
> Thread 0 Interval: 1500
> 0:   0:  13
> 0:   1:   8
> 0:   2:   7
> 0:   3:5648
> 0:   4:   8
> 0:   5:   7
> 0:   6:   7
> 0:   7:   7
> 0:   8:5649

[...]

> After verification, although no other process is running with real 
> time scheduler, I see a latency of about 5.6 ms at regular intervals.
> 
> it seems that the priority task 1 (fifo) is not pre-empted by the 
> cyclictest process with a priority of 50 (fifo) when the low priority task is 
> active.
> This  corresponds to the cycle recorded in the file: 6 ms of latency 
> followed by 4 "normal" latencies of 7 us.
> 
> Does anyone have any idea of this problem?
> 
> Best regards
> Frederic MATHIEU



Re: Kernel 4.9: strange behavior with fifo scheduler

2019-02-06 Thread Dietmar Eggemann

Hi Frédéric,

On 2/5/19 11:47 AM, Frédéric Mathieu wrote:

Hi,

on an X86_64 architecture (Intel(R) Core(TM) i3-6100U CPU @ 2.30GHz), I use
the linux kernel 4.9.146 with patch rt 125.
uname -a: Linux 4.9.146-rt125 #1 SMP PREEMPT RT Tue Jan 29 14:17:55 CET 2019
x86_64 GNU/Linux

I observed a strange behavior of the scheduler when several tasks are
executed in FIFO mode on a CPU core and a significant CPU activity.

first test (reference: cpu load=0%):
cyclictest -m -D 5 -i 1000 -p 50 -a 0
# / dev / cpu_dma_latency set to 0us
policy: fifo: loadavg: 1.95 1.06 0.43 1/159 14305
T: 0 (14145) P: 50 I: 1000 C: 4997 Min: 7 Act: 7 Avg: 7 Max: 18
work fine

now, i'm loading the system on the cpu core 0 with a homemade process:
   cpu load 60%, sched FIFO prio 1, cpu 0


Are you sure that your test app runs with prio 1? Is this in the range 
of the SCHED_FIFO (userspace) priorities shown by chrt -m?


...
SCHED_FIFO min/max priority : 1/99
...

If I run your setup (test and cyclictest affine to CPU0) on 4.15.0-43 
(i7-4750HQ) with:


(1) test prio > cyclictest prio

# chrt -p $PID_TEST

pid 28489's current scheduling policy: SCHED_FIFO
pid 28489's current scheduling priority: 51

# cat /proc/$PID_TEST/stat

28489 (test) R 28488 28487 8664 34828 28487 4194304 86 0 0 0 0 0 0 0 -52

I get your behaviour:

# /dev/cpu_dma_latency set to 0us
Thread 0 Interval: 1500
   0:   0:   6
   0:   1:   5
   0:   2:   2
   0:   3:5419
   0:   4:   3
   0:   5:   2
   0:   6:   2
   0:   7:   2
   0:   8:5422
   0:   9:   3
...

whereas with:

(2) test prio < cyclictest prio

# chrt -p $PID_TEST

pid 28811's current scheduling policy: SCHED_FIFO
pid 28811's current scheduling priority: 49

# cat /proc/$PID_TEST/stat

28811 (test) S 28810 28809 8664 34828 28809 1077936128 90 0 0 0 0 0 0 0 -50

I get:

# /dev/cpu_dma_latency set to 0us
Thread 0 Interval: 1500
   0:   0:   7
   0:   1:   4
   0:   2:   3
   0:   3:   5
   0:   4:   4
   0:   5:   2
   0:   6:   2
   0:   7:   2
   0:   8:   2
   0:   9:   3
...

[...]


In this case cyclictest detects very long latencies
   cyclictest -m -D 5 -i 1000 -p 50 -a 0 -v

Max CPUs = 2
# /dev/cpu_dma_latency set to 0us
Thread 0 Interval: 1500
0:   0:  13
0:   1:   8
0:   2:   7
0:   3:5648
0:   4:   8
0:   5:   7
0:   6:   7
0:   7:   7
0:   8:5649


[...]


After verification, although no other process is running with real time
scheduler, I see a latency of about 5.6 ms at regular intervals.

it seems that the priority task 1 (fifo) is not pre-empted by the cyclictest
process with a priority of 50 (fifo) when the low priority task is active.
This  corresponds to the cycle recorded in the file: 6 ms of latency
followed by 4 "normal" latencies of 7 us.

Does anyone have any idea of this problem?

Best regards
Frederic MATHIEU


Kernel 4.9: strange behavior with fifo scheduler

2019-02-05 Thread Frédéric Mathieu
Hi,

on an X86_64 architecture (Intel(R) Core(TM) i3-6100U CPU @ 2.30GHz), I use
the linux kernel 4.9.146 with patch rt 125.
uname –a: Linux 4.9.146-rt125 #1 SMP PREEMPT RT Tue Jan 29 14:17:55 CET 2019
x86_64 GNU/Linux

I observed a strange behavior of the scheduler when several tasks are
executed in FIFO mode on a CPU core and a significant CPU activity.

first test (reference: cpu load=0%):
cyclictest -m -D 5 -i 1000 -p 50 -a 0
# / dev / cpu_dma_latency set to 0us
policy: fifo: loadavg: 1.95 1.06 0.43 1/159 14305
T: 0 (14145) P: 50 I: 1000 C: 4997 Min: 7 Act: 7 Avg: 7 Max: 18
work fine

now, i'm loading the system on the cpu core 0 with a homemade process:
  cpu load 60%, sched FIFO prio 1, cpu 0
code snippet 
active_time = 6000;
sleep_time = 4000;
    do {
   clock_gettime(CLOCK_MONOTONIC, _time);
   // active loop    
   do {
       for( cptdelay=0; cptdelay <
100; cptdelay++ );
  
clock_gettime(CLOCK_MONOTONIC, );
   diff  = timespec_diff_us(
_time,  );
   } while (diff < active_time);
   // sleep : suspend time
   usleep(sleep_time);
    } while( 1 );


In this case cyclictest detects very long latencies
  cyclictest -m -D 5 -i 1000 -p 50 -a 0 -v 

Max CPUs = 2
# /dev/cpu_dma_latency set to 0us
Thread 0 Interval: 1500
   0:   0:  13
   0:   1:   8
   0:   2:   7
   0:   3:5648
   0:   4:   8
   0:   5:   7
   0:   6:   7
   0:   7:   7
   0:   8:5649
   0:   9:   7
   0:  10:   7
   0:  11:   7
   0:  12:   7
   0:  13:5651
   0:  14:   7
   0:  15:   7
   0:  16:   7
   0:  17:   7
   0:  18:5655
   0:  19:   7
   0:  20:   7
   0:  21:   7
   0:  22:   7
   0:  23:5658
   0:  24:   7
   0:  25:   7
   0:  26:   7
   0:  27:   7
   0:  28:5663
   0:  29:   7
   0:  30:   7
   0:  31:   7
   0:  32:   7
   0:  33:5664
   0:  34:   7
   0:  35:   7
   0:  36:   7
   0:  37:   7
   0:  38:5667
   0:  39:   7
   0:  40:   7
   0:  41:   7
   0:  42:   7
   0:  43:5671
   0:  44:   8
   0:  45:   7
   0:  46:   7
   0:  47:   7
   0:  48:5673
   0:  49:   7
   0:  50:   7
   0:  51:   7
   0:  52:   7
   0:  53:5676
   0:  54:   7
   0:  55:   7
   0:  56:   7
   0:  57:   7
   0:  58:5679
   0:  59:   7
   0:  60:   7
   0:  61:   7
   0:  62:   7
   0:  63:5682
   0:  64:   7
   0:  65:   7
   0:  66:   7
   0:  67:   7
   0:  68:5685


After verification, although no other process is running with real time
scheduler, I see a latency of about 5.6 ms at regular intervals.

it seems that the priority task 1 (fifo) is not pre-empted by the cyclictest
process with a priority of 50 (fifo) when the low priority task is active. 
This  corresponds to the cycle recorded in the file: 6 ms of latency
followed by 4 "normal" latencies of 7 us.

Does anyone have any idea of this problem?

Best regards
Frederic MATHIEU




Re: Strange hang with gcc 8 of kprobe multiple_kprobes test

2018-12-09 Thread Ingo Molnar


* Steven Rostedt  wrote:

> On Tue, 4 Dec 2018 09:15:06 +0100
> Ingo Molnar  wrote:
> 
> > * Masami Hiramatsu  wrote:
> > 
> > > I remember I have fixed this, and actually WE did it :-D
> > > 
> > > https://lkml.org/lkml/2018/8/23/1203
> > > 
> > > Ah, we hit a same bug...
> > > 
> > > Ingo, could you pick the patch? Should I resend it?  
> > 
> > Indeed: I just picked it up into tip:perf/urgent.
> > 
> > It's my bad: I missed the original submission due to Steve's feedback 
> > which I mistook as a request for another iteration, while he only 
> > commented on the reason for the original breakage and gave his 
> > Reviewed-by ...
> > 
> 
> Sorry for the confusion. The patch and code was quite complex, and I
> was documenting what I thought of the patch (and the bug), so that my
> Reviewed-by had a bit more meaning.

It was totally welcome when I read it the second time - and it's my bad I 
was reading too quickly first time around ;-)

So please keep doing it whenever you find the time! :)

Thanks,

Ingo


Re: Strange hang with gcc 8 of kprobe multiple_kprobes test

2018-12-04 Thread Steven Rostedt
On Tue, 4 Dec 2018 09:15:06 +0100
Ingo Molnar  wrote:

> * Masami Hiramatsu  wrote:
> 
> > I remember I have fixed this, and actually WE did it :-D
> > 
> > https://lkml.org/lkml/2018/8/23/1203
> > 
> > Ah, we hit a same bug...
> > 
> > Ingo, could you pick the patch? Should I resend it?  
> 
> Indeed: I just picked it up into tip:perf/urgent.
> 
> It's my bad: I missed the original submission due to Steve's feedback 
> which I mistook as a request for another iteration, while he only 
> commented on the reason for the original breakage and gave his 
> Reviewed-by ...
> 

Sorry for the confusion. The patch and code was quite complex, and I
was documenting what I thought of the patch (and the bug), so that my
Reviewed-by had a bit more meaning.

-- Steve


Re: Strange hang with gcc 8 of kprobe multiple_kprobes test

2018-12-04 Thread Steven Rostedt
On Tue, 4 Dec 2018 09:15:06 +0100
Ingo Molnar  wrote:

> * Masami Hiramatsu  wrote:
> 
> > I remember I have fixed this, and actually WE did it :-D
> > 
> > https://lkml.org/lkml/2018/8/23/1203
> > 
> > Ah, we hit a same bug...
> > 
> > Ingo, could you pick the patch? Should I resend it?  
> 
> Indeed: I just picked it up into tip:perf/urgent.
> 
> It's my bad: I missed the original submission due to Steve's feedback 
> which I mistook as a request for another iteration, while he only 
> commented on the reason for the original breakage and gave his 
> Reviewed-by ...
> 

Sorry for the confusion. The patch and code was quite complex, and I
was documenting what I thought of the patch (and the bug), so that my
Reviewed-by had a bit more meaning.

-- Steve


Re: Strange hang with gcc 8 of kprobe multiple_kprobes test

2018-12-04 Thread Ingo Molnar


* Masami Hiramatsu  wrote:

> I remember I have fixed this, and actually WE did it :-D
> 
> https://lkml.org/lkml/2018/8/23/1203
> 
> Ah, we hit a same bug...
> 
> Ingo, could you pick the patch? Should I resend it?

Indeed: I just picked it up into tip:perf/urgent.

It's my bad: I missed the original submission due to Steve's feedback 
which I mistook as a request for another iteration, while he only 
commented on the reason for the original breakage and gave his 
Reviewed-by ...

Thanks,

Ingo


Re: Strange hang with gcc 8 of kprobe multiple_kprobes test

2018-12-04 Thread Ingo Molnar


* Masami Hiramatsu  wrote:

> I remember I have fixed this, and actually WE did it :-D
> 
> https://lkml.org/lkml/2018/8/23/1203
> 
> Ah, we hit a same bug...
> 
> Ingo, could you pick the patch? Should I resend it?

Indeed: I just picked it up into tip:perf/urgent.

It's my bad: I missed the original submission due to Steve's feedback 
which I mistook as a request for another iteration, while he only 
commented on the reason for the original breakage and gave his 
Reviewed-by ...

Thanks,

Ingo


Re: Strange hang with gcc 8 of kprobe multiple_kprobes test

2018-12-04 Thread Masami Hiramatsu
On Tue, 4 Dec 2018 16:40:07 +0900
Masami Hiramatsu  wrote:

> Hi Steve,
> 
> On Mon, 3 Dec 2018 21:18:07 -0500
> Steven Rostedt  wrote:
> 
> > Hi Masami,
> > 
> > I started testing some of my new code and the system got into a
> > strange state. Debugging further, I found the cause came from the
> > kprobe tests. It became stranger to me that I could reproduce it with
> > older kernels. I went back as far as 4.16 and it triggered. I thought
> > this very strange because I ran this test on all those kernels in the
> > past.
> > 
> > After a bit of hair pulling, I figured out what changed. I upgraded to
> > gcc 8.1 (and I reproduce it with 8.2 as well). I convert back to gcc 7
> > and the tests pass without issue.
> 
> OK, let me see.
> 
> > The issue that I notice when the system gets into this strange state is
> > that I can't log into the box. Nor can I reboot. Basically it's
> > anything to do with systemd just doesn't work (insert your jokes here
> > now, and then let's move on).
> > 
> > I was able to narrow down what the exact function was that caused the
> > issues and it is: update_vsyscall()
> > 
> > gcc 7 looks like this:
> > 
> > 81004bf0 :
> > 81004bf0:   e8 0b cc 9f 00  callq  81a01800 
> > <__fentry__>
> > 81004bf1: R_X86_64_PC32 __fentry__-0x4
> > 81004bf5:   48 8b 07mov(%rdi),%rax
> > 81004bf8:   8b 15 96 5f 34 01   mov0x1345f96(%rip),%edx 
> ># 8234ab94 
> > 81004bfa: R_X86_64_PC32 vclocks_used-0x4
> > 81004bfe:   83 05 7b 84 6f 01 01addl   $0x1,0x16f847b(%rip) 
> ># 826fd080 
> > 81004c00: R_X86_64_PC32 
> > vsyscall_gtod_data-0x5
> > 81004c05:   8b 48 24mov0x24(%rax),%ecx
> > 81004c08:   b8 01 00 00 00  mov$0x1,%eax
> > 81004c0d:   d3 e0   shl%cl,%eax
> > 
> > And gcc 8 looks like this:
> > 
> > 81004c90 :
> > 81004c90:   e8 6b cb 9f 00  callq  81a01800 
> > <__fentry__>
> > 81004c91: R_X86_64_PC32 __fentry__-0x4
> > 81004c95:   48 8b 07mov(%rdi),%rax
> > 81004c98:   83 05 e1 93 6f 01 01addl   $0x1,0x16f93e1(%rip) 
> ># 826fe080 
> 
> Hm this is a RIP relative instruction, it should be modified by kprobes.
> 
> > 81004c9a: R_X86_64_PC32 
> > vsyscall_gtod_data-0x5
> > 81004c9f:   8b 50 24mov0x24(%rax),%edx
> > 81004ca2:   8b 05 ec 5e 34 01   mov0x1345eec(%rip),%eax 
> ># 8234ab94 
> > 81004ca4: R_X86_64_PC32 vclocks_used-0x4
> > 
> > The test adds a kprobe (optimized) at udpate_vsyscall+5. And will
> > insert a jump on the two instructions after fentry. The difference
> > between v7 and v8 is that v7 is touching vclocks_used and v8 is
> > touching vsyscall_gtod_data.
> > 
> > Is there some black magic going on with the vsyscall area with
> > vsyscall_gtod_data that is causing havoc when a kprobe is added there?
> 
> I think it might miss something when preprocessing RIP relative instruction.

Ah, I got it. I thought it had been fixed, but not...

In arch/x86/kernel/kprobes/opt.c, copy_optimized_instructions() does a copy
loop, but only update src and dest cursors, but not update real address
which is used for adjusting RIP relative instruction.

while (len < RELATIVEJUMP_SIZE) {
ret = __copy_instruction(dest + len, src + len, real, );
if (!ret || !can_boost(, src + len))
return -EINVAL;
len += ret;
}

I remember I have fixed this, and actually WE did it :-D

https://lkml.org/lkml/2018/8/23/1203

Ah, we hit a same bug...

Ingo, could you pick the patch? Should I resend it?

Thank you,


-- 
Masami Hiramatsu 


Re: Strange hang with gcc 8 of kprobe multiple_kprobes test

2018-12-04 Thread Masami Hiramatsu
On Tue, 4 Dec 2018 16:40:07 +0900
Masami Hiramatsu  wrote:

> Hi Steve,
> 
> On Mon, 3 Dec 2018 21:18:07 -0500
> Steven Rostedt  wrote:
> 
> > Hi Masami,
> > 
> > I started testing some of my new code and the system got into a
> > strange state. Debugging further, I found the cause came from the
> > kprobe tests. It became stranger to me that I could reproduce it with
> > older kernels. I went back as far as 4.16 and it triggered. I thought
> > this very strange because I ran this test on all those kernels in the
> > past.
> > 
> > After a bit of hair pulling, I figured out what changed. I upgraded to
> > gcc 8.1 (and I reproduce it with 8.2 as well). I convert back to gcc 7
> > and the tests pass without issue.
> 
> OK, let me see.
> 
> > The issue that I notice when the system gets into this strange state is
> > that I can't log into the box. Nor can I reboot. Basically it's
> > anything to do with systemd just doesn't work (insert your jokes here
> > now, and then let's move on).
> > 
> > I was able to narrow down what the exact function was that caused the
> > issues and it is: update_vsyscall()
> > 
> > gcc 7 looks like this:
> > 
> > 81004bf0 :
> > 81004bf0:   e8 0b cc 9f 00  callq  81a01800 
> > <__fentry__>
> > 81004bf1: R_X86_64_PC32 __fentry__-0x4
> > 81004bf5:   48 8b 07mov(%rdi),%rax
> > 81004bf8:   8b 15 96 5f 34 01   mov0x1345f96(%rip),%edx 
> ># 8234ab94 
> > 81004bfa: R_X86_64_PC32 vclocks_used-0x4
> > 81004bfe:   83 05 7b 84 6f 01 01addl   $0x1,0x16f847b(%rip) 
> ># 826fd080 
> > 81004c00: R_X86_64_PC32 
> > vsyscall_gtod_data-0x5
> > 81004c05:   8b 48 24mov0x24(%rax),%ecx
> > 81004c08:   b8 01 00 00 00  mov$0x1,%eax
> > 81004c0d:   d3 e0   shl%cl,%eax
> > 
> > And gcc 8 looks like this:
> > 
> > 81004c90 :
> > 81004c90:   e8 6b cb 9f 00  callq  81a01800 
> > <__fentry__>
> > 81004c91: R_X86_64_PC32 __fentry__-0x4
> > 81004c95:   48 8b 07mov(%rdi),%rax
> > 81004c98:   83 05 e1 93 6f 01 01addl   $0x1,0x16f93e1(%rip) 
> ># 826fe080 
> 
> Hm this is a RIP relative instruction, it should be modified by kprobes.
> 
> > 81004c9a: R_X86_64_PC32 
> > vsyscall_gtod_data-0x5
> > 81004c9f:   8b 50 24mov0x24(%rax),%edx
> > 81004ca2:   8b 05 ec 5e 34 01   mov0x1345eec(%rip),%eax 
> ># 8234ab94 
> > 81004ca4: R_X86_64_PC32 vclocks_used-0x4
> > 
> > The test adds a kprobe (optimized) at udpate_vsyscall+5. And will
> > insert a jump on the two instructions after fentry. The difference
> > between v7 and v8 is that v7 is touching vclocks_used and v8 is
> > touching vsyscall_gtod_data.
> > 
> > Is there some black magic going on with the vsyscall area with
> > vsyscall_gtod_data that is causing havoc when a kprobe is added there?
> 
> I think it might miss something when preprocessing RIP relative instruction.

Ah, I got it. I thought it had been fixed, but not...

In arch/x86/kernel/kprobes/opt.c, copy_optimized_instructions() does a copy
loop, but only update src and dest cursors, but not update real address
which is used for adjusting RIP relative instruction.

while (len < RELATIVEJUMP_SIZE) {
ret = __copy_instruction(dest + len, src + len, real, );
if (!ret || !can_boost(, src + len))
return -EINVAL;
len += ret;
}

I remember I have fixed this, and actually WE did it :-D

https://lkml.org/lkml/2018/8/23/1203

Ah, we hit a same bug...

Ingo, could you pick the patch? Should I resend it?

Thank you,


-- 
Masami Hiramatsu 


Re: Strange hang with gcc 8 of kprobe multiple_kprobes test

2018-12-03 Thread Masami Hiramatsu
Hi Steve,

On Mon, 3 Dec 2018 21:18:07 -0500
Steven Rostedt  wrote:

> Hi Masami,
> 
> I started testing some of my new code and the system got into a
> strange state. Debugging further, I found the cause came from the
> kprobe tests. It became stranger to me that I could reproduce it with
> older kernels. I went back as far as 4.16 and it triggered. I thought
> this very strange because I ran this test on all those kernels in the
> past.
> 
> After a bit of hair pulling, I figured out what changed. I upgraded to
> gcc 8.1 (and I reproduce it with 8.2 as well). I convert back to gcc 7
> and the tests pass without issue.

OK, let me see.

> The issue that I notice when the system gets into this strange state is
> that I can't log into the box. Nor can I reboot. Basically it's
> anything to do with systemd just doesn't work (insert your jokes here
> now, and then let's move on).
> 
> I was able to narrow down what the exact function was that caused the
> issues and it is: update_vsyscall()
> 
> gcc 7 looks like this:
> 
> 81004bf0 :
> 81004bf0:   e8 0b cc 9f 00  callq  81a01800 
> <__fentry__>
> 81004bf1: R_X86_64_PC32 __fentry__-0x4
> 81004bf5:   48 8b 07mov(%rdi),%rax
> 81004bf8:   8b 15 96 5f 34 01   mov0x1345f96(%rip),%edx   
>  # 8234ab94 
> 81004bfa: R_X86_64_PC32 vclocks_used-0x4
> 81004bfe:   83 05 7b 84 6f 01 01addl   $0x1,0x16f847b(%rip)   
>  # 826fd080 
> 81004c00: R_X86_64_PC32 vsyscall_gtod_data-0x5
> 81004c05:   8b 48 24mov0x24(%rax),%ecx
> 81004c08:   b8 01 00 00 00  mov$0x1,%eax
> 81004c0d:   d3 e0   shl%cl,%eax
> 
> And gcc 8 looks like this:
> 
> 81004c90 :
> 81004c90:   e8 6b cb 9f 00  callq  81a01800 
> <__fentry__>
> 81004c91: R_X86_64_PC32 __fentry__-0x4
> 81004c95:   48 8b 07mov(%rdi),%rax
> 81004c98:   83 05 e1 93 6f 01 01addl   $0x1,0x16f93e1(%rip)   
>  # 826fe080 

Hm this is a RIP relative instruction, it should be modified by kprobes.

> 81004c9a: R_X86_64_PC32 vsyscall_gtod_data-0x5
> 81004c9f:   8b 50 24mov0x24(%rax),%edx
> 81004ca2:   8b 05 ec 5e 34 01   mov0x1345eec(%rip),%eax   
>  # 8234ab94 
> 81004ca4: R_X86_64_PC32 vclocks_used-0x4
> 
> The test adds a kprobe (optimized) at udpate_vsyscall+5. And will
> insert a jump on the two instructions after fentry. The difference
> between v7 and v8 is that v7 is touching vclocks_used and v8 is
> touching vsyscall_gtod_data.
> 
> Is there some black magic going on with the vsyscall area with
> vsyscall_gtod_data that is causing havoc when a kprobe is added there?

I think it might miss something when preprocessing RIP relative instruction.
Could you disable jump optimization as below and test what happen on
update_vsyscall+5 AND update_vsyscall+8? (RIP relative preprocess must
happen even if the jump optimization is disabled)

# echo 0 > /proc/sys/debug/kprobes-optimization


> I can dig a little more into this, but I'm currently at my HQ office
> with a lot of other objectives that I must get done, and I can't work
> on this much more this week.

OK, let me try to reproduce it in my environment.

> 
> I included my config (for my virt machine, which I was also able to
> trigger it with).

Thanks, but I think it should not depend on the kconfig.

> 
> The test that triggers this bug is:
> 
>  tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc
> 
> It runs the test fine, but other things just start to act up after I
> run it.

Yeah, thank you for digging it down. It is now much easier to me.

> 
> I notice that when I get into the state, journald and the dbus_daemon
> are constantly running. Perhaps the userspace time keeping went bad?

Yeah, I think so. Maybe addl instruction becomes broken.

Thank you,

-- 
Masami Hiramatsu 


Re: Strange hang with gcc 8 of kprobe multiple_kprobes test

2018-12-03 Thread Masami Hiramatsu
Hi Steve,

On Mon, 3 Dec 2018 21:18:07 -0500
Steven Rostedt  wrote:

> Hi Masami,
> 
> I started testing some of my new code and the system got into a
> strange state. Debugging further, I found the cause came from the
> kprobe tests. It became stranger to me that I could reproduce it with
> older kernels. I went back as far as 4.16 and it triggered. I thought
> this very strange because I ran this test on all those kernels in the
> past.
> 
> After a bit of hair pulling, I figured out what changed. I upgraded to
> gcc 8.1 (and I reproduce it with 8.2 as well). I convert back to gcc 7
> and the tests pass without issue.

OK, let me see.

> The issue that I notice when the system gets into this strange state is
> that I can't log into the box. Nor can I reboot. Basically it's
> anything to do with systemd just doesn't work (insert your jokes here
> now, and then let's move on).
> 
> I was able to narrow down what the exact function was that caused the
> issues and it is: update_vsyscall()
> 
> gcc 7 looks like this:
> 
> 81004bf0 :
> 81004bf0:   e8 0b cc 9f 00  callq  81a01800 
> <__fentry__>
> 81004bf1: R_X86_64_PC32 __fentry__-0x4
> 81004bf5:   48 8b 07mov(%rdi),%rax
> 81004bf8:   8b 15 96 5f 34 01   mov0x1345f96(%rip),%edx   
>  # 8234ab94 
> 81004bfa: R_X86_64_PC32 vclocks_used-0x4
> 81004bfe:   83 05 7b 84 6f 01 01addl   $0x1,0x16f847b(%rip)   
>  # 826fd080 
> 81004c00: R_X86_64_PC32 vsyscall_gtod_data-0x5
> 81004c05:   8b 48 24mov0x24(%rax),%ecx
> 81004c08:   b8 01 00 00 00  mov$0x1,%eax
> 81004c0d:   d3 e0   shl%cl,%eax
> 
> And gcc 8 looks like this:
> 
> 81004c90 :
> 81004c90:   e8 6b cb 9f 00  callq  81a01800 
> <__fentry__>
> 81004c91: R_X86_64_PC32 __fentry__-0x4
> 81004c95:   48 8b 07mov(%rdi),%rax
> 81004c98:   83 05 e1 93 6f 01 01addl   $0x1,0x16f93e1(%rip)   
>  # 826fe080 

Hm this is a RIP relative instruction, it should be modified by kprobes.

> 81004c9a: R_X86_64_PC32 vsyscall_gtod_data-0x5
> 81004c9f:   8b 50 24mov0x24(%rax),%edx
> 81004ca2:   8b 05 ec 5e 34 01   mov0x1345eec(%rip),%eax   
>  # 8234ab94 
> 81004ca4: R_X86_64_PC32 vclocks_used-0x4
> 
> The test adds a kprobe (optimized) at udpate_vsyscall+5. And will
> insert a jump on the two instructions after fentry. The difference
> between v7 and v8 is that v7 is touching vclocks_used and v8 is
> touching vsyscall_gtod_data.
> 
> Is there some black magic going on with the vsyscall area with
> vsyscall_gtod_data that is causing havoc when a kprobe is added there?

I think it might miss something when preprocessing RIP relative instruction.
Could you disable jump optimization as below and test what happen on
update_vsyscall+5 AND update_vsyscall+8? (RIP relative preprocess must
happen even if the jump optimization is disabled)

# echo 0 > /proc/sys/debug/kprobes-optimization


> I can dig a little more into this, but I'm currently at my HQ office
> with a lot of other objectives that I must get done, and I can't work
> on this much more this week.

OK, let me try to reproduce it in my environment.

> 
> I included my config (for my virt machine, which I was also able to
> trigger it with).

Thanks, but I think it should not depend on the kconfig.

> 
> The test that triggers this bug is:
> 
>  tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc
> 
> It runs the test fine, but other things just start to act up after I
> run it.

Yeah, thank you for digging it down. It is now much easier to me.

> 
> I notice that when I get into the state, journald and the dbus_daemon
> are constantly running. Perhaps the userspace time keeping went bad?

Yeah, I think so. Maybe addl instruction becomes broken.

Thank you,

-- 
Masami Hiramatsu 


Re: CHECKPATCH: strange warning on alignment modifier

2018-10-08 Thread Joe Perches
On Mon, 2018-10-08 at 10:56 +0300, Igor Stoppa wrote:
> Hi,
> 
> I have the following fragment of code:
> 
> +struct my_struct {
> +   atomic_long_t l __aligned(sizeof(atomic_long_t));
> +} __aligned(sizeof(atomic_long_t));
> 
> 
> triggering this warning, when fed to checkpatch.pl:
> 
> WARNING: function definition argument 'atomic_long_t' should also have 
> an identifier name
> #19: FILE: path/to/file.h
> + atomic_long_t l __aligned(sizeof(atomic_long_t));
> 
> 
> gcc [(Ubuntu 7.3.0-16ubuntu3) 7.3.0] seems to be happy about it
> 
> I am using the HEAD from mainline.
> 
> My intent is to specify the alignment of both the field and the 
> structure (yes, probably redundant in this single-field case).
> 
> If I am doing something wrong, I can't figure out what it is, but I 
> don't understand why the WARNING is mentioning a function definition.

It's a defect in checkpatch.
For now, just ignore the message.
I will work on it later.




Re: CHECKPATCH: strange warning on alignment modifier

2018-10-08 Thread Joe Perches
On Mon, 2018-10-08 at 10:56 +0300, Igor Stoppa wrote:
> Hi,
> 
> I have the following fragment of code:
> 
> +struct my_struct {
> +   atomic_long_t l __aligned(sizeof(atomic_long_t));
> +} __aligned(sizeof(atomic_long_t));
> 
> 
> triggering this warning, when fed to checkpatch.pl:
> 
> WARNING: function definition argument 'atomic_long_t' should also have 
> an identifier name
> #19: FILE: path/to/file.h
> + atomic_long_t l __aligned(sizeof(atomic_long_t));
> 
> 
> gcc [(Ubuntu 7.3.0-16ubuntu3) 7.3.0] seems to be happy about it
> 
> I am using the HEAD from mainline.
> 
> My intent is to specify the alignment of both the field and the 
> structure (yes, probably redundant in this single-field case).
> 
> If I am doing something wrong, I can't figure out what it is, but I 
> don't understand why the WARNING is mentioning a function definition.

It's a defect in checkpatch.
For now, just ignore the message.
I will work on it later.




CHECKPATCH: strange warning on alignment modifier

2018-10-08 Thread Igor Stoppa

Hi,

I have the following fragment of code:

+struct my_struct {
+   atomic_long_t l __aligned(sizeof(atomic_long_t));
+} __aligned(sizeof(atomic_long_t));


triggering this warning, when fed to checkpatch.pl:

WARNING: function definition argument 'atomic_long_t' should also have 
an identifier name

#19: FILE: path/to/file.h
+   atomic_long_t l __aligned(sizeof(atomic_long_t));


gcc [(Ubuntu 7.3.0-16ubuntu3) 7.3.0] seems to be happy about it

I am using the HEAD from mainline.

My intent is to specify the alignment of both the field and the 
structure (yes, probably redundant in this single-field case).


If I am doing something wrong, I can't figure out what it is, but I 
don't understand why the WARNING is mentioning a function definition.


--
igor


CHECKPATCH: strange warning on alignment modifier

2018-10-08 Thread Igor Stoppa

Hi,

I have the following fragment of code:

+struct my_struct {
+   atomic_long_t l __aligned(sizeof(atomic_long_t));
+} __aligned(sizeof(atomic_long_t));


triggering this warning, when fed to checkpatch.pl:

WARNING: function definition argument 'atomic_long_t' should also have 
an identifier name

#19: FILE: path/to/file.h
+   atomic_long_t l __aligned(sizeof(atomic_long_t));


gcc [(Ubuntu 7.3.0-16ubuntu3) 7.3.0] seems to be happy about it

I am using the HEAD from mainline.

My intent is to specify the alignment of both the field and the 
structure (yes, probably redundant in this single-field case).


If I am doing something wrong, I can't figure out what it is, but I 
don't understand why the WARNING is mentioning a function definition.


--
igor


[PATCH 4.14 053/156] s390/zcrypt: Fix wrong comparison leading to strange load balancing

2018-02-02 Thread Greg Kroah-Hartman
4.14-stable review patch.  If anyone has any objections, please let me know.

--

From: Harald Freudenberger 


[ Upstream commit 0b0882672640ced4deeebf84da0b88b6389619c4 ]

The function to decide if one zcrypt queue is better than
another one compared two pointers instead of comparing the
values where the pointers refer to. So within the same
zcrypt card when load of each queue was equal just one queue
was used. This effect only appears on relatively lite load,
typically with one thread applications.

This patch fixes the wrong comparison and now the counters
show that requests are balanced equally over all available
queues within the cards.

There is no performance improvement coming with this fix.
As long as the queue depth for an APQN queue is not touched,
processing is not faster when requests are spread over
queues within the same card hardware. So this fix only
beautifies the lszcrypt counter printouts.

Signed-off-by: Harald Freudenberger 
Signed-off-by: Martin Schwidefsky 
Signed-off-by: Sasha Levin 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/s390/crypto/zcrypt_api.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -218,8 +218,8 @@ static inline bool zcrypt_queue_compare(
weight += atomic_read(>load);
pref_weight += atomic_read(_zq->load);
if (weight == pref_weight)
-   return >queue->total_request_count >
-   _zq->queue->total_request_count;
+   return zq->queue->total_request_count >
+   pref_zq->queue->total_request_count;
return weight > pref_weight;
 }
 




[PATCH 4.14 053/156] s390/zcrypt: Fix wrong comparison leading to strange load balancing

2018-02-02 Thread Greg Kroah-Hartman
4.14-stable review patch.  If anyone has any objections, please let me know.

--

From: Harald Freudenberger 


[ Upstream commit 0b0882672640ced4deeebf84da0b88b6389619c4 ]

The function to decide if one zcrypt queue is better than
another one compared two pointers instead of comparing the
values where the pointers refer to. So within the same
zcrypt card when load of each queue was equal just one queue
was used. This effect only appears on relatively lite load,
typically with one thread applications.

This patch fixes the wrong comparison and now the counters
show that requests are balanced equally over all available
queues within the cards.

There is no performance improvement coming with this fix.
As long as the queue depth for an APQN queue is not touched,
processing is not faster when requests are spread over
queues within the same card hardware. So this fix only
beautifies the lszcrypt counter printouts.

Signed-off-by: Harald Freudenberger 
Signed-off-by: Martin Schwidefsky 
Signed-off-by: Sasha Levin 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/s390/crypto/zcrypt_api.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -218,8 +218,8 @@ static inline bool zcrypt_queue_compare(
weight += atomic_read(>load);
pref_weight += atomic_read(_zq->load);
if (weight == pref_weight)
-   return >queue->total_request_count >
-   _zq->queue->total_request_count;
+   return zq->queue->total_request_count >
+   pref_zq->queue->total_request_count;
return weight > pref_weight;
 }
 




[PATCH AUTOSEL for 4.14 024/100] s390/zcrypt: Fix wrong comparison leading to strange load balancing

2018-01-23 Thread Sasha Levin
From: Harald Freudenberger 

[ Upstream commit 0b0882672640ced4deeebf84da0b88b6389619c4 ]

The function to decide if one zcrypt queue is better than
another one compared two pointers instead of comparing the
values where the pointers refer to. So within the same
zcrypt card when load of each queue was equal just one queue
was used. This effect only appears on relatively lite load,
typically with one thread applications.

This patch fixes the wrong comparison and now the counters
show that requests are balanced equally over all available
queues within the cards.

There is no performance improvement coming with this fix.
As long as the queue depth for an APQN queue is not touched,
processing is not faster when requests are spread over
queues within the same card hardware. So this fix only
beautifies the lszcrypt counter printouts.

Signed-off-by: Harald Freudenberger 
Signed-off-by: Martin Schwidefsky 
Signed-off-by: Sasha Levin 
---
 drivers/s390/crypto/zcrypt_api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index b5f4006198b9..a9a56aa9c26b 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -218,8 +218,8 @@ static inline bool zcrypt_queue_compare(struct zcrypt_queue 
*zq,
weight += atomic_read(>load);
pref_weight += atomic_read(_zq->load);
if (weight == pref_weight)
-   return >queue->total_request_count >
-   _zq->queue->total_request_count;
+   return zq->queue->total_request_count >
+   pref_zq->queue->total_request_count;
return weight > pref_weight;
 }
 
-- 
2.11.0


[PATCH AUTOSEL for 4.14 024/100] s390/zcrypt: Fix wrong comparison leading to strange load balancing

2018-01-23 Thread Sasha Levin
From: Harald Freudenberger 

[ Upstream commit 0b0882672640ced4deeebf84da0b88b6389619c4 ]

The function to decide if one zcrypt queue is better than
another one compared two pointers instead of comparing the
values where the pointers refer to. So within the same
zcrypt card when load of each queue was equal just one queue
was used. This effect only appears on relatively lite load,
typically with one thread applications.

This patch fixes the wrong comparison and now the counters
show that requests are balanced equally over all available
queues within the cards.

There is no performance improvement coming with this fix.
As long as the queue depth for an APQN queue is not touched,
processing is not faster when requests are spread over
queues within the same card hardware. So this fix only
beautifies the lszcrypt counter printouts.

Signed-off-by: Harald Freudenberger 
Signed-off-by: Martin Schwidefsky 
Signed-off-by: Sasha Levin 
---
 drivers/s390/crypto/zcrypt_api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index b5f4006198b9..a9a56aa9c26b 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -218,8 +218,8 @@ static inline bool zcrypt_queue_compare(struct zcrypt_queue 
*zq,
weight += atomic_read(>load);
pref_weight += atomic_read(_zq->load);
if (weight == pref_weight)
-   return >queue->total_request_count >
-   _zq->queue->total_request_count;
+   return zq->queue->total_request_count >
+   pref_zq->queue->total_request_count;
return weight > pref_weight;
 }
 
-- 
2.11.0


[PATCH 02/22] signal: Document the strange si_codes used by ptrace event stops

2018-01-15 Thread Eric W. Biederman
Signed-off-by: "Eric W. Biederman" 
---
 include/uapi/asm-generic/siginfo.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/asm-generic/siginfo.h 
b/include/uapi/asm-generic/siginfo.h
index cbe1b0cc7a6a..7158421ac911 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -225,6 +225,11 @@ typedef struct siginfo {
 #define TRAP_HWBKPT 4  /* hardware breakpoint/watchpoint */
 #define NSIGTRAP   4
 
+/*
+ * There are an additional set of SIGTRAP si_codes used by ptrace
+ * that of the form: ((PTRACE_EVENT_XXX << 8) | SIGTRAP)
+ */
+
 /*
  * SIGCHLD si_codes
  */
-- 
2.14.1



[PATCH 02/22] signal: Document the strange si_codes used by ptrace event stops

2018-01-15 Thread Eric W. Biederman
Signed-off-by: "Eric W. Biederman" 
---
 include/uapi/asm-generic/siginfo.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/asm-generic/siginfo.h 
b/include/uapi/asm-generic/siginfo.h
index cbe1b0cc7a6a..7158421ac911 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -225,6 +225,11 @@ typedef struct siginfo {
 #define TRAP_HWBKPT 4  /* hardware breakpoint/watchpoint */
 #define NSIGTRAP   4
 
+/*
+ * There are an additional set of SIGTRAP si_codes used by ptrace
+ * that of the form: ((PTRACE_EVENT_XXX << 8) | SIGTRAP)
+ */
+
 /*
  * SIGCHLD si_codes
  */
-- 
2.14.1



Re: linux-next: strange update of the sound tree

2017-06-05 Thread Stephen Rothwell
Hi Takashi,

On Mon, 05 Jun 2017 18:32:54 +0200 Takashi Iwai  wrote:
>
> Doh, I pushed out a wrong branch (master) to for-next, as I had to
> work on another machine at home today due to a holiday.  Now the
> correct branch was pushed.

Much better, thanks :-)

-- 
Cheers,
Stephen Rothwell


Re: linux-next: strange update of the sound tree

2017-06-05 Thread Stephen Rothwell
Hi Takashi,

On Mon, 05 Jun 2017 18:32:54 +0200 Takashi Iwai  wrote:
>
> Doh, I pushed out a wrong branch (master) to for-next, as I had to
> work on another machine at home today due to a holiday.  Now the
> correct branch was pushed.

Much better, thanks :-)

-- 
Cheers,
Stephen Rothwell


Re: linux-next: strange update of the sound tree

2017-06-05 Thread Takashi Iwai
On Mon, 05 Jun 2017 16:17:23 +0200,
Stephen Rothwell wrote:
> 
> Hi Takashi,
> 
> I don't think you have the result in the sound tree
> (git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git
> branch for-next).  Yesterday it was a fairly linear tree with head
> commit 7421a1671abe ("ALSA: pcm: include pcm_local.h and remove some
> extraneous tabs") based on commit f83914fdfcc3 ("ALSA: usb-audio: fix
> Amanero Combo384 quirk on big-endian hosts").  Now it is a tree with
> 687 commits (realative to Linus' tree) with 651 merge commits in it. :-(

Doh, I pushed out a wrong branch (master) to for-next, as I had to
work on another machine at home today due to a holiday.  Now the
correct branch was pushed.

Sorry for inconvenience, and thanks for catching it.


Takashi


Re: linux-next: strange update of the sound tree

2017-06-05 Thread Takashi Iwai
On Mon, 05 Jun 2017 16:17:23 +0200,
Stephen Rothwell wrote:
> 
> Hi Takashi,
> 
> I don't think you have the result in the sound tree
> (git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git
> branch for-next).  Yesterday it was a fairly linear tree with head
> commit 7421a1671abe ("ALSA: pcm: include pcm_local.h and remove some
> extraneous tabs") based on commit f83914fdfcc3 ("ALSA: usb-audio: fix
> Amanero Combo384 quirk on big-endian hosts").  Now it is a tree with
> 687 commits (realative to Linus' tree) with 651 merge commits in it. :-(

Doh, I pushed out a wrong branch (master) to for-next, as I had to
work on another machine at home today due to a holiday.  Now the
correct branch was pushed.

Sorry for inconvenience, and thanks for catching it.


Takashi


linux-next: strange update of the sound tree

2017-06-05 Thread Stephen Rothwell
Hi Takashi,

I don't think you have the result in the sound tree
(git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git
branch for-next).  Yesterday it was a fairly linear tree with head
commit 7421a1671abe ("ALSA: pcm: include pcm_local.h and remove some
extraneous tabs") based on commit f83914fdfcc3 ("ALSA: usb-audio: fix
Amanero Combo384 quirk on big-endian hosts").  Now it is a tree with
687 commits (realative to Linus' tree) with 651 merge commits in it. :-(

-- 
Cheers,
Stephen Rothwell


linux-next: strange update of the sound tree

2017-06-05 Thread Stephen Rothwell
Hi Takashi,

I don't think you have the result in the sound tree
(git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git
branch for-next).  Yesterday it was a fairly linear tree with head
commit 7421a1671abe ("ALSA: pcm: include pcm_local.h and remove some
extraneous tabs") based on commit f83914fdfcc3 ("ALSA: usb-audio: fix
Amanero Combo384 quirk on big-endian hosts").  Now it is a tree with
687 commits (realative to Linus' tree) with 651 merge commits in it. :-(

-- 
Cheers,
Stephen Rothwell


Re: strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer

2017-06-02 Thread Tom Lendacky

On 6/2/2017 9:43 AM, Michal Hocko wrote:

On Fri 02-06-17 09:20:54, Tom Lendacky wrote:

On 5/31/2017 11:04 AM, Michal Hocko wrote:

Hi Tom,


Hi Michal,


I have stumbled over the following construct in xgbe_map_rx_buffer
order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1?
And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all?



The driver tries to allocate a number of pages to be used as receive
buffers.  Based on what I could find in documentation, the value of
PAGE_ALLOC_COSTLY_ORDER is the point at which order allocations
(could) get expensive.  So I decrease by one the order requested. The
max_t test is just to insure that in case PAGE_ALLOC_COSTLY_ORDER ever
gets defined as 0, 0 would be used.


So you have fallen into a carefully prepared trap ;). The thing is that
orders _larger_ than PAGE_ALLOC_COSTLY_ORDER are costly actually. I can
completely see how this can be confusing.

Moreover xgbe_map_rx_buffer does an atomic allocation which doesn't do
any direct reclaim/compaction attempts so the costly vs. non-costly
doesn't apply here at all.

I would be much happier if no code outside of mm used
PAGE_ALLOC_COSTLY_ORDER directly but that requires a deeper
consideration. E.g. what would be the largest size that would be
useful for this path? xgbe_alloc_pages does the order fallback so
PAGE_ALLOC_COSTLY_ORDER sounds like an artificial limit to me.
I guess we can at least simplify the xgbe right away though.
---
 From c7d5ca637b889c4e3779f8d2a84ade6448a76ef9 Mon Sep 17 00:00:00 2001
From: Michal Hocko 
Date: Fri, 2 Jun 2017 16:34:28 +0200
Subject: [PATCH] amd-xgbe: use PAGE_ALLOC_COSTLY_ORDER in xgbe_map_rx_buffer

xgbe_map_rx_buffer is rather confused about what PAGE_ALLOC_COSTLY_ORDER
means. It uses PAGE_ALLOC_COSTLY_ORDER-1 assuming that
PAGE_ALLOC_COSTLY_ORDER is the first costly order which is not the case
actually because orders larger than that are costly. And even that
applies only to sleeping allocations which is not the case here. We
simply do not perform any costly operations like reclaim or compaction
for those. Simplify the code by dropping the order calculation and use
PAGE_ALLOC_COSTLY_ORDER directly.

Signed-off-by: Michal Hocko 
---
  drivers/net/ethernet/amd/xgbe/xgbe-desc.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
index b3bc87fe3764..5ded10eba418 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
@@ -333,9 +333,8 @@ static int xgbe_map_rx_buffer(struct xgbe_prv_data *pdata,
}
  
  	if (!ring->rx_buf_pa.pages) {

-   order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
ret = xgbe_alloc_pages(pdata, >rx_buf_pa, GFP_ATOMIC,
-  order);
+  PAGE_ALLOC_COSTLY_ORDER);


You'll need to also remove the variable definition to avoid an un-used
variable warning.  You should also send this to the netdev mailing list
to send this through the net-next tree (or net tree if you want it fixed
in the current version of the Linux kernel).

Thanks,
Tom


if (ret)
return ret;
}



Re: strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer

2017-06-02 Thread Tom Lendacky

On 6/2/2017 9:43 AM, Michal Hocko wrote:

On Fri 02-06-17 09:20:54, Tom Lendacky wrote:

On 5/31/2017 11:04 AM, Michal Hocko wrote:

Hi Tom,


Hi Michal,


I have stumbled over the following construct in xgbe_map_rx_buffer
order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1?
And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all?



The driver tries to allocate a number of pages to be used as receive
buffers.  Based on what I could find in documentation, the value of
PAGE_ALLOC_COSTLY_ORDER is the point at which order allocations
(could) get expensive.  So I decrease by one the order requested. The
max_t test is just to insure that in case PAGE_ALLOC_COSTLY_ORDER ever
gets defined as 0, 0 would be used.


So you have fallen into a carefully prepared trap ;). The thing is that
orders _larger_ than PAGE_ALLOC_COSTLY_ORDER are costly actually. I can
completely see how this can be confusing.

Moreover xgbe_map_rx_buffer does an atomic allocation which doesn't do
any direct reclaim/compaction attempts so the costly vs. non-costly
doesn't apply here at all.

I would be much happier if no code outside of mm used
PAGE_ALLOC_COSTLY_ORDER directly but that requires a deeper
consideration. E.g. what would be the largest size that would be
useful for this path? xgbe_alloc_pages does the order fallback so
PAGE_ALLOC_COSTLY_ORDER sounds like an artificial limit to me.
I guess we can at least simplify the xgbe right away though.
---
 From c7d5ca637b889c4e3779f8d2a84ade6448a76ef9 Mon Sep 17 00:00:00 2001
From: Michal Hocko 
Date: Fri, 2 Jun 2017 16:34:28 +0200
Subject: [PATCH] amd-xgbe: use PAGE_ALLOC_COSTLY_ORDER in xgbe_map_rx_buffer

xgbe_map_rx_buffer is rather confused about what PAGE_ALLOC_COSTLY_ORDER
means. It uses PAGE_ALLOC_COSTLY_ORDER-1 assuming that
PAGE_ALLOC_COSTLY_ORDER is the first costly order which is not the case
actually because orders larger than that are costly. And even that
applies only to sleeping allocations which is not the case here. We
simply do not perform any costly operations like reclaim or compaction
for those. Simplify the code by dropping the order calculation and use
PAGE_ALLOC_COSTLY_ORDER directly.

Signed-off-by: Michal Hocko 
---
  drivers/net/ethernet/amd/xgbe/xgbe-desc.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
index b3bc87fe3764..5ded10eba418 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
@@ -333,9 +333,8 @@ static int xgbe_map_rx_buffer(struct xgbe_prv_data *pdata,
}
  
  	if (!ring->rx_buf_pa.pages) {

-   order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
ret = xgbe_alloc_pages(pdata, >rx_buf_pa, GFP_ATOMIC,
-  order);
+  PAGE_ALLOC_COSTLY_ORDER);


You'll need to also remove the variable definition to avoid an un-used
variable warning.  You should also send this to the netdev mailing list
to send this through the net-next tree (or net tree if you want it fixed
in the current version of the Linux kernel).

Thanks,
Tom


if (ret)
return ret;
}



Re: strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer

2017-06-02 Thread Michal Hocko
On Fri 02-06-17 09:20:54, Tom Lendacky wrote:
> On 5/31/2017 11:04 AM, Michal Hocko wrote:
> >Hi Tom,
> 
> Hi Michal,
> 
> >I have stumbled over the following construct in xgbe_map_rx_buffer
> > order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
> >which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1?
> >And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all?
> >
> 
> The driver tries to allocate a number of pages to be used as receive
> buffers.  Based on what I could find in documentation, the value of
> PAGE_ALLOC_COSTLY_ORDER is the point at which order allocations
> (could) get expensive.  So I decrease by one the order requested. The
> max_t test is just to insure that in case PAGE_ALLOC_COSTLY_ORDER ever
> gets defined as 0, 0 would be used.

So you have fallen into a carefully prepared trap ;). The thing is that
orders _larger_ than PAGE_ALLOC_COSTLY_ORDER are costly actually. I can
completely see how this can be confusing.

Moreover xgbe_map_rx_buffer does an atomic allocation which doesn't do
any direct reclaim/compaction attempts so the costly vs. non-costly
doesn't apply here at all.

I would be much happier if no code outside of mm used
PAGE_ALLOC_COSTLY_ORDER directly but that requires a deeper
consideration. E.g. what would be the largest size that would be
useful for this path? xgbe_alloc_pages does the order fallback so
PAGE_ALLOC_COSTLY_ORDER sounds like an artificial limit to me.
I guess we can at least simplify the xgbe right away though.
---
>From c7d5ca637b889c4e3779f8d2a84ade6448a76ef9 Mon Sep 17 00:00:00 2001
From: Michal Hocko 
Date: Fri, 2 Jun 2017 16:34:28 +0200
Subject: [PATCH] amd-xgbe: use PAGE_ALLOC_COSTLY_ORDER in xgbe_map_rx_buffer

xgbe_map_rx_buffer is rather confused about what PAGE_ALLOC_COSTLY_ORDER
means. It uses PAGE_ALLOC_COSTLY_ORDER-1 assuming that
PAGE_ALLOC_COSTLY_ORDER is the first costly order which is not the case
actually because orders larger than that are costly. And even that
applies only to sleeping allocations which is not the case here. We
simply do not perform any costly operations like reclaim or compaction
for those. Simplify the code by dropping the order calculation and use
PAGE_ALLOC_COSTLY_ORDER directly.

Signed-off-by: Michal Hocko 
---
 drivers/net/ethernet/amd/xgbe/xgbe-desc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
index b3bc87fe3764..5ded10eba418 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
@@ -333,9 +333,8 @@ static int xgbe_map_rx_buffer(struct xgbe_prv_data *pdata,
}
 
if (!ring->rx_buf_pa.pages) {
-   order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
ret = xgbe_alloc_pages(pdata, >rx_buf_pa, GFP_ATOMIC,
-  order);
+  PAGE_ALLOC_COSTLY_ORDER);
if (ret)
return ret;
}
-- 
2.11.0


-- 
Michal Hocko
SUSE Labs


Re: strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer

2017-06-02 Thread Michal Hocko
On Fri 02-06-17 09:20:54, Tom Lendacky wrote:
> On 5/31/2017 11:04 AM, Michal Hocko wrote:
> >Hi Tom,
> 
> Hi Michal,
> 
> >I have stumbled over the following construct in xgbe_map_rx_buffer
> > order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
> >which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1?
> >And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all?
> >
> 
> The driver tries to allocate a number of pages to be used as receive
> buffers.  Based on what I could find in documentation, the value of
> PAGE_ALLOC_COSTLY_ORDER is the point at which order allocations
> (could) get expensive.  So I decrease by one the order requested. The
> max_t test is just to insure that in case PAGE_ALLOC_COSTLY_ORDER ever
> gets defined as 0, 0 would be used.

So you have fallen into a carefully prepared trap ;). The thing is that
orders _larger_ than PAGE_ALLOC_COSTLY_ORDER are costly actually. I can
completely see how this can be confusing.

Moreover xgbe_map_rx_buffer does an atomic allocation which doesn't do
any direct reclaim/compaction attempts so the costly vs. non-costly
doesn't apply here at all.

I would be much happier if no code outside of mm used
PAGE_ALLOC_COSTLY_ORDER directly but that requires a deeper
consideration. E.g. what would be the largest size that would be
useful for this path? xgbe_alloc_pages does the order fallback so
PAGE_ALLOC_COSTLY_ORDER sounds like an artificial limit to me.
I guess we can at least simplify the xgbe right away though.
---
>From c7d5ca637b889c4e3779f8d2a84ade6448a76ef9 Mon Sep 17 00:00:00 2001
From: Michal Hocko 
Date: Fri, 2 Jun 2017 16:34:28 +0200
Subject: [PATCH] amd-xgbe: use PAGE_ALLOC_COSTLY_ORDER in xgbe_map_rx_buffer

xgbe_map_rx_buffer is rather confused about what PAGE_ALLOC_COSTLY_ORDER
means. It uses PAGE_ALLOC_COSTLY_ORDER-1 assuming that
PAGE_ALLOC_COSTLY_ORDER is the first costly order which is not the case
actually because orders larger than that are costly. And even that
applies only to sleeping allocations which is not the case here. We
simply do not perform any costly operations like reclaim or compaction
for those. Simplify the code by dropping the order calculation and use
PAGE_ALLOC_COSTLY_ORDER directly.

Signed-off-by: Michal Hocko 
---
 drivers/net/ethernet/amd/xgbe/xgbe-desc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
index b3bc87fe3764..5ded10eba418 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
@@ -333,9 +333,8 @@ static int xgbe_map_rx_buffer(struct xgbe_prv_data *pdata,
}
 
if (!ring->rx_buf_pa.pages) {
-   order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
ret = xgbe_alloc_pages(pdata, >rx_buf_pa, GFP_ATOMIC,
-  order);
+  PAGE_ALLOC_COSTLY_ORDER);
if (ret)
return ret;
}
-- 
2.11.0


-- 
Michal Hocko
SUSE Labs


Re: strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer

2017-06-02 Thread Tom Lendacky

On 5/31/2017 11:04 AM, Michal Hocko wrote:

Hi Tom,


Hi Michal,


I have stumbled over the following construct in xgbe_map_rx_buffer
order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1?
And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all?



The driver tries to allocate a number of pages to be used as receive
buffers.  Based on what I could find in documentation, the value of
PAGE_ALLOC_COSTLY_ORDER is the point at which order allocations
(could) get expensive.  So I decrease by one the order requested. The
max_t test is just to insure that in case PAGE_ALLOC_COSTLY_ORDER ever
gets defined as 0, 0 would be used.

I believe there have been some enhancements relative to speed in
allocating 0-order pages recently that may make this unnecessary. I
haven't run any performance tests yet to determine if I can just go to
a 0-order allocation, though.

Thanks,
Tom


Thanks!



Re: strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer

2017-06-02 Thread Tom Lendacky

On 5/31/2017 11:04 AM, Michal Hocko wrote:

Hi Tom,


Hi Michal,


I have stumbled over the following construct in xgbe_map_rx_buffer
order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1?
And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all?



The driver tries to allocate a number of pages to be used as receive
buffers.  Based on what I could find in documentation, the value of
PAGE_ALLOC_COSTLY_ORDER is the point at which order allocations
(could) get expensive.  So I decrease by one the order requested. The
max_t test is just to insure that in case PAGE_ALLOC_COSTLY_ORDER ever
gets defined as 0, 0 would be used.

I believe there have been some enhancements relative to speed in
allocating 0-order pages recently that may make this unnecessary. I
haven't run any performance tests yet to determine if I can just go to
a 0-order allocation, though.

Thanks,
Tom


Thanks!



strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer

2017-05-31 Thread Michal Hocko
Hi Tom,
I have stumbled over the following construct in xgbe_map_rx_buffer
order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1?
And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all?

Thanks!
-- 
Michal Hocko
SUSE Labs


strange PAGE_ALLOC_COSTLY_ORDER usage in xgbe_map_rx_buffer

2017-05-31 Thread Michal Hocko
Hi Tom,
I have stumbled over the following construct in xgbe_map_rx_buffer
order = max_t(int, PAGE_ALLOC_COSTLY_ORDER - 1, 0);
which looks quite suspicious. Why does it PAGE_ALLOC_COSTLY_ORDER - 1?
And why do you depend on PAGE_ALLOC_COSTLY_ORDER at all?

Thanks!
-- 
Michal Hocko
SUSE Labs


Re: [dm-devel] [PATCH] dm-region-hash: fix strange usage of mempool_alloc.

2017-05-03 Thread NeilBrown
On Wed, May 03 2017, Mikulas Patocka wrote:

> On Mon, 24 Apr 2017, NeilBrown wrote:
>> 
>> I had a look at how the allocation 'dm_region' objects are used,
>> and it would take a bit of work to make it really safe.
>> My guess is __rh_find() should be allowed to fail, and the various
>> callers need to handle failure.
>> For example, dm_rh_inc_pending() would be given a second bio_list,
>> and would move any bios for which rh_inc() fails, onto that list.
>> Then do_writes() would merge that list back into ms->writes.
>> That way do_mirror() would not block indefinitely and forward progress
>> could be assured ... maybe.
>> It would take more work than I'm able to give at the moment, so
>> I'm happy to just drop this patch.
>> 
>> Thanks,
>> NeilBrown
>
> I think that the only way how to fix this would be to preallocate the all 
> the regions when the target is created.
>
> But, with the default region size 512kiB, it would cause high memory 
> consumption (approximatelly 1GB of RAM for 20TB device).

Two reflections:
 1/ This is close to what md/bitmap does.
   It actually uses a 2-level array for the 'pending' field from
   dm_region, combined with something similar to 'state'.
   The top level is allocated when the device is created.
   Entries in this table are either
 - pointers to a second-level array for 2048 regions
 - entries for 2 giant regions, 1024 times the normal size.

   So if we cannot allocate a page when we need that second level,
we just use an enormous region and so risk resync taking a bit
longer if there is a crash.

 2/ Even though md does pre-alloc to a degree, I'm not convinced that it
is necessary.
We only need a region to be recorded when it is actively being
written to, or when it is being recovered.
We could, in theory, have just one region that is written to and one
region that is being recovered.  If a writes request arrives for a
different region it blocks until the current region has no active
requests.  Then that region is forgotten and the new region
activated, and the new write allowed to proceed.
Obviously this would be horribly slow, but it should be
deadlock-free.
Using a mempool instead of a single region would then allow multiple
regions to be active in parallel, which would improve throughput
without affecting the deadlock status.

Maybe I'll try to code it and see what happens ... maybe not.

NeilBrown

 


signature.asc
Description: PGP signature


Re: [dm-devel] [PATCH] dm-region-hash: fix strange usage of mempool_alloc.

2017-05-03 Thread NeilBrown
On Wed, May 03 2017, Mikulas Patocka wrote:

> On Mon, 24 Apr 2017, NeilBrown wrote:
>> 
>> I had a look at how the allocation 'dm_region' objects are used,
>> and it would take a bit of work to make it really safe.
>> My guess is __rh_find() should be allowed to fail, and the various
>> callers need to handle failure.
>> For example, dm_rh_inc_pending() would be given a second bio_list,
>> and would move any bios for which rh_inc() fails, onto that list.
>> Then do_writes() would merge that list back into ms->writes.
>> That way do_mirror() would not block indefinitely and forward progress
>> could be assured ... maybe.
>> It would take more work than I'm able to give at the moment, so
>> I'm happy to just drop this patch.
>> 
>> Thanks,
>> NeilBrown
>
> I think that the only way how to fix this would be to preallocate the all 
> the regions when the target is created.
>
> But, with the default region size 512kiB, it would cause high memory 
> consumption (approximatelly 1GB of RAM for 20TB device).

Two reflections:
 1/ This is close to what md/bitmap does.
   It actually uses a 2-level array for the 'pending' field from
   dm_region, combined with something similar to 'state'.
   The top level is allocated when the device is created.
   Entries in this table are either
 - pointers to a second-level array for 2048 regions
 - entries for 2 giant regions, 1024 times the normal size.

   So if we cannot allocate a page when we need that second level,
we just use an enormous region and so risk resync taking a bit
longer if there is a crash.

 2/ Even though md does pre-alloc to a degree, I'm not convinced that it
is necessary.
We only need a region to be recorded when it is actively being
written to, or when it is being recovered.
We could, in theory, have just one region that is written to and one
region that is being recovered.  If a writes request arrives for a
different region it blocks until the current region has no active
requests.  Then that region is forgotten and the new region
activated, and the new write allowed to proceed.
Obviously this would be horribly slow, but it should be
deadlock-free.
Using a mempool instead of a single region would then allow multiple
regions to be active in parallel, which would improve throughput
without affecting the deadlock status.

Maybe I'll try to code it and see what happens ... maybe not.

NeilBrown

 


signature.asc
Description: PGP signature


Re: [dm-devel] [PATCH] dm-region-hash: fix strange usage of mempool_alloc.

2017-05-03 Thread Mikulas Patocka


On Mon, 24 Apr 2017, NeilBrown wrote:

> On Fri, Apr 21 2017, Mikulas Patocka wrote:
> 
> > On Mon, 10 Apr 2017, NeilBrown wrote:
> >
> >> mempool_alloc() should only be called with GFP_ATOMIC when
> >> it is not safe to wait. Passing __GFP_NOFAIL to kmalloc()
> >> says that it is safe to wait indefinitely.  So this code is
> >> inconsistent.
> >> 
> >> Clearly it is OK to wait indefinitely in this code, and
> >> mempool_alloc() is able to do that.  So just use
> >> mempool_alloc, and allow it to sleep.  If no memory is
> >> available it will wait for something to be returned to the
> >> pool, and will retry a normal allocation regularly.
> >
> > The region hash code is buggy anyway, because it may allocate more entries 
> > than the size of the pool and not give them back.
> >
> > That kmalloc was introduced in the commit c06aad854 to work around a 
> > deadlock due to incorrect mempool usage.
> >
> > Your patch slightly increases the probability of the deadlock because 
> > mempool_alloc does all allocations with __GFP_NOMEMALLOC, so it uses 
> > higher limit than kmalloc(GFP_NOIO).
> >
> 
> Thanks for the review.
> 
> I had a look at how the allocation 'dm_region' objects are used,
> and it would take a bit of work to make it really safe.
> My guess is __rh_find() should be allowed to fail, and the various
> callers need to handle failure.
> For example, dm_rh_inc_pending() would be given a second bio_list,
> and would move any bios for which rh_inc() fails, onto that list.
> Then do_writes() would merge that list back into ms->writes.
> That way do_mirror() would not block indefinitely and forward progress
> could be assured ... maybe.
> It would take more work than I'm able to give at the moment, so
> I'm happy to just drop this patch.
> 
> Thanks,
> NeilBrown

I think that the only way how to fix this would be to preallocate the all 
the regions when the target is created.

But, with the default region size 512kiB, it would cause high memory 
consumption (approximatelly 1GB of RAM for 20TB device).

Mikulas


Re: [dm-devel] [PATCH] dm-region-hash: fix strange usage of mempool_alloc.

2017-05-03 Thread Mikulas Patocka


On Mon, 24 Apr 2017, NeilBrown wrote:

> On Fri, Apr 21 2017, Mikulas Patocka wrote:
> 
> > On Mon, 10 Apr 2017, NeilBrown wrote:
> >
> >> mempool_alloc() should only be called with GFP_ATOMIC when
> >> it is not safe to wait. Passing __GFP_NOFAIL to kmalloc()
> >> says that it is safe to wait indefinitely.  So this code is
> >> inconsistent.
> >> 
> >> Clearly it is OK to wait indefinitely in this code, and
> >> mempool_alloc() is able to do that.  So just use
> >> mempool_alloc, and allow it to sleep.  If no memory is
> >> available it will wait for something to be returned to the
> >> pool, and will retry a normal allocation regularly.
> >
> > The region hash code is buggy anyway, because it may allocate more entries 
> > than the size of the pool and not give them back.
> >
> > That kmalloc was introduced in the commit c06aad854 to work around a 
> > deadlock due to incorrect mempool usage.
> >
> > Your patch slightly increases the probability of the deadlock because 
> > mempool_alloc does all allocations with __GFP_NOMEMALLOC, so it uses 
> > higher limit than kmalloc(GFP_NOIO).
> >
> 
> Thanks for the review.
> 
> I had a look at how the allocation 'dm_region' objects are used,
> and it would take a bit of work to make it really safe.
> My guess is __rh_find() should be allowed to fail, and the various
> callers need to handle failure.
> For example, dm_rh_inc_pending() would be given a second bio_list,
> and would move any bios for which rh_inc() fails, onto that list.
> Then do_writes() would merge that list back into ms->writes.
> That way do_mirror() would not block indefinitely and forward progress
> could be assured ... maybe.
> It would take more work than I'm able to give at the moment, so
> I'm happy to just drop this patch.
> 
> Thanks,
> NeilBrown

I think that the only way how to fix this would be to preallocate the all 
the regions when the target is created.

But, with the default region size 512kiB, it would cause high memory 
consumption (approximatelly 1GB of RAM for 20TB device).

Mikulas


  1   2   3   4   5   6   7   8   9   10   >