[PATCH v3 0/1] aacraid: Host adapter Adaptec 6405 constantly resets under high io load

2019-08-19 Thread Konstantin Khorenko
Problem description:

A node with Adaptec 6405 controller, latest BIOS V5.3-0[19204]
A lot of disks attached to the controller.
Simple test: running mkfs.ext4 on many disks on the same controller in
parallel (mkfs is not important here, any serious io load triggers controller
aborts)

Results:
* no problems (controller resets) with kernels prior to
  395e5df79a95 ("scsi: aacraid: Remove reference to Series-9")

* latest ms kernel v5.2-rc6-15-g249155c20f9b - mkfs processes are in D state,
  lot of complains in logs like:

  [  654.894633] aacraid: Host adapter abort request.
  aacraid: Outstanding commands on (0,1,43,0):
  [  699.441034] aacraid: Host adapter abort request.
  aacraid: Outstanding commands on (0,1,40,0):
  [  699.442950] aacraid: Host adapter reset request. SCSI hang ?
  [  714.457428] aacraid: Host adapter reset request. SCSI hang ?
  ...
  [  759.514759] aacraid: Host adapter reset request. SCSI hang ?
  [  759.514869] aacraid :03:00.0: outstanding cmd: midlevel-0
  [  759.514870] aacraid :03:00.0: outstanding cmd: lowlevel-0
  [  759.514872] aacraid :03:00.0: outstanding cmd: error handler-498
  [  759.514873] aacraid :03:00.0: outstanding cmd: firmware-471
  [  759.514875] aacraid :03:00.0: outstanding cmd: kernel-60
  [  759.514912] aacraid :03:00.0: Controller reset type is 3
  [  759.515013] aacraid :03:00.0: Issuing IOP reset
  [  850.296705] aacraid :03:00.0: IOP reset succeeded

Same complains on Ubuntu kernel 4.15.0-50-generic:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1777586

Controller:
===
03:00.0 RAID bus controller: Adaptec Series 6 - 6G SAS/PCIe 2 (rev 01)
 Subsystem: Adaptec Series 6 - ASR-6405 - 4 internal 6G SAS ports

Test:
=
# cat dev.list
/dev/sdq1
/dev/sde1
/dev/sds1
/dev/sdb1
/dev/sdk1
/dev/sdaj1
/dev/sdaf1
/dev/sdd1
/dev/sdac1
/dev/sdai1
/dev/sdz1
/dev/sdj1
/dev/sdy1
/dev/sdn1
/dev/sdae1
/dev/sdg1
/dev/sdi1
/dev/sdc1
/dev/sdf1
/dev/sdl1
/dev/sda1
/dev/sdab1
/dev/sdr1
/dev/sdo1
/dev/sdah1
/dev/sdm1
/dev/sdt1
/dev/sdp1
/dev/sdad1
/dev/sdh1

===
# cat run_mkfs.sh
#!/bin/bash

while read i; do
   mkfs.ext4 $i -q -E lazy_itable_init=1 -O uninit_bg -m 0 &
done

=
# cat dev.list | ./run_mkfs.sh

The issue is 100% reproducible.

i've bisected to the culprit patch, it's
395e5df79a95 ("scsi: aacraid: Remove reference to Series-9")

it changes arc ctrl checks for Series-6 controllers
and i've checked that resurrection of original logic in arc ctrl checks
eliminates controller hangs/resets.

Konstantin Khorenko (1):
  scsi: aacraid: resurrect correct arc ctrl checks for Series-6

--
v3 changes:
 * introduced another wrapper to check for devices except for Series 6
   controllers upon request from Sagar Biradar (Microchip)

 * dropped mentions of private bug ids


 drivers/scsi/aacraid/aacraid.h  | 11 +++
 drivers/scsi/aacraid/comminit.c |  5 ++---
 drivers/scsi/aacraid/linit.c|  2 +-
 3 files changed, 14 insertions(+), 4 deletions(-)

-- 
2.15.1



[PATCH v3 1/1] scsi: aacraid: resurrect correct arc ctrl checks for Series-6

2019-08-19 Thread Konstantin Khorenko
The patch introduces another wrapper similar to aac_is_src()
which avoids checking for Series 6 devices.

Use this new wrapper in order to revert original arc ctrl checks for
Series-6 controllers which were occasionally changed by commit
395e5df79a95 ("scsi: aacraid: Remove reference to Series-9")

The patch above not only drops Series-9 cards checks but also
changes logic for Series-6 controllers which lead to controller
hungs/resets under high io load.

https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1777586

Signed-off-by: Konstantin Khorenko 
---
 drivers/scsi/aacraid/aacraid.h  | 11 +++
 drivers/scsi/aacraid/comminit.c |  5 ++---
 drivers/scsi/aacraid/linit.c|  2 +-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index 3fa03230f6ba..ddfa78c05728 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -2740,6 +2740,17 @@ static inline int aac_is_src(struct aac_dev *dev)
return 0;
 }
 
+static inline int aac_is_srcv(struct aac_dev *dev)
+{
+   u16 device = dev->pdev->device;
+
+   if (device == PMC_DEVICE_S7 ||
+   device == PMC_DEVICE_S8)
+   return 1;
+
+   return 0;
+}
+
 static inline int aac_supports_2T(struct aac_dev *dev)
 {
return (dev->adapter_info.options & AAC_OPT_NEW_COMM_64);
diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c
index d4fcfa1e54e0..1918e46ae3ec 100644
--- a/drivers/scsi/aacraid/comminit.c
+++ b/drivers/scsi/aacraid/comminit.c
@@ -349,8 +349,7 @@ int aac_send_shutdown(struct aac_dev * dev)
/* FIB should be freed only after getting the response from the F/W */
if (status != -ERESTARTSYS)
aac_fib_free(fibctx);
-   if (aac_is_src(dev) &&
-dev->msi_enabled)
+   if (aac_is_srcv(dev) && dev->msi_enabled)
aac_set_intx_mode(dev);
return status;
 }
@@ -605,7 +604,7 @@ struct aac_dev *aac_init_adapter(struct aac_dev *dev)
dev->max_fib_size = status[1] & 0xFFE0;
host->sg_tablesize = status[2] >> 16;
dev->sg_tablesize = status[2] & 0x;
-   if (aac_is_src(dev)) {
+   if (aac_is_srcv(dev)) {
if (host->can_queue > (status[3] >> 16) -
AAC_NUM_MGT_FIB)
host->can_queue = (status[3] >> 16) -
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 644f7f5c61a2..c8badc9d9ae7 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1835,7 +1835,7 @@ static int aac_acquire_resources(struct aac_dev *dev)
aac_adapter_enable_int(dev);
 
 
-   if (aac_is_src(dev))
+   if (aac_is_srcv(dev))
aac_define_int_mode(dev);
 
if (dev->msi_enabled)
-- 
2.15.1



[PATCH v2 0/2] aacraid: Host adapter Adaptec 6405 constantly resets under high io load

2019-07-10 Thread Konstantin Khorenko
Problem description:

A node with Adaptec 6405 controller, latest BIOS V5.3-0[19204]
A lot of disks attached to the controller.
Simple test: running mkfs.ext4 on many disks on the same controller in
parallel
(mkfs is not important here, any serious io load triggers controller aborts)

Results:
* no problems (controller resets) with kernels prior to
  395e5df79a95 ("scsi: aacraid: Remove reference to Series-9")

* latest ms kernel v5.2-rc6-15-g249155c20f9b - mkfs processes are in D state,
  lot of complains in logs like:

  [  654.894633] aacraid: Host adapter abort request.
  aacraid: Outstanding commands on (0,1,43,0):
  [  699.441034] aacraid: Host adapter abort request.
  aacraid: Outstanding commands on (0,1,40,0):
  [  699.442950] aacraid: Host adapter reset request. SCSI hang ?
  [  714.457428] aacraid: Host adapter reset request. SCSI hang ?
  ...
  [  759.514759] aacraid: Host adapter reset request. SCSI hang ?
  [  759.514869] aacraid :03:00.0: outstanding cmd: midlevel-0
  [  759.514870] aacraid :03:00.0: outstanding cmd: lowlevel-0
  [  759.514872] aacraid :03:00.0: outstanding cmd: error handler-498
  [  759.514873] aacraid :03:00.0: outstanding cmd: firmware-471
  [  759.514875] aacraid :03:00.0: outstanding cmd: kernel-60
  [  759.514912] aacraid :03:00.0: Controller reset type is 3
  [  759.515013] aacraid :03:00.0: Issuing IOP reset
  [  850.296705] aacraid :03:00.0: IOP reset succeeded

Same complains on Ubuntu kernel 4.15.0-50-generic:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1777586

Controller:
===
03:00.0 RAID bus controller: Adaptec Series 6 - 6G SAS/PCIe 2 (rev 01)
 Subsystem: Adaptec Series 6 - ASR-6405 - 4 internal 6G SAS ports

Test:
=
# cat dev.list
/dev/sdq1
/dev/sde1
/dev/sds1
/dev/sdb1
/dev/sdk1
/dev/sdaj1
/dev/sdaf1
/dev/sdd1
/dev/sdac1
/dev/sdai1
/dev/sdz1
/dev/sdj1
/dev/sdy1
/dev/sdn1
/dev/sdae1
/dev/sdg1
/dev/sdi1
/dev/sdc1
/dev/sdf1
/dev/sdl1
/dev/sda1
/dev/sdab1
/dev/sdr1
/dev/sdo1
/dev/sdah1
/dev/sdm1
/dev/sdt1
/dev/sdp1
/dev/sdad1
/dev/sdh1

===
# cat run_mkfs.sh
#!/bin/bash

while read i; do
   mkfs.ext4 $i -q -E lazy_itable_init=1 -O uninit_bg -m 0 &
done

=
# cat dev.list | ./run_mkfs.sh

The issue is 100% reproducible.

i've bisected to the culprit patch, it's
395e5df79a95 ("scsi: aacraid: Remove reference to Series-9")

it changes arc ctrl checks for Series-6 controllers
and i've checked that resurrection of original logic in arc ctrl checks
eliminates controller hangs/resets.

Konstantin Khorenko (2):
  Revert "scsi: aacraid: Remove reference to Series-9"
  scsi: aacraid: Remove references to Series-9 (only)

 drivers/scsi/aacraid/aacraid.h  | 11 ---
 drivers/scsi/aacraid/comminit.c | 15 +++
 drivers/scsi/aacraid/commsup.c  |  4 +++-
 drivers/scsi/aacraid/linit.c|  8 +---
 4 files changed, 19 insertions(+), 19 deletions(-)

-- 
2.15.1



[PATCH v2 2/2] scsi: aacraid: Remove references to Series-9 (only)

2019-07-10 Thread Konstantin Khorenko
The patch removes references to Series 9 adapters following
395e5df79a95 ("scsi: aacraid: Remove reference to Series-9"),
but doesn't touch Series 6 adapters logic.

Leaving Series 6 adapters untouched avoids controller
hungs/resets under high io load.

https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1777586
https://bugzilla.redhat.com/show_bug.cgi?id=1724077
https://jira.sw.ru/browse/PSBM-95736

Signed-off-by: Konstantin Khorenko 
---
 drivers/scsi/aacraid/aacraid.h  | 1 -
 drivers/scsi/aacraid/comminit.c | 9 +++--
 drivers/scsi/aacraid/commsup.c  | 3 +--
 drivers/scsi/aacraid/linit.c| 8 +++-
 4 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index aef47d0e718c..b674fb645523 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -416,7 +416,6 @@ struct aac_ciss_identify_pd {
 #define PMC_DEVICE_S6  0x28b
 #define PMC_DEVICE_S7  0x28c
 #define PMC_DEVICE_S8  0x28d
-#define PMC_DEVICE_S9  0x28f
 
 #define aac_phys_to_logical(x)  ((x)+1)
 #define aac_logical_to_phys(x)  ((x)?(x)-1:0)
diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c
index edaa2d53e704..c8db6614b712 100644
--- a/drivers/scsi/aacraid/comminit.c
+++ b/drivers/scsi/aacraid/comminit.c
@@ -353,8 +353,7 @@ int aac_send_shutdown(struct aac_dev * dev)
if (status != -ERESTARTSYS)
aac_fib_free(fibctx);
if ((dev->pdev->device == PMC_DEVICE_S7 ||
-dev->pdev->device == PMC_DEVICE_S8 ||
-dev->pdev->device == PMC_DEVICE_S9) &&
+dev->pdev->device == PMC_DEVICE_S8) &&
 dev->msi_enabled)
aac_set_intx_mode(dev);
return status;
@@ -611,8 +610,7 @@ struct aac_dev *aac_init_adapter(struct aac_dev *dev)
host->sg_tablesize = status[2] >> 16;
dev->sg_tablesize = status[2] & 0x;
if (dev->pdev->device == PMC_DEVICE_S7 ||
-   dev->pdev->device == PMC_DEVICE_S8 ||
-   dev->pdev->device == PMC_DEVICE_S9) {
+   dev->pdev->device == PMC_DEVICE_S8) {
if (host->can_queue > (status[3] >> 16) -
AAC_NUM_MGT_FIB)
host->can_queue = (status[3] >> 16) -
@@ -633,8 +631,7 @@ struct aac_dev *aac_init_adapter(struct aac_dev *dev)
 
if (dev->pdev->device == PMC_DEVICE_S6 ||
dev->pdev->device == PMC_DEVICE_S7 ||
-   dev->pdev->device == PMC_DEVICE_S8 ||
-   dev->pdev->device == PMC_DEVICE_S9)
+   dev->pdev->device == PMC_DEVICE_S8)
aac_define_int_mode(dev);
/*
 *  Ok now init the communication subsystem
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index b047b1e2215a..705e003caa95 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -2576,8 +2576,7 @@ void aac_free_irq(struct aac_dev *dev)
 
if (dev->pdev->device == PMC_DEVICE_S6 ||
dev->pdev->device == PMC_DEVICE_S7 ||
-   dev->pdev->device == PMC_DEVICE_S8 ||
-   dev->pdev->device == PMC_DEVICE_S9) {
+   dev->pdev->device == PMC_DEVICE_S8) {
if (dev->max_msix > 1) {
for (i = 0; i < dev->max_msix; i++)
free_irq(pci_irq_vector(dev->pdev, i),
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index f669a4405217..d5082b191aa8 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1561,8 +1561,7 @@ static void __aac_shutdown(struct aac_dev * aac)
aac_adapter_disable_int(aac);
if (aac->pdev->device == PMC_DEVICE_S6 ||
aac->pdev->device == PMC_DEVICE_S7 ||
-   aac->pdev->device == PMC_DEVICE_S8 ||
-   aac->pdev->device == PMC_DEVICE_S9) {
+   aac->pdev->device == PMC_DEVICE_S8) {
if (aac->max_msix > 1) {
for (i = 0; i < aac->max_msix; i++) {
free_irq(pci_irq_vector(aac->pdev, i),
@@ -1837,9 +1836,8 @@ static int aac_acquire_resources(struct aac_dev *dev)
aac_adapter_enable_int(dev);
 
 
-   if ((dev->pdev->device == PMC_DEVICE_S7 ||
-dev->pdev->device == PMC_DEVICE_S8 ||
-dev->pdev->device == PMC_DEVICE_S9))
+   if (dev->pdev->device == PMC_DEVICE_S7 ||
+   dev->pdev->device == PMC_DEVICE_S8)
aac_define_int_mode(dev);
 
if (dev->msi_enabled)
-- 
2.15.1



[PATCH v2 1/2] Revert "scsi: aacraid: Remove reference to Series-9"

2019-07-10 Thread Konstantin Khorenko
This reverts commit 395e5df79a9588abf1099ea746f11872c9086252.

The patch being reverted not only drops Series-9 cards
checks but also changes logic for Series-6 controllers which
lead to controller hungs/resets under high io load.

So revert the original patch, references to Series 9 are to
be removed by next patch.

https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1777586
https://bugzilla.redhat.com/show_bug.cgi?id=1724077
https://jira.sw.ru/browse/PSBM-95736

Signed-off-by: Konstantin Khorenko 
---
 drivers/scsi/aacraid/aacraid.h  | 12 +---
 drivers/scsi/aacraid/comminit.c | 18 ++
 drivers/scsi/aacraid/commsup.c  |  5 -
 drivers/scsi/aacraid/linit.c| 10 +++---
 4 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index 3fa03230f6ba..aef47d0e718c 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -416,6 +416,7 @@ struct aac_ciss_identify_pd {
 #define PMC_DEVICE_S6  0x28b
 #define PMC_DEVICE_S7  0x28c
 #define PMC_DEVICE_S8  0x28d
+#define PMC_DEVICE_S9  0x28f
 
 #define aac_phys_to_logical(x)  ((x)+1)
 #define aac_logical_to_phys(x)  ((x)?(x)-1:0)
@@ -2729,17 +2730,6 @@ int _aac_rx_init(struct aac_dev *dev);
 int aac_rx_select_comm(struct aac_dev *dev, int comm);
 int aac_rx_deliver_producer(struct fib * fib);
 
-static inline int aac_is_src(struct aac_dev *dev)
-{
-   u16 device = dev->pdev->device;
-
-   if (device == PMC_DEVICE_S6 ||
-   device == PMC_DEVICE_S7 ||
-   device == PMC_DEVICE_S8)
-   return 1;
-   return 0;
-}
-
 static inline int aac_supports_2T(struct aac_dev *dev)
 {
return (dev->adapter_info.options & AAC_OPT_NEW_COMM_64);
diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c
index d4fcfa1e54e0..edaa2d53e704 100644
--- a/drivers/scsi/aacraid/comminit.c
+++ b/drivers/scsi/aacraid/comminit.c
@@ -41,8 +41,11 @@ static inline int aac_is_msix_mode(struct aac_dev *dev)
 {
u32 status = 0;
 
-   if (aac_is_src(dev))
+   if (dev->pdev->device == PMC_DEVICE_S6 ||
+   dev->pdev->device == PMC_DEVICE_S7 ||
+   dev->pdev->device == PMC_DEVICE_S8) {
status = src_readl(dev, MUnit.OMR);
+   }
return (status & AAC_INT_MODE_MSIX);
 }
 
@@ -349,7 +352,9 @@ int aac_send_shutdown(struct aac_dev * dev)
/* FIB should be freed only after getting the response from the F/W */
if (status != -ERESTARTSYS)
aac_fib_free(fibctx);
-   if (aac_is_src(dev) &&
+   if ((dev->pdev->device == PMC_DEVICE_S7 ||
+dev->pdev->device == PMC_DEVICE_S8 ||
+dev->pdev->device == PMC_DEVICE_S9) &&
 dev->msi_enabled)
aac_set_intx_mode(dev);
return status;
@@ -605,7 +610,9 @@ struct aac_dev *aac_init_adapter(struct aac_dev *dev)
dev->max_fib_size = status[1] & 0xFFE0;
host->sg_tablesize = status[2] >> 16;
dev->sg_tablesize = status[2] & 0x;
-   if (aac_is_src(dev)) {
+   if (dev->pdev->device == PMC_DEVICE_S7 ||
+   dev->pdev->device == PMC_DEVICE_S8 ||
+   dev->pdev->device == PMC_DEVICE_S9) {
if (host->can_queue > (status[3] >> 16) -
AAC_NUM_MGT_FIB)
host->can_queue = (status[3] >> 16) -
@@ -624,7 +631,10 @@ struct aac_dev *aac_init_adapter(struct aac_dev *dev)
pr_warn("numacb=%d ignored\n", numacb);
}
 
-   if (aac_is_src(dev))
+   if (dev->pdev->device == PMC_DEVICE_S6 ||
+   dev->pdev->device == PMC_DEVICE_S7 ||
+   dev->pdev->device == PMC_DEVICE_S8 ||
+   dev->pdev->device == PMC_DEVICE_S9)
aac_define_int_mode(dev);
/*
 *  Ok now init the communication subsystem
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index 2142a649e865..b047b1e2215a 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -2574,7 +2574,10 @@ void aac_free_irq(struct aac_dev *dev)
 {
int i;
 
-   if (aac_is_src(dev)) {
+   if (dev->pdev->device == PMC_DEVICE_S6 ||
+   dev->pdev->device == PMC_DEVICE_S7 ||
+   dev->pdev->device == PMC_DEVICE_S8 ||
+   dev->pdev->device == PMC_DEVICE_S9) {
if (dev->max_msix > 1) {
for (i = 0; i < dev->max_msix; i++)
free_irq(pci_irq_vector(dev->pdev, i),
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
inde

Re: [PATCH 1/1] scsi: aacraid: resurrect correct arc ctrl checks for Series-6

2019-07-10 Thread Konstantin Khorenko
On 07/08/2019 02:49 AM, Finn Thain wrote:
> Andrey,
>
> It is helpful to send your review to the patch author. I've added
> Konstantin to the Cc list, as well as Raghava (who introduced the
> regression addressed by Konstantin's patch).
>
> If I'm not mistaken, your review misunderstands the patch description.
>
> FWIW, Konstantin's patch might have been easier to follow if it was a
> simple 'git revert'.

Hi Finn, Andrey,

Finn,
thank you for putting me back to the thread, appreciated.
And i agree with you, may be git revert followed by independent patch
which removes Series-9 mentions is easier to read, so sending the second
version - in that way.


Andrey,
please take a look at the new version patches, hope it's easier to understand.

And talking about the helper: i thought about leaving it, but we have several 
places which check for Series 7 and 8 only
and several places which check for Series 6,7,8, so either
- we need 2 helpers
- we have a helper to check for Series 7,8 only and in some places will have a 
check for Series 6 + helper
- introduce the helper with parameter

Honestly i don't like any of variants above, so just left the code without 
helper,
not that many checks and easier to read the code IMHO.

Thank you!

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team


[PATCH 0/1] aacraid: Host adapter Adaptec 6405 constantly resets under high io load

2019-06-27 Thread Konstantin Khorenko
Problem description:

A node with Adaptec 6405 controller, latest BIOS V5.3-0[19204]
A lot of disks attached to the controller.
Simple test: running mkfs.ext4 on many disks on the same controller in
parallel
(mkfs is not important here, any serious io load triggers controller aborts)

Results:
* no problems (controller resets) with kernels prior to
  395e5df79a95 ("scsi: aacraid: Remove reference to Series-9")

* latest ms kernel v5.2-rc6-15-g249155c20f9b - mkfs processes are in D state,
  lot of complains in logs like:

  [  654.894633] aacraid: Host adapter abort request.
  aacraid: Outstanding commands on (0,1,43,0):
  [  699.441034] aacraid: Host adapter abort request.
  aacraid: Outstanding commands on (0,1,40,0):
  [  699.442950] aacraid: Host adapter reset request. SCSI hang ?
  [  714.457428] aacraid: Host adapter reset request. SCSI hang ?
  ...
  [  759.514759] aacraid: Host adapter reset request. SCSI hang ?
  [  759.514869] aacraid :03:00.0: outstanding cmd: midlevel-0
  [  759.514870] aacraid :03:00.0: outstanding cmd: lowlevel-0
  [  759.514872] aacraid :03:00.0: outstanding cmd: error handler-498
  [  759.514873] aacraid :03:00.0: outstanding cmd: firmware-471
  [  759.514875] aacraid :03:00.0: outstanding cmd: kernel-60
  [  759.514912] aacraid :03:00.0: Controller reset type is 3
  [  759.515013] aacraid :03:00.0: Issuing IOP reset
  [  850.296705] aacraid :03:00.0: IOP reset succeeded

Same complains on Ubuntu kernel 4.15.0-50-generic:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1777586

Controller:
===
03:00.0 RAID bus controller: Adaptec Series 6 - 6G SAS/PCIe 2 (rev 01)
 Subsystem: Adaptec Series 6 - ASR-6405 - 4 internal 6G SAS ports

Test:
=
# cat dev.list
/dev/sdq1
/dev/sde1
/dev/sds1
/dev/sdb1
/dev/sdk1
/dev/sdaj1
/dev/sdaf1
/dev/sdd1
/dev/sdac1
/dev/sdai1
/dev/sdz1
/dev/sdj1
/dev/sdy1
/dev/sdn1
/dev/sdae1
/dev/sdg1
/dev/sdi1
/dev/sdc1
/dev/sdf1
/dev/sdl1
/dev/sda1
/dev/sdab1
/dev/sdr1
/dev/sdo1
/dev/sdah1
/dev/sdm1
/dev/sdt1
/dev/sdp1
/dev/sdad1
/dev/sdh1

===
# cat run_mkfs.sh
#!/bin/bash

while read i; do
   mkfs.ext4 $i -q -E lazy_itable_init=1 -O uninit_bg -m 0 &
done

=
# cat dev.list | ./run_mkfs.sh

The issue is 100% reproducible.

i've bisected to the culprit patch, it's
395e5df79a95 ("scsi: aacraid: Remove reference to Series-9")

it changes arc ctrl checks for Series-6 controllers
and i've checked that resurrection of original logic in arc ctrl checks
eliminates controller hangs/resets.


Konstantin Khorenko (1):
  scsi: aacraid: resurrect correct arc ctrl checks for Series-6

 drivers/scsi/aacraid/aacraid.h  | 11 ---
 drivers/scsi/aacraid/comminit.c | 14 ++
 drivers/scsi/aacraid/commsup.c  |  4 +++-
 drivers/scsi/aacraid/linit.c|  7 +--
 4 files changed, 18 insertions(+), 18 deletions(-)

-- 
2.15.1



[PATCH 1/1] scsi: aacraid: resurrect correct arc ctrl checks for Series-6

2019-06-27 Thread Konstantin Khorenko
This partially reverts ms commit
395e5df79a95 ("scsi: aacraid: Remove reference to Series-9")

The patch above not only drops Series-9 cards checks but also
changes logic for Series-6 controllers which leads to controller
hangs/resets under high io load.

So revert to original arc ctrl checks for Series-6 controllers.

https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1777586
https://bugzilla.redhat.com/show_bug.cgi?id=1724077
https://jira.sw.ru/browse/PSBM-95736

Fixes: 395e5df79a95 ("scsi: aacraid: Remove reference to Series-9")
Cc: sta...@vger.kernel.org

Signed-off-by: Konstantin Khorenko 
---
 drivers/scsi/aacraid/aacraid.h  | 11 ---
 drivers/scsi/aacraid/comminit.c | 14 ++
 drivers/scsi/aacraid/commsup.c  |  4 +++-
 drivers/scsi/aacraid/linit.c|  7 +--
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index 3fa03230f6ba..b674fb645523 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -2729,17 +2729,6 @@ int _aac_rx_init(struct aac_dev *dev);
 int aac_rx_select_comm(struct aac_dev *dev, int comm);
 int aac_rx_deliver_producer(struct fib * fib);
 
-static inline int aac_is_src(struct aac_dev *dev)
-{
-   u16 device = dev->pdev->device;
-
-   if (device == PMC_DEVICE_S6 ||
-   device == PMC_DEVICE_S7 ||
-   device == PMC_DEVICE_S8)
-   return 1;
-   return 0;
-}
-
 static inline int aac_supports_2T(struct aac_dev *dev)
 {
return (dev->adapter_info.options & AAC_OPT_NEW_COMM_64);
diff --git a/drivers/scsi/aacraid/comminit.c b/drivers/scsi/aacraid/comminit.c
index d4fcfa1e54e0..b8046b6c1239 100644
--- a/drivers/scsi/aacraid/comminit.c
+++ b/drivers/scsi/aacraid/comminit.c
@@ -41,7 +41,9 @@ static inline int aac_is_msix_mode(struct aac_dev *dev)
 {
u32 status = 0;
 
-   if (aac_is_src(dev))
+   if (dev->pdev->device == PMC_DEVICE_S6 ||
+   dev->pdev->device == PMC_DEVICE_S7 ||
+   dev->pdev->device == PMC_DEVICE_S8)
status = src_readl(dev, MUnit.OMR);
return (status & AAC_INT_MODE_MSIX);
 }
@@ -349,7 +351,8 @@ int aac_send_shutdown(struct aac_dev * dev)
/* FIB should be freed only after getting the response from the F/W */
if (status != -ERESTARTSYS)
aac_fib_free(fibctx);
-   if (aac_is_src(dev) &&
+   if ((dev->pdev->device == PMC_DEVICE_S7 ||
+dev->pdev->device == PMC_DEVICE_S8) &&
 dev->msi_enabled)
aac_set_intx_mode(dev);
return status;
@@ -605,7 +608,8 @@ struct aac_dev *aac_init_adapter(struct aac_dev *dev)
dev->max_fib_size = status[1] & 0xFFE0;
host->sg_tablesize = status[2] >> 16;
dev->sg_tablesize = status[2] & 0x;
-   if (aac_is_src(dev)) {
+   if (dev->pdev->device == PMC_DEVICE_S7 ||
+   dev->pdev->device == PMC_DEVICE_S8) {
if (host->can_queue > (status[3] >> 16) -
AAC_NUM_MGT_FIB)
host->can_queue = (status[3] >> 16) -
@@ -624,7 +628,9 @@ struct aac_dev *aac_init_adapter(struct aac_dev *dev)
pr_warn("numacb=%d ignored\n", numacb);
}
 
-   if (aac_is_src(dev))
+   if (dev->pdev->device == PMC_DEVICE_S6 ||
+   dev->pdev->device == PMC_DEVICE_S7 ||
+   dev->pdev->device == PMC_DEVICE_S8)
aac_define_int_mode(dev);
/*
 *  Ok now init the communication subsystem
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index 2142a649e865..705e003caa95 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -2574,7 +2574,9 @@ void aac_free_irq(struct aac_dev *dev)
 {
int i;
 
-   if (aac_is_src(dev)) {
+   if (dev->pdev->device == PMC_DEVICE_S6 ||
+   dev->pdev->device == PMC_DEVICE_S7 ||
+   dev->pdev->device == PMC_DEVICE_S8) {
if (dev->max_msix > 1) {
for (i = 0; i < dev->max_msix; i++)
free_irq(pci_irq_vector(dev->pdev, i),
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 644f7f5c61a2..3b7968b17169 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -1560,7 +1560,9 @@ static void __aac_shutdown(struct aac_dev * aac)
 
aac_adapter_disable_int(aac);
 
-   if (aac_is_src(aac)) {
+   if (aac->pdev->device == PMC_DEVICE_S6 ||
+   aac->pdev->device == PMC_DEVICE_S7 ||
+   aac->pdev->device == PMC_DEVICE_S8) {
if (aa

Re: [PATCH RFC 1/1] kernfs: keep kernfs node alive for __kernfs_remove()

2019-04-17 Thread Konstantin Khorenko
On 04/16/2019 10:17 PM, Tejun Heo wrote:
> On Tue, Apr 16, 2019 at 06:53:35PM +0300, Konstantin Khorenko wrote:
>> __kernfs_remove() which is called under kernfs_mutex,
>> assumes nobody kills kernfs node whie it's working on it
>> and "get"s current kernfs node for that.
>>
>> But we hit a warning in kernfs_get(): kn->counter == 0 already:
>>   [ cut here ]
>>   WARNING: CPU: 2 PID: 63923 at fs/kernfs/dir.c:377 kernfs_get+0x2f/0x40
>>   ...
>>   Call Trace:
>>[] dump_stack+0x19/0x1b
>>[] __warn+0xd8/0x100
>>[] warn_slowpath_null+0x1d/0x20
>>[] kernfs_get+0x2f/0x40
>>[] __kernfs_remove+0x113/0x260
>>[] kernfs_remove+0x21/0x30
>>[] sysfs_remove_dir+0x50/0x80
>>[] kobject_del+0x18/0x50
>>[] sysfs_slab_remove+0x3d/0x50
>>[] do_kmem_cache_release+0x3b/0x70
>>[] memcg_destroy_kmem_caches+0xb1/0xf0
>>[] mem_cgroup_css_free+0x4c/0x280
>>[] cgroup_free_fn+0x4c/0x120
>>[] process_one_work+0x182/0x440
>>[] worker_thread+0x126/0x3c0
>>[] kthread+0xd1/0xe0
>>
>> This could be for example because of kernfs_notify_workfn() which
>> does kernfs_put(kn) out of kernfs_mutex held section,
>> so move kernfs_put(kn) under the mutex.
>
> This patch doesn't really make sense to me.  Can you give a more
> concrete scenario where this would help?

i don't know the full scenario unfortunately, but the idea is the following:

__kernfs_remove() is called under kernfs_mutex and if
   !(!kn || (kn->parent && RB_EMPTY_NODE(>rb)))

it assumes that nothing can change while we hold the mutex and
for each kernfs descendant should have kn->count > 0.

=
 /* deactivate and unlink the subtree node-by-node */
 do {
 pos = kernfs_leftmost_descendant(kn);

 /*
  * kernfs_drain() drops kernfs_mutex temporarily and @pos's
  * base ref could have been put by someone else by the time
  * the function returns.  Make sure it doesn't go away
  * underneath us.
  */
     kernfs_get(pos);
=

At the same time kernfs_notify_workfn() can do a kernfs_put() out of 
kernfs_mutex
which probably can be the last put and dec kn->count to 0 any moment.


Thank you.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team


[PATCH RFC 0/1] kernfs: hit a warning in kernfs_get()

2019-04-16 Thread Konstantin Khorenko
We hit a warning on rather old kernel - RHEL7-based 3.10.0-xxx.
Please don't think it's really near 3.10 - both RedHat and we (Virtuozzo)
backport a lot of things from modern mainstream kernels and do own changes.

  [ cut here ]
  WARNING: CPU: 2 PID: 63923 at fs/kernfs/dir.c:377 kernfs_get+0x2f/0x40
  CPU: 2 PID: 63923 Comm: kworker/2:7 ve: 0 Kdump: loaded Not tainted
   3.10.0-957.10.1.vz7.85.12 #1 85.12
  ...
  Call Trace:
   [] dump_stack+0x19/0x1b
   [] __warn+0xd8/0x100
   [] warn_slowpath_null+0x1d/0x20
   [] kernfs_get+0x2f/0x40
   [] __kernfs_remove+0x113/0x260
   [] kernfs_remove+0x21/0x30
   [] sysfs_remove_dir+0x50/0x80
   [] kobject_del+0x18/0x50
   [] sysfs_slab_remove+0x3d/0x50
   [] do_kmem_cache_release+0x3b/0x70
   [] memcg_destroy_kmem_caches+0xb1/0xf0
   [] mem_cgroup_css_free+0x4c/0x280
   [] cgroup_free_fn+0x4c/0x120
   [] process_one_work+0x182/0x440
   [] worker_thread+0x126/0x3c0
   [] kthread+0xd1/0xe0

The warning has been triggered only once and so far i'm unable to reproduce it.

i'm not completely sure why __kernfs_remove() believes "pos" should always
have kn->counter > 0 as it holds kernfs_mutex, but kernfs_notify_workfn()
could definitely do a kernfs_put() out of kernfs_mutex.

So i suppose kernfs_put() should be put under kernfs_mutex() in
kernfs_notify_workfn().


Konstantin Khorenko (1):
  kernfs: keep kernfs node alive for __kernfs_remove()

 fs/kernfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

-- 
2.15.1



[PATCH RFC 1/1] kernfs: keep kernfs node alive for __kernfs_remove()

2019-04-16 Thread Konstantin Khorenko
__kernfs_remove() which is called under kernfs_mutex,
assumes nobody kills kernfs node whie it's working on it
and "get"s current kernfs node for that.

But we hit a warning in kernfs_get(): kn->counter == 0 already:
  [ cut here ]
  WARNING: CPU: 2 PID: 63923 at fs/kernfs/dir.c:377 kernfs_get+0x2f/0x40
  ...
  Call Trace:
   [] dump_stack+0x19/0x1b
   [] __warn+0xd8/0x100
   [] warn_slowpath_null+0x1d/0x20
   [] kernfs_get+0x2f/0x40
   [] __kernfs_remove+0x113/0x260
   [] kernfs_remove+0x21/0x30
   [] sysfs_remove_dir+0x50/0x80
   [] kobject_del+0x18/0x50
   [] sysfs_slab_remove+0x3d/0x50
   [] do_kmem_cache_release+0x3b/0x70
   [] memcg_destroy_kmem_caches+0xb1/0xf0
   [] mem_cgroup_css_free+0x4c/0x280
   [] cgroup_free_fn+0x4c/0x120
   [] process_one_work+0x182/0x440
   [] worker_thread+0x126/0x3c0
   [] kthread+0xd1/0xe0

This could be for example because of kernfs_notify_workfn() which
does kernfs_put(kn) out of kernfs_mutex held section,
so move kernfs_put(kn) under the mutex.

Signed-off-by: Konstantin Khorenko 
---
 fs/kernfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ae948aaa4c53..ab9c7e2064cc 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -915,8 +915,8 @@ static void kernfs_notify_workfn(struct work_struct *work)
iput(inode);
}
 
-   mutex_unlock(_mutex);
kernfs_put(kn);
+   mutex_unlock(_mutex);
goto repeat;
 }
 
-- 
2.15.1



[PATCH] tty/vt: avoid high order pages allocation on GIO_UNIMAP ioctl

2019-04-15 Thread Konstantin Khorenko
GIO_UNIMAP can easily result in a high order allocation,
seen 6th order allocation on radeondrmfb:

  fbcon: radeondrmfb (fb0) is primary device
  Console: switching to colour frame buffer device 160x64
  radeon :01:05.0: fb0: radeondrmfb frame buffer device
  WARNING: CPU: 0 PID: 78661 at mm/page_alloc.c:3532
__alloc_pages_nodemask+0x1b1/0x600
  order 6 >= 3, gfp 0x40d0

The warning is generated by a debug patch.

At the same time it's safe to use kvmalloc() for allocation in
con_get_unimap(), so let's do the substitution.

And do the same for con_set_unimap().

Signed-off-by: Konstantin Khorenko 
---
 drivers/tty/vt/consolemap.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c
index 7c7ada0b3ea0..b28aa0d289f8 100644
--- a/drivers/tty/vt/consolemap.c
+++ b/drivers/tty/vt/consolemap.c
@@ -542,7 +542,7 @@ int con_set_unimap(struct vc_data *vc, ushort ct, struct 
unipair __user *list)
if (!ct)
return 0;
 
-   unilist = memdup_user(list, ct * sizeof(struct unipair));
+   unilist = vmemdup_user(list, ct * sizeof(struct unipair));
if (IS_ERR(unilist))
return PTR_ERR(unilist);
 
@@ -641,7 +641,7 @@ int con_set_unimap(struct vc_data *vc, ushort ct, struct 
unipair __user *list)
 
 out_unlock:
console_unlock();
-   kfree(unilist);
+   kvfree(unilist);
return err;
 }
 
@@ -743,7 +743,7 @@ int con_get_unimap(struct vc_data *vc, ushort ct, ushort 
__user *uct, struct uni
struct uni_pagedir *p;
struct unipair *unilist;
 
-   unilist = kmalloc_array(ct, sizeof(struct unipair), GFP_KERNEL);
+   unilist = kvmalloc_array(ct, sizeof(struct unipair), GFP_KERNEL);
if (!unilist)
return -ENOMEM;
 
@@ -775,7 +775,7 @@ int con_get_unimap(struct vc_data *vc, ushort ct, ushort 
__user *uct, struct uni
if (copy_to_user(list, unilist, min(ect, ct) * sizeof(struct unipair)))
ret = -EFAULT;
put_user(ect, uct);
-   kfree(unilist);
+   kvfree(unilist);
return ret ? ret : (ect <= ct) ? 0 : -ENOMEM;
 }
 
-- 
2.15.1



Re: [PATCH 1/1] mm/page_alloc: add a warning about high order allocations

2018-12-28 Thread Konstantin Khorenko
On 12/27/2018 07:50 PM, Michal Hocko wrote:
> On Thu 27-12-18 16:05:18, Konstantin Khorenko wrote:
>> On 12/26/2018 11:40 AM, Michal Hocko wrote:
>>> Appart from general comments as a reply to the cover (btw. this all
>>> should be in the changelog because this is the _why_ part of the
>>> justification which should be _always_ part of the changelog).
>>
>> Thank you, will add in the next version of the patch alltogether
>> with other changes if any.
>>
>>> On Tue 25-12-18 18:39:27, Konstantin Khorenko wrote:
>>> [...]
>>>> +config WARN_HIGH_ORDER
>>>> +  bool "Enable complains about high order memory allocations"
>>>> +  depends on !LOCKDEP
>>>
>>> Why?
>>
>> LOCKDEP makes structures big, so if we see a high order allocation warning
>> on a debug kernel with lockdep, it does not give us a lot - lockdep enabled
>> kernel performance is not our target.
>> i can remove !LOCKDEP dependence here, but then need to adjust default
>> warning level i think, or logs will be spammed.
>
> OK, I see but this just points to how this is not really a suitable
> solution for the problem you are looking for.

i have to admit, yes, it is inconvenient to have such a dependency.
i would be really glad to hear alternatives...

Just to mention: tracepoints are for issues investigation (when we already have 
any),
and i'm thinking about issues prevention. Gathering places which might
lead to problem and rework to prevent possible issues.

>>>> +static __always_inline void warn_high_order(int order, gfp_t gfp_mask)
>>>> +{
>>>> +  static atomic_t warn_count = ATOMIC_INIT(32);
>>>> +
>>>> +  if (order >= warn_order && !(gfp_mask & __GFP_NOWARN))
>>>> +  WARN(atomic_dec_if_positive(_count) >= 0,
>>>> +   "order %d >= %d, gfp 0x%x\n",
>>>> +   order, warn_order, gfp_mask);
>>>> +}
>>>
>>> We do have ratelimit functionality, so why cannot you use it?
>>
>> Well, my idea was to really shut up the warning after some number of messages
>> (if a node is in production and its uptime, say, a year, i don't want to see
>> many warnings in logs, first several is enough - let's fix them first).
>
> OK, but it is quite likely that the system is perfectly healthy and
> unfragmented after fresh boot when doing a large order allocations is
> perfectly fine.

You are right again, Michal. Thus just switch those early allocations to
kvmalloc() and on boot on an unfragmented system same kmalloc() will be used.
And no new warning for that place. And no any chance this place ever trigger
compaction (in case we did not notice some way this allocation might also
happen later, not on a fresh node).

 > Note that it is smaller order allocations that generate
 > fragmentation in general.

And again - yes, that's true. Still i don't know how to avoid memory 
fragmentation,
but can avoid (ok, significantly decrease the number of) big allocations which
might trigger compaction because of that fragmentation.


--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team


Re: [RFC PATCH 0/1] mm: add a warning about high order allocations

2018-12-28 Thread Konstantin Khorenko
On 12/27/2018 07:46 PM, Michal Hocko wrote:
> On Thu 27-12-18 15:18:54, Konstantin Khorenko wrote:
>> Hi Michal,
>>
>> thank you very much for your questions, please see my notes below.
>>
>> On 12/26/2018 11:35 AM, Michal Hocko wrote:
>>> On Tue 25-12-18 18:39:26, Konstantin Khorenko wrote:
>>>> Q: Why do we need to bother at all?
>>>> A: If a node is highly loaded and its memory is significantly fragmented
>>>> (unfortunately almost any node with serious load has highly fragmented 
>>>> memory)
>>>> then any high order memory allocation can trigger massive memory shrink and
>>>> result in quite a big allocation latency. And the node becomes less 
>>>> responsive
>>>> and users don't like it.
>>>> The ultimate solution here is to get rid of large allocations, but we need 
>>>> an
>>>> instrument to detect them.
>>>
>>> Can you point to an example of the problem you are referring here? At
>>> least for costly orders we do bail out early and try to not cause
>>> massive reclaim. So what is the order that you are concerned about?
>>
>> Well, this is the most difficult question to answer.
>> Unfortunately i don't have a reproducer for that, usually we get into 
>> situation
>> when someone experiences significant node slowdown, nodes most often have a 
>> lot of RAM,
>> we check what is going on there and see the node is busy with reclaim.
>> And almost every time the reason was - fragmented memory and high order 
>> allocations.
>> Mostly of 2nd and 3rd (which is still considered not costly) order.
>>
>> Recent related issues we faced were about FUSE dev pipe:
>> d6d931adce11 ("fuse: use kvmalloc to allocate array of pipe_buffer structs.")
>>
>> and about bnx driver + mtu 9000 which for each packet required page of 2nd 
>> order
>> (and it even failed sometimes, though it was not the root cause):
>>  kswapd0: page allocation failure: order:2, mode:0x4020
>>  Call Trace:
>>  dump_stack+0x19/0x1b
>>  warn_alloc_failed+0x110/0x180
>>  __alloc_pages_nodemask+0x7bf/0xc60
>>  alloc_pages_current+0x98/0x110
>>  kmalloc_order+0x18/0x40
>>  kmalloc_order_trace+0x26/0xa0
>>  __kmalloc+0x279/0x290
>>  bnx2x_frag_alloc.isra.61+0x2a/0x40 [bnx2x]
>>  bnx2x_rx_int+0x227/0x17c0 [bnx2x]
>>  bnx2x_poll+0x1dd/0x260 [bnx2x]
>>  net_rx_action+0x179/0x390
>>  __do_softirq+0x10f/0x2aa
>>  call_softirq+0x1c/0x30
>>  do_softirq+0x65/0xa0
>>  irq_exit+0x105/0x110
>>  do_IRQ+0x56/0xe0
>>  common_interrupt+0x6d/0x6d
>>
>> And as both places were called very often - the system latency was high.
>>
>> This warning can be also used to catch allocation of 4th order and higher 
>> which may
>> easily fail. Those places which are ready to get allocation errors and have
>> fallbacks are marked with __GFP_NOWARN.
>
> This is not true in general, though.

Right now - yep, this is not true. Sorry, i was not clear enough here.
i meant after we catch all high order allocations, we review them and either
switch to kvmalloc() or mark with NOWARN if we are ready to handle allocation 
errors
in that particular place. So this is an ideal situation when we reviewed all 
the things.

> [...]
>> But after it's done and there are no (almost) unmarked high order 
>> allocations -
>> why not? This will reveal new cases of high order allocations soon.
>
> There will always be legitimate high order allocations.

Sure. But after we review them we either switch them to kvmalloc() or mark them 
with
NOWARN. In both cases we won't get new warnings about that places.

 > I believe that
> for your particular use case it is much better to simply enable reclaim
> and page allocator tracepoints which will give you not only the source
> of the allocation but also a much better picture

Tracepoints are much better for issues investigation, right. And we do so.

And warnings are intended not for investigation but for prevention of possible 
future issues.
If we spot a big allocation, we can review it in advance, before we face any 
problem,
and in most cases just switch it to use kvmalloc() in 90% cases and we never 
ever have
a problem with unexpected reclaim due to this allocation. Ever.
With any reclaim algorithm - the compaction just won't be triggered.

>> i think people who run systems with "kernel.panic_on_warn" enabled do care
>> about reporting issues.
>
> You surely do not 

Re: [PATCH 1/1] mm/page_alloc: add a warning about high order allocations

2018-12-27 Thread Konstantin Khorenko
On 12/26/2018 11:40 AM, Michal Hocko wrote:
> Appart from general comments as a reply to the cover (btw. this all
> should be in the changelog because this is the _why_ part of the
> justification which should be _always_ part of the changelog).

Thank you, will add in the next version of the patch alltogether
with other changes if any.

> On Tue 25-12-18 18:39:27, Konstantin Khorenko wrote:
> [...]
>> +config WARN_HIGH_ORDER
>> +bool "Enable complains about high order memory allocations"
>> +depends on !LOCKDEP
>
> Why?

LOCKDEP makes structures big, so if we see a high order allocation warning
on a debug kernel with lockdep, it does not give us a lot - lockdep enabled
kernel performance is not our target.
i can remove !LOCKDEP dependence here, but then need to adjust default
warning level i think, or logs will be spammed.

>> +default n
>> +help
>> +  Enables warnings on high order memory allocations. This allows to
>> +  determine users of large memory chunks and rework them to decrease
>> +  allocation latency. Note, some debug options make kernel structures
>> +  fat.
>> +
>> +config WARN_HIGH_ORDER_LEVEL
>> +int "Define page order level considered as too high"
>> +depends on WARN_HIGH_ORDER
>> +default 3
>> +help
>> +  Defines page order starting which the system to complain about.
>> +  Default is current PAGE_ALLOC_COSTLY_ORDER.
>> +
>>  config HWPOISON_INJECT
>>  tristate "HWPoison pages injector"
>>  depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index e95b5b7c9c3d..258892adb861 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -4341,6 +4341,30 @@ static inline void finalise_ac(gfp_t gfp_mask, struct 
>> alloc_context *ac)
>>  ac->high_zoneidx, ac->nodemask);
>>  }
>>
>> +#ifdef CONFIG_WARN_HIGH_ORDER
>> +int warn_order = CONFIG_WARN_HIGH_ORDER_LEVEL;
>> +
>> +/*
>> + * Complain if we allocate a high order page unless there is a __GFP_NOWARN
>> + * flag provided.
>> + *
>> + * Shuts up after 32 complains.
>> + */
>> +static __always_inline void warn_high_order(int order, gfp_t gfp_mask)
>> +{
>> +static atomic_t warn_count = ATOMIC_INIT(32);
>> +
>> +if (order >= warn_order && !(gfp_mask & __GFP_NOWARN))
>> +WARN(atomic_dec_if_positive(_count) >= 0,
>> + "order %d >= %d, gfp 0x%x\n",
>> + order, warn_order, gfp_mask);
>> +}
>
> We do have ratelimit functionality, so why cannot you use it?

Well, my idea was to really shut up the warning after some number of messages
(if a node is in production and its uptime, say, a year, i don't want to see
many warnings in logs, first several is enough - let's fix them first).

If i use printk_ratelimited() i could get 24 days delay at most AFAIK,
but okay, i can switch to printk_ratelimited() if you prefer.

struct ratelimit_state {
 int interval;

(gdb) p ((1LL<<31) -1)/1000/60/60/24
$11 = 24

>> +#else
>> +static __always_inline void warn_high_order(int order, gfp_t gfp_mask)
>> +{
>> +}
>> +#endif
>> +
>>  /*
>>   * This is the 'heart' of the zoned buddy allocator.
>>   */
>> @@ -4361,6 +4385,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int 
>> order, int preferred_nid,
>>  WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
>>  return NULL;
>>  }
>> +warn_high_order(order, gfp_mask);
>>
>>  gfp_mask &= gfp_allowed_mask;
>>  alloc_mask = gfp_mask;
>
> Why do you warn about all allocations in the hot path? I thought you
> want to catch expensive allocations so I would assume that you would
> stick that into a slow path after we are not able to allocate anything
> after the first round of compaction.

The idea is to catch big allocations soon and preferably during testing,
not on production nodes under high load, that's why i've chosen hot path.

And if we switch to the slow path, we'll have to run all tests under
additional serious load - to generate memory fragmentation.
Not so convenient.

> Also do you want to warn about opportunistic GFP_NOWAIT allocations that
> have a reasonable fallback?

Yes, i would like to. Sometimes allocation flags come from upper level and
it's not always evident if there is GFP_NOWAIT flag or not, so let's we
are noticed about such a case and verify if it's legal and not called often.
If yes - we'll just mark it with NO_WARN.


 > And forgot to mention other opportunistic allocations like THP of
 > course.

hugepages are allocated with NOWARN already, AFAIK.
alloc_fresh_huge_page_node(), __hugetlb_alloc_buddy_huge_page()


Thank you once again, Michal.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team


Re: [RFC PATCH 0/1] mm: add a warning about high order allocations

2018-12-27 Thread Konstantin Khorenko
Hi Michal,

thank you very much for your questions, please see my notes below.

On 12/26/2018 11:35 AM, Michal Hocko wrote:
> On Tue 25-12-18 18:39:26, Konstantin Khorenko wrote:
>> Q: Why do we need to bother at all?
>> A: If a node is highly loaded and its memory is significantly fragmented
>> (unfortunately almost any node with serious load has highly fragmented 
>> memory)
>> then any high order memory allocation can trigger massive memory shrink and
>> result in quite a big allocation latency. And the node becomes less 
>> responsive
>> and users don't like it.
>> The ultimate solution here is to get rid of large allocations, but we need an
>> instrument to detect them.
>
> Can you point to an example of the problem you are referring here? At
> least for costly orders we do bail out early and try to not cause
> massive reclaim. So what is the order that you are concerned about?

Well, this is the most difficult question to answer.
Unfortunately i don't have a reproducer for that, usually we get into situation
when someone experiences significant node slowdown, nodes most often have a lot 
of RAM,
we check what is going on there and see the node is busy with reclaim.
And almost every time the reason was - fragmented memory and high order 
allocations.
Mostly of 2nd and 3rd (which is still considered not costly) order.

Recent related issues we faced were about FUSE dev pipe:
d6d931adce11 ("fuse: use kvmalloc to allocate array of pipe_buffer structs.")

and about bnx driver + mtu 9000 which for each packet required page of 2nd order
(and it even failed sometimes, though it was not the root cause):
 kswapd0: page allocation failure: order:2, mode:0x4020
 Call Trace:
 dump_stack+0x19/0x1b
 warn_alloc_failed+0x110/0x180
 __alloc_pages_nodemask+0x7bf/0xc60
 alloc_pages_current+0x98/0x110
 kmalloc_order+0x18/0x40
 kmalloc_order_trace+0x26/0xa0
 __kmalloc+0x279/0x290
 bnx2x_frag_alloc.isra.61+0x2a/0x40 [bnx2x]
 bnx2x_rx_int+0x227/0x17c0 [bnx2x]
 bnx2x_poll+0x1dd/0x260 [bnx2x]
 net_rx_action+0x179/0x390
 __do_softirq+0x10f/0x2aa
 call_softirq+0x1c/0x30
 do_softirq+0x65/0xa0
 irq_exit+0x105/0x110
 do_IRQ+0x56/0xe0
 common_interrupt+0x6d/0x6d

And as both places were called very often - the system latency was high.

This warning can be also used to catch allocation of 4th order and higher which 
may
easily fail. Those places which are ready to get allocation errors and have
fallbacks are marked with __GFP_NOWARN.

>> Q: Why warning? Use tracepoints!
>> A: Well, this is a matter of magic defaults.
>> Yes, you can use tracepoints to catch large allocations, but you need to do 
>> this
>> on purpose and regularly and this is to be done by every developer which is
>> quite unreal.
>> On the other hand if you develop something and get a warning, you'll have to
>> think about the reason and either succeed with reworking the code to use
>> smaller allocation sizes (and thus decrease allocation latency!) or just use
>> kvmalloc() if you don't really need physically continuos chunk or come to the
>> conclusion you definitely need physically continuos memory and shut up the
>> warning.
>
> Well, not really. For one thing, there are systems to panic on warning
> and you really do not want to blow up just because somebody is doing a
> large order allocation.

Well, on one hand - yes, i agree with you. That's why i don't suggest to enable
the warning by default right now - until all (most) of large allocations are 
marked
properly.
But after it's done and there are no (almost) unmarked high order allocations -
why not? This will reveal new cases of high order allocations soon.
i think people who run systems with "kernel.panic_on_warn" enabled do care
about reporting issues.

>> Q: Why compile time config option?
>> A: In order not to decrease the performance even a bit in case someone does 
>> not
>> want to hunt for large allocations.
>> In an ideal life i'd prefer this check/warning is enabled by default and may 
>> be
>> even without a config option so it works on every node. Once we find and 
>> rework
>> or mark all large allocations that would be good by default. Until that 
>> though
>> it will be noisy.
>
> So who is going to enable this option?

At the beginning - people who want to debug kernel and verify their fallbacks on
memory allocations failures in the code or just speed up their code on nodes
with fragmented memory - for 2nd and 3rd orders.

mm performance issues are tough, you know, and this is just another way to
gain more performance. It won't avoid the necessity of digging mm for sure,
but might decrease 

[RFC PATCH 0/1] mm: add a warning about high order allocations

2018-12-25 Thread Konstantin Khorenko
Q: Why do we need to bother at all?
A: If a node is highly loaded and its memory is significantly fragmented
(unfortunately almost any node with serious load has highly fragmented memory)
then any high order memory allocation can trigger massive memory shrink and
result in quite a big allocation latency. And the node becomes less responsive
and users don't like it.
The ultimate solution here is to get rid of large allocations, but we need an
instrument to detect them.

Q: Why warning? Use tracepoints!
A: Well, this is a matter of magic defaults.
Yes, you can use tracepoints to catch large allocations, but you need to do this
on purpose and regularly and this is to be done by every developer which is
quite unreal.
On the other hand if you develop something and get a warning, you'll have to
think about the reason and either succeed with reworking the code to use
smaller allocation sizes (and thus decrease allocation latency!) or just use
kvmalloc() if you don't really need physically continuos chunk or come to the
conclusion you definitely need physically continuos memory and shut up the
warning.

Q: Why compile time config option?
A: In order not to decrease the performance even a bit in case someone does not
want to hunt for large allocations.
In an ideal life i'd prefer this check/warning is enabled by default and may be
even without a config option so it works on every node. Once we find and rework
or mark all large allocations that would be good by default. Until that though
it will be noisy.

Another option is to rework the patch via static keys (having the warning
disabled by default surely). That makes it possible to turn on the feature
without recompiling the kernel - during testing period for example.

If you prefer this way, i would be happy to rework the patch via static keys.

Konstantin Khorenko (1):
  mm/page_alloc: add warning about high order allocations

 kernel/sysctl.c | 15 +++
 mm/Kconfig  | 18 ++
 mm/page_alloc.c | 25 +
 3 files changed, 58 insertions(+)

-- 
2.15.1



[PATCH 1/1] mm/page_alloc: add a warning about high order allocations

2018-12-25 Thread Konstantin Khorenko
Adds sysctl "vm.warn_high_order". If set it will warn about
allocations with order >= vm.warn_high_order.
Prints only 32 warnings at most and skips all __GFP_NOWARN allocations.

The code is under config option, disabled by default.
If enabled, default vm.warn_high_order is 3 (PAGE_ALLOC_COSTLY_ORDER).

Sysctl max value is set to 100 which should be enough to exceed maximum
order (normally MAX_ORDER == 11).

Signed-off-by: Konstantin Khorenko 
---
 kernel/sysctl.c | 15 +++
 mm/Kconfig  | 18 ++
 mm/page_alloc.c | 25 +
 3 files changed, 58 insertions(+)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5fc724e4e454..28a8ebfa7a1d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -176,6 +176,10 @@ extern int unaligned_dump_stack;
 extern int no_unaligned_warning;
 #endif
 
+#ifdef CONFIG_WARN_HIGH_ORDER
+extern int warn_order;
+#endif
+
 #ifdef CONFIG_PROC_SYSCTL
 
 /**
@@ -1675,6 +1679,17 @@ static struct ctl_table vm_table[] = {
.extra1 = (void *)_rnd_compat_bits_min,
.extra2 = (void *)_rnd_compat_bits_max,
},
+#endif
+#ifdef CONFIG_WARN_HIGH_ORDER
+   {
+   .procname   = "warn_high_order",
+   .data   = _order,
+   .maxlen = sizeof(warn_order),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = ,
+   .extra2 = _hundred,
+   },
 #endif
{ }
 };
diff --git a/mm/Kconfig b/mm/Kconfig
index d85e39da47ae..4e2d613d52f1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -336,6 +336,24 @@ config MEMORY_FAILURE
  even when some of its memory has uncorrected errors. This requires
  special hardware support and typically ECC memory.
 
+config WARN_HIGH_ORDER
+   bool "Enable complains about high order memory allocations"
+   depends on !LOCKDEP
+   default n
+   help
+ Enables warnings on high order memory allocations. This allows to
+ determine users of large memory chunks and rework them to decrease
+ allocation latency. Note, some debug options make kernel structures
+ fat.
+
+config WARN_HIGH_ORDER_LEVEL
+   int "Define page order level considered as too high"
+   depends on WARN_HIGH_ORDER
+   default 3
+   help
+ Defines page order starting which the system to complain about.
+ Default is current PAGE_ALLOC_COSTLY_ORDER.
+
 config HWPOISON_INJECT
tristate "HWPoison pages injector"
depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e95b5b7c9c3d..258892adb861 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4341,6 +4341,30 @@ static inline void finalise_ac(gfp_t gfp_mask, struct 
alloc_context *ac)
ac->high_zoneidx, ac->nodemask);
 }
 
+#ifdef CONFIG_WARN_HIGH_ORDER
+int warn_order = CONFIG_WARN_HIGH_ORDER_LEVEL;
+
+/*
+ * Complain if we allocate a high order page unless there is a __GFP_NOWARN
+ * flag provided.
+ *
+ * Shuts up after 32 complains.
+ */
+static __always_inline void warn_high_order(int order, gfp_t gfp_mask)
+{
+   static atomic_t warn_count = ATOMIC_INIT(32);
+
+   if (order >= warn_order && !(gfp_mask & __GFP_NOWARN))
+   WARN(atomic_dec_if_positive(_count) >= 0,
+"order %d >= %d, gfp 0x%x\n",
+order, warn_order, gfp_mask);
+}
+#else
+static __always_inline void warn_high_order(int order, gfp_t gfp_mask)
+{
+}
+#endif
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -4361,6 +4385,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int 
order, int preferred_nid,
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
+   warn_high_order(order, gfp_mask);
 
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
-- 
2.15.1



Re: [PATCH] fbcon: use kvmalloc() for scrollback buffer

2018-12-21 Thread Konstantin Khorenko
Hi Bartlomiej,

On 12/20/2018 07:21 PM, Bartlomiej Zolnierkiewicz wrote:
> On 11/26/2018 11:02 AM, Konstantin Khorenko wrote:
>> Scrollback frame buffer is rather big - 32K,
>> so it requires 3rd order page, so let's use kvmalloc() instead of
>> ordinary kmalloc() for it.
>
> Is it actually safe to use non-contiguous memory for softback_buf?

Well, that's why we need a review. :)

i've asked myself same question while fixing this,
i've dig sources a bit and did not find places when softback_buf is provided 
for DMA,
all other places seems to work with virtual addresses, so there should be no 
problem.

Even more i saw a function which mentions that softback might be non-contigious:

/* As we might be inside of softback, we may work with non-contiguous buffer,
that's why we have to use a separate routine. */
static void fbcon_invert_region(struct vc_data *vc, u16 * p, int cnt)

So i think it's safe to use kvmalloc() here.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

>> Signed-off-by: Konstantin Khorenko 
>> ---
>>  drivers/video/fbdev/core/fbcon.c | 6 +++---
>>  1 file changed, 3 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/video/fbdev/core/fbcon.c 
>> b/drivers/video/fbdev/core/fbcon.c
>> index 8958ccc8b1ac..2b1a34d3f5e2 100644
>> --- a/drivers/video/fbdev/core/fbcon.c
>> +++ b/drivers/video/fbdev/core/fbcon.c
>> @@ -992,7 +992,7 @@ static const char *fbcon_startup(void)
>>  if (!softback_buf) {
>>  softback_buf =
>>  (unsigned long)
>> -kmalloc(fbcon_softback_size,
>> +kvmalloc(fbcon_softback_size,
>>  GFP_KERNEL);
>>  if (!softback_buf) {
>>  fbcon_softback_size = 0;
>> @@ -1001,7 +1001,7 @@ static const char *fbcon_startup(void)
>>  }
>>  } else {
>>  if (softback_buf) {
>> -kfree((void *) softback_buf);
>> +kvfree((void *) softback_buf);
>>  softback_buf = 0;
>>  softback_top = 0;
>>  }
>> @@ -3665,7 +3665,7 @@ static void fbcon_exit(void)
>>  }
>>  #endif
>>
>> -kfree((void *)softback_buf);
>> +kvfree((void *)softback_buf);
>>  softback_buf = 0UL;
>>
>>  for_each_registered_fb(i) {
>
> Best regards,
> --
> Bartlomiej Zolnierkiewicz
> Samsung R Institute Poland
> Samsung Electronics


Re: [PATCH 0/2] fs/lock: show locks info owned by dead/invisible processes

2018-08-09 Thread Konstantin Khorenko

On 08/09/2018 10:16 AM, Murphy Zhou wrote:

Hi,

Looks like this missed v4.18 ?


Hi Murphy,

yes, Jeff planned to push those patches into 4.19 and they are in "linux-next" 
now,
but not in 4.18 "master" currently.

On 06/14/2018 01:41 PM, Jeff Layton wrote:
> These look fine to me. I'll plan to pick them up for v4.19 unless anyone
> has objections.

linux-next:
1cf8e5de4055 ("fs/lock: show locks taken by processes from another pidns")
826d7bc9f013 ("fs/lock: skip lock owner pid translation in case we are in 
init_pid_ns")

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team


On Fri, Jun 8, 2018 at 10:27 PM, Konstantin Khorenko
 wrote:

The behavior has been changed after 9d5b86ac13c5 ("fs/locks: Remove fl_nspid
and use fs-specific l_pid for remote locks")
and now /proc/$PID/fdinfo/$FD does not show the info about the lock
* if the flock owner process is dead and its pid has been already freed
or
* if the lock owner is not visible in current pidns.

CRIU uses this interface to store locks info during dump and thus can break
on v4.13 and newer.

So let's show info about locks anyway in described cases (like it was before
9d5b86ac13c5), but show pid number saved in file_lock struct if we are in
init_pid_ns (patch 1) or just zero otherwise (patch 2) like we do with SID.

Reproducer:
process A   process A1  process A2
fork()->
exit()  open()
flock()
fork()->
exit()  sleep()

Before the patch:

(root@vz7)/: cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 257
lock:   (root@vz7)/:

After the patch:
===
(root@vz7)/:cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 295
lock:   1: FLOCK  ADVISORY  WRITE ${PID_A1} b6:f8a61:529946 0 EOF

===
# cat flock1.c

#include 
#include 
#include 
#include 
#include 
#include 
#include 

int main(void)
{
int fd;
int err;
pid_t child_pid;

child_pid = fork();
if (child_pid == -1)
perror("fork failed");
if (child_pid) {
exit(0);
}

fd = open("/tmp/a", O_CREAT | O_RDWR);
if (fd == -1)
perror("Failed to open the file");

err = flock(fd, LOCK_EX);
if (err == -1)
perror("flock failed");

child_pid = fork();
if (child_pid == -1)
perror("fork failed");
if (child_pid)
exit(0);

sleep(1);

return 0;
}

Konstantin Khorenko (2):
  fs/lock: skip lock owner pid translation in case we are in init_pid_ns
  fs/lock: show locks taken by processes from another pidns

 fs/locks.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

--
2.15.1


.



Re: [PATCH 0/2] fs/lock: show locks info owned by dead/invisible processes

2018-08-09 Thread Konstantin Khorenko

On 08/09/2018 10:16 AM, Murphy Zhou wrote:

Hi,

Looks like this missed v4.18 ?


Hi Murphy,

yes, Jeff planned to push those patches into 4.19 and they are in "linux-next" 
now,
but not in 4.18 "master" currently.

On 06/14/2018 01:41 PM, Jeff Layton wrote:
> These look fine to me. I'll plan to pick them up for v4.19 unless anyone
> has objections.

linux-next:
1cf8e5de4055 ("fs/lock: show locks taken by processes from another pidns")
826d7bc9f013 ("fs/lock: skip lock owner pid translation in case we are in 
init_pid_ns")

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team


On Fri, Jun 8, 2018 at 10:27 PM, Konstantin Khorenko
 wrote:

The behavior has been changed after 9d5b86ac13c5 ("fs/locks: Remove fl_nspid
and use fs-specific l_pid for remote locks")
and now /proc/$PID/fdinfo/$FD does not show the info about the lock
* if the flock owner process is dead and its pid has been already freed
or
* if the lock owner is not visible in current pidns.

CRIU uses this interface to store locks info during dump and thus can break
on v4.13 and newer.

So let's show info about locks anyway in described cases (like it was before
9d5b86ac13c5), but show pid number saved in file_lock struct if we are in
init_pid_ns (patch 1) or just zero otherwise (patch 2) like we do with SID.

Reproducer:
process A   process A1  process A2
fork()->
exit()  open()
flock()
fork()->
exit()  sleep()

Before the patch:

(root@vz7)/: cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 257
lock:   (root@vz7)/:

After the patch:
===
(root@vz7)/:cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 295
lock:   1: FLOCK  ADVISORY  WRITE ${PID_A1} b6:f8a61:529946 0 EOF

===
# cat flock1.c

#include 
#include 
#include 
#include 
#include 
#include 
#include 

int main(void)
{
int fd;
int err;
pid_t child_pid;

child_pid = fork();
if (child_pid == -1)
perror("fork failed");
if (child_pid) {
exit(0);
}

fd = open("/tmp/a", O_CREAT | O_RDWR);
if (fd == -1)
perror("Failed to open the file");

err = flock(fd, LOCK_EX);
if (err == -1)
perror("flock failed");

child_pid = fork();
if (child_pid == -1)
perror("fork failed");
if (child_pid)
exit(0);

sleep(1);

return 0;
}

Konstantin Khorenko (2):
  fs/lock: skip lock owner pid translation in case we are in init_pid_ns
  fs/lock: show locks taken by processes from another pidns

 fs/locks.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

--
2.15.1


.



[PATCH 1/2] fs/lock: skip lock owner pid translation in case we are in init_pid_ns

2018-06-08 Thread Konstantin Khorenko
If the flock owner process is dead and its pid has been already freed,
pid translation won't work, but we still want to show flock owner pid
number when expecting /proc/$PID/fdinfo/$FD in init pidns.

Reproducer:
process A   process A1  process A2
fork()->
exit()  open()
flock()
fork()->
exit()  sleep()

Before the patch:

(root@vz7)/: cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 257
lock:   (root@vz7)/:

After the patch:
===
(root@vz7)/:cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 295
lock:   1: FLOCK  ADVISORY  WRITE ${PID_A1} b6:f8a61:529946 0 EOF

Fixes: 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use fs-specific l_pid for 
remote locks")
Signed-off-by: Konstantin Khorenko 
---
 fs/locks.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/locks.c b/fs/locks.c
index 05e211be8684..bfee5b7f2862 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2072,6 +2072,13 @@ static pid_t locks_translate_pid(struct file_lock *fl, 
struct pid_namespace *ns)
return -1;
if (IS_REMOTELCK(fl))
return fl->fl_pid;
+   /*
+* If the flock owner process is dead and its pid has been already
+* freed, the translation below won't work, but we still want to show
+* flock owner pid number in init pidns.
+*/
+   if (ns == _pid_ns)
+   return (pid_t)fl->fl_pid;
 
rcu_read_lock();
pid = find_pid_ns(fl->fl_pid, _pid_ns);
-- 
2.15.1



[PATCH 1/2] fs/lock: skip lock owner pid translation in case we are in init_pid_ns

2018-06-08 Thread Konstantin Khorenko
If the flock owner process is dead and its pid has been already freed,
pid translation won't work, but we still want to show flock owner pid
number when expecting /proc/$PID/fdinfo/$FD in init pidns.

Reproducer:
process A   process A1  process A2
fork()->
exit()  open()
flock()
fork()->
exit()  sleep()

Before the patch:

(root@vz7)/: cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 257
lock:   (root@vz7)/:

After the patch:
===
(root@vz7)/:cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 295
lock:   1: FLOCK  ADVISORY  WRITE ${PID_A1} b6:f8a61:529946 0 EOF

Fixes: 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use fs-specific l_pid for 
remote locks")
Signed-off-by: Konstantin Khorenko 
---
 fs/locks.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/locks.c b/fs/locks.c
index 05e211be8684..bfee5b7f2862 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2072,6 +2072,13 @@ static pid_t locks_translate_pid(struct file_lock *fl, 
struct pid_namespace *ns)
return -1;
if (IS_REMOTELCK(fl))
return fl->fl_pid;
+   /*
+* If the flock owner process is dead and its pid has been already
+* freed, the translation below won't work, but we still want to show
+* flock owner pid number in init pidns.
+*/
+   if (ns == _pid_ns)
+   return (pid_t)fl->fl_pid;
 
rcu_read_lock();
pid = find_pid_ns(fl->fl_pid, _pid_ns);
-- 
2.15.1



[PATCH 2/2] fs/lock: show locks taken by processes from another pidns

2018-06-08 Thread Konstantin Khorenko
Currently if we face a lock taken by a process invisible in the current
pidns we skip the lock completely, but this

1) makes the output not that nice
(root@vz7)/: cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 257
lock:   (root@vz7)/:

2) makes it more difficult to debug issues with leaked flocks
   if you get error on lock, but don't see any locks in /proc/$id/fdinfo/$file

Let's show information about such locks again as previously, but
show zero in the owner pid field.

After the patch:
===
(root@vz7)/:cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 295
lock:   1: FLOCK  ADVISORY  WRITE 0 b6:f8a61:529946 0 EOF

Fixes: 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use fs-specific l_pid for 
remote locks")
Signed-off-by: Konstantin Khorenko 
---
 fs/locks.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index bfee5b7f2862..e533623e2e99 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2633,12 +2633,10 @@ static void lock_get_status(struct seq_file *f, struct 
file_lock *fl,
 
fl_pid = locks_translate_pid(fl, proc_pidns);
/*
-* If there isn't a fl_pid don't display who is waiting on
-* the lock if we are called from locks_show, or if we are
-* called from __show_fd_info - skip lock entirely
+* If lock owner is dead (and pid is freed) or not visible in current
+* pidns, zero is shown as a pid value. Check lock info from
+* init_pid_ns to get saved lock pid value.
 */
-   if (fl_pid == 0)
-   return;
 
if (fl->fl_file != NULL)
inode = locks_inode(fl->fl_file);
-- 
2.15.1



[PATCH 2/2] fs/lock: show locks taken by processes from another pidns

2018-06-08 Thread Konstantin Khorenko
Currently if we face a lock taken by a process invisible in the current
pidns we skip the lock completely, but this

1) makes the output not that nice
(root@vz7)/: cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 257
lock:   (root@vz7)/:

2) makes it more difficult to debug issues with leaked flocks
   if you get error on lock, but don't see any locks in /proc/$id/fdinfo/$file

Let's show information about such locks again as previously, but
show zero in the owner pid field.

After the patch:
===
(root@vz7)/:cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 295
lock:   1: FLOCK  ADVISORY  WRITE 0 b6:f8a61:529946 0 EOF

Fixes: 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use fs-specific l_pid for 
remote locks")
Signed-off-by: Konstantin Khorenko 
---
 fs/locks.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index bfee5b7f2862..e533623e2e99 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2633,12 +2633,10 @@ static void lock_get_status(struct seq_file *f, struct 
file_lock *fl,
 
fl_pid = locks_translate_pid(fl, proc_pidns);
/*
-* If there isn't a fl_pid don't display who is waiting on
-* the lock if we are called from locks_show, or if we are
-* called from __show_fd_info - skip lock entirely
+* If lock owner is dead (and pid is freed) or not visible in current
+* pidns, zero is shown as a pid value. Check lock info from
+* init_pid_ns to get saved lock pid value.
 */
-   if (fl_pid == 0)
-   return;
 
if (fl->fl_file != NULL)
inode = locks_inode(fl->fl_file);
-- 
2.15.1



[PATCH 0/2] fs/lock: show locks info owned by dead/invisible processes

2018-06-08 Thread Konstantin Khorenko
The behavior has been changed after 9d5b86ac13c5 ("fs/locks: Remove fl_nspid
and use fs-specific l_pid for remote locks")
and now /proc/$PID/fdinfo/$FD does not show the info about the lock
* if the flock owner process is dead and its pid has been already freed
or
* if the lock owner is not visible in current pidns.

CRIU uses this interface to store locks info during dump and thus can break
on v4.13 and newer.

So let's show info about locks anyway in described cases (like it was before
9d5b86ac13c5), but show pid number saved in file_lock struct if we are in
init_pid_ns (patch 1) or just zero otherwise (patch 2) like we do with SID.

Reproducer:
process A   process A1  process A2
fork()->
exit()  open()
flock()
fork()->
exit()  sleep()

Before the patch:

(root@vz7)/: cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 257
lock:   (root@vz7)/:

After the patch:
===
(root@vz7)/:cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 295
lock:   1: FLOCK  ADVISORY  WRITE ${PID_A1} b6:f8a61:529946 0 EOF

===
# cat flock1.c

#include 
#include 
#include 
#include 
#include 
#include 
#include 

int main(void)
{
int fd;
int err;
pid_t child_pid;

child_pid = fork();
if (child_pid == -1)
perror("fork failed");
if (child_pid) {
exit(0);
}

fd = open("/tmp/a", O_CREAT | O_RDWR);
if (fd == -1)
perror("Failed to open the file");

err = flock(fd, LOCK_EX);
if (err == -1)
perror("flock failed");

child_pid = fork();
if (child_pid == -1)
perror("fork failed");
if (child_pid)
    exit(0);

sleep(1);

return 0;
}

Konstantin Khorenko (2):
  fs/lock: skip lock owner pid translation in case we are in init_pid_ns
  fs/lock: show locks taken by processes from another pidns

 fs/locks.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

-- 
2.15.1



[PATCH 0/2] fs/lock: show locks info owned by dead/invisible processes

2018-06-08 Thread Konstantin Khorenko
The behavior has been changed after 9d5b86ac13c5 ("fs/locks: Remove fl_nspid
and use fs-specific l_pid for remote locks")
and now /proc/$PID/fdinfo/$FD does not show the info about the lock
* if the flock owner process is dead and its pid has been already freed
or
* if the lock owner is not visible in current pidns.

CRIU uses this interface to store locks info during dump and thus can break
on v4.13 and newer.

So let's show info about locks anyway in described cases (like it was before
9d5b86ac13c5), but show pid number saved in file_lock struct if we are in
init_pid_ns (patch 1) or just zero otherwise (patch 2) like we do with SID.

Reproducer:
process A   process A1  process A2
fork()->
exit()  open()
flock()
fork()->
exit()  sleep()

Before the patch:

(root@vz7)/: cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 257
lock:   (root@vz7)/:

After the patch:
===
(root@vz7)/:cat /proc/${PID_A2}/fdinfo/3
pos:4
flags:  0212
mnt_id: 295
lock:   1: FLOCK  ADVISORY  WRITE ${PID_A1} b6:f8a61:529946 0 EOF

===
# cat flock1.c

#include 
#include 
#include 
#include 
#include 
#include 
#include 

int main(void)
{
int fd;
int err;
pid_t child_pid;

child_pid = fork();
if (child_pid == -1)
perror("fork failed");
if (child_pid) {
exit(0);
}

fd = open("/tmp/a", O_CREAT | O_RDWR);
if (fd == -1)
perror("Failed to open the file");

err = flock(fd, LOCK_EX);
if (err == -1)
perror("flock failed");

child_pid = fork();
if (child_pid == -1)
perror("fork failed");
if (child_pid)
    exit(0);

sleep(1);

return 0;
}

Konstantin Khorenko (2):
  fs/lock: skip lock owner pid translation in case we are in init_pid_ns
  fs/lock: show locks taken by processes from another pidns

 fs/locks.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

-- 
2.15.1



[PATCH] [NETFILTER] ipt_SAME: add compat conversion functions

2007-11-13 Thread Konstantin Khorenko
[NETFILTER]: ipt_SAME: add compat conversion functions

ipt_SAME should have the compat function cause its entry structure 
(ipt_same_info)
contains a pointer between data filled/checked in both kernel and userspace.

Signed-off-by: Konstantin Khorenko <[EMAIL PROTECTED]>

---
Thank you,
    Konstantin Khorenko

SWsoft Virtuozzo/OpenVZ Linux kernel team

--- ./net/ipv4/netfilter/ipt_SAME.c.SAME2007-11-06 13:55:16.0 
+0300
+++ ./net/ipv4/netfilter/ipt_SAME.c 2007-11-09 16:50:38.0 +0300
@@ -152,6 +152,47 @@ same_target(struct sk_buff *skb,
return nf_nat_setup_info(ct, , hooknum);
 }
 
+#ifdef CONFIG_COMPAT
+struct compat_ipt_same_info
+{
+   unsigned char info;
+   u_int32_t rangesize;
+   u_int32_t ipnum;
+   compat_uptr_t iparray;
+
+   /* hangs off end. */
+   struct nf_nat_range range[IPT_SAME_MAX_RANGE];
+};
+
+static void compat_from_user(void *dst, void *src)
+{
+   const struct compat_ipt_same_info *cl = src;
+   struct ipt_same_info l = {
+   .info   = cl->info,
+   .rangesize  = cl->rangesize,
+   .ipnum  = 0,
+   .iparray= NULL,
+   };
+
+   memcpy(l.range, cl->range, sizeof(l.range));
+   memcpy(dst, , sizeof(l));
+}
+
+static int compat_to_user(void __user *dst, void *src)
+{
+   const struct ipt_same_info *l = src;
+   struct compat_ipt_same_info cl = {
+   .info   = l->info,
+   .rangesize  = l->rangesize,
+   .ipnum  = 0,
+   .iparray= (compat_uptr_t)NULL,
+   };
+
+   memcpy(cl.range, l->range, sizeof(cl.range));
+   return copy_to_user(dst, , sizeof(cl)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
 static struct xt_target same_reg __read_mostly = {
.name   = "SAME",
.family = AF_INET,
@@ -161,6 +202,11 @@ static struct xt_target same_reg __read_
.hooks  = (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING),
.checkentry = same_check,
.destroy= same_destroy,
+#ifdef CONFIG_COMPAT
+   .compatsize = sizeof(struct compat_ipt_same_info),
+   .compat_from_user = compat_from_user,
+   .compat_to_user = compat_to_user,
+#endif
.me = THIS_MODULE,
 };
 



[PATCH] [NETFILTER] ipt_SAME: add compat conversion functions

2007-11-13 Thread Konstantin Khorenko
[NETFILTER]: ipt_SAME: add compat conversion functions

ipt_SAME should have the compat function cause its entry structure 
(ipt_same_info)
contains a pointer between data filled/checked in both kernel and userspace.

Signed-off-by: Konstantin Khorenko [EMAIL PROTECTED]

---
Thank you,
Konstantin Khorenko

SWsoft Virtuozzo/OpenVZ Linux kernel team

--- ./net/ipv4/netfilter/ipt_SAME.c.SAME2007-11-06 13:55:16.0 
+0300
+++ ./net/ipv4/netfilter/ipt_SAME.c 2007-11-09 16:50:38.0 +0300
@@ -152,6 +152,47 @@ same_target(struct sk_buff *skb,
return nf_nat_setup_info(ct, newrange, hooknum);
 }
 
+#ifdef CONFIG_COMPAT
+struct compat_ipt_same_info
+{
+   unsigned char info;
+   u_int32_t rangesize;
+   u_int32_t ipnum;
+   compat_uptr_t iparray;
+
+   /* hangs off end. */
+   struct nf_nat_range range[IPT_SAME_MAX_RANGE];
+};
+
+static void compat_from_user(void *dst, void *src)
+{
+   const struct compat_ipt_same_info *cl = src;
+   struct ipt_same_info l = {
+   .info   = cl-info,
+   .rangesize  = cl-rangesize,
+   .ipnum  = 0,
+   .iparray= NULL,
+   };
+
+   memcpy(l.range, cl-range, sizeof(l.range));
+   memcpy(dst, l, sizeof(l));
+}
+
+static int compat_to_user(void __user *dst, void *src)
+{
+   const struct ipt_same_info *l = src;
+   struct compat_ipt_same_info cl = {
+   .info   = l-info,
+   .rangesize  = l-rangesize,
+   .ipnum  = 0,
+   .iparray= (compat_uptr_t)NULL,
+   };
+
+   memcpy(cl.range, l-range, sizeof(cl.range));
+   return copy_to_user(dst, cl, sizeof(cl)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
 static struct xt_target same_reg __read_mostly = {
.name   = SAME,
.family = AF_INET,
@@ -161,6 +202,11 @@ static struct xt_target same_reg __read_
.hooks  = (1  NF_IP_PRE_ROUTING | 1  NF_IP_POST_ROUTING),
.checkentry = same_check,
.destroy= same_destroy,
+#ifdef CONFIG_COMPAT
+   .compatsize = sizeof(struct compat_ipt_same_info),
+   .compat_from_user = compat_from_user,
+   .compat_to_user = compat_to_user,
+#endif
.me = THIS_MODULE,
 };