[PATCH v3] media: siano: Fix coherent memory allocation failure on arm64

2018-03-03 Thread tomoki . sekiyama
From: Tomoki Sekiyama <tomoki.sekiy...@gmail.com>

On some architectures such as arm64, siano chip based TV-tuner
USB devices are not recognized correctly due to coherent memory
allocation failure with the following error:

[  663.556135] usbcore: deregistering interface driver smsusb
[  683.624809] smsusb:smsusb_probe: board id=18, interface number 0
[  683.633530] smsusb:smsusb_init_device: smscore_register_device(...) failed, 
rc -12
[  683.641501] smsusb:smsusb_probe: Device initialized with return code -12
[  683.652978] smsusb: probe of 1-1:1.0 failed with error -12

This is caused by dma_alloc_coherent(NULL, ...) returning NULL in
smscoreapi.c.

To fix this error, allocate the buffer memory for the USB devices
via kmalloc() and let the USB core do the DMA mapping and free.

v3: let the usb core do the DMA mapping and free
v2: non-usb `device' is also be passed to dma_alloc_coherent()

Signed-off-by: Tomoki Sekiyama <tomoki.sekiy...@gmail.com>
---
 drivers/media/common/siano/smscoreapi.c | 33 ++---
 drivers/media/common/siano/smscoreapi.h |  2 ++
 drivers/media/usb/siano/smsusb.c|  4 ++--
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/drivers/media/common/siano/smscoreapi.c 
b/drivers/media/common/siano/smscoreapi.c
index c5c827e11b64..b5dcc6d1fe90 100644
--- a/drivers/media/common/siano/smscoreapi.c
+++ b/drivers/media/common/siano/smscoreapi.c
@@ -631,7 +631,8 @@ smscore_buffer_t *smscore_createbuffer(u8 *buffer, void 
*common_buffer,
 
cb->p = buffer;
cb->offset_in_common = buffer - (u8 *) common_buffer;
-   cb->phys = common_buffer_phys + cb->offset_in_common;
+   if (common_buffer_phys)
+   cb->phys = common_buffer_phys + cb->offset_in_common;
 
return cb;
 }
@@ -690,17 +691,21 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
 
/* alloc common buffer */
dev->common_buffer_size = params->buffer_size * params->num_buffers;
-   dev->common_buffer = dma_alloc_coherent(NULL, dev->common_buffer_size,
-   >common_buffer_phys,
-   GFP_KERNEL | GFP_DMA);
-   if (!dev->common_buffer) {
+   if (params->usb_device)
+   buffer = kzalloc(dev->common_buffer_size, GFP_KERNEL);
+   else
+   buffer = dma_alloc_coherent(params->device,
+   dev->common_buffer_size,
+   >common_buffer_phys,
+   GFP_KERNEL | GFP_DMA);
+   if (!buffer) {
smscore_unregister_device(dev);
return -ENOMEM;
}
+   dev->common_buffer = buffer;
 
/* prepare dma buffers */
-   for (buffer = dev->common_buffer;
-dev->num_buffers < params->num_buffers;
+   for (; dev->num_buffers < params->num_buffers;
 dev->num_buffers++, buffer += params->buffer_size) {
struct smscore_buffer_t *cb;
 
@@ -720,6 +725,7 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
dev->board_id = SMS_BOARD_UNKNOWN;
dev->context = params->context;
dev->device = params->device;
+   dev->usb_device = params->usb_device;
dev->setmode_handler = params->setmode_handler;
dev->detectmode_handler = params->detectmode_handler;
dev->sendrequest_handler = params->sendrequest_handler;
@@ -1231,10 +1237,15 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
 
pr_debug("freed %d buffers\n", num_buffers);
 
-   if (coredev->common_buffer)
-   dma_free_coherent(NULL, coredev->common_buffer_size,
-   coredev->common_buffer, coredev->common_buffer_phys);
-
+   if (coredev->common_buffer) {
+   if (coredev->usb_device)
+   kfree(coredev->common_buffer);
+   else
+   dma_free_coherent(coredev->device,
+ coredev->common_buffer_size,
+ coredev->common_buffer,
+ coredev->common_buffer_phys);
+   }
kfree(coredev->fw_buf);
 
list_del(>entry);
diff --git a/drivers/media/common/siano/smscoreapi.h 
b/drivers/media/common/siano/smscoreapi.h
index 4cc39e4a8318..134c69f7ea7b 100644
--- a/drivers/media/common/siano/smscoreapi.h
+++ b/drivers/media/common/siano/smscoreapi.h
@@ -134,6 +134,7 @@ struct smscore_buffer_t {
 
 struct smsdevice_params_t {
struct device   *device;
+   struct usb_device   *usb_device;
 
int buffer_size;
int

[PATCH v3] media: siano: Fix coherent memory allocation failure on arm64

2018-03-03 Thread tomoki . sekiyama
From: Tomoki Sekiyama 

On some architectures such as arm64, siano chip based TV-tuner
USB devices are not recognized correctly due to coherent memory
allocation failure with the following error:

[  663.556135] usbcore: deregistering interface driver smsusb
[  683.624809] smsusb:smsusb_probe: board id=18, interface number 0
[  683.633530] smsusb:smsusb_init_device: smscore_register_device(...) failed, 
rc -12
[  683.641501] smsusb:smsusb_probe: Device initialized with return code -12
[  683.652978] smsusb: probe of 1-1:1.0 failed with error -12

This is caused by dma_alloc_coherent(NULL, ...) returning NULL in
smscoreapi.c.

To fix this error, allocate the buffer memory for the USB devices
via kmalloc() and let the USB core do the DMA mapping and free.

v3: let the usb core do the DMA mapping and free
v2: non-usb `device' is also be passed to dma_alloc_coherent()

Signed-off-by: Tomoki Sekiyama 
---
 drivers/media/common/siano/smscoreapi.c | 33 ++---
 drivers/media/common/siano/smscoreapi.h |  2 ++
 drivers/media/usb/siano/smsusb.c|  4 ++--
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/drivers/media/common/siano/smscoreapi.c 
b/drivers/media/common/siano/smscoreapi.c
index c5c827e11b64..b5dcc6d1fe90 100644
--- a/drivers/media/common/siano/smscoreapi.c
+++ b/drivers/media/common/siano/smscoreapi.c
@@ -631,7 +631,8 @@ smscore_buffer_t *smscore_createbuffer(u8 *buffer, void 
*common_buffer,
 
cb->p = buffer;
cb->offset_in_common = buffer - (u8 *) common_buffer;
-   cb->phys = common_buffer_phys + cb->offset_in_common;
+   if (common_buffer_phys)
+   cb->phys = common_buffer_phys + cb->offset_in_common;
 
return cb;
 }
@@ -690,17 +691,21 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
 
/* alloc common buffer */
dev->common_buffer_size = params->buffer_size * params->num_buffers;
-   dev->common_buffer = dma_alloc_coherent(NULL, dev->common_buffer_size,
-   >common_buffer_phys,
-   GFP_KERNEL | GFP_DMA);
-   if (!dev->common_buffer) {
+   if (params->usb_device)
+   buffer = kzalloc(dev->common_buffer_size, GFP_KERNEL);
+   else
+   buffer = dma_alloc_coherent(params->device,
+   dev->common_buffer_size,
+   >common_buffer_phys,
+   GFP_KERNEL | GFP_DMA);
+   if (!buffer) {
smscore_unregister_device(dev);
return -ENOMEM;
}
+   dev->common_buffer = buffer;
 
/* prepare dma buffers */
-   for (buffer = dev->common_buffer;
-dev->num_buffers < params->num_buffers;
+   for (; dev->num_buffers < params->num_buffers;
 dev->num_buffers++, buffer += params->buffer_size) {
struct smscore_buffer_t *cb;
 
@@ -720,6 +725,7 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
dev->board_id = SMS_BOARD_UNKNOWN;
dev->context = params->context;
dev->device = params->device;
+   dev->usb_device = params->usb_device;
dev->setmode_handler = params->setmode_handler;
dev->detectmode_handler = params->detectmode_handler;
dev->sendrequest_handler = params->sendrequest_handler;
@@ -1231,10 +1237,15 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
 
pr_debug("freed %d buffers\n", num_buffers);
 
-   if (coredev->common_buffer)
-   dma_free_coherent(NULL, coredev->common_buffer_size,
-   coredev->common_buffer, coredev->common_buffer_phys);
-
+   if (coredev->common_buffer) {
+   if (coredev->usb_device)
+   kfree(coredev->common_buffer);
+   else
+   dma_free_coherent(coredev->device,
+ coredev->common_buffer_size,
+ coredev->common_buffer,
+ coredev->common_buffer_phys);
+   }
kfree(coredev->fw_buf);
 
list_del(>entry);
diff --git a/drivers/media/common/siano/smscoreapi.h 
b/drivers/media/common/siano/smscoreapi.h
index 4cc39e4a8318..134c69f7ea7b 100644
--- a/drivers/media/common/siano/smscoreapi.h
+++ b/drivers/media/common/siano/smscoreapi.h
@@ -134,6 +134,7 @@ struct smscore_buffer_t {
 
 struct smsdevice_params_t {
struct device   *device;
+   struct usb_device   *usb_device;
 
int buffer_size;
int num_buffers;
@@ -176,6 +177,7 @@ struct smscore_device_t {
 
 

[PATCH v2] media: siano: Fix coherent memory allocation failure on arm64

2018-03-03 Thread tomoki . sekiyama
From: Tomoki Sekiyama <tomoki.sekiy...@gmail.com>

On some architectures such as arm64, siano chip based TV-tuner
USB devices are not recognized correctly due to coherent memory
allocation failure with the following error:

[  663.556135] usbcore: deregistering interface driver smsusb
[  683.624809] smsusb:smsusb_probe: board id=18, interface number 0
[  683.633530] smsusb:smsusb_init_device: smscore_register_device(...) failed, 
rc -12
[  683.641501] smsusb:smsusb_probe: Device initialized with return code -12
[  683.652978] smsusb: probe of 1-1:1.0 failed with error -12

This is caused by dma_alloc_coherent(NULL, ...) returning NULL in
smscoreapi.c.

To fix this error, usb_alloc_coherent() must be used for DMA
memory allocation for USB devices in such architectures.

v2: non-usb `device' is also be passed to dma_alloc_coherent()

Signed-off-by: Tomoki Sekiyama <tomoki.sekiy...@gmail.com>
---
 drivers/media/common/siano/smscoreapi.c | 36 -
 drivers/media/common/siano/smscoreapi.h |  2 ++
 drivers/media/usb/siano/smsusb.c|  1 +
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/drivers/media/common/siano/smscoreapi.c 
b/drivers/media/common/siano/smscoreapi.c
index c5c827e11b64..34622b562963 100644
--- a/drivers/media/common/siano/smscoreapi.c
+++ b/drivers/media/common/siano/smscoreapi.c
@@ -690,17 +690,24 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
 
/* alloc common buffer */
dev->common_buffer_size = params->buffer_size * params->num_buffers;
-   dev->common_buffer = dma_alloc_coherent(NULL, dev->common_buffer_size,
-   >common_buffer_phys,
-   GFP_KERNEL | GFP_DMA);
-   if (!dev->common_buffer) {
+   if (params->usb_device)
+   buffer = usb_alloc_coherent(params->usb_device,
+   dev->common_buffer_size,
+   GFP_KERNEL | GFP_DMA,
+   >common_buffer_phys);
+   else
+   buffer = dma_alloc_coherent(params->device,
+   dev->common_buffer_size,
+   >common_buffer_phys,
+   GFP_KERNEL | GFP_DMA);
+   if (!buffer) {
smscore_unregister_device(dev);
return -ENOMEM;
}
+   dev->common_buffer = buffer;
 
/* prepare dma buffers */
-   for (buffer = dev->common_buffer;
-dev->num_buffers < params->num_buffers;
+   for (; dev->num_buffers < params->num_buffers;
 dev->num_buffers++, buffer += params->buffer_size) {
struct smscore_buffer_t *cb;
 
@@ -720,6 +727,7 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
dev->board_id = SMS_BOARD_UNKNOWN;
dev->context = params->context;
dev->device = params->device;
+   dev->usb_device = params->usb_device;
dev->setmode_handler = params->setmode_handler;
dev->detectmode_handler = params->detectmode_handler;
dev->sendrequest_handler = params->sendrequest_handler;
@@ -1231,10 +1239,18 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
 
pr_debug("freed %d buffers\n", num_buffers);
 
-   if (coredev->common_buffer)
-   dma_free_coherent(NULL, coredev->common_buffer_size,
-   coredev->common_buffer, coredev->common_buffer_phys);
-
+   if (coredev->common_buffer) {
+   if (coredev->usb_device)
+   usb_free_coherent(coredev->usb_device,
+ coredev->common_buffer_size,
+ coredev->common_buffer,
+ coredev->common_buffer_phys);
+   else
+   dma_free_coherent(coredev->device,
+ coredev->common_buffer_size,
+ coredev->common_buffer,
+ coredev->common_buffer_phys);
+   }
kfree(coredev->fw_buf);
 
list_del(>entry);
diff --git a/drivers/media/common/siano/smscoreapi.h 
b/drivers/media/common/siano/smscoreapi.h
index 4cc39e4a8318..134c69f7ea7b 100644
--- a/drivers/media/common/siano/smscoreapi.h
+++ b/drivers/media/common/siano/smscoreapi.h
@@ -134,6 +134,7 @@ struct smscore_buffer_t {
 
 struct smsdevice_params_t {
struct device   *device;
+   struct usb_device   *usb_device;
 
int buffer_size;
int num_buffers;
@@ -176,6

[PATCH v2] media: siano: Fix coherent memory allocation failure on arm64

2018-03-03 Thread tomoki . sekiyama
From: Tomoki Sekiyama 

On some architectures such as arm64, siano chip based TV-tuner
USB devices are not recognized correctly due to coherent memory
allocation failure with the following error:

[  663.556135] usbcore: deregistering interface driver smsusb
[  683.624809] smsusb:smsusb_probe: board id=18, interface number 0
[  683.633530] smsusb:smsusb_init_device: smscore_register_device(...) failed, 
rc -12
[  683.641501] smsusb:smsusb_probe: Device initialized with return code -12
[  683.652978] smsusb: probe of 1-1:1.0 failed with error -12

This is caused by dma_alloc_coherent(NULL, ...) returning NULL in
smscoreapi.c.

To fix this error, usb_alloc_coherent() must be used for DMA
memory allocation for USB devices in such architectures.

v2: non-usb `device' is also be passed to dma_alloc_coherent()

Signed-off-by: Tomoki Sekiyama 
---
 drivers/media/common/siano/smscoreapi.c | 36 -
 drivers/media/common/siano/smscoreapi.h |  2 ++
 drivers/media/usb/siano/smsusb.c|  1 +
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/drivers/media/common/siano/smscoreapi.c 
b/drivers/media/common/siano/smscoreapi.c
index c5c827e11b64..34622b562963 100644
--- a/drivers/media/common/siano/smscoreapi.c
+++ b/drivers/media/common/siano/smscoreapi.c
@@ -690,17 +690,24 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
 
/* alloc common buffer */
dev->common_buffer_size = params->buffer_size * params->num_buffers;
-   dev->common_buffer = dma_alloc_coherent(NULL, dev->common_buffer_size,
-   >common_buffer_phys,
-   GFP_KERNEL | GFP_DMA);
-   if (!dev->common_buffer) {
+   if (params->usb_device)
+   buffer = usb_alloc_coherent(params->usb_device,
+   dev->common_buffer_size,
+   GFP_KERNEL | GFP_DMA,
+   >common_buffer_phys);
+   else
+   buffer = dma_alloc_coherent(params->device,
+   dev->common_buffer_size,
+   >common_buffer_phys,
+   GFP_KERNEL | GFP_DMA);
+   if (!buffer) {
smscore_unregister_device(dev);
return -ENOMEM;
}
+   dev->common_buffer = buffer;
 
/* prepare dma buffers */
-   for (buffer = dev->common_buffer;
-dev->num_buffers < params->num_buffers;
+   for (; dev->num_buffers < params->num_buffers;
 dev->num_buffers++, buffer += params->buffer_size) {
struct smscore_buffer_t *cb;
 
@@ -720,6 +727,7 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
dev->board_id = SMS_BOARD_UNKNOWN;
dev->context = params->context;
dev->device = params->device;
+   dev->usb_device = params->usb_device;
dev->setmode_handler = params->setmode_handler;
dev->detectmode_handler = params->detectmode_handler;
dev->sendrequest_handler = params->sendrequest_handler;
@@ -1231,10 +1239,18 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
 
pr_debug("freed %d buffers\n", num_buffers);
 
-   if (coredev->common_buffer)
-   dma_free_coherent(NULL, coredev->common_buffer_size,
-   coredev->common_buffer, coredev->common_buffer_phys);
-
+   if (coredev->common_buffer) {
+   if (coredev->usb_device)
+   usb_free_coherent(coredev->usb_device,
+ coredev->common_buffer_size,
+ coredev->common_buffer,
+ coredev->common_buffer_phys);
+   else
+   dma_free_coherent(coredev->device,
+ coredev->common_buffer_size,
+ coredev->common_buffer,
+ coredev->common_buffer_phys);
+   }
kfree(coredev->fw_buf);
 
list_del(>entry);
diff --git a/drivers/media/common/siano/smscoreapi.h 
b/drivers/media/common/siano/smscoreapi.h
index 4cc39e4a8318..134c69f7ea7b 100644
--- a/drivers/media/common/siano/smscoreapi.h
+++ b/drivers/media/common/siano/smscoreapi.h
@@ -134,6 +134,7 @@ struct smscore_buffer_t {
 
 struct smsdevice_params_t {
struct device   *device;
+   struct usb_device   *usb_device;
 
int buffer_size;
int num_buffers;
@@ -176,6 +177,7 @@ struct smscore_device_t {
 
void *context;
 

[PATCH] media: siano: Fix coherent memory allocation failure on some arch

2018-01-04 Thread Tomoki Sekiyama
On some architectures like arm64, coherent memory allocation for
USB devices fails by following error:

[  663.556135] usbcore: deregistering interface driver smsusb
[  683.624809] smsusb:smsusb_probe: board id=18, interface number 0
[  683.633530] smsusb:smsusb_init_device: smscore_register_device(...) failed, 
rc -12
[  683.641501] smsusb:smsusb_probe: Device initialized with return code -12
[  683.652978] smsusb: probe of 1-1:1.0 failed with error -12

This is caused by dma_alloc_coherent(NULL, ...) returning NULL in
smscoreapi.c.

To fix this error, usb_alloc_coherent() must be used for DMA
memory allocation for USB devices in such architectures.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiy...@gmail.com>
---
 drivers/media/common/siano/smscoreapi.c | 34 +++--
 drivers/media/common/siano/smscoreapi.h |  2 ++
 drivers/media/usb/siano/smsusb.c|  1 +
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/drivers/media/common/siano/smscoreapi.c 
b/drivers/media/common/siano/smscoreapi.c
index c5c827e..8d0e7a6 100644
--- a/drivers/media/common/siano/smscoreapi.c
+++ b/drivers/media/common/siano/smscoreapi.c
@@ -690,17 +690,23 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
 
/* alloc common buffer */
dev->common_buffer_size = params->buffer_size * params->num_buffers;
-   dev->common_buffer = dma_alloc_coherent(NULL, dev->common_buffer_size,
-   >common_buffer_phys,
-   GFP_KERNEL | GFP_DMA);
-   if (!dev->common_buffer) {
+   if (params->usb_device)
+   buffer = usb_alloc_coherent(params->usb_device,
+   dev->common_buffer_size,
+   GFP_KERNEL | GFP_DMA,
+   >common_buffer_phys);
+   else
+   buffer = dma_alloc_coherent(NULL, dev->common_buffer_size,
+   >common_buffer_phys,
+   GFP_KERNEL | GFP_DMA);
+   if (!buffer) {
smscore_unregister_device(dev);
return -ENOMEM;
}
+   dev->common_buffer = buffer;
 
/* prepare dma buffers */
-   for (buffer = dev->common_buffer;
-dev->num_buffers < params->num_buffers;
+   for (; dev->num_buffers < params->num_buffers;
 dev->num_buffers++, buffer += params->buffer_size) {
struct smscore_buffer_t *cb;
 
@@ -720,6 +726,7 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
dev->board_id = SMS_BOARD_UNKNOWN;
dev->context = params->context;
dev->device = params->device;
+   dev->usb_device = params->usb_device;
dev->setmode_handler = params->setmode_handler;
dev->detectmode_handler = params->detectmode_handler;
dev->sendrequest_handler = params->sendrequest_handler;
@@ -1231,10 +1238,17 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
 
pr_debug("freed %d buffers\n", num_buffers);
 
-   if (coredev->common_buffer)
-   dma_free_coherent(NULL, coredev->common_buffer_size,
-   coredev->common_buffer, coredev->common_buffer_phys);
-
+   if (coredev->common_buffer) {
+   if (coredev->usb_device)
+   usb_free_coherent(coredev->usb_device,
+ coredev->common_buffer_size,
+ coredev->common_buffer,
+ coredev->common_buffer_phys);
+   else
+   dma_free_coherent(NULL, coredev->common_buffer_size,
+ coredev->common_buffer,
+ coredev->common_buffer_phys);
+   }
kfree(coredev->fw_buf);
 
list_del(>entry);
diff --git a/drivers/media/common/siano/smscoreapi.h 
b/drivers/media/common/siano/smscoreapi.h
index 4cc39e4..134c69f 100644
--- a/drivers/media/common/siano/smscoreapi.h
+++ b/drivers/media/common/siano/smscoreapi.h
@@ -134,6 +134,7 @@ struct smscore_buffer_t {
 
 struct smsdevice_params_t {
struct device   *device;
+   struct usb_device   *usb_device;
 
int buffer_size;
int num_buffers;
@@ -176,6 +177,7 @@ struct smscore_device_t {
 
void *context;
struct device *device;
+   struct usb_device *usb_device;
 
char devpath[32];
unsigned long device_flags;
diff --git a/drivers/media/usb/siano/smsusb.c b/drivers/media/usb/siano/smsusb.c
index d07349c..7e8e803 100644

[PATCH] media: siano: Fix coherent memory allocation failure on some arch

2018-01-04 Thread Tomoki Sekiyama
On some architectures like arm64, coherent memory allocation for
USB devices fails by following error:

[  663.556135] usbcore: deregistering interface driver smsusb
[  683.624809] smsusb:smsusb_probe: board id=18, interface number 0
[  683.633530] smsusb:smsusb_init_device: smscore_register_device(...) failed, 
rc -12
[  683.641501] smsusb:smsusb_probe: Device initialized with return code -12
[  683.652978] smsusb: probe of 1-1:1.0 failed with error -12

This is caused by dma_alloc_coherent(NULL, ...) returning NULL in
smscoreapi.c.

To fix this error, usb_alloc_coherent() must be used for DMA
memory allocation for USB devices in such architectures.

Signed-off-by: Tomoki Sekiyama 
---
 drivers/media/common/siano/smscoreapi.c | 34 +++--
 drivers/media/common/siano/smscoreapi.h |  2 ++
 drivers/media/usb/siano/smsusb.c|  1 +
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/drivers/media/common/siano/smscoreapi.c 
b/drivers/media/common/siano/smscoreapi.c
index c5c827e..8d0e7a6 100644
--- a/drivers/media/common/siano/smscoreapi.c
+++ b/drivers/media/common/siano/smscoreapi.c
@@ -690,17 +690,23 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
 
/* alloc common buffer */
dev->common_buffer_size = params->buffer_size * params->num_buffers;
-   dev->common_buffer = dma_alloc_coherent(NULL, dev->common_buffer_size,
-   >common_buffer_phys,
-   GFP_KERNEL | GFP_DMA);
-   if (!dev->common_buffer) {
+   if (params->usb_device)
+   buffer = usb_alloc_coherent(params->usb_device,
+   dev->common_buffer_size,
+   GFP_KERNEL | GFP_DMA,
+   >common_buffer_phys);
+   else
+   buffer = dma_alloc_coherent(NULL, dev->common_buffer_size,
+   >common_buffer_phys,
+   GFP_KERNEL | GFP_DMA);
+   if (!buffer) {
smscore_unregister_device(dev);
return -ENOMEM;
}
+   dev->common_buffer = buffer;
 
/* prepare dma buffers */
-   for (buffer = dev->common_buffer;
-dev->num_buffers < params->num_buffers;
+   for (; dev->num_buffers < params->num_buffers;
 dev->num_buffers++, buffer += params->buffer_size) {
struct smscore_buffer_t *cb;
 
@@ -720,6 +726,7 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
dev->board_id = SMS_BOARD_UNKNOWN;
dev->context = params->context;
dev->device = params->device;
+   dev->usb_device = params->usb_device;
dev->setmode_handler = params->setmode_handler;
dev->detectmode_handler = params->detectmode_handler;
dev->sendrequest_handler = params->sendrequest_handler;
@@ -1231,10 +1238,17 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
 
pr_debug("freed %d buffers\n", num_buffers);
 
-   if (coredev->common_buffer)
-   dma_free_coherent(NULL, coredev->common_buffer_size,
-   coredev->common_buffer, coredev->common_buffer_phys);
-
+   if (coredev->common_buffer) {
+   if (coredev->usb_device)
+   usb_free_coherent(coredev->usb_device,
+ coredev->common_buffer_size,
+ coredev->common_buffer,
+ coredev->common_buffer_phys);
+   else
+   dma_free_coherent(NULL, coredev->common_buffer_size,
+ coredev->common_buffer,
+ coredev->common_buffer_phys);
+   }
kfree(coredev->fw_buf);
 
list_del(>entry);
diff --git a/drivers/media/common/siano/smscoreapi.h 
b/drivers/media/common/siano/smscoreapi.h
index 4cc39e4..134c69f 100644
--- a/drivers/media/common/siano/smscoreapi.h
+++ b/drivers/media/common/siano/smscoreapi.h
@@ -134,6 +134,7 @@ struct smscore_buffer_t {
 
 struct smsdevice_params_t {
struct device   *device;
+   struct usb_device   *usb_device;
 
int buffer_size;
int num_buffers;
@@ -176,6 +177,7 @@ struct smscore_device_t {
 
void *context;
struct device *device;
+   struct usb_device *usb_device;
 
char devpath[32];
unsigned long device_flags;
diff --git a/drivers/media/usb/siano/smsusb.c b/drivers/media/usb/siano/smsusb.c
index d07349c..7e8e803 100644
--- a/drivers/media/usb/siano/smsusb.c
+++ b/dr

[PATCH 1/2 v5] perf sched: fix wrong conversion of task state

2016-08-01 Thread Tomoki Sekiyama
Currently sched_out_state() converts the prev_state u64 bitmask to a char
using the bitmask as an index, which may cause invalid memory access.
This fixes the issue by using the __ffs() returned value as an index.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama...@hitachi.com>
Fixes: cdce9d738b91e ("perf sched: Add sched latency profiling")
Cc: Jiri Olsa <jo...@kernel.org>
Cc: David Ahern <dsah...@gmail.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <a.p.zijls...@chello.nl>
Cc: Masami Hiramatsu <mhira...@kernel.org>
Acked-by: David Ahern <dsah...@gmail.com>
---
 tools/perf/builtin-sched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..ce9bef6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -897,9 +897,10 @@ static int thread_atoms_insert(struct perf_sched *sched, 
struct thread *thread)
 
 static char sched_out_state(u64 prev_state)
 {
-   const char *str = TASK_STATE_TO_CHAR_STR;
+   const char str[] = TASK_STATE_TO_CHAR_STR;
+   unsigned int bit = prev_state ? __ffs(prev_state) + 1 : 0;
 
-   return str[prev_state];
+   return bit < ARRAY_SIZE(str) - 1 ? str[bit] : '?';
 }
 
 static int
-- 
2.7.4



[PATCH 2/2 v5] perf sched: adapt TASK_STATE_TO_CHAR_STR to the latest kernel

2016-08-01 Thread Tomoki Sekiyama
Update TASK_STATE_TO_CHAR_STR macro to one from sched.h in the latest
kernel.

Related kernel commits:
 - commit ad86622b478e ("wait: swap EXIT_ZOMBIE and EXIT_DEAD to hide
   EXIT_TRACE from user-space"):
   'Z' and 'X' are swapped

 - commit 80ed87c8a9ca ("sched/wait: Introduce TASK_NOLOAD and TASK_IDLE"):
   Introduces new state 'N'

 - commit 7dc603c9028e ("sched/fair: Fix PELT integrity for new tasks"):
   Introduces new state 'n'

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama...@hitachi.com>
Cc: Jiri Olsa <jo...@kernel.org>
Cc: David Ahern <dsah...@gmail.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <a.p.zijls...@chello.nl>
Cc: Masami Hiramatsu <mhira...@kernel.org>
---
Changes from v4:
  Removed 'Fixes:' tag and added related commits in the commit message

 tools/perf/builtin-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ce9bef6..5776263 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -70,7 +70,7 @@ struct sched_atom {
struct task_desc*wakee;
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
 enum thread_state {
THREAD_SLEEPING = 0,
-- 
2.7.4



[PATCH 1/2 v5] perf sched: fix wrong conversion of task state

2016-08-01 Thread Tomoki Sekiyama
Currently sched_out_state() converts the prev_state u64 bitmask to a char
using the bitmask as an index, which may cause invalid memory access.
This fixes the issue by using the __ffs() returned value as an index.

Signed-off-by: Tomoki Sekiyama 
Fixes: cdce9d738b91e ("perf sched: Add sched latency profiling")
Cc: Jiri Olsa 
Cc: David Ahern 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Masami Hiramatsu 
Acked-by: David Ahern 
---
 tools/perf/builtin-sched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..ce9bef6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -897,9 +897,10 @@ static int thread_atoms_insert(struct perf_sched *sched, 
struct thread *thread)
 
 static char sched_out_state(u64 prev_state)
 {
-   const char *str = TASK_STATE_TO_CHAR_STR;
+   const char str[] = TASK_STATE_TO_CHAR_STR;
+   unsigned int bit = prev_state ? __ffs(prev_state) + 1 : 0;
 
-   return str[prev_state];
+   return bit < ARRAY_SIZE(str) - 1 ? str[bit] : '?';
 }
 
 static int
-- 
2.7.4



[PATCH 2/2 v5] perf sched: adapt TASK_STATE_TO_CHAR_STR to the latest kernel

2016-08-01 Thread Tomoki Sekiyama
Update TASK_STATE_TO_CHAR_STR macro to one from sched.h in the latest
kernel.

Related kernel commits:
 - commit ad86622b478e ("wait: swap EXIT_ZOMBIE and EXIT_DEAD to hide
   EXIT_TRACE from user-space"):
   'Z' and 'X' are swapped

 - commit 80ed87c8a9ca ("sched/wait: Introduce TASK_NOLOAD and TASK_IDLE"):
   Introduces new state 'N'

 - commit 7dc603c9028e ("sched/fair: Fix PELT integrity for new tasks"):
   Introduces new state 'n'

Signed-off-by: Tomoki Sekiyama 
Cc: Jiri Olsa 
Cc: David Ahern 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Masami Hiramatsu 
---
Changes from v4:
  Removed 'Fixes:' tag and added related commits in the commit message

 tools/perf/builtin-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ce9bef6..5776263 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -70,7 +70,7 @@ struct sched_atom {
struct task_desc*wakee;
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
 enum thread_state {
THREAD_SLEEPING = 0,
-- 
2.7.4



[PATCH 1/2 v4] perf sched: fix wrong conversion of task state

2016-07-31 Thread Tomoki Sekiyama
Currently sched_out_state() converts the prev_state u64 bitmask to a char
using the bitmask as an index, which may cause invalid memory access.
This fixes the issue by using the __ffs() returned value as an index.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama...@hitachi.com>
Fixes: cdce9d738b91e ("perf sched: Add sched latency profiling")
Cc: Jiri Olsa <jo...@kernel.org>
Cc: David Ahern <dsah...@gmail.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <a.p.zijls...@chello.nl>
Cc: Masami Hiramatsu <mhira...@kernel.org>
---
Changes from v3:
  Added Fixes:
  Split the patch into 2 pieces.

 tools/perf/builtin-sched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..ce9bef6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -897,9 +897,10 @@ static int thread_atoms_insert(struct perf_sched *sched, 
struct thread *thread)
 
 static char sched_out_state(u64 prev_state)
 {
-   const char *str = TASK_STATE_TO_CHAR_STR;
+   const char str[] = TASK_STATE_TO_CHAR_STR;
+   unsigned int bit = prev_state ? __ffs(prev_state) + 1 : 0;
 
-   return str[prev_state];
+   return bit < ARRAY_SIZE(str) - 1 ? str[bit] : '?';
 }
 
 static int
-- 
2.7.4



[PATCH 2/2 v4] perf sched: adapt TASK_STATE_TO_CHAR_STR to the latest kernel

2016-07-31 Thread Tomoki Sekiyama
Update TASK_STATE_TO_CHAR_STR macro to one from sched.h in the latest
kernel, where 'N' and 'n' are introduced, 'X' and 'Z' are swapped.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama...@hitachi.com>
Fixes: cdce9d738b91e ("perf sched: Add sched latency profiling")
Cc: Jiri Olsa <jo...@kernel.org>
Cc: David Ahern <dsah...@gmail.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <a.p.zijls...@chello.nl>
Cc: Masami Hiramatsu <mhira...@kernel.org>
---
Changes from v3:
  Added Fixes:
  Split patch into 2 pieces.

 tools/perf/builtin-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ce9bef6..5776263 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -70,7 +70,7 @@ struct sched_atom {
struct task_desc*wakee;
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
 enum thread_state {
THREAD_SLEEPING = 0,
-- 
2.7.4



[PATCH 1/2 v4] perf sched: fix wrong conversion of task state

2016-07-31 Thread Tomoki Sekiyama
Currently sched_out_state() converts the prev_state u64 bitmask to a char
using the bitmask as an index, which may cause invalid memory access.
This fixes the issue by using the __ffs() returned value as an index.

Signed-off-by: Tomoki Sekiyama 
Fixes: cdce9d738b91e ("perf sched: Add sched latency profiling")
Cc: Jiri Olsa 
Cc: David Ahern 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Masami Hiramatsu 
---
Changes from v3:
  Added Fixes:
  Split the patch into 2 pieces.

 tools/perf/builtin-sched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..ce9bef6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -897,9 +897,10 @@ static int thread_atoms_insert(struct perf_sched *sched, 
struct thread *thread)
 
 static char sched_out_state(u64 prev_state)
 {
-   const char *str = TASK_STATE_TO_CHAR_STR;
+   const char str[] = TASK_STATE_TO_CHAR_STR;
+   unsigned int bit = prev_state ? __ffs(prev_state) + 1 : 0;
 
-   return str[prev_state];
+   return bit < ARRAY_SIZE(str) - 1 ? str[bit] : '?';
 }
 
 static int
-- 
2.7.4



[PATCH 2/2 v4] perf sched: adapt TASK_STATE_TO_CHAR_STR to the latest kernel

2016-07-31 Thread Tomoki Sekiyama
Update TASK_STATE_TO_CHAR_STR macro to one from sched.h in the latest
kernel, where 'N' and 'n' are introduced, 'X' and 'Z' are swapped.

Signed-off-by: Tomoki Sekiyama 
Fixes: cdce9d738b91e ("perf sched: Add sched latency profiling")
Cc: Jiri Olsa 
Cc: David Ahern 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Masami Hiramatsu 
---
Changes from v3:
  Added Fixes:
  Split patch into 2 pieces.

 tools/perf/builtin-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ce9bef6..5776263 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -70,7 +70,7 @@ struct sched_atom {
struct task_desc*wakee;
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
 enum thread_state {
THREAD_SLEEPING = 0,
-- 
2.7.4



RE: [PATCH v2] perf sched: fix wrong conversion of task state

2016-07-28 Thread Tomoki Sekiyama
Hi Hiramatsu-san,

On 2016/7/28, 2016 8:43, Masami Hiramatsu wrote:
> Hmm, this includes several fixes and enhancements.
> 1. Use first bit of the state instead of state itself (critical bug to
avoid crash?)
> 2. Check the range of the array and return '?' if out (minor bug, it can
access data area)
> 3. Fix TASK_STATE_TO_CHAR_STR to swap X and Z.
> 4. Add new 'N+' to TASK_STATE_TO_CHAR_STR. (how about 'n'?)
> 5. Treat a preempted task as THREAD_WAIT_CPU.
>
> so IMHO, it is better to split this patch into atleast 2, #1 and #2
(critical bugfix),
> #3, #4, and #5 (minor update).

This time I will fix only the invalid array access and
adapting to the current kernel TASK_STATE, and leave the
preempted task handling for later follow-up.

 [..]
>> @@ -897,9 +898,10 @@ static int thread_atoms_insert(struct perf_sched
*sched, struct thread *thread)
>> +return bit < sizeof(str) - 1 ? str[bit] : '?';
>
> You'd better use ARRAY_SIZE(str) instead of sizeof() for array here.

OK, will change this to use ARRAY_SIZE on the next update.

Thanks,
Tomoki Sekiyama



RE: [PATCH v2] perf sched: fix wrong conversion of task state

2016-07-28 Thread Tomoki Sekiyama
Hi Hiramatsu-san,

On 2016/7/28, 2016 8:43, Masami Hiramatsu wrote:
> Hmm, this includes several fixes and enhancements.
> 1. Use first bit of the state instead of state itself (critical bug to
avoid crash?)
> 2. Check the range of the array and return '?' if out (minor bug, it can
access data area)
> 3. Fix TASK_STATE_TO_CHAR_STR to swap X and Z.
> 4. Add new 'N+' to TASK_STATE_TO_CHAR_STR. (how about 'n'?)
> 5. Treat a preempted task as THREAD_WAIT_CPU.
>
> so IMHO, it is better to split this patch into atleast 2, #1 and #2
(critical bugfix),
> #3, #4, and #5 (minor update).

This time I will fix only the invalid array access and
adapting to the current kernel TASK_STATE, and leave the
preempted task handling for later follow-up.

 [..]
>> @@ -897,9 +898,10 @@ static int thread_atoms_insert(struct perf_sched
*sched, struct thread *thread)
>> +return bit < sizeof(str) - 1 ? str[bit] : '?';
>
> You'd better use ARRAY_SIZE(str) instead of sizeof() for array here.

OK, will change this to use ARRAY_SIZE on the next update.

Thanks,
Tomoki Sekiyama



[PATCH v3] perf sched: fix wrong conversion of task state

2016-07-28 Thread Tomoki Sekiyama
sched_out_state() converts the prev_state u64 bitmask to a char in
a wrong way, which may cause invalid memory access.
TASK_STATE_TO_CHAR_STR should also be fixed to adapt current
kernel's sched.h.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama...@hitachi.com>
Cc: Jiri Olsa <jo...@kernel.org>
Cc: David Ahern <dsah...@gmail.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <a.p.zijls...@chello.nl>
Cc: Masami Hiramatsu <mhira...@kernel.org>
---
 tools/perf/builtin-sched.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..5776263 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -70,7 +70,7 @@ struct sched_atom {
struct task_desc*wakee;
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
 enum thread_state {
THREAD_SLEEPING = 0,
@@ -897,9 +897,10 @@ static int thread_atoms_insert(struct perf_sched *sched, 
struct thread *thread)
 
 static char sched_out_state(u64 prev_state)
 {
-   const char *str = TASK_STATE_TO_CHAR_STR;
+   const char str[] = TASK_STATE_TO_CHAR_STR;
+   unsigned int bit = prev_state ? __ffs(prev_state) + 1 : 0;
 
-   return str[prev_state];
+   return bit < ARRAY_SIZE(str) - 1 ? str[bit] : '?';
 }
 
 static int
-- 
2.7.4



[PATCH v3] perf sched: fix wrong conversion of task state

2016-07-28 Thread Tomoki Sekiyama
sched_out_state() converts the prev_state u64 bitmask to a char in
a wrong way, which may cause invalid memory access.
TASK_STATE_TO_CHAR_STR should also be fixed to adapt current
kernel's sched.h.

Signed-off-by: Tomoki Sekiyama 
Cc: Jiri Olsa 
Cc: David Ahern 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Masami Hiramatsu 
---
 tools/perf/builtin-sched.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..5776263 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -70,7 +70,7 @@ struct sched_atom {
struct task_desc*wakee;
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
 enum thread_state {
THREAD_SLEEPING = 0,
@@ -897,9 +897,10 @@ static int thread_atoms_insert(struct perf_sched *sched, 
struct thread *thread)
 
 static char sched_out_state(u64 prev_state)
 {
-   const char *str = TASK_STATE_TO_CHAR_STR;
+   const char str[] = TASK_STATE_TO_CHAR_STR;
+   unsigned int bit = prev_state ? __ffs(prev_state) + 1 : 0;
 
-   return str[prev_state];
+   return bit < ARRAY_SIZE(str) - 1 ? str[bit] : '?';
 }
 
 static int
-- 
2.7.4



RE: [PATCH] perf sched: fix wrong conversion of task state

2016-07-27 Thread Tomoki Sekiyama
On 2016/7/28 1:50, David Ahern wrote:
>>>> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
>>>> index 0dfe8df..eb2f7f4 100644
>>>> --- a/tools/perf/builtin-sched.c
>>>> +++ b/tools/perf/builtin-sched.c
>>>> @@ -71,6 +71,7 @@ struct sched_atom {
>>>>  };
>>>>
>>>>  #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
>>>> +#define TASK_STATE_MASK 0x7ff
>>>
>>> The mask should not be needed and looking at top of tree there are 2 new
>>> states (N and n) that need to be added.
>>
>> I couldn't find the state for 'n'. Where can I find it?
>
> Check Linus' tree -- top of tree; commit 7dc603c9028ea.

Thanks, I was failing to git pull by some network reason..
Will add 'n' too.

>> I have rechecked the code and found that __trace_sched_switch_state()
uses
>> TASK_STATE_MAX for preempted tasks.
>
>since TASK_STATE_MAX is kernel version dependent perf should not rely on
it.

Hmm, that is true (actually 'n' is added recently).
But that means we cannot handle preemption correctly as far as
sched:sched_switch
event uses TASK_STATE_MAX to mark preempted tasks.

Should we stop using TASK_STATE_MAX for preempted tasks in ftrace and
use (1 << 63) or something that doesn't change on kernel version instead?


This causes missing some context switches information by 'perf sched
latency'.
For example, running 2 'yes' commands (inifinitely call write(2)) on the
same
cpu by following steps results in the incomplete result:

$ taskset -c 0 yes > /dev/null &
$ taskset -c 0 yes > /dev/null &
# perf sched record sleep 1
# perf sched latency -p -C 0

like:

--
  Task  |   Runtime ms  | Switches | Average delay ms |
 --
  yes:14187 |499.705 ms |   17 | avg:   12.981 ms |
  yes:14188 |500.350 ms |   14 | avg:   12.023 ms |
  gnome-terminal-:12722 |  0.285 ms |3 | avg:0.025 ms |
...

where avg delay * switches for yes commands don't cover expected total delay
500 ms although one make another delayed to be executed.
(12.981 ms * 17 = 220.677 ms << 500 ms)

With the patch, perf sched latency shows:
Now 12.838 ms * 39 = 500.682 ms as expected.

--
  Task  |   Runtime ms  | Switches | Average delay ms |
 --
  yes:14187 |499.705 ms |   39 | avg:   12.838 ms |
  yes:14188 |500.350 ms |   40 | avg:   12.506 ms |
  gnome-terminal-:12722 |  0.285 ms |3 | avg:0.025 ms |
...


Thanks,
Tomoki Sekiyama




RE: [PATCH] perf sched: fix wrong conversion of task state

2016-07-27 Thread Tomoki Sekiyama
On 2016/7/28 1:50, David Ahern wrote:
>>>> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
>>>> index 0dfe8df..eb2f7f4 100644
>>>> --- a/tools/perf/builtin-sched.c
>>>> +++ b/tools/perf/builtin-sched.c
>>>> @@ -71,6 +71,7 @@ struct sched_atom {
>>>>  };
>>>>
>>>>  #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
>>>> +#define TASK_STATE_MASK 0x7ff
>>>
>>> The mask should not be needed and looking at top of tree there are 2 new
>>> states (N and n) that need to be added.
>>
>> I couldn't find the state for 'n'. Where can I find it?
>
> Check Linus' tree -- top of tree; commit 7dc603c9028ea.

Thanks, I was failing to git pull by some network reason..
Will add 'n' too.

>> I have rechecked the code and found that __trace_sched_switch_state()
uses
>> TASK_STATE_MAX for preempted tasks.
>
>since TASK_STATE_MAX is kernel version dependent perf should not rely on
it.

Hmm, that is true (actually 'n' is added recently).
But that means we cannot handle preemption correctly as far as
sched:sched_switch
event uses TASK_STATE_MAX to mark preempted tasks.

Should we stop using TASK_STATE_MAX for preempted tasks in ftrace and
use (1 << 63) or something that doesn't change on kernel version instead?


This causes missing some context switches information by 'perf sched
latency'.
For example, running 2 'yes' commands (inifinitely call write(2)) on the
same
cpu by following steps results in the incomplete result:

$ taskset -c 0 yes > /dev/null &
$ taskset -c 0 yes > /dev/null &
# perf sched record sleep 1
# perf sched latency -p -C 0

like:

--
  Task  |   Runtime ms  | Switches | Average delay ms |
 --
  yes:14187 |499.705 ms |   17 | avg:   12.981 ms |
  yes:14188 |500.350 ms |   14 | avg:   12.023 ms |
  gnome-terminal-:12722 |  0.285 ms |3 | avg:0.025 ms |
...

where avg delay * switches for yes commands don't cover expected total delay
500 ms although one make another delayed to be executed.
(12.981 ms * 17 = 220.677 ms << 500 ms)

With the patch, perf sched latency shows:
Now 12.838 ms * 39 = 500.682 ms as expected.

--
  Task  |   Runtime ms  | Switches | Average delay ms |
 --
  yes:14187 |499.705 ms |   39 | avg:   12.838 ms |
  yes:14188 |500.350 ms |   40 | avg:   12.506 ms |
  gnome-terminal-:12722 |  0.285 ms |3 | avg:0.025 ms |
...


Thanks,
Tomoki Sekiyama




[PATCH v2] perf sched: fix wrong conversion of task state

2016-07-27 Thread Tomoki Sekiyama
sched_out_state() converts the prev_state u64 bitmask to a char in
a wrong way, which may cause wrong results of 'perf sched latency'.
This patch fixes the conversion.
Also, preempted tasks must be considered that they are in the
THREAD_WAIT_CPU state.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama...@hitachi.com>
Cc: Jiri Olsa <jo...@kernel.org>
Cc: David Ahern <dsah...@gmail.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <a.p.zijls...@chello.nl>
Cc: Masami Hiramatsu <mhira...@kernel.org>
---
 tools/perf/builtin-sched.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..8651c36 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -70,7 +70,8 @@ struct sched_atom {
struct task_desc*wakee;
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+/* TASK_STATE_MAX means the task is preempted(R+). Use '+' for it here. */
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN+"
 
 enum thread_state {
THREAD_SLEEPING = 0,
@@ -897,9 +898,10 @@ static int thread_atoms_insert(struct perf_sched *sched, 
struct thread *thread)
 
 static char sched_out_state(u64 prev_state)
 {
-   const char *str = TASK_STATE_TO_CHAR_STR;
+   const char str[] = TASK_STATE_TO_CHAR_STR;
+   unsigned int bit = prev_state ? __ffs(prev_state) + 1 : 0;
 
-   return str[prev_state];
+   return bit < sizeof(str) - 1 ? str[bit] : '?';
 }
 
 static int
@@ -915,7 +917,7 @@ add_sched_out_event(struct work_atoms *atoms,
 
atom->sched_out_time = timestamp;
 
-   if (run_state == 'R') {
+   if (run_state == 'R' || run_state == '+') {
atom->state = THREAD_WAIT_CPU;
atom->wake_up_time = atom->sched_out_time;
}
-- 
2.7.4



[PATCH v2] perf sched: fix wrong conversion of task state

2016-07-27 Thread Tomoki Sekiyama
sched_out_state() converts the prev_state u64 bitmask to a char in
a wrong way, which may cause wrong results of 'perf sched latency'.
This patch fixes the conversion.
Also, preempted tasks must be considered that they are in the
THREAD_WAIT_CPU state.

Signed-off-by: Tomoki Sekiyama 
Cc: Jiri Olsa 
Cc: David Ahern 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Masami Hiramatsu 
---
 tools/perf/builtin-sched.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..8651c36 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -70,7 +70,8 @@ struct sched_atom {
struct task_desc*wakee;
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+/* TASK_STATE_MAX means the task is preempted(R+). Use '+' for it here. */
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN+"
 
 enum thread_state {
THREAD_SLEEPING = 0,
@@ -897,9 +898,10 @@ static int thread_atoms_insert(struct perf_sched *sched, 
struct thread *thread)
 
 static char sched_out_state(u64 prev_state)
 {
-   const char *str = TASK_STATE_TO_CHAR_STR;
+   const char str[] = TASK_STATE_TO_CHAR_STR;
+   unsigned int bit = prev_state ? __ffs(prev_state) + 1 : 0;
 
-   return str[prev_state];
+   return bit < sizeof(str) - 1 ? str[bit] : '?';
 }
 
 static int
@@ -915,7 +917,7 @@ add_sched_out_event(struct work_atoms *atoms,
 
atom->sched_out_time = timestamp;
 
-   if (run_state == 'R') {
+   if (run_state == 'R' || run_state == '+') {
atom->state = THREAD_WAIT_CPU;
atom->wake_up_time = atom->sched_out_time;
}
-- 
2.7.4



[PATCH] perf sched: fix wrong conversion of task state

2016-07-27 Thread Tomoki Sekiyama
sched_out_state() converts the prev_state u64 bitmask to a char in
a wrong way, which may cause wrong results of 'perf sched latency'.
This patch fixes the conversion.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama...@hitachi.com>
Cc: Jiri Olsa <jo...@kernel.org>
Cc: David Ahern <dsah...@gmail.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <a.p.zijls...@chello.nl>
Cc: Masami Hiramatsu <mhira...@kernel.org>
---
 tools/perf/builtin-sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..eb2f7f4 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -71,6 +71,7 @@ struct sched_atom {
 };
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_MASK 0x7ff
 
 enum thread_state {
THREAD_SLEEPING = 0,
@@ -899,7 +900,7 @@ static char sched_out_state(u64 prev_state)
 {
const char *str = TASK_STATE_TO_CHAR_STR;
 
-   return str[prev_state];
+   return str[ffs(prev_state & TASK_STATE_MASK)];
 }
 
 static int
-- 
2.7.4



[PATCH] perf sched: fix wrong conversion of task state

2016-07-27 Thread Tomoki Sekiyama
sched_out_state() converts the prev_state u64 bitmask to a char in
a wrong way, which may cause wrong results of 'perf sched latency'.
This patch fixes the conversion.

Signed-off-by: Tomoki Sekiyama 
Cc: Jiri Olsa 
Cc: David Ahern 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Masami Hiramatsu 
---
 tools/perf/builtin-sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..eb2f7f4 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -71,6 +71,7 @@ struct sched_atom {
 };
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_MASK 0x7ff
 
 enum thread_state {
THREAD_SLEEPING = 0,
@@ -899,7 +900,7 @@ static char sched_out_state(u64 prev_state)
 {
const char *str = TASK_STATE_TO_CHAR_STR;
 
-   return str[prev_state];
+   return str[ffs(prev_state & TASK_STATE_MASK)];
 }
 
 static int
-- 
2.7.4



[PATCH] perf sched: fix wrong conversion of task state

2016-07-27 Thread Tomoki Sekiyama
sched_out_state() converts the prev_state u64 bitmask to a char in
a wrong way, which may cause wrong results of 'perf sched latency'.
This patch fixes the conversion.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama...@hitachi.com>
Cc: Jiri Olsa <jo...@kernel.org>
Cc: David Ahern <dsah...@gmail.com>
Cc: Namhyung Kim <namhy...@kernel.org>
Cc: Peter Zijlstra <a.p.zijls...@chello.nl>
Cc: Masami Hiramatsu <mhira...@kernel.org>
---
 tools/perf/builtin-sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..eb2f7f4 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -71,6 +71,7 @@ struct sched_atom {
 };
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_MASK 0x7ff
 
 enum thread_state {
THREAD_SLEEPING = 0,
@@ -899,7 +900,7 @@ static char sched_out_state(u64 prev_state)
 {
const char *str = TASK_STATE_TO_CHAR_STR;
 
-   return str[prev_state];
+   return str[ffs(prev_state & TASK_STATE_MASK)];
 }
 
 static int
-- 
2.7.4



[PATCH] perf sched: fix wrong conversion of task state

2016-07-27 Thread Tomoki Sekiyama
sched_out_state() converts the prev_state u64 bitmask to a char in
a wrong way, which may cause wrong results of 'perf sched latency'.
This patch fixes the conversion.

Signed-off-by: Tomoki Sekiyama 
Cc: Jiri Olsa 
Cc: David Ahern 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Masami Hiramatsu 
---
 tools/perf/builtin-sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0dfe8df..eb2f7f4 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -71,6 +71,7 @@ struct sched_atom {
 };
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
+#define TASK_STATE_MASK 0x7ff
 
 enum thread_state {
THREAD_SLEEPING = 0,
@@ -899,7 +900,7 @@ static char sched_out_state(u64 prev_state)
 {
const char *str = TASK_STATE_TO_CHAR_STR;
 
-   return str[prev_state];
+   return str[ffs(prev_state & TASK_STATE_MASK)];
 }
 
 static int
-- 
2.7.4



[PATCH] drivers/tty/hvc: don't free hvc_console_setup after init

2014-05-02 Thread Tomoki Sekiyama
When 'console=hvc0' is specified to the kernel parameter in x86 KVM guest,
hvc console is setup within a kthread. However, that will cause SEGV
and the boot will fail when the driver is builtin to the kernel,
because currently hvc_console_setup() is annotated with '__init'. This
patch removes '__init' to boot the guest successfully with 'console=hvc0'.

Signed-off-by: Tomoki Sekiyama 
---
 drivers/tty/hvc/hvc_console.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index 94f9e3a..0ff7fda 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -190,7 +190,7 @@ static struct tty_driver *hvc_console_device(struct console 
*c, int *index)
return hvc_driver;
 }
 
-static int __init hvc_console_setup(struct console *co, char *options)
+static int hvc_console_setup(struct console *co, char *options)
 {  
if (co->index < 0 || co->index >= MAX_NR_HVC_CONSOLES)
return -ENODEV;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] drivers/tty/hvc: don't free hvc_console_setup after init

2014-05-02 Thread Tomoki Sekiyama
When 'console=hvc0' is specified to the kernel parameter in x86 KVM guest,
hvc console is setup within a kthread. However, that will cause SEGV
and the boot will fail when the driver is builtin to the kernel,
because currently hvc_console_setup() is annotated with '__init'. This
patch removes '__init' to boot the guest successfully with 'console=hvc0'.

Signed-off-by: Tomoki Sekiyama 
---
 drivers/tty/hvc/hvc_console.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index 94f9e3a..0ff7fda 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -190,7 +190,7 @@ static struct tty_driver *hvc_console_device(struct console 
*c, int *index)
return hvc_driver;
 }
 
-static int __init hvc_console_setup(struct console *co, char *options)
+static int hvc_console_setup(struct console *co, char *options)
 {  
if (co->index < 0 || co->index >= MAX_NR_HVC_CONSOLES)
return -ENODEV;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] drivers/tty/hvc: don't free hvc_console_setup after init

2014-05-02 Thread Tomoki Sekiyama
When 'console=hvc0' is specified to the kernel parameter in x86 KVM guest,
hvc console is setup within a kthread. However, that will cause SEGV
and the boot will fail when the driver is builtin to the kernel,
because currently hvc_console_setup() is annotated with '__init'. This
patch removes '__init' to boot the guest successfully with 'console=hvc0'.

Signed-off-by: Tomoki Sekiyama tomoki.sekiy...@hds.com
---
 drivers/tty/hvc/hvc_console.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index 94f9e3a..0ff7fda 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -190,7 +190,7 @@ static struct tty_driver *hvc_console_device(struct console 
*c, int *index)
return hvc_driver;
 }
 
-static int __init hvc_console_setup(struct console *co, char *options)
+static int hvc_console_setup(struct console *co, char *options)
 {  
if (co-index  0 || co-index = MAX_NR_HVC_CONSOLES)
return -ENODEV;

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] drivers/tty/hvc: don't free hvc_console_setup after init

2014-05-02 Thread Tomoki Sekiyama
When 'console=hvc0' is specified to the kernel parameter in x86 KVM guest,
hvc console is setup within a kthread. However, that will cause SEGV
and the boot will fail when the driver is builtin to the kernel,
because currently hvc_console_setup() is annotated with '__init'. This
patch removes '__init' to boot the guest successfully with 'console=hvc0'.

Signed-off-by: Tomoki Sekiyama tomoki.sekiy...@hds.com
---
 drivers/tty/hvc/hvc_console.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index 94f9e3a..0ff7fda 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -190,7 +190,7 @@ static struct tty_driver *hvc_console_device(struct console 
*c, int *index)
return hvc_driver;
 }
 
-static int __init hvc_console_setup(struct console *co, char *options)
+static int hvc_console_setup(struct console *co, char *options)
 {  
if (co-index  0 || co-index = MAX_NR_HVC_CONSOLES)
return -ENODEV;

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/2] elevator: Fix a race in elevator switching and md device initialization

2013-10-09 Thread Tomoki Sekiyama
Hi all,


Is this patchset going to be merged into 3.12?

Thanks,
-- 
Tomoki

On 9/23/13 16:14 , "Tejun Heo"  wrote:

>Hello,
>
>On Mon, Sep 23, 2013 at 08:11:55PM +0000, Tomoki Sekiyama wrote:
>> >Hmm... why aren't we just changing elevator_init() to grab sysfs_lock
>> >where necessary?
>> 
>> The locking cannot be moved into elevator_init() because it is called
>> from elevator_switch() path, where the request_queue's sysfs_lock is
>> already taken.
>> 
>> > It'd be more consistent with elevator_exit() that way.
>> 
>> What elevator_exit() locks is elevator_queue's sysfs_lock, not
>> request_queue's sysfs_lock. What we need here is request_queue's
>> sysfs_lock.
>
>Ah, okay.
>
> Reviewed-by: Tejun Heo 
>
>Thanks.
>
>-- 
>tejun

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/2] elevator: Fix a race in elevator switching and md device initialization

2013-10-09 Thread Tomoki Sekiyama
Hi all,


Is this patchset going to be merged into 3.12?

Thanks,
-- 
Tomoki

On 9/23/13 16:14 , Tejun Heo t...@kernel.org wrote:

Hello,

On Mon, Sep 23, 2013 at 08:11:55PM +, Tomoki Sekiyama wrote:
 Hmm... why aren't we just changing elevator_init() to grab sysfs_lock
 where necessary?
 
 The locking cannot be moved into elevator_init() because it is called
 from elevator_switch() path, where the request_queue's sysfs_lock is
 already taken.
 
  It'd be more consistent with elevator_exit() that way.
 
 What elevator_exit() locks is elevator_queue's sysfs_lock, not
 request_queue's sysfs_lock. What we need here is request_queue's
 sysfs_lock.

Ah, okay.

 Reviewed-by: Tejun Heo t...@kernel.org

Thanks.

-- 
tejun

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/2] elevator: Fix a race in elevator switching and md device initialization

2013-09-23 Thread Tomoki Sekiyama
Hi Tejun,

Thank you for the review.

On 9/22/13 13:04 , "Tejun Heo"  wrote:

>On Fri, Aug 30, 2013 at 06:47:07PM -0400, Tomoki Sekiyama wrote:
>> @@ -739,9 +739,17 @@ blk_init_allocated_queue(struct request_queue *q,
>>request_fn_proc *rfn,
>>  
>>  q->sg_reserved_size = INT_MAX;
>>  
>> +/* Protect q->elevator from elevator_change */
>> +mutex_lock(>sysfs_lock);
>> +
>>  /* init elevator */
>> -if (elevator_init(q, NULL))
>> +if (elevator_init(q, NULL)) {
>> +mutex_unlock(>sysfs_lock);
>>  return NULL;
>> +}
>> +
>> +mutex_unlock(>sysfs_lock);
>> +
>>  return q;
>>  }
>>  EXPORT_SYMBOL(blk_init_allocated_queue);
>> diff --git a/block/elevator.c b/block/elevator.c
>> index 668394d..02d4390 100644
>> --- a/block/elevator.c
>> +++ b/block/elevator.c
>> @@ -186,6 +186,12 @@ int elevator_init(struct request_queue *q, char
>>*name)
>>  struct elevator_type *e = NULL;
>>  int err;
>>  
>> +/*
>> + * q->sysfs_lock must be held to provide mutual exclusion between
>> + * elevator_switch() and here.
>> + */
>> +lockdep_assert_held(>sysfs_lock);
>> +
>>  if (unlikely(q->elevator))
>>  return 0;
>
>Hmm... why aren't we just changing elevator_init() to grab sysfs_lock
>where necessary?  

The locking cannot be moved into elevator_init() because it is called
from elevator_switch() path, where the request_queue's sysfs_lock is
already taken.

> It'd be more consistent with elevator_exit() that way.

What elevator_exit() locks is elevator_queue's sysfs_lock, not
request_queue's sysfs_lock. What we need here is request_queue's
sysfs_lock.

>Thanks.
>
>-- 
>Tejun


Thanks,
Tomoki Sekiyama


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/2] elevator: Fix a race in elevator switching and md device initialization

2013-09-23 Thread Tomoki Sekiyama
Hi Tejun,

Thank you for the review.

On 9/22/13 13:04 , Tejun Heo t...@kernel.org wrote:

On Fri, Aug 30, 2013 at 06:47:07PM -0400, Tomoki Sekiyama wrote:
 @@ -739,9 +739,17 @@ blk_init_allocated_queue(struct request_queue *q,
request_fn_proc *rfn,
  
  q-sg_reserved_size = INT_MAX;
  
 +/* Protect q-elevator from elevator_change */
 +mutex_lock(q-sysfs_lock);
 +
  /* init elevator */
 -if (elevator_init(q, NULL))
 +if (elevator_init(q, NULL)) {
 +mutex_unlock(q-sysfs_lock);
  return NULL;
 +}
 +
 +mutex_unlock(q-sysfs_lock);
 +
  return q;
  }
  EXPORT_SYMBOL(blk_init_allocated_queue);
 diff --git a/block/elevator.c b/block/elevator.c
 index 668394d..02d4390 100644
 --- a/block/elevator.c
 +++ b/block/elevator.c
 @@ -186,6 +186,12 @@ int elevator_init(struct request_queue *q, char
*name)
  struct elevator_type *e = NULL;
  int err;
  
 +/*
 + * q-sysfs_lock must be held to provide mutual exclusion between
 + * elevator_switch() and here.
 + */
 +lockdep_assert_held(q-sysfs_lock);
 +
  if (unlikely(q-elevator))
  return 0;

Hmm... why aren't we just changing elevator_init() to grab sysfs_lock
where necessary?  

The locking cannot be moved into elevator_init() because it is called
from elevator_switch() path, where the request_queue's sysfs_lock is
already taken.

 It'd be more consistent with elevator_exit() that way.

What elevator_exit() locks is elevator_queue's sysfs_lock, not
request_queue's sysfs_lock. What we need here is request_queue's
sysfs_lock.

Thanks.

-- 
Tejun


Thanks,
Tomoki Sekiyama


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/2] elevator: Fix a race in elevator switching and md device initialization

2013-09-06 Thread Tomoki Sekiyama
Ping: any comments for this series?


On 8/30/13 18:47 , "Tomoki Sekiyama"  wrote:

>The soft lockup below happens at the boot time of the system using dm
>multipath and the udev rules to switch scheduler.
>
>[  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
>[  356.127001] RIP: 0010:[]  []
>lock_timer_base.isra.35+0x1d/0x50
>...
>[  356.127001] Call Trace:
>[  356.127001]  [] try_to_del_timer_sync+0x20/0x70
>[  356.127001]  [] ?
>kmem_cache_alloc_node_trace+0x20a/0x230
>[  356.127001]  [] del_timer_sync+0x52/0x60
>[  356.127001]  [] cfq_exit_queue+0x32/0xf0
>[  356.127001]  [] elevator_exit+0x2f/0x50
>[  356.127001]  [] elevator_change+0xf1/0x1c0
>[  356.127001]  [] elv_iosched_store+0x20/0x50
>[  356.127001]  [] queue_attr_store+0x59/0xb0
>[  356.127001]  [] sysfs_write_file+0xc6/0x140
>[  356.127001]  [] vfs_write+0xbd/0x1e0
>[  356.127001]  [] SyS_write+0x49/0xa0
>[  356.127001]  [] system_call_fastpath+0x16/0x1b
>
>This is caused by a race between md device initialization by multipathd
>and
>shell script to switch the scheduler using sysfs.
>
> - multipathd:
>   SyS_ioctl -> do_vfs_ioctl -> dm_ctl_ioctl -> ctl_ioctl -> table_load
>   -> dm_setup_md_queue -> blk_init_allocated_queue -> elevator_init
>q->elevator = elevator_alloc(q, e); // not yet initialized
>
> - sh -c 'echo deadline > /sys/$DEVPATH/queue/scheduler':
>   elevator_switch (in the call trace above)
>struct elevator_queue *old = q->elevator;
>q->elevator = elevator_alloc(q, new_e);
>elevator_exit(old); // lockup! (*)
>
> - multipathd: (cont.)
>err = e->ops.elevator_init_fn(q);   // init fails; q->elevator is
>modified
>
>(*) When del_timer_sync() is called, lock_timer_base() will loop
>infinitely
>while timer->base == NULL. In this case, as timer will never initialized,
>it results in lockup.
>
>This patch introduces acquisition of q->sysfs_lock around elevator_init()
>into blk_init_allocated_queue(), to provide mutual exclusion between
>initialization of the q->scheduler and switching of the scheduler.
>
>This should fix this bugzilla:
>https://bugzilla.redhat.com/show_bug.cgi?id=902012
>
>Signed-off-by: Tomoki Sekiyama 
>---
> block/blk-core.c |   10 +-
> block/elevator.c |6 ++
> 2 files changed, 15 insertions(+), 1 deletion(-)
>
>diff --git a/block/blk-core.c b/block/blk-core.c
>index 93a18d1..2f6275f 100644
>--- a/block/blk-core.c
>+++ b/block/blk-core.c
>@@ -739,9 +739,17 @@ blk_init_allocated_queue(struct request_queue *q,
>request_fn_proc *rfn,
> 
>   q->sg_reserved_size = INT_MAX;
> 
>+  /* Protect q->elevator from elevator_change */
>+  mutex_lock(>sysfs_lock);
>+
>   /* init elevator */
>-  if (elevator_init(q, NULL))
>+  if (elevator_init(q, NULL)) {
>+  mutex_unlock(>sysfs_lock);
>   return NULL;
>+  }
>+
>+  mutex_unlock(>sysfs_lock);
>+
>   return q;
> }
> EXPORT_SYMBOL(blk_init_allocated_queue);
>diff --git a/block/elevator.c b/block/elevator.c
>index 668394d..02d4390 100644
>--- a/block/elevator.c
>+++ b/block/elevator.c
>@@ -186,6 +186,12 @@ int elevator_init(struct request_queue *q, char
>*name)
>   struct elevator_type *e = NULL;
>   int err;
> 
>+  /*
>+   * q->sysfs_lock must be held to provide mutual exclusion between
>+   * elevator_switch() and here.
>+   */
>+  lockdep_assert_held(>sysfs_lock);
>+
>   if (unlikely(q->elevator))
>   return 0;
> 
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/2] elevator: Fix a race in elevator switching and md device initialization

2013-09-06 Thread Tomoki Sekiyama
Ping: any comments for this series?


On 8/30/13 18:47 , Tomoki Sekiyama tomoki.sekiy...@hds.com wrote:

The soft lockup below happens at the boot time of the system using dm
multipath and the udev rules to switch scheduler.

[  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
[  356.127001] RIP: 0010:[81072a7d]  [81072a7d]
lock_timer_base.isra.35+0x1d/0x50
...
[  356.127001] Call Trace:
[  356.127001]  [81073810] try_to_del_timer_sync+0x20/0x70
[  356.127001]  [8118b08a] ?
kmem_cache_alloc_node_trace+0x20a/0x230
[  356.127001]  [810738b2] del_timer_sync+0x52/0x60
[  356.127001]  [812ece22] cfq_exit_queue+0x32/0xf0
[  356.127001]  [812c98df] elevator_exit+0x2f/0x50
[  356.127001]  [812c9f21] elevator_change+0xf1/0x1c0
[  356.127001]  [812caa50] elv_iosched_store+0x20/0x50
[  356.127001]  [812d1d09] queue_attr_store+0x59/0xb0
[  356.127001]  [812143f6] sysfs_write_file+0xc6/0x140
[  356.127001]  [811a326d] vfs_write+0xbd/0x1e0
[  356.127001]  [811a3ca9] SyS_write+0x49/0xa0
[  356.127001]  [8164e899] system_call_fastpath+0x16/0x1b

This is caused by a race between md device initialization by multipathd
and
shell script to switch the scheduler using sysfs.

 - multipathd:
   SyS_ioctl - do_vfs_ioctl - dm_ctl_ioctl - ctl_ioctl - table_load
   - dm_setup_md_queue - blk_init_allocated_queue - elevator_init
q-elevator = elevator_alloc(q, e); // not yet initialized

 - sh -c 'echo deadline  /sys/$DEVPATH/queue/scheduler':
   elevator_switch (in the call trace above)
struct elevator_queue *old = q-elevator;
q-elevator = elevator_alloc(q, new_e);
elevator_exit(old); // lockup! (*)

 - multipathd: (cont.)
err = e-ops.elevator_init_fn(q);   // init fails; q-elevator is
modified

(*) When del_timer_sync() is called, lock_timer_base() will loop
infinitely
while timer-base == NULL. In this case, as timer will never initialized,
it results in lockup.

This patch introduces acquisition of q-sysfs_lock around elevator_init()
into blk_init_allocated_queue(), to provide mutual exclusion between
initialization of the q-scheduler and switching of the scheduler.

This should fix this bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=902012

Signed-off-by: Tomoki Sekiyama tomoki.sekiy...@hds.com
---
 block/blk-core.c |   10 +-
 block/elevator.c |6 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 93a18d1..2f6275f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -739,9 +739,17 @@ blk_init_allocated_queue(struct request_queue *q,
request_fn_proc *rfn,
 
   q-sg_reserved_size = INT_MAX;
 
+  /* Protect q-elevator from elevator_change */
+  mutex_lock(q-sysfs_lock);
+
   /* init elevator */
-  if (elevator_init(q, NULL))
+  if (elevator_init(q, NULL)) {
+  mutex_unlock(q-sysfs_lock);
   return NULL;
+  }
+
+  mutex_unlock(q-sysfs_lock);
+
   return q;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
diff --git a/block/elevator.c b/block/elevator.c
index 668394d..02d4390 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -186,6 +186,12 @@ int elevator_init(struct request_queue *q, char
*name)
   struct elevator_type *e = NULL;
   int err;
 
+  /*
+   * q-sysfs_lock must be held to provide mutual exclusion between
+   * elevator_switch() and here.
+   */
+  lockdep_assert_held(q-sysfs_lock);
+
   if (unlikely(q-elevator))
   return 0;
 


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/2] elevator: acquire q->sysfs_lock in elevator_change()

2013-08-30 Thread Tomoki Sekiyama
Add locking of q->sysfs_lock into elevator_change() (an exported function)
to ensure it is held to protect q->elevator from elevator_init(), even if
elevator_change() is called from non-sysfs paths.
sysfs path (elv_iosched_store) uses __elevator_change(), non-locking
version, as the lock is already taken by elv_iosched_store().

Signed-off-by: Tomoki Sekiyama 
---
 block/elevator.c |   16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 02d4390..6d765f7 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -965,7 +965,7 @@ fail_init:
 /*
  * Switch this queue to the given IO scheduler.
  */
-int elevator_change(struct request_queue *q, const char *name)
+static int __elevator_change(struct request_queue *q, const char *name)
 {
char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
@@ -987,6 +987,18 @@ int elevator_change(struct request_queue *q, const char 
*name)
 
return elevator_switch(q, e);
 }
+
+int elevator_change(struct request_queue *q, const char *name)
+{
+   int ret;
+
+   /* Protect q->elevator from elevator_init() */
+   mutex_lock(>sysfs_lock);
+   ret = __elevator_change(q, name);
+   mutex_unlock(>sysfs_lock);
+
+   return ret;
+}
 EXPORT_SYMBOL(elevator_change);
 
 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
@@ -997,7 +1009,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const 
char *name,
if (!q->elevator)
return count;
 
-   ret = elevator_change(q, name);
+   ret = __elevator_change(q, name);
if (!ret)
return count;
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/2] elevator: Fix a race in elevator switching and md device initialization

2013-08-30 Thread Tomoki Sekiyama
The soft lockup below happens at the boot time of the system using dm
multipath and the udev rules to switch scheduler.

[  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
[  356.127001] RIP: 0010:[]  [] 
lock_timer_base.isra.35+0x1d/0x50
...
[  356.127001] Call Trace:
[  356.127001]  [] try_to_del_timer_sync+0x20/0x70
[  356.127001]  [] ? kmem_cache_alloc_node_trace+0x20a/0x230
[  356.127001]  [] del_timer_sync+0x52/0x60
[  356.127001]  [] cfq_exit_queue+0x32/0xf0
[  356.127001]  [] elevator_exit+0x2f/0x50
[  356.127001]  [] elevator_change+0xf1/0x1c0
[  356.127001]  [] elv_iosched_store+0x20/0x50
[  356.127001]  [] queue_attr_store+0x59/0xb0
[  356.127001]  [] sysfs_write_file+0xc6/0x140
[  356.127001]  [] vfs_write+0xbd/0x1e0
[  356.127001]  [] SyS_write+0x49/0xa0
[  356.127001]  [] system_call_fastpath+0x16/0x1b

This is caused by a race between md device initialization by multipathd and
shell script to switch the scheduler using sysfs.

 - multipathd:
   SyS_ioctl -> do_vfs_ioctl -> dm_ctl_ioctl -> ctl_ioctl -> table_load
   -> dm_setup_md_queue -> blk_init_allocated_queue -> elevator_init
q->elevator = elevator_alloc(q, e); // not yet initialized

 - sh -c 'echo deadline > /sys/$DEVPATH/queue/scheduler':
   elevator_switch (in the call trace above)
struct elevator_queue *old = q->elevator;
q->elevator = elevator_alloc(q, new_e);
elevator_exit(old); // lockup! (*)

 - multipathd: (cont.)
err = e->ops.elevator_init_fn(q);   // init fails; q->elevator is modified

(*) When del_timer_sync() is called, lock_timer_base() will loop infinitely
while timer->base == NULL. In this case, as timer will never initialized,
it results in lockup.

This patch introduces acquisition of q->sysfs_lock around elevator_init()
into blk_init_allocated_queue(), to provide mutual exclusion between
initialization of the q->scheduler and switching of the scheduler.

This should fix this bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=902012

Signed-off-by: Tomoki Sekiyama 
---
 block/blk-core.c |   10 +-
 block/elevator.c |6 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 93a18d1..2f6275f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -739,9 +739,17 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
q->sg_reserved_size = INT_MAX;
 
+   /* Protect q->elevator from elevator_change */
+   mutex_lock(>sysfs_lock);
+
/* init elevator */
-   if (elevator_init(q, NULL))
+   if (elevator_init(q, NULL)) {
+   mutex_unlock(>sysfs_lock);
return NULL;
+   }
+
+   mutex_unlock(>sysfs_lock);
+
return q;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
diff --git a/block/elevator.c b/block/elevator.c
index 668394d..02d4390 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -186,6 +186,12 @@ int elevator_init(struct request_queue *q, char *name)
struct elevator_type *e = NULL;
int err;
 
+   /*
+* q->sysfs_lock must be held to provide mutual exclusion between
+* elevator_switch() and here.
+*/
+   lockdep_assert_held(>sysfs_lock);
+
if (unlikely(q->elevator))
return 0;
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/2] elevator: Fix a race in elevator switching and md device initialization

2013-08-30 Thread Tomoki Sekiyama
The soft lockup below happens at the boot time of the system using dm
multipath and the udev rules to switch scheduler.

[  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
[  356.127001] RIP: 0010:[81072a7d]  [81072a7d] 
lock_timer_base.isra.35+0x1d/0x50
...
[  356.127001] Call Trace:
[  356.127001]  [81073810] try_to_del_timer_sync+0x20/0x70
[  356.127001]  [8118b08a] ? kmem_cache_alloc_node_trace+0x20a/0x230
[  356.127001]  [810738b2] del_timer_sync+0x52/0x60
[  356.127001]  [812ece22] cfq_exit_queue+0x32/0xf0
[  356.127001]  [812c98df] elevator_exit+0x2f/0x50
[  356.127001]  [812c9f21] elevator_change+0xf1/0x1c0
[  356.127001]  [812caa50] elv_iosched_store+0x20/0x50
[  356.127001]  [812d1d09] queue_attr_store+0x59/0xb0
[  356.127001]  [812143f6] sysfs_write_file+0xc6/0x140
[  356.127001]  [811a326d] vfs_write+0xbd/0x1e0
[  356.127001]  [811a3ca9] SyS_write+0x49/0xa0
[  356.127001]  [8164e899] system_call_fastpath+0x16/0x1b

This is caused by a race between md device initialization by multipathd and
shell script to switch the scheduler using sysfs.

 - multipathd:
   SyS_ioctl - do_vfs_ioctl - dm_ctl_ioctl - ctl_ioctl - table_load
   - dm_setup_md_queue - blk_init_allocated_queue - elevator_init
q-elevator = elevator_alloc(q, e); // not yet initialized

 - sh -c 'echo deadline  /sys/$DEVPATH/queue/scheduler':
   elevator_switch (in the call trace above)
struct elevator_queue *old = q-elevator;
q-elevator = elevator_alloc(q, new_e);
elevator_exit(old); // lockup! (*)

 - multipathd: (cont.)
err = e-ops.elevator_init_fn(q);   // init fails; q-elevator is modified

(*) When del_timer_sync() is called, lock_timer_base() will loop infinitely
while timer-base == NULL. In this case, as timer will never initialized,
it results in lockup.

This patch introduces acquisition of q-sysfs_lock around elevator_init()
into blk_init_allocated_queue(), to provide mutual exclusion between
initialization of the q-scheduler and switching of the scheduler.

This should fix this bugzilla:
https://bugzilla.redhat.com/show_bug.cgi?id=902012

Signed-off-by: Tomoki Sekiyama tomoki.sekiy...@hds.com
---
 block/blk-core.c |   10 +-
 block/elevator.c |6 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 93a18d1..2f6275f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -739,9 +739,17 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
q-sg_reserved_size = INT_MAX;
 
+   /* Protect q-elevator from elevator_change */
+   mutex_lock(q-sysfs_lock);
+
/* init elevator */
-   if (elevator_init(q, NULL))
+   if (elevator_init(q, NULL)) {
+   mutex_unlock(q-sysfs_lock);
return NULL;
+   }
+
+   mutex_unlock(q-sysfs_lock);
+
return q;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
diff --git a/block/elevator.c b/block/elevator.c
index 668394d..02d4390 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -186,6 +186,12 @@ int elevator_init(struct request_queue *q, char *name)
struct elevator_type *e = NULL;
int err;
 
+   /*
+* q-sysfs_lock must be held to provide mutual exclusion between
+* elevator_switch() and here.
+*/
+   lockdep_assert_held(q-sysfs_lock);
+
if (unlikely(q-elevator))
return 0;
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/2] elevator: acquire q-sysfs_lock in elevator_change()

2013-08-30 Thread Tomoki Sekiyama
Add locking of q-sysfs_lock into elevator_change() (an exported function)
to ensure it is held to protect q-elevator from elevator_init(), even if
elevator_change() is called from non-sysfs paths.
sysfs path (elv_iosched_store) uses __elevator_change(), non-locking
version, as the lock is already taken by elv_iosched_store().

Signed-off-by: Tomoki Sekiyama tomoki.sekiy...@hds.com
---
 block/elevator.c |   16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 02d4390..6d765f7 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -965,7 +965,7 @@ fail_init:
 /*
  * Switch this queue to the given IO scheduler.
  */
-int elevator_change(struct request_queue *q, const char *name)
+static int __elevator_change(struct request_queue *q, const char *name)
 {
char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
@@ -987,6 +987,18 @@ int elevator_change(struct request_queue *q, const char 
*name)
 
return elevator_switch(q, e);
 }
+
+int elevator_change(struct request_queue *q, const char *name)
+{
+   int ret;
+
+   /* Protect q-elevator from elevator_init() */
+   mutex_lock(q-sysfs_lock);
+   ret = __elevator_change(q, name);
+   mutex_unlock(q-sysfs_lock);
+
+   return ret;
+}
 EXPORT_SYMBOL(elevator_change);
 
 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
@@ -997,7 +1009,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const 
char *name,
if (!q-elevator)
return count;
 
-   ret = elevator_change(q, name);
+   ret = __elevator_change(q, name);
if (!ret)
return count;
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] elevator: Fix a race in elevator switching and md device initialization

2013-08-29 Thread Tomoki Sekiyama
On 8/29/13 16:29 , "Vivek Goyal"  wrote:

>On Mon, Aug 26, 2013 at 09:45:15AM -0400, Tomoki Sekiyama wrote:
>> The soft lockup below happes at the boot time of the system using dm
>> multipath and automated elevator switching udev rules.
>> 
>> [  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
>> [  356.127001] RIP: 0010:[]  []
>>lock_timer_base.isra.35+0x1d/0x50
>> ...
>> [  356.127001] Call Trace:
>> [  356.127001]  [] try_to_del_timer_sync+0x20/0x70
>> [  356.127001]  [] ?
>>kmem_cache_alloc_node_trace+0x20a/0x230
>> [  356.127001]  [] del_timer_sync+0x52/0x60
>> [  356.127001]  [] cfq_exit_queue+0x32/0xf0
>> [  356.127001]  [] elevator_exit+0x2f/0x50
>> [  356.127001]  [] elevator_change+0xf1/0x1c0
>> [  356.127001]  [] elv_iosched_store+0x20/0x50
>> [  356.127001]  [] queue_attr_store+0x59/0xb0
>> [  356.127001]  [] sysfs_write_file+0xc6/0x140
>> [  356.127001]  [] vfs_write+0xbd/0x1e0
>> [  356.127001]  [] SyS_write+0x49/0xa0
>> [  356.127001]  [] system_call_fastpath+0x16/0x1b
>> 
>> This is caused by a race between md device initialization and sysfs knob
>> to switch the scheduler.
>
>I think we can also improve changelog a bit. So IIUC, softlockup
>happens because one thread called del_timer_sync() on a timer which
>was not even initilized. Timer initialization should have happened
>in cfq_init_queue() using init_timer(). But before init_timer()
>could be called, elevator switch path called del_timer_sync().
>
>del_timer_sync() in turn calls lock_timer_base() which will loop
>infinitely if timer->base == NULL. And because we have not called
>init_timer() yet, I am assuming timer->base is null?
>
>Is this right analysis? If yes, then this patch should most likely
>fix following bz.
>
>https://bugzilla.redhat.com/show_bug.cgi?id=902012

I think your analysis is correct. If del_timer_sync() is called right after
cfqd is allocated (with __GFP_ZERO), timer->base == NULL. Otherwise it may
hit NULL pointer.

The other reason of NULL timer->base is that the timer is migrating in
__mod_timer(), but then the it must be set to non-NULL in a short time.

Maybe __mod_timer should use some illegal pointer value (like LIST_POISON1)
instead of NULL to represent the timer is migrating
Actually, when I changed __mod_timer to timer_set_base(timer, 0xdeadbeaf),
made lock_timer_base wait while base == 0xdeadbeaf, and
added BUG_ON(!timer->base) to lock_timer_base, my system hits the BUG.

>I had concluded that some how timer->base is NULL but could not understand
>how come timer base is NULL when we have called init_timer() on it.
>
>Thanks
>Vivek

Thanks,
Tomoki

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] elevator: Fix a race in elevator switching and md device initialization

2013-08-29 Thread Tomoki Sekiyama
On 8/29/13 14:43 , "Vivek Goyal"  wrote:
>On Thu, Aug 29, 2013 at 02:33:10PM -0400, Vivek Goyal wrote:
>> On Mon, Aug 26, 2013 at 09:45:15AM -0400, Tomoki Sekiyama wrote:
>> > The soft lockup below happes at the boot time of the system using dm
>> > multipath and automated elevator switching udev rules.
>> > 
>> > [  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
>> > [  356.127001] RIP: 0010:[]  []
>>lock_timer_base.isra.35+0x1d/0x50
>> > ...
>> > [  356.127001] Call Trace:
>> > [  356.127001]  [] try_to_del_timer_sync+0x20/0x70
>> > [  356.127001]  [] ?
>>kmem_cache_alloc_node_trace+0x20a/0x230
>> > [  356.127001]  [] del_timer_sync+0x52/0x60
>> > [  356.127001]  [] cfq_exit_queue+0x32/0xf0
>> > [  356.127001]  [] elevator_exit+0x2f/0x50
>> > [  356.127001]  [] elevator_change+0xf1/0x1c0
>> > [  356.127001]  [] elv_iosched_store+0x20/0x50
>> > [  356.127001]  [] queue_attr_store+0x59/0xb0
>> > [  356.127001]  [] sysfs_write_file+0xc6/0x140
>> > [  356.127001]  [] vfs_write+0xbd/0x1e0
>> > [  356.127001]  [] SyS_write+0x49/0xa0
>> > [  356.127001]  [] system_call_fastpath+0x16/0x1b
>> > 
>> 
>> Tokomi, 
>> 
>> As you noticed, there is a fedora bug open with similar signature. May
>> be this patch will fix that issue also.
>> 
>> https://bugzilla.redhat.com/show_bug.cgi?id=902012
>> 
>> 
>> > This is caused by a race between md device initialization and sysfs
>>knob
>> > to switch the scheduler.
>> > 
>> > * multipathd:
>> >  SyS_ioctl -> do_vfs_ioctl -> dm_ctl_ioctl -> ctl_ioctl ->  table_load
>> >   -> dm_setup_md_queue -> blk_init_allocated_queue -> elevator_init:
>> > 
>> > q->elevator = elevator_alloc(q, e); // not yet initialized
>> > 
>> > * sh -c 'echo deadline > /sys/$DEVPATH/queue/scheduler'
>> >  SyS_write -> vfs_write -> sysfs_write_file -> queue_attr_store
>> >  ( mutex_lock(>sysfs_lock) here. )
>> >   -> elv_iosched_store -> elevator_change:
>> > 
>> >   elevator_exit(old); // try to de-init uninitialized elevator and
>>hang up
>> > 
>
>If problem in this case is that we are trying to exit() the elevator
>which has not been properly initialized, then we should not attach
>the elevator to the queue yet.
>
>In cfq_init_queue(), can we move following code towards the end of
>function.
>
>spin_lock_irq(q->queue_lock);
>q->elevator = eq;
>spin_unlock_irq(q->queue_lock);
>
>So till elevator is initialized, we will not attach it to queue and
>elevator_switch() will return as it will not find a valid elevator
>on the queue.
>
>
>elevator_change() {
>   if (!q->elevator)
>return -ENXIO;
>}
>
>Thanks
>Vivek

I think it also works, though I prefer introducing explicit locking,
as you said, so that code won't break again in some future.

Thanks,
Tomoki

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] elevator: Fix a race in elevator switching and md device initialization

2013-08-29 Thread Tomoki Sekiyama
Hi vivek,

Thanks for your comments.

On 8/29/13 14:33 , "Vivek Goyal"  wrote:

>On Mon, Aug 26, 2013 at 09:45:15AM -0400, Tomoki Sekiyama wrote:
>> The soft lockup below happes at the boot time of the system using dm
>> multipath and automated elevator switching udev rules.
>> 
>> [  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
>> [  356.127001] RIP: 0010:[]  []
>>lock_timer_base.isra.35+0x1d/0x50
>> ...
>> [  356.127001] Call Trace:
>> [  356.127001]  [] try_to_del_timer_sync+0x20/0x70
>> [  356.127001]  [] ?
>>kmem_cache_alloc_node_trace+0x20a/0x230
>> [  356.127001]  [] del_timer_sync+0x52/0x60
>> [  356.127001]  [] cfq_exit_queue+0x32/0xf0
>> [  356.127001]  [] elevator_exit+0x2f/0x50
>> [  356.127001]  [] elevator_change+0xf1/0x1c0
>> [  356.127001]  [] elv_iosched_store+0x20/0x50
>> [  356.127001]  [] queue_attr_store+0x59/0xb0
>> [  356.127001]  [] sysfs_write_file+0xc6/0x140
>> [  356.127001]  [] vfs_write+0xbd/0x1e0
>> [  356.127001]  [] SyS_write+0x49/0xa0
>> [  356.127001]  [] system_call_fastpath+0x16/0x1b
>> 
>
>Tokomi, 
>
>As you noticed, there is a fedora bug open with similar signature. May
>be this patch will fix that issue also.
>
>https://bugzilla.redhat.com/show_bug.cgi?id=902012
>
>
>> This is caused by a race between md device initialization and sysfs knob
>> to switch the scheduler.
>> 
>> * multipathd:
>>  SyS_ioctl -> do_vfs_ioctl -> dm_ctl_ioctl -> ctl_ioctl ->  table_load
>>   -> dm_setup_md_queue -> blk_init_allocated_queue -> elevator_init:
>>
>> q->elevator = elevator_alloc(q, e); // not yet initialized
>> 
>>
>>* sh -c 'echo deadline > /sys/$DEVPATH/queue/scheduler'
>>  SyS_write -> vfs_write -> sysfs_write_file -> queue_attr_store
>>  ( mutex_lock(>sysfs_lock) here. )
>>   -> elv_iosched_store -> elevator_change:
>> 
>>
>>   elevator_exit(old); // try to de-init uninitialized elevator and hang
>>up
>> 
>>
>>This patch adds acquisition of q->sysfs_lock in
>>blk_init_allocated_queue().
>> This also adds the lock into elevator_change() to ensure locking from
>>the
>> other path, as it is exposed function (and queue_attr_store will uses
>> __elevator_change() now, the non-locking version of elevator_change()).
>
>I think introducing __elevator_change() is orthogonal to this problem.
>May be keep that in a separate patch.

OK, I will split it into 2 patches.


>>  block/blk-core.c |6 +-
>>  block/elevator.c |   16 ++--
>>  2 files changed, 19 insertions(+), 3 deletions(-)
>> 
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 93a18d1..2323ec3 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -739,9 +739,13 @@ blk_init_allocated_queue(struct request_queue *q,
>>request_fn_proc *rfn,
>>  
>>  q->sg_reserved_size = INT_MAX;
>>  
>> +/* Protect q->elevator from elevator_change */
>> +mutex_lock(>sysfs_lock);
>>  /* init elevator */
>>  if (elevator_init(q, NULL))
>> -return NULL;
>> +q = NULL;
>> +mutex_unlock(>sysfs_lock);
>> +
>
>So core of the problem is, what's the locking semantics to make sure
>that we are not trying to switch elevator while it is still initializing.
>IOW, should we allow multiple parallel calls of elevator_init_fn() on a
>queue and is it safe?
>
>I would argue that it is easier to read and maintain the code if we
>provide explicit locking around. So I like the idea of introducing
>some locking around elevator_init().
>
>Because we are racing against elevator switch path which takes
>q->sysfs_lock, it makes sense to provide mutual exlusion using
>q->sysfs_lock.
>
>What I don't know is that can we take mutex in queue init path. Generally
>drivers call it and do they expect that they can call this function
>while holding a spin lock.

As elevator_alloc() allocates memory with GFP_KERNEL, elevator_init() might
sleep. So it should be safe to use mutex here.
 
>I am CCing Tejun also to the thread. He also might have some ideas here.
>
>Thanks
>Vivek

Thanks,
Tomoki Sekiyama


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] elevator: Fix a race in elevator switching and md device initialization

2013-08-29 Thread Tomoki Sekiyama
Hi vivek,

Thanks for your comments.

On 8/29/13 14:33 , Vivek Goyal vgo...@redhat.com wrote:

On Mon, Aug 26, 2013 at 09:45:15AM -0400, Tomoki Sekiyama wrote:
 The soft lockup below happes at the boot time of the system using dm
 multipath and automated elevator switching udev rules.
 
 [  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
 [  356.127001] RIP: 0010:[81072a7d]  [81072a7d]
lock_timer_base.isra.35+0x1d/0x50
 ...
 [  356.127001] Call Trace:
 [  356.127001]  [81073810] try_to_del_timer_sync+0x20/0x70
 [  356.127001]  [8118b08a] ?
kmem_cache_alloc_node_trace+0x20a/0x230
 [  356.127001]  [810738b2] del_timer_sync+0x52/0x60
 [  356.127001]  [812ece22] cfq_exit_queue+0x32/0xf0
 [  356.127001]  [812c98df] elevator_exit+0x2f/0x50
 [  356.127001]  [812c9f21] elevator_change+0xf1/0x1c0
 [  356.127001]  [812caa50] elv_iosched_store+0x20/0x50
 [  356.127001]  [812d1d09] queue_attr_store+0x59/0xb0
 [  356.127001]  [812143f6] sysfs_write_file+0xc6/0x140
 [  356.127001]  [811a326d] vfs_write+0xbd/0x1e0
 [  356.127001]  [811a3ca9] SyS_write+0x49/0xa0
 [  356.127001]  [8164e899] system_call_fastpath+0x16/0x1b
 

Tokomi, 

As you noticed, there is a fedora bug open with similar signature. May
be this patch will fix that issue also.

https://bugzilla.redhat.com/show_bug.cgi?id=902012


 This is caused by a race between md device initialization and sysfs knob
 to switch the scheduler.
 
 * multipathd:
  SyS_ioctl - do_vfs_ioctl - dm_ctl_ioctl - ctl_ioctl -  table_load
   - dm_setup_md_queue - blk_init_allocated_queue - elevator_init:

 q-elevator = elevator_alloc(q, e); // not yet initialized
 

* sh -c 'echo deadline  /sys/$DEVPATH/queue/scheduler'
  SyS_write - vfs_write - sysfs_write_file - queue_attr_store
  ( mutex_lock(q-sysfs_lock) here. )
   - elv_iosched_store - elevator_change:
 

   elevator_exit(old); // try to de-init uninitialized elevator and hang
up
 

This patch adds acquisition of q-sysfs_lock in
blk_init_allocated_queue().
 This also adds the lock into elevator_change() to ensure locking from
the
 other path, as it is exposed function (and queue_attr_store will uses
 __elevator_change() now, the non-locking version of elevator_change()).

I think introducing __elevator_change() is orthogonal to this problem.
May be keep that in a separate patch.

OK, I will split it into 2 patches.


  block/blk-core.c |6 +-
  block/elevator.c |   16 ++--
  2 files changed, 19 insertions(+), 3 deletions(-)
 
 diff --git a/block/blk-core.c b/block/blk-core.c
 index 93a18d1..2323ec3 100644
 --- a/block/blk-core.c
 +++ b/block/blk-core.c
 @@ -739,9 +739,13 @@ blk_init_allocated_queue(struct request_queue *q,
request_fn_proc *rfn,
  
  q-sg_reserved_size = INT_MAX;
  
 +/* Protect q-elevator from elevator_change */
 +mutex_lock(q-sysfs_lock);
  /* init elevator */
  if (elevator_init(q, NULL))
 -return NULL;
 +q = NULL;
 +mutex_unlock(q-sysfs_lock);
 +

So core of the problem is, what's the locking semantics to make sure
that we are not trying to switch elevator while it is still initializing.
IOW, should we allow multiple parallel calls of elevator_init_fn() on a
queue and is it safe?

I would argue that it is easier to read and maintain the code if we
provide explicit locking around. So I like the idea of introducing
some locking around elevator_init().

Because we are racing against elevator switch path which takes
q-sysfs_lock, it makes sense to provide mutual exlusion using
q-sysfs_lock.

What I don't know is that can we take mutex in queue init path. Generally
drivers call it and do they expect that they can call this function
while holding a spin lock.

As elevator_alloc() allocates memory with GFP_KERNEL, elevator_init() might
sleep. So it should be safe to use mutex here.
 
I am CCing Tejun also to the thread. He also might have some ideas here.

Thanks
Vivek

Thanks,
Tomoki Sekiyama


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] elevator: Fix a race in elevator switching and md device initialization

2013-08-29 Thread Tomoki Sekiyama
On 8/29/13 14:43 , Vivek Goyal vgo...@redhat.com wrote:
On Thu, Aug 29, 2013 at 02:33:10PM -0400, Vivek Goyal wrote:
 On Mon, Aug 26, 2013 at 09:45:15AM -0400, Tomoki Sekiyama wrote:
  The soft lockup below happes at the boot time of the system using dm
  multipath and automated elevator switching udev rules.
  
  [  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
  [  356.127001] RIP: 0010:[81072a7d]  [81072a7d]
lock_timer_base.isra.35+0x1d/0x50
  ...
  [  356.127001] Call Trace:
  [  356.127001]  [81073810] try_to_del_timer_sync+0x20/0x70
  [  356.127001]  [8118b08a] ?
kmem_cache_alloc_node_trace+0x20a/0x230
  [  356.127001]  [810738b2] del_timer_sync+0x52/0x60
  [  356.127001]  [812ece22] cfq_exit_queue+0x32/0xf0
  [  356.127001]  [812c98df] elevator_exit+0x2f/0x50
  [  356.127001]  [812c9f21] elevator_change+0xf1/0x1c0
  [  356.127001]  [812caa50] elv_iosched_store+0x20/0x50
  [  356.127001]  [812d1d09] queue_attr_store+0x59/0xb0
  [  356.127001]  [812143f6] sysfs_write_file+0xc6/0x140
  [  356.127001]  [811a326d] vfs_write+0xbd/0x1e0
  [  356.127001]  [811a3ca9] SyS_write+0x49/0xa0
  [  356.127001]  [8164e899] system_call_fastpath+0x16/0x1b
  
 
 Tokomi, 
 
 As you noticed, there is a fedora bug open with similar signature. May
 be this patch will fix that issue also.
 
 https://bugzilla.redhat.com/show_bug.cgi?id=902012
 
 
  This is caused by a race between md device initialization and sysfs
knob
  to switch the scheduler.
  
  * multipathd:
   SyS_ioctl - do_vfs_ioctl - dm_ctl_ioctl - ctl_ioctl -  table_load
- dm_setup_md_queue - blk_init_allocated_queue - elevator_init:
  
  q-elevator = elevator_alloc(q, e); // not yet initialized
  
  * sh -c 'echo deadline  /sys/$DEVPATH/queue/scheduler'
   SyS_write - vfs_write - sysfs_write_file - queue_attr_store
   ( mutex_lock(q-sysfs_lock) here. )
- elv_iosched_store - elevator_change:
  
elevator_exit(old); // try to de-init uninitialized elevator and
hang up
  

If problem in this case is that we are trying to exit() the elevator
which has not been properly initialized, then we should not attach
the elevator to the queue yet.

In cfq_init_queue(), can we move following code towards the end of
function.

spin_lock_irq(q-queue_lock);
q-elevator = eq;
spin_unlock_irq(q-queue_lock);

So till elevator is initialized, we will not attach it to queue and
elevator_switch() will return as it will not find a valid elevator
on the queue.


elevator_change() {
   if (!q-elevator)
return -ENXIO;
}

Thanks
Vivek

I think it also works, though I prefer introducing explicit locking,
as you said, so that code won't break again in some future.

Thanks,
Tomoki

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] elevator: Fix a race in elevator switching and md device initialization

2013-08-29 Thread Tomoki Sekiyama
On 8/29/13 16:29 , Vivek Goyal vgo...@redhat.com wrote:

On Mon, Aug 26, 2013 at 09:45:15AM -0400, Tomoki Sekiyama wrote:
 The soft lockup below happes at the boot time of the system using dm
 multipath and automated elevator switching udev rules.
 
 [  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
 [  356.127001] RIP: 0010:[81072a7d]  [81072a7d]
lock_timer_base.isra.35+0x1d/0x50
 ...
 [  356.127001] Call Trace:
 [  356.127001]  [81073810] try_to_del_timer_sync+0x20/0x70
 [  356.127001]  [8118b08a] ?
kmem_cache_alloc_node_trace+0x20a/0x230
 [  356.127001]  [810738b2] del_timer_sync+0x52/0x60
 [  356.127001]  [812ece22] cfq_exit_queue+0x32/0xf0
 [  356.127001]  [812c98df] elevator_exit+0x2f/0x50
 [  356.127001]  [812c9f21] elevator_change+0xf1/0x1c0
 [  356.127001]  [812caa50] elv_iosched_store+0x20/0x50
 [  356.127001]  [812d1d09] queue_attr_store+0x59/0xb0
 [  356.127001]  [812143f6] sysfs_write_file+0xc6/0x140
 [  356.127001]  [811a326d] vfs_write+0xbd/0x1e0
 [  356.127001]  [811a3ca9] SyS_write+0x49/0xa0
 [  356.127001]  [8164e899] system_call_fastpath+0x16/0x1b
 
 This is caused by a race between md device initialization and sysfs knob
 to switch the scheduler.

I think we can also improve changelog a bit. So IIUC, softlockup
happens because one thread called del_timer_sync() on a timer which
was not even initilized. Timer initialization should have happened
in cfq_init_queue() using init_timer(). But before init_timer()
could be called, elevator switch path called del_timer_sync().

del_timer_sync() in turn calls lock_timer_base() which will loop
infinitely if timer-base == NULL. And because we have not called
init_timer() yet, I am assuming timer-base is null?

Is this right analysis? If yes, then this patch should most likely
fix following bz.

https://bugzilla.redhat.com/show_bug.cgi?id=902012

I think your analysis is correct. If del_timer_sync() is called right after
cfqd is allocated (with __GFP_ZERO), timer-base == NULL. Otherwise it may
hit NULL pointer.

The other reason of NULL timer-base is that the timer is migrating in
__mod_timer(), but then the it must be set to non-NULL in a short time.

Maybe __mod_timer should use some illegal pointer value (like LIST_POISON1)
instead of NULL to represent the timer is migrating
Actually, when I changed __mod_timer to timer_set_base(timer, 0xdeadbeaf),
made lock_timer_base wait while base == 0xdeadbeaf, and
added BUG_ON(!timer-base) to lock_timer_base, my system hits the BUG.

I had concluded that some how timer-base is NULL but could not understand
how come timer base is NULL when we have called init_timer() on it.

Thanks
Vivek

Thanks,
Tomoki

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] elevator: Fix a race in elevator switching and md device initialization

2013-08-26 Thread Tomoki Sekiyama
The soft lockup below happes at the boot time of the system using dm
multipath and automated elevator switching udev rules.

[  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
[  356.127001] RIP: 0010:[]  [] 
lock_timer_base.isra.35+0x1d/0x50
...
[  356.127001] Call Trace:
[  356.127001]  [] try_to_del_timer_sync+0x20/0x70
[  356.127001]  [] ? kmem_cache_alloc_node_trace+0x20a/0x230
[  356.127001]  [] del_timer_sync+0x52/0x60
[  356.127001]  [] cfq_exit_queue+0x32/0xf0
[  356.127001]  [] elevator_exit+0x2f/0x50
[  356.127001]  [] elevator_change+0xf1/0x1c0
[  356.127001]  [] elv_iosched_store+0x20/0x50
[  356.127001]  [] queue_attr_store+0x59/0xb0
[  356.127001]  [] sysfs_write_file+0xc6/0x140
[  356.127001]  [] vfs_write+0xbd/0x1e0
[  356.127001]  [] SyS_write+0x49/0xa0
[  356.127001]  [] system_call_fastpath+0x16/0x1b

This is caused by a race between md device initialization and sysfs knob
to switch the scheduler.

* multipathd:
 SyS_ioctl -> do_vfs_ioctl -> dm_ctl_ioctl -> ctl_ioctl ->  table_load
  -> dm_setup_md_queue -> blk_init_allocated_queue -> elevator_init:

q->elevator = elevator_alloc(q, e); // not yet initialized

* sh -c 'echo deadline > /sys/$DEVPATH/queue/scheduler'
 SyS_write -> vfs_write -> sysfs_write_file -> queue_attr_store
 ( mutex_lock(>sysfs_lock) here. )
  -> elv_iosched_store -> elevator_change:

  elevator_exit(old); // try to de-init uninitialized elevator and hang up

This patch adds acquisition of q->sysfs_lock in blk_init_allocated_queue().
This also adds the lock into elevator_change() to ensure locking from the
other path, as it is exposed function (and queue_attr_store will uses
__elevator_change() now, the non-locking version of elevator_change()).

Signed-off-by: Tomoki Sekiyama 
---
 block/blk-core.c |6 +-
 block/elevator.c |   16 ++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 93a18d1..2323ec3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -739,9 +739,13 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
q->sg_reserved_size = INT_MAX;
 
+   /* Protect q->elevator from elevator_change */
+   mutex_lock(>sysfs_lock);
/* init elevator */
if (elevator_init(q, NULL))
-   return NULL;
+   q = NULL;
+   mutex_unlock(>sysfs_lock);
+
return q;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
diff --git a/block/elevator.c b/block/elevator.c
index 668394d..5232565 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -959,7 +959,7 @@ fail_init:
 /*
  * Switch this queue to the given IO scheduler.
  */
-int elevator_change(struct request_queue *q, const char *name)
+static int __elevator_change(struct request_queue *q, const char *name)
 {
char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
@@ -981,6 +981,18 @@ int elevator_change(struct request_queue *q, const char 
*name)
 
return elevator_switch(q, e);
 }
+
+int elevator_change(struct request_queue *q, const char *name)
+{
+   int ret;
+
+   /* Protect q->elevator from blk_init_allocated_queue() */
+   mutex_lock(>sysfs_lock);
+   ret = __elevator_change(q, name);
+   mutex_unlock(>sysfs_lock);
+
+   return ret;
+}
 EXPORT_SYMBOL(elevator_change);
 
 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
@@ -991,7 +1003,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const 
char *name,
if (!q->elevator)
return count;
 
-   ret = elevator_change(q, name);
+   ret = __elevator_change(q, name);
if (!ret)
return count;
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] elevator: Fix a race in elevator switching and md device initialization

2013-08-26 Thread Tomoki Sekiyama
The soft lockup below happes at the boot time of the system using dm
multipath and automated elevator switching udev rules.

[  356.127001] BUG: soft lockup - CPU#3 stuck for 22s! [sh:483]
[  356.127001] RIP: 0010:[81072a7d]  [81072a7d] 
lock_timer_base.isra.35+0x1d/0x50
...
[  356.127001] Call Trace:
[  356.127001]  [81073810] try_to_del_timer_sync+0x20/0x70
[  356.127001]  [8118b08a] ? kmem_cache_alloc_node_trace+0x20a/0x230
[  356.127001]  [810738b2] del_timer_sync+0x52/0x60
[  356.127001]  [812ece22] cfq_exit_queue+0x32/0xf0
[  356.127001]  [812c98df] elevator_exit+0x2f/0x50
[  356.127001]  [812c9f21] elevator_change+0xf1/0x1c0
[  356.127001]  [812caa50] elv_iosched_store+0x20/0x50
[  356.127001]  [812d1d09] queue_attr_store+0x59/0xb0
[  356.127001]  [812143f6] sysfs_write_file+0xc6/0x140
[  356.127001]  [811a326d] vfs_write+0xbd/0x1e0
[  356.127001]  [811a3ca9] SyS_write+0x49/0xa0
[  356.127001]  [8164e899] system_call_fastpath+0x16/0x1b

This is caused by a race between md device initialization and sysfs knob
to switch the scheduler.

* multipathd:
 SyS_ioctl - do_vfs_ioctl - dm_ctl_ioctl - ctl_ioctl -  table_load
  - dm_setup_md_queue - blk_init_allocated_queue - elevator_init:

q-elevator = elevator_alloc(q, e); // not yet initialized

* sh -c 'echo deadline  /sys/$DEVPATH/queue/scheduler'
 SyS_write - vfs_write - sysfs_write_file - queue_attr_store
 ( mutex_lock(q-sysfs_lock) here. )
  - elv_iosched_store - elevator_change:

  elevator_exit(old); // try to de-init uninitialized elevator and hang up

This patch adds acquisition of q-sysfs_lock in blk_init_allocated_queue().
This also adds the lock into elevator_change() to ensure locking from the
other path, as it is exposed function (and queue_attr_store will uses
__elevator_change() now, the non-locking version of elevator_change()).

Signed-off-by: Tomoki Sekiyama tomoki.sekiy...@hds.com
---
 block/blk-core.c |6 +-
 block/elevator.c |   16 ++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 93a18d1..2323ec3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -739,9 +739,13 @@ blk_init_allocated_queue(struct request_queue *q, 
request_fn_proc *rfn,
 
q-sg_reserved_size = INT_MAX;
 
+   /* Protect q-elevator from elevator_change */
+   mutex_lock(q-sysfs_lock);
/* init elevator */
if (elevator_init(q, NULL))
-   return NULL;
+   q = NULL;
+   mutex_unlock(q-sysfs_lock);
+
return q;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
diff --git a/block/elevator.c b/block/elevator.c
index 668394d..5232565 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -959,7 +959,7 @@ fail_init:
 /*
  * Switch this queue to the given IO scheduler.
  */
-int elevator_change(struct request_queue *q, const char *name)
+static int __elevator_change(struct request_queue *q, const char *name)
 {
char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
@@ -981,6 +981,18 @@ int elevator_change(struct request_queue *q, const char 
*name)
 
return elevator_switch(q, e);
 }
+
+int elevator_change(struct request_queue *q, const char *name)
+{
+   int ret;
+
+   /* Protect q-elevator from blk_init_allocated_queue() */
+   mutex_lock(q-sysfs_lock);
+   ret = __elevator_change(q, name);
+   mutex_unlock(q-sysfs_lock);
+
+   return ret;
+}
 EXPORT_SYMBOL(elevator_change);
 
 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
@@ -991,7 +1003,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const 
char *name,
if (!q-elevator)
return count;
 
-   ret = elevator_change(q, name);
+   ret = __elevator_change(q, name);
if (!ret)
return count;
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] cfq-iosched: limit slice_idle when many busy queues are in idle window

2013-08-05 Thread Tomoki Sekiyama
On 8/1/13 17:04 , "Jens Axboe"  wrote:

>On 08/01/2013 02:28 PM, Tomoki Sekiyama wrote:
>> On 7/30/13 10:09 PM, Shaohua Li wrote:
>>> On Tue, Jul 30, 2013 at 03:30:33PM -0400, Tomoki Sekiyama wrote:
>>>> Hi,
>>>>
>>>> When some application launches several hundreds of processes that
>>>>issue
>>>> only a few small sync I/O requests, CFQ may cause heavy latencies
>>>> (10+ seconds at the worst case), although the request rate is low
>>>>enough for
>>>> the disk to handle it without waiting. This is because CFQ waits for
>>>> slice_idle (default:8ms) every time before processing each request,
>>>>until
>>>> their thinktimes are evaluated.
>>>>
>>>> This scenario can be reproduced using fio with parameters below:
>>>>   fio -filename=/tmp/test -rw=randread -size=5G -runtime=15
>>>>-name=file1 \
>>>>   -bs=4k -numjobs=500 -thinktime=100
>>>> In this case, 500 processes issue a random read request every second.
>>>
>>> For this workload CFQ should perfectly detect it's a seek queue and
>>>disable
>>> idle. I suppose the reason is CFQ hasn't enough data/time to disable
>>>idle yet,
>>> since your thinktime is long and runtime is short.
>> 
>> Right, CFQ will learn the patten, but it takes too long time to reach
>>stable
>> performance when a lot of I/O processes are launched.
>> 
>>> I thought the real problem here is cfq_init_cfqq() shouldn't set
>>>idle_window
>>> when initializing a queue. We should enable idle window after we
>>>detect the
>>> queue is worthy idle.
>> 
>> Do you think the patch below is appropriate? Or should we check whether
>> busy_idle_queues in my original patch is high enough and only then
>> disable default idle_window in cfq_init_cfqq()?
>> 
>>> Thanks,
>>> Shaohua
>> 
>> Thanks,
>> Tomoki Sekiyama
>> 
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index d5cd313..abbe28f 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -3514,11 +3514,8 @@ static void cfq_init_cfqq(struct cfq_data *cfqd,
>>struct cfq_queue *cfqq,
>>  
>>  cfq_mark_cfqq_prio_changed(cfqq);
>>  
>> -if (is_sync) {
>> -if (!cfq_class_idle(cfqq))
>> -cfq_mark_cfqq_idle_window(cfqq);
>> +if (is_sync)
>>  cfq_mark_cfqq_sync(cfqq);
>> -}
>>  cfqq->pid = pid;
>>  }
>
>I do agree in principle with this, but now you are going to have the
>reverse problem where idling workloads take longer to reach their
>natural steady state. It could probably be argued that they should
>converge quicker, however, in which case it's probably a good change.

Even with this change, idling workload looks estimated worth for
idle_window soon if I/O rate is not so high and think time is low enough.
When the I/O rate is high, it might be regarded as not worth for idling
as the thinktimes were overestimated (although I couldn't find out
patterns which lost performance by that, as far as I tried).

How about fairness? Doesn't this make new processes disadvantageous?
If unfairness by this change was unacceptable, it might be helpful for
mitigating unfairness to add conditions like
 "the number of busy queues marked idle_window in the group == 0"
to marking idle_window as default.


Thanks,
Tomoki Sekiyama

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] cfq-iosched: limit slice_idle when many busy queues are in idle window

2013-08-05 Thread Tomoki Sekiyama
On 8/1/13 17:04 , Jens Axboe ax...@kernel.dk wrote:

On 08/01/2013 02:28 PM, Tomoki Sekiyama wrote:
 On 7/30/13 10:09 PM, Shaohua Li wrote:
 On Tue, Jul 30, 2013 at 03:30:33PM -0400, Tomoki Sekiyama wrote:
 Hi,

 When some application launches several hundreds of processes that
issue
 only a few small sync I/O requests, CFQ may cause heavy latencies
 (10+ seconds at the worst case), although the request rate is low
enough for
 the disk to handle it without waiting. This is because CFQ waits for
 slice_idle (default:8ms) every time before processing each request,
until
 their thinktimes are evaluated.

 This scenario can be reproduced using fio with parameters below:
   fio -filename=/tmp/test -rw=randread -size=5G -runtime=15
-name=file1 \
   -bs=4k -numjobs=500 -thinktime=100
 In this case, 500 processes issue a random read request every second.

 For this workload CFQ should perfectly detect it's a seek queue and
disable
 idle. I suppose the reason is CFQ hasn't enough data/time to disable
idle yet,
 since your thinktime is long and runtime is short.
 
 Right, CFQ will learn the patten, but it takes too long time to reach
stable
 performance when a lot of I/O processes are launched.
 
 I thought the real problem here is cfq_init_cfqq() shouldn't set
idle_window
 when initializing a queue. We should enable idle window after we
detect the
 queue is worthy idle.
 
 Do you think the patch below is appropriate? Or should we check whether
 busy_idle_queues in my original patch is high enough and only then
 disable default idle_window in cfq_init_cfqq()?
 
 Thanks,
 Shaohua
 
 Thanks,
 Tomoki Sekiyama
 
 diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
 index d5cd313..abbe28f 100644
 --- a/block/cfq-iosched.c
 +++ b/block/cfq-iosched.c
 @@ -3514,11 +3514,8 @@ static void cfq_init_cfqq(struct cfq_data *cfqd,
struct cfq_queue *cfqq,
  
  cfq_mark_cfqq_prio_changed(cfqq);
  
 -if (is_sync) {
 -if (!cfq_class_idle(cfqq))
 -cfq_mark_cfqq_idle_window(cfqq);
 +if (is_sync)
  cfq_mark_cfqq_sync(cfqq);
 -}
  cfqq-pid = pid;
  }

I do agree in principle with this, but now you are going to have the
reverse problem where idling workloads take longer to reach their
natural steady state. It could probably be argued that they should
converge quicker, however, in which case it's probably a good change.

Even with this change, idling workload looks estimated worth for
idle_window soon if I/O rate is not so high and think time is low enough.
When the I/O rate is high, it might be regarded as not worth for idling
as the thinktimes were overestimated (although I couldn't find out
patterns which lost performance by that, as far as I tried).

How about fairness? Doesn't this make new processes disadvantageous?
If unfairness by this change was unacceptable, it might be helpful for
mitigating unfairness to add conditions like
 the number of busy queues marked idle_window in the group == 0
to marking idle_window as default.


Thanks,
Tomoki Sekiyama

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] cfq-iosched: limit slice_idle when many busy queues are in idle window

2013-08-01 Thread Tomoki Sekiyama
On 7/30/13 10:09 PM, Shaohua Li wrote:
> On Tue, Jul 30, 2013 at 03:30:33PM -0400, Tomoki Sekiyama wrote:
>> Hi,
>>
>> When some application launches several hundreds of processes that issue
>> only a few small sync I/O requests, CFQ may cause heavy latencies
>> (10+ seconds at the worst case), although the request rate is low enough for
>> the disk to handle it without waiting. This is because CFQ waits for
>> slice_idle (default:8ms) every time before processing each request, until
>> their thinktimes are evaluated.
>>
>> This scenario can be reproduced using fio with parameters below:
>>   fio -filename=/tmp/test -rw=randread -size=5G -runtime=15 -name=file1 \
>>   -bs=4k -numjobs=500 -thinktime=100
>> In this case, 500 processes issue a random read request every second.
> 
> For this workload CFQ should perfectly detect it's a seek queue and disable
> idle. I suppose the reason is CFQ hasn't enough data/time to disable idle yet,
> since your thinktime is long and runtime is short.

Right, CFQ will learn the patten, but it takes too long time to reach stable
performance when a lot of I/O processes are launched.

> I thought the real problem here is cfq_init_cfqq() shouldn't set idle_window
> when initializing a queue. We should enable idle window after we detect the
> queue is worthy idle.

Do you think the patch below is appropriate? Or should we check whether
busy_idle_queues in my original patch is high enough and only then
disable default idle_window in cfq_init_cfqq()?

> Thanks,
> Shaohua

Thanks,
Tomoki Sekiyama

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd313..abbe28f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3514,11 +3514,8 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct 
cfq_queue *cfqq,
 
cfq_mark_cfqq_prio_changed(cfqq);
 
-   if (is_sync) {
-   if (!cfq_class_idle(cfqq))
-   cfq_mark_cfqq_idle_window(cfqq);
+   if (is_sync)
cfq_mark_cfqq_sync(cfqq);
-   }
cfqq->pid = pid;
 }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] cfq-iosched: limit slice_idle when many busy queues are in idle window

2013-08-01 Thread Tomoki Sekiyama
On 7/30/13 10:09 PM, Shaohua Li wrote:
 On Tue, Jul 30, 2013 at 03:30:33PM -0400, Tomoki Sekiyama wrote:
 Hi,

 When some application launches several hundreds of processes that issue
 only a few small sync I/O requests, CFQ may cause heavy latencies
 (10+ seconds at the worst case), although the request rate is low enough for
 the disk to handle it without waiting. This is because CFQ waits for
 slice_idle (default:8ms) every time before processing each request, until
 their thinktimes are evaluated.

 This scenario can be reproduced using fio with parameters below:
   fio -filename=/tmp/test -rw=randread -size=5G -runtime=15 -name=file1 \
   -bs=4k -numjobs=500 -thinktime=100
 In this case, 500 processes issue a random read request every second.
 
 For this workload CFQ should perfectly detect it's a seek queue and disable
 idle. I suppose the reason is CFQ hasn't enough data/time to disable idle yet,
 since your thinktime is long and runtime is short.

Right, CFQ will learn the patten, but it takes too long time to reach stable
performance when a lot of I/O processes are launched.

 I thought the real problem here is cfq_init_cfqq() shouldn't set idle_window
 when initializing a queue. We should enable idle window after we detect the
 queue is worthy idle.

Do you think the patch below is appropriate? Or should we check whether
busy_idle_queues in my original patch is high enough and only then
disable default idle_window in cfq_init_cfqq()?

 Thanks,
 Shaohua

Thanks,
Tomoki Sekiyama

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd313..abbe28f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3514,11 +3514,8 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct 
cfq_queue *cfqq,
 
cfq_mark_cfqq_prio_changed(cfqq);
 
-   if (is_sync) {
-   if (!cfq_class_idle(cfqq))
-   cfq_mark_cfqq_idle_window(cfqq);
+   if (is_sync)
cfq_mark_cfqq_sync(cfqq);
-   }
cfqq-pid = pid;
 }

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH] cfq-iosched: limit slice_idle when many busy queues are in idle window

2013-07-30 Thread Tomoki Sekiyama
Hi,

When some application launches several hundreds of processes that issue
only a few small sync I/O requests, CFQ may cause heavy latencies
(10+ seconds at the worst case), although the request rate is low enough for
the disk to handle it without waiting. This is because CFQ waits for
slice_idle (default:8ms) every time before processing each request, until
their thinktimes are evaluated.

This scenario can be reproduced using fio with parameters below:
  fio -filename=/tmp/test -rw=randread -size=5G -runtime=15 -name=file1 \
  -bs=4k -numjobs=500 -thinktime=100
In this case, 500 processes issue a random read request every second.

This problem can be avoided by setting slice_idle to 0, but there is a
risk to hurt throughput performance on S-ATA disks.

This patch tries to reduce the effect of slice_idle automatically when a
lot of busy queues are waiting in the idle window.
It adds a counter (busy_idle_queues) of queues in idle window that have
I/O requests to cfq_data. And if (busy_idle_queues * slice_idle) goes over
the slice allocated to the group, it limits the idle wait time to
(group_slice / busy_idle_queues).

Without this patch, fio benchmark with parameters above to an ext4
partition on a S-ATA HDD results in:
 read : io=20140KB, bw=1258.5KB/s, iops=314 , runt= 16004msec
 clat (usec): min=4 , max=6494.9K, avg=541264.54, stdev=993834.12

With this patch:
  read : io=28040KB, bw=1750.1KB/s, iops=437 , runt= 16014msec
  clat (usec): min=4 , max=2837.2K, avg=110236.79, stdev=303351.72

Average latency is reduced by 80%, and max is also reduced by 56%.

Any comments are appreciated.

Signed-off-by: Tomoki Sekiyama 
---
 block/cfq-iosched.c |   36 +++-
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd313..77ac27e80 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -329,6 +329,7 @@ struct cfq_data {
 
unsigned int busy_queues;
unsigned int busy_sync_queues;
+   unsigned int busy_idle_queues; /* busy but with idle window */
 
int rq_in_driver;
int rq_in_flight[2];
@@ -446,6 +447,20 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
+static inline void cfq_set_cfqq_idle_window(struct cfq_data *cfqd,
+   struct cfq_queue *cfqq, bool idle)
+{
+   if (idle) {
+   cfq_mark_cfqq_idle_window(cfqq);
+   if (cfq_cfqq_on_rr(cfqq))
+   cfqd->busy_idle_queues++;
+   } else {
+   cfq_clear_cfqq_idle_window(cfqq);
+   if (cfq_cfqq_on_rr(cfqq))
+   cfqd->busy_idle_queues--;
+   }
+}
+
 static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 {
return pd ? container_of(pd, struct cfq_group, pd) : NULL;
@@ -2164,6 +2179,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct 
cfq_queue *cfqq)
cfqd->busy_queues++;
if (cfq_cfqq_sync(cfqq))
cfqd->busy_sync_queues++;
+   if (cfq_cfqq_idle_window(cfqq))
+   cfqd->busy_idle_queues++;
 
cfq_resort_rr_list(cfqd, cfqq);
 }
@@ -2192,6 +2209,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct 
cfq_queue *cfqq)
cfqd->busy_queues--;
if (cfq_cfqq_sync(cfqq))
cfqd->busy_sync_queues--;
+   if (cfq_cfqq_idle_window(cfqq))
+   cfqd->busy_idle_queues--;
 }
 
 /*
@@ -2761,6 +2780,16 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
else
sl = cfqd->cfq_slice_idle;
 
+   /*
+* If there too many queues with idle window, slice idle can cause
+* unacceptable latency. Then we reduce slice idle here.
+*/
+   if (cfqd->busy_idle_queues) {
+   unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
+   unsigned long limit = group_slice / cfqd->busy_idle_queues;
+   sl = min(sl, limit);
+   }
+
mod_timer(>idle_slice_timer, jiffies + sl);
cfqg_stats_set_start_idle_time(cfqq->cfqg);
cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
@@ -3091,7 +3120,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data 
*cfqd)
(cfq_cfqq_slice_new(cfqq) ||
(cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
cfq_clear_cfqq_deep(cfqq);
-   cfq_clear_cfqq_idle_window(cfqq);
+   cfq_set_cfqq_idle_window(cfqd, cfqq, false);
}
 
if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
@@ -3742,10 +3771,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct 
cfq_queue *cfqq,
 
if (old_idle != enable_idle) {
cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
-   if (enable_i

[RFC PATCH] cfq-iosched: limit slice_idle when many busy queues are in idle window

2013-07-30 Thread Tomoki Sekiyama
Hi,

When some application launches several hundreds of processes that issue
only a few small sync I/O requests, CFQ may cause heavy latencies
(10+ seconds at the worst case), although the request rate is low enough for
the disk to handle it without waiting. This is because CFQ waits for
slice_idle (default:8ms) every time before processing each request, until
their thinktimes are evaluated.

This scenario can be reproduced using fio with parameters below:
  fio -filename=/tmp/test -rw=randread -size=5G -runtime=15 -name=file1 \
  -bs=4k -numjobs=500 -thinktime=100
In this case, 500 processes issue a random read request every second.

This problem can be avoided by setting slice_idle to 0, but there is a
risk to hurt throughput performance on S-ATA disks.

This patch tries to reduce the effect of slice_idle automatically when a
lot of busy queues are waiting in the idle window.
It adds a counter (busy_idle_queues) of queues in idle window that have
I/O requests to cfq_data. And if (busy_idle_queues * slice_idle) goes over
the slice allocated to the group, it limits the idle wait time to
(group_slice / busy_idle_queues).

Without this patch, fio benchmark with parameters above to an ext4
partition on a S-ATA HDD results in:
 read : io=20140KB, bw=1258.5KB/s, iops=314 , runt= 16004msec
 clat (usec): min=4 , max=6494.9K, avg=541264.54, stdev=993834.12

With this patch:
  read : io=28040KB, bw=1750.1KB/s, iops=437 , runt= 16014msec
  clat (usec): min=4 , max=2837.2K, avg=110236.79, stdev=303351.72

Average latency is reduced by 80%, and max is also reduced by 56%.

Any comments are appreciated.

Signed-off-by: Tomoki Sekiyama tomoki.sekiy...@hds.com
---
 block/cfq-iosched.c |   36 +++-
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd313..77ac27e80 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -329,6 +329,7 @@ struct cfq_data {
 
unsigned int busy_queues;
unsigned int busy_sync_queues;
+   unsigned int busy_idle_queues; /* busy but with idle window */
 
int rq_in_driver;
int rq_in_flight[2];
@@ -446,6 +447,20 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
+static inline void cfq_set_cfqq_idle_window(struct cfq_data *cfqd,
+   struct cfq_queue *cfqq, bool idle)
+{
+   if (idle) {
+   cfq_mark_cfqq_idle_window(cfqq);
+   if (cfq_cfqq_on_rr(cfqq))
+   cfqd-busy_idle_queues++;
+   } else {
+   cfq_clear_cfqq_idle_window(cfqq);
+   if (cfq_cfqq_on_rr(cfqq))
+   cfqd-busy_idle_queues--;
+   }
+}
+
 static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 {
return pd ? container_of(pd, struct cfq_group, pd) : NULL;
@@ -2164,6 +2179,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct 
cfq_queue *cfqq)
cfqd-busy_queues++;
if (cfq_cfqq_sync(cfqq))
cfqd-busy_sync_queues++;
+   if (cfq_cfqq_idle_window(cfqq))
+   cfqd-busy_idle_queues++;
 
cfq_resort_rr_list(cfqd, cfqq);
 }
@@ -2192,6 +2209,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct 
cfq_queue *cfqq)
cfqd-busy_queues--;
if (cfq_cfqq_sync(cfqq))
cfqd-busy_sync_queues--;
+   if (cfq_cfqq_idle_window(cfqq))
+   cfqd-busy_idle_queues--;
 }
 
 /*
@@ -2761,6 +2780,16 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
else
sl = cfqd-cfq_slice_idle;
 
+   /*
+* If there too many queues with idle window, slice idle can cause
+* unacceptable latency. Then we reduce slice idle here.
+*/
+   if (cfqd-busy_idle_queues) {
+   unsigned group_slice = cfq_group_slice(cfqd, cfqq-cfqg);
+   unsigned long limit = group_slice / cfqd-busy_idle_queues;
+   sl = min(sl, limit);
+   }
+
mod_timer(cfqd-idle_slice_timer, jiffies + sl);
cfqg_stats_set_start_idle_time(cfqq-cfqg);
cfq_log_cfqq(cfqd, cfqq, arm_idle: %lu group_idle: %d, sl,
@@ -3091,7 +3120,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data 
*cfqd)
(cfq_cfqq_slice_new(cfqq) ||
(cfqq-slice_end - jiffies  jiffies - cfqq-slice_start))) {
cfq_clear_cfqq_deep(cfqq);
-   cfq_clear_cfqq_idle_window(cfqq);
+   cfq_set_cfqq_idle_window(cfqd, cfqq, false);
}
 
if (cfqq-dispatched  cfq_should_idle(cfqd, cfqq)) {
@@ -3742,10 +3771,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct 
cfq_queue *cfqq,
 
if (old_idle != enable_idle) {
cfq_log_cfqq(cfqd, cfqq, idle=%d, enable_idle);
-   if (enable_idle)
-   cfq_mark_cfqq_idle_window(cfqq);
-   else

Re: Re: [RFC v2 PATCH 04/21] x86: Avoid RCU warnings on slave CPUs

2012-09-28 Thread Tomoki Sekiyama
Hi Paul,

Thank you for your comments, and sorry for my late reply.

On 2012/09/21 2:34, Paul E. McKenney wrote:

> On Thu, Sep 06, 2012 at 08:27:40PM +0900, Tomoki Sekiyama wrote:
>> Initialize rcu related variables to avoid warnings about RCU usage while
>> slave CPUs is running specified functions. Also notify RCU subsystem before
>> the slave CPU is entered into idle state.
> 
> Hello, Tomoki,
> 
> A few questions and comments interspersed below.
>> 
>> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
>> index e8cfe377..45dfc1d 100644
>> --- a/arch/x86/kernel/smpboot.c
>> +++ b/arch/x86/kernel/smpboot.c
>> @@ -382,6 +382,8 @@ notrace static void __cpuinit start_slave_cpu(void 
>> *unused)
>>  f = per_cpu(slave_cpu_func, cpu);
>>  per_cpu(slave_cpu_func, cpu).func = NULL;
>>
>> +rcu_note_context_switch(cpu);
>> +
> 
> Why not use rcu_idle_enter() and rcu_idle_exit()?  These would tell
> RCU to ignore the slave CPU for the duration of its idle period.
> The way you have it, if a slave CPU stayed idle for too long, you
> would get RCU CPU stall warnings, and possibly system hangs as well. 

That's true, rcu_idle_enter() and rcu_idle_exit() should be used when
the slave cpu is idle. Thanks.

> Or is this being called from some task that is not the idle task?
> If so, you instead want the new rcu_user_enter() and rcu_user_exit()
> that are hopefully on their way into 3.7.  Or maybe better, use a real
> idle task, so that idle_task(smp_processor_id()) returns true and RCU
> stops complaining.  ;-)
>
> Note that CPUs that RCU believes to be idle are not permitted to contain
> RCU read-side critical sections, which in turn means no entering the
> scheduler, no sleeping, and so on.  There is an RCU_NONIDLE() macro
> to tell RCU to pay attention to the CPU only for the duration of the
> statement passed to RCU_NONIDLE, and there are also an _rcuidle variant
> of the tracing statement to allow tracing from idle. 

This was for KVM is called as `func', which contains RCU read-side critical
sections, and rcu_virt_note_context_switch() (that is
rcu_note_context_switch(cpu)) before entering guest.
Maybe it should be replaced by rcu_user_enter() and rcu_user_exit() in the
future.

>> --- a/kernel/rcutree.c
>> +++ b/kernel/rcutree.c
>> @@ -2589,6 +2589,9 @@ static int __cpuinit rcu_cpu_notify(struc 
>> tnotifier_block *self,
>>  switch (action) {
>>  case CPU_UP_PREPARE:
>>  case CPU_UP_PREPARE_FROZEN:
>> +#ifdef CONFIG_SLAVE_CPU
>> +case CPU_SLAVE_UP_PREPARE:
>> +#endif
> 
> Why do you need #ifdef here?  Why not define CPU_SLAVE_UP_PREPARE
> unconditionally?  Then if CONFIG_SLAVE_CPU=n, rcu_cpu_notify() would
> never be invoked with CPU_SLAVE_UP_PREPARE, so no problems. 

Agreed. That will make the code simpler.

Thank you again,
-- 
Tomoki Sekiyama 
Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:x86/mm] x86: Distinguish TLB shootdown interrupts from other functions call interrupts

2012-09-28 Thread tip-bot for Tomoki Sekiyama
Commit-ID:  fd0f5869724ff6195c6e7f12f8287c66a132e0ba
Gitweb: http://git.kernel.org/tip/fd0f5869724ff6195c6e7f12f8287c66a132e0ba
Author: Tomoki Sekiyama 
AuthorDate: Wed, 26 Sep 2012 11:11:28 +0900
Committer:  H. Peter Anvin 
CommitDate: Thu, 27 Sep 2012 22:52:34 -0700

x86: Distinguish TLB shootdown interrupts from other functions call interrupts

As TLB shootdown requests to other CPU cores are now using function call
interrupts, TLB shootdowns entry in /proc/interrupts is always shown as 0.

This behavior change was introduced by commit 52aec3308db8 ("x86/tlb:
replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR").

This patch reverts TLB shootdowns entry in /proc/interrupts to count TLB
shootdowns separately from the other function call interrupts.

Signed-off-by: Tomoki Sekiyama 
Link: http://lkml.kernel.org/r/20120926021128.22212.20440.stgit@hpxw
Acked-by: Alex Shi 
Signed-off-by: H. Peter Anvin 
---
 arch/x86/include/asm/hardirq.h |4 
 arch/x86/kernel/irq.c  |4 ++--
 arch/x86/mm/tlb.c  |2 ++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index d3895db..81f04ce 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,6 +18,10 @@ typedef struct {
 #ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
+   /*
+* irq_tlb_count is double-counted in irq_call_count, so it must be
+* subtracted from irq_call_count when displaying irq_call_count
+*/
unsigned int irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 1f5f1d5..355b13f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "  Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
-   seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+   seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
+   irq_stats(j)->irq_tlb_count);
seq_printf(p, "  Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
@@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
-   sum += irq_stats(cpu)->irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a085c56..0777f04 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -98,6 +98,8 @@ static void flush_tlb_func(void *info)
 {
struct flush_tlb_info *f = info;
 
+   inc_irq_stat(irq_tlb_count);
+
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:x86/mm] x86: Distinguish TLB shootdown interrupts from other functions call interrupts

2012-09-28 Thread tip-bot for Tomoki Sekiyama
Commit-ID:  fd0f5869724ff6195c6e7f12f8287c66a132e0ba
Gitweb: http://git.kernel.org/tip/fd0f5869724ff6195c6e7f12f8287c66a132e0ba
Author: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
AuthorDate: Wed, 26 Sep 2012 11:11:28 +0900
Committer:  H. Peter Anvin h...@linux.intel.com
CommitDate: Thu, 27 Sep 2012 22:52:34 -0700

x86: Distinguish TLB shootdown interrupts from other functions call interrupts

As TLB shootdown requests to other CPU cores are now using function call
interrupts, TLB shootdowns entry in /proc/interrupts is always shown as 0.

This behavior change was introduced by commit 52aec3308db8 (x86/tlb:
replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR).

This patch reverts TLB shootdowns entry in /proc/interrupts to count TLB
shootdowns separately from the other function call interrupts.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Link: http://lkml.kernel.org/r/20120926021128.22212.20440.stgit@hpxw
Acked-by: Alex Shi alex@intel.com
Signed-off-by: H. Peter Anvin h...@linux.intel.com
---
 arch/x86/include/asm/hardirq.h |4 
 arch/x86/kernel/irq.c  |4 ++--
 arch/x86/mm/tlb.c  |2 ++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index d3895db..81f04ce 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,6 +18,10 @@ typedef struct {
 #ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
+   /*
+* irq_tlb_count is double-counted in irq_call_count, so it must be
+* subtracted from irq_call_count when displaying irq_call_count
+*/
unsigned int irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 1f5f1d5..355b13f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p,   Rescheduling interrupts\n);
seq_printf(p, %*s: , prec, CAL);
for_each_online_cpu(j)
-   seq_printf(p, %10u , irq_stats(j)-irq_call_count);
+   seq_printf(p, %10u , irq_stats(j)-irq_call_count -
+   irq_stats(j)-irq_tlb_count);
seq_printf(p,   Function call interrupts\n);
seq_printf(p, %*s: , prec, TLB);
for_each_online_cpu(j)
@@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_SMP
sum += irq_stats(cpu)-irq_resched_count;
sum += irq_stats(cpu)-irq_call_count;
-   sum += irq_stats(cpu)-irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)-irq_thermal_count;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a085c56..0777f04 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -98,6 +98,8 @@ static void flush_tlb_func(void *info)
 {
struct flush_tlb_info *f = info;
 
+   inc_irq_stat(irq_tlb_count);
+
if (f-flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Re: [RFC v2 PATCH 04/21] x86: Avoid RCU warnings on slave CPUs

2012-09-28 Thread Tomoki Sekiyama
Hi Paul,

Thank you for your comments, and sorry for my late reply.

On 2012/09/21 2:34, Paul E. McKenney wrote:

 On Thu, Sep 06, 2012 at 08:27:40PM +0900, Tomoki Sekiyama wrote:
 Initialize rcu related variables to avoid warnings about RCU usage while
 slave CPUs is running specified functions. Also notify RCU subsystem before
 the slave CPU is entered into idle state.
 
 Hello, Tomoki,
 
 A few questions and comments interspersed below.
 snip
 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
 index e8cfe377..45dfc1d 100644
 --- a/arch/x86/kernel/smpboot.c
 +++ b/arch/x86/kernel/smpboot.c
 @@ -382,6 +382,8 @@ notrace static void __cpuinit start_slave_cpu(void 
 *unused)
  f = per_cpu(slave_cpu_func, cpu);
  per_cpu(slave_cpu_func, cpu).func = NULL;

 +rcu_note_context_switch(cpu);
 +
 
 Why not use rcu_idle_enter() and rcu_idle_exit()?  These would tell
 RCU to ignore the slave CPU for the duration of its idle period.
 The way you have it, if a slave CPU stayed idle for too long, you
 would get RCU CPU stall warnings, and possibly system hangs as well. 

That's true, rcu_idle_enter() and rcu_idle_exit() should be used when
the slave cpu is idle. Thanks.

 Or is this being called from some task that is not the idle task?
 If so, you instead want the new rcu_user_enter() and rcu_user_exit()
 that are hopefully on their way into 3.7.  Or maybe better, use a real
 idle task, so that idle_task(smp_processor_id()) returns true and RCU
 stops complaining.  ;-)

 Note that CPUs that RCU believes to be idle are not permitted to contain
 RCU read-side critical sections, which in turn means no entering the
 scheduler, no sleeping, and so on.  There is an RCU_NONIDLE() macro
 to tell RCU to pay attention to the CPU only for the duration of the
 statement passed to RCU_NONIDLE, and there are also an _rcuidle variant
 of the tracing statement to allow tracing from idle. 

This was for KVM is called as `func', which contains RCU read-side critical
sections, and rcu_virt_note_context_switch() (that is
rcu_note_context_switch(cpu)) before entering guest.
Maybe it should be replaced by rcu_user_enter() and rcu_user_exit() in the
future.

 --- a/kernel/rcutree.c
 +++ b/kernel/rcutree.c
 @@ -2589,6 +2589,9 @@ static int __cpuinit rcu_cpu_notify(struc 
 tnotifier_block *self,
  switch (action) {
  case CPU_UP_PREPARE:
  case CPU_UP_PREPARE_FROZEN:
 +#ifdef CONFIG_SLAVE_CPU
 +case CPU_SLAVE_UP_PREPARE:
 +#endif
 
 Why do you need #ifdef here?  Why not define CPU_SLAVE_UP_PREPARE
 unconditionally?  Then if CONFIG_SLAVE_CPU=n, rcu_cpu_notify() would
 never be invoked with CPU_SLAVE_UP_PREPARE, so no problems. 

Agreed. That will make the code simpler.

Thank you again,
-- 
Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: Distinguish TLB shootdown interrupts from other functions call interrupts

2012-09-25 Thread Tomoki Sekiyama
Hi Alex,

On 2012/09/25 11:57, Alex Shi wrote:
> On 09/24/2012 09:37 AM, Alex Shi wrote:
>
>> On 09/20/2012 04:50 PM, Tomoki Sekiyama wrote:
>>
>>> unsigned int irq_resched_count;
>>> unsigned int irq_call_count;
>>> +   /* irq_tlb_count is double-counted in irq_call_count, so it must be
>>> +  subtracted from irq_call_count when displaying irq_call_count */
>>> unsigned int irq_tlb_count;
>>
>> Review again this patch, above comments is not kernel compatible format.
>> Could you change it like standard comment format:
>>
>> /*
>>  * xxx
>>  * 
>>  */
>>
>
> the 3.6 kernel will closed soon. it will be great to has this patch in.
> So, could you like to refresh your patch with popular comments format? :)

Fixed patch is below.
Thank you for the review again.

--
As TLB shootdown requests to other CPU cores are now using function call
interrupts, TLB shootdowns entry in /proc/interrupts is always shown as 0.

This behavior change was introduced by commit 52aec3308db8 ("x86/tlb:
replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR").

This patch reverts TLB shootdowns entry in /proc/interrupts to count TLB
shootdowns separately from the other function call interrupts.

Signed-off-by: Tomoki Sekiyama 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Alex Shi 
---
 arch/x86/include/asm/hardirq.h |4 
 arch/x86/kernel/irq.c  |4 ++--
 arch/x86/mm/tlb.c  |2 ++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index d3895db..81f04ce 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,6 +18,10 @@ typedef struct {
 #ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
+   /*
+* irq_tlb_count is double-counted in irq_call_count, so it must be
+* subtracted from irq_call_count when displaying irq_call_count
+*/
unsigned int irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d44f782..e4595f1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "  Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
-   seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+   seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
+   irq_stats(j)->irq_tlb_count);
seq_printf(p, "  Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
@@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
-   sum += irq_stats(cpu)->irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 613cd83..2d6d8ed 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -98,6 +98,8 @@ static void flush_tlb_func(void *info)
 {
struct flush_tlb_info *f = info;
 
+   inc_irq_stat(irq_tlb_count);
+
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: Distinguish TLB shootdown interrupts from other functions call interrupts

2012-09-25 Thread Tomoki Sekiyama
Hi Alex,

On 2012/09/25 11:57, Alex Shi wrote:
 On 09/24/2012 09:37 AM, Alex Shi wrote:

 On 09/20/2012 04:50 PM, Tomoki Sekiyama wrote:

 unsigned int irq_resched_count;
 unsigned int irq_call_count;
 +   /* irq_tlb_count is double-counted in irq_call_count, so it must be
 +  subtracted from irq_call_count when displaying irq_call_count */
 unsigned int irq_tlb_count;

 Review again this patch, above comments is not kernel compatible format.
 Could you change it like standard comment format:

 /*
  * xxx
  * 
  */


 the 3.6 kernel will closed soon. it will be great to has this patch in.
 So, could you like to refresh your patch with popular comments format? :)

Fixed patch is below.
Thank you for the review again.

--
As TLB shootdown requests to other CPU cores are now using function call
interrupts, TLB shootdowns entry in /proc/interrupts is always shown as 0.

This behavior change was introduced by commit 52aec3308db8 (x86/tlb:
replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR).

This patch reverts TLB shootdowns entry in /proc/interrupts to count TLB
shootdowns separately from the other function call interrupts.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
Cc: Alex Shi alex@intel.com
---
 arch/x86/include/asm/hardirq.h |4 
 arch/x86/kernel/irq.c  |4 ++--
 arch/x86/mm/tlb.c  |2 ++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index d3895db..81f04ce 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,6 +18,10 @@ typedef struct {
 #ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
+   /*
+* irq_tlb_count is double-counted in irq_call_count, so it must be
+* subtracted from irq_call_count when displaying irq_call_count
+*/
unsigned int irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d44f782..e4595f1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p,   Rescheduling interrupts\n);
seq_printf(p, %*s: , prec, CAL);
for_each_online_cpu(j)
-   seq_printf(p, %10u , irq_stats(j)-irq_call_count);
+   seq_printf(p, %10u , irq_stats(j)-irq_call_count -
+   irq_stats(j)-irq_tlb_count);
seq_printf(p,   Function call interrupts\n);
seq_printf(p, %*s: , prec, TLB);
for_each_online_cpu(j)
@@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_SMP
sum += irq_stats(cpu)-irq_resched_count;
sum += irq_stats(cpu)-irq_call_count;
-   sum += irq_stats(cpu)-irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)-irq_thermal_count;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 613cd83..2d6d8ed 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -98,6 +98,8 @@ static void flush_tlb_func(void *info)
 {
struct flush_tlb_info *f = info;
 
+   inc_irq_stat(irq_tlb_count);
+
if (f-flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: Distinguish TLB shootdown interrupts from other functions call interrupts

2012-09-20 Thread Tomoki Sekiyama
Hi Alex,

thank you for the review.

>> sum += irq_stats(cpu)->irq_call_count;
>> -   sum += irq_stats(cpu)->irq_tlb_count;
>> +   /* irq_tlb_count is already added to irq_call_count */
>
>redundant comments here?

>> @@ -98,6 +98,9 @@ static void flush_tlb_func(void *info)
>>  {
>>  struct flush_tlb_info *f = info;
>>
>> +/* irq_call_cnt is also incremented; be subtracted on display */
>
>If is it better to move above explanation to irq_call_cnt definition place: 
>harirq.h?

Agreed.

In the patch below, I reduced the redundant comments.

--
As TLB shootdown requests to other CPU cores are now using function call
interrupts, TLB shootdowns entry in /proc/interrupts is always shown as 0.

This behavior change was introduced by commit 52aec3308db8 ("x86/tlb:
replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR").

This patch reverts TLB shootdowns entry in /proc/interrupts to count TLB
shootdowns separately from the other function call interrupts.

Signed-off-by: Tomoki Sekiyama 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Alex Shi 
---
 arch/x86/include/asm/hardirq.h |2 ++
 arch/x86/kernel/irq.c  |4 ++--
 arch/x86/mm/tlb.c  |2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index d3895db..e34b252 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,6 +18,8 @@ typedef struct {
 #ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
+   /* irq_tlb_count is double-counted in irq_call_count, so it must be
+  subtracted from irq_call_count when displaying irq_call_count */
unsigned int irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d44f782..e4595f1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "  Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
-   seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+   seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
+   irq_stats(j)->irq_tlb_count);
seq_printf(p, "  Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
@@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
-   sum += irq_stats(cpu)->irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 613cd83..2d6d8ed 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -98,6 +98,8 @@ static void flush_tlb_func(void *info)
 {
struct flush_tlb_info *f = info;
 
+   inc_irq_stat(irq_tlb_count);
+
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86: Distinguish TLB shootdown interrupts from other functions call interrupts

2012-09-20 Thread Tomoki Sekiyama
Hi Alex,

thank you for the review.

 sum += irq_stats(cpu)-irq_call_count;
 -   sum += irq_stats(cpu)-irq_tlb_count;
 +   /* irq_tlb_count is already added to irq_call_count */

redundant comments here?

 @@ -98,6 +98,9 @@ static void flush_tlb_func(void *info)
  {
  struct flush_tlb_info *f = info;

 +/* irq_call_cnt is also incremented; be subtracted on display */

If is it better to move above explanation to irq_call_cnt definition place: 
harirq.h?

Agreed.

In the patch below, I reduced the redundant comments.

--
As TLB shootdown requests to other CPU cores are now using function call
interrupts, TLB shootdowns entry in /proc/interrupts is always shown as 0.

This behavior change was introduced by commit 52aec3308db8 (x86/tlb:
replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR).

This patch reverts TLB shootdowns entry in /proc/interrupts to count TLB
shootdowns separately from the other function call interrupts.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
Cc: Alex Shi alex@intel.com
---
 arch/x86/include/asm/hardirq.h |2 ++
 arch/x86/kernel/irq.c  |4 ++--
 arch/x86/mm/tlb.c  |2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index d3895db..e34b252 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,6 +18,8 @@ typedef struct {
 #ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
+   /* irq_tlb_count is double-counted in irq_call_count, so it must be
+  subtracted from irq_call_count when displaying irq_call_count */
unsigned int irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d44f782..e4595f1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p,   Rescheduling interrupts\n);
seq_printf(p, %*s: , prec, CAL);
for_each_online_cpu(j)
-   seq_printf(p, %10u , irq_stats(j)-irq_call_count);
+   seq_printf(p, %10u , irq_stats(j)-irq_call_count -
+   irq_stats(j)-irq_tlb_count);
seq_printf(p,   Function call interrupts\n);
seq_printf(p, %*s: , prec, TLB);
for_each_online_cpu(j)
@@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_SMP
sum += irq_stats(cpu)-irq_resched_count;
sum += irq_stats(cpu)-irq_call_count;
-   sum += irq_stats(cpu)-irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)-irq_thermal_count;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 613cd83..2d6d8ed 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -98,6 +98,8 @@ static void flush_tlb_func(void *info)
 {
struct flush_tlb_info *f = info;
 
+   inc_irq_stat(irq_tlb_count);
+
if (f-flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] x86: Distinguish TLB shootdown interrupts from other functions call interrupts

2012-09-19 Thread Tomoki Sekiyama
As TLB shootdown requests to other CPU cores are now done using function call
interrupts, TLB shootdowns entry in /proc/interrupts is always shown as 0.

This behavior change was introduced by commit 52aec3308db8 ("x86/tlb:
replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR").

This patch reverts TLB shootdowns entry in /proc/interrupts to count TLB
shootdowns separately from the other function call interrupts.

Signed-off-by: Tomoki Sekiyama 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: Alex Shi 
---
 arch/x86/include/asm/hardirq.h |2 +-
 arch/x86/kernel/irq.c  |5 +++--
 arch/x86/mm/tlb.c  |3 +++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index d3895db..af60ab5 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,7 +18,7 @@ typedef struct {
 #ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
-   unsigned int irq_tlb_count;
+   unsigned int irq_tlb_count; /* double-counted in irq_call_count */
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d44f782..6dfa8b1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "  Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
-   seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+   seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
+   irq_stats(j)->irq_tlb_count);
seq_printf(p, "  Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
@@ -147,7 +148,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
-   sum += irq_stats(cpu)->irq_tlb_count;
+   /* irq_tlb_count is already added to irq_call_count */
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 613cd83..0a054db 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -98,6 +98,9 @@ static void flush_tlb_func(void *info)
 {
struct flush_tlb_info *f = info;
 
+   /* irq_call_cnt is also incremented; be subtracted on display */
+   inc_irq_stat(irq_tlb_count);
+
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] x86: Distinguish TLB shootdown interrupts from other functions call interrupts

2012-09-19 Thread Tomoki Sekiyama
As TLB shootdown requests to other CPU cores are now done using function call
interrupts, TLB shootdowns entry in /proc/interrupts is always shown as 0.

This behavior change was introduced by commit 52aec3308db8 (x86/tlb:
replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR).

This patch reverts TLB shootdowns entry in /proc/interrupts to count TLB
shootdowns separately from the other function call interrupts.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
Cc: Alex Shi alex@intel.com
---
 arch/x86/include/asm/hardirq.h |2 +-
 arch/x86/kernel/irq.c  |5 +++--
 arch/x86/mm/tlb.c  |3 +++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index d3895db..af60ab5 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,7 +18,7 @@ typedef struct {
 #ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
-   unsigned int irq_tlb_count;
+   unsigned int irq_tlb_count; /* double-counted in irq_call_count */
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d44f782..6dfa8b1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p,   Rescheduling interrupts\n);
seq_printf(p, %*s: , prec, CAL);
for_each_online_cpu(j)
-   seq_printf(p, %10u , irq_stats(j)-irq_call_count);
+   seq_printf(p, %10u , irq_stats(j)-irq_call_count -
+   irq_stats(j)-irq_tlb_count);
seq_printf(p,   Function call interrupts\n);
seq_printf(p, %*s: , prec, TLB);
for_each_online_cpu(j)
@@ -147,7 +148,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_SMP
sum += irq_stats(cpu)-irq_resched_count;
sum += irq_stats(cpu)-irq_call_count;
-   sum += irq_stats(cpu)-irq_tlb_count;
+   /* irq_tlb_count is already added to irq_call_count */
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)-irq_thermal_count;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 613cd83..0a054db 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -98,6 +98,9 @@ static void flush_tlb_func(void *info)
 {
struct flush_tlb_info *f = info;
 
+   /* irq_call_cnt is also incremented; be subtracted on display */
+   inc_irq_stat(irq_tlb_count);
+
if (f-flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC v2 PATCH 00/21] KVM: x86: CPU isolation and direct interrupts delivery to guests

2012-09-10 Thread Tomoki Sekiyama
Hi Jan,

On 2012/09/07 17:26, Jan Kiszka wrote:

> On 2012-09-06 13:27, Tomoki Sekiyama wrote:
>> This RFC patch series provides facility to dedicate CPUs to KVM guests
>> and enable the guests to handle interrupts from passed-through PCI devices
>> directly (without VM exit and relay by the host).
>>
>> With this feature, we can improve throughput and response time of the device
>> and the host's CPU usage by reducing the overhead of interrupt handling.
>> This is good for the application using very high throughput/frequent
>> interrupt device (e.g. 10GbE NIC).
>> Real-time applicatoins also gets benefit from CPU isolation feature, which
>> reduces interfare from host kernel tasks and scheduling delay.
>>
>> The overview of this patch series is presented in CloudOpen 2012.
>> The slides are available at:
>> http://events.linuxfoundation.org/images/stories/pdf/lcna_co2012_sekiyama.pdf
> 
> One question regarding your benchmarks: If you measured against standard
> KVM, were the vCPU thread running on an isolcpus core of its own as
> well? If not, your numbers about the impact of these patches on maximum,
> maybe also average latencies are likely too good. Also, using a non-RT
> host and possibly a non-prioritized vCPU thread for the standard
> scenario looks like an unfair comparison.


In the standard KVM benchmark, the vCPU thread is pinned down to its own
CPU core. In addition, the vCPU thread and irq/*-kvm threads are both set
to the max priority with SCHED_RR policy.

As you said, RT-host may result in better max latencies as below.
(But min/average latencies became worse, however, this might be our setup
 issue.)
 Min / Avg / Max latencies
Normal KVM
  RT-host (3.4.4-rt14)  15us / 21us / 117us
  non RT-host (3.5.0-rc6)    6us / 11us / 152us
KVM + Direct IRQ
  non RT-host (3.5.0-rc6 +patch) 1us /  2us /  14us

Thanks,
-- 
Tomoki Sekiyama 
Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC v2 PATCH 00/21] KVM: x86: CPU isolation and direct interrupts delivery to guests

2012-09-10 Thread Tomoki Sekiyama
Hi Jan,

On 2012/09/07 17:26, Jan Kiszka wrote:

 On 2012-09-06 13:27, Tomoki Sekiyama wrote:
 This RFC patch series provides facility to dedicate CPUs to KVM guests
 and enable the guests to handle interrupts from passed-through PCI devices
 directly (without VM exit and relay by the host).

 With this feature, we can improve throughput and response time of the device
 and the host's CPU usage by reducing the overhead of interrupt handling.
 This is good for the application using very high throughput/frequent
 interrupt device (e.g. 10GbE NIC).
 Real-time applicatoins also gets benefit from CPU isolation feature, which
 reduces interfare from host kernel tasks and scheduling delay.

 The overview of this patch series is presented in CloudOpen 2012.
 The slides are available at:
 http://events.linuxfoundation.org/images/stories/pdf/lcna_co2012_sekiyama.pdf
 
 One question regarding your benchmarks: If you measured against standard
 KVM, were the vCPU thread running on an isolcpus core of its own as
 well? If not, your numbers about the impact of these patches on maximum,
 maybe also average latencies are likely too good. Also, using a non-RT
 host and possibly a non-prioritized vCPU thread for the standard
 scenario looks like an unfair comparison.


In the standard KVM benchmark, the vCPU thread is pinned down to its own
CPU core. In addition, the vCPU thread and irq/*-kvm threads are both set
to the max priority with SCHED_RR policy.

As you said, RT-host may result in better max latencies as below.
(But min/average latencies became worse, however, this might be our setup
 issue.)
 Min / Avg / Max latencies
Normal KVM
  RT-host (3.4.4-rt14)  15us / 21us / 117us
  non RT-host (3.5.0-rc6)6us / 11us / 152us
KVM + Direct IRQ
  non RT-host (3.5.0-rc6 +patch) 1us /  2us /  14us

Thanks,
-- 
Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 05/21] KVM: Enable/Disable virtualization on slave CPUs are activated/dying

2012-09-06 Thread Tomoki Sekiyama
Enable virtualization when slave CPUs are activated, and disable when
the CPUs are dying using slave CPU notifier call chain.

In x86, TSC kHz must also be initialized by tsc_khz_changed when the
new slave CPUs are activated.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/kvm/x86.c  |   16 
 virt/kvm/kvm_main.c |   30 --
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 148ed66..7501cc4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -61,6 +61,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
@@ -4782,9 +4783,15 @@ static int kvmclock_cpu_notifier(struct notifier_block 
*nfb,
switch (action) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
+#ifdef CONFIG_SLAVE_CPU
+   case CPU_SLAVE_UP:
+#endif
smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
break;
case CPU_DOWN_PREPARE:
+#ifdef CONFIG_SLAVE_CPU
+   case CPU_SLAVE_DYING:
+#endif
smp_call_function_single(cpu, tsc_bad, NULL, 1);
break;
}
@@ -4796,12 +4803,18 @@ static struct notifier_block 
kvmclock_cpu_notifier_block = {
.priority = -INT_MAX
 };
 
+static struct notifier_block kvmclock_slave_cpu_notifier_block = {
+   .notifier_call  = kvmclock_cpu_notifier,
+   .priority = -INT_MAX
+};
+
 static void kvm_timer_init(void)
 {
int cpu;
 
max_tsc_khz = tsc_khz;
register_hotcpu_notifier(_cpu_notifier_block);
+   register_slave_cpu_notifier(_slave_cpu_notifier_block);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
 #ifdef CONFIG_CPU_FREQ
struct cpufreq_policy policy;
@@ -4818,6 +4831,8 @@ static void kvm_timer_init(void)
pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
for_each_online_cpu(cpu)
smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
+   for_each_slave_cpu(cpu)
+   smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
 }
 
 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4943,6 +4958,7 @@ void kvm_arch_exit(void)
cpufreq_unregister_notifier(_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
unregister_hotcpu_notifier(_cpu_notifier_block);
+   unregister_slave_cpu_notifier(_slave_cpu_notifier_block);
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d617f69..dc86e9a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -54,6 +54,9 @@
 #include 
 #include 
 #include 
+#ifdef CONFIG_X86
+#include 
+#endif
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
@@ -2336,11 +2339,17 @@ static void hardware_disable(void *junk)
 
 static void hardware_disable_all_nolock(void)
 {
+   int cpu;
+
BUG_ON(!kvm_usage_count);
 
kvm_usage_count--;
-   if (!kvm_usage_count)
+   if (!kvm_usage_count) {
on_each_cpu(hardware_disable_nolock, NULL, 1);
+   for_each_slave_cpu(cpu)
+   smp_call_function_single(cpu, hardware_disable_nolock,
+NULL, 1);
+   }
 }
 
 static void hardware_disable_all(void)
@@ -2353,6 +2362,7 @@ static void hardware_disable_all(void)
 static int hardware_enable_all(void)
 {
int r = 0;
+   int cpu;
 
raw_spin_lock(_lock);
 
@@ -2360,6 +2370,9 @@ static int hardware_enable_all(void)
if (kvm_usage_count == 1) {
atomic_set(_enable_failed, 0);
on_each_cpu(hardware_enable_nolock, NULL, 1);
+   for_each_slave_cpu(cpu)
+   smp_call_function_single(cpu, hardware_enable_nolock,
+NULL, 1);
 
if (atomic_read(_enable_failed)) {
hardware_disable_all_nolock();
@@ -2383,11 +2396,17 @@ static int kvm_cpu_hotplug(struct notifier_block 
*notifier, unsigned long val,
val &= ~CPU_TASKS_FROZEN;
switch (val) {
case CPU_DYING:
+#ifdef CONFIG_SLAVE_CPU
+   case CPU_SLAVE_DYING:
+#endif
printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
   cpu);
hardware_disable(NULL);
break;
case CPU_STARTING:
+#ifdef CONFIG_SLAVE_CPU
+   case CPU_SLAVE_UP:
+#endif
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
   cpu);
hardware_enable(NULL);
@@ -2605,6 +2624,10 @@ static struct notifier_block kvm_cpu_noti

[RFC v2 PATCH 07/21] KVM: handle page faults of slave guests on online CPUs

2012-09-06 Thread Tomoki Sekiyama
Page faults which occured by the guest running on slave CPUs cannot be
handled on slave CPUs because it is running on idle process context.

With this patch, the page fault happened in a slave CPU is notified to
online CPU using struct kvm_access_fault, and is handled after the
user-process for the guest is resumed on an online CPU.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/kvm_host.h |   15 +++
 arch/x86/kvm/mmu.c  |   13 +
 arch/x86/kvm/x86.c  |   10 ++
 3 files changed, 38 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 72a0a64..8dc1a0a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -67,6 +67,11 @@
 
 #define UNMAPPED_GVA (~(gpa_t)0)
 
+#ifdef CONFIG_SLAVE_CPU
+/* Requests to handle VM exit on online cpu */
+#define KVM_REQ_HANDLE_PF  32
+#endif
+
 /* KVM Hugepage definitions for x86 */
 #define KVM_NR_PAGE_SIZES  3
 #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
@@ -413,6 +418,16 @@ struct kvm_vcpu_arch {
u8 nr;
} interrupt;
 
+#ifdef CONFIG_SLAVE_CPU
+   /* used for recording page fault on offline CPU */
+   struct kvm_access_fault {
+   gva_t cr2;
+   u32 error_code;
+   void *insn;
+   int insn_len;
+   } page_fault;
+#endif
+
int halt_request; /* real mode on Intel only */
 
int cpuid_nent;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7fbd0d2..eb1d397 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3946,6 +3946,19 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, 
u32 error_code,
int r, emulation_type = EMULTYPE_RETRY;
enum emulation_result er;
 
+#ifdef CONFIG_SLAVE_CPU
+   if (cpu_slave(smp_processor_id())) {
+   /* Page fault must be handled on user-process context. */
+   r = -EFAULT;
+   vcpu->arch.page_fault.cr2 = cr2;
+   vcpu->arch.page_fault.error_code = error_code;
+   vcpu->arch.page_fault.insn = insn;
+   vcpu->arch.page_fault.insn_len = insn_len;
+   kvm_make_request(KVM_REQ_HANDLE_PF, vcpu);
+   goto out;
+   }
+#endif
+
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
if (r < 0)
goto out;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 827b681..579c41c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5561,6 +5561,16 @@ static int vcpu_enter_guest_slave(struct kvm_vcpu *vcpu,
r = arg.ret;
*apf_pending = arg.apf_pending;
 
+   if (r == -EFAULT && kvm_check_request(KVM_REQ_HANDLE_PF, vcpu)) {
+   pr_debug("handling page fault request @%p\n",
+(void *)vcpu->arch.page_fault.cr2);
+   r = kvm_mmu_page_fault(vcpu,
+  vcpu->arch.page_fault.cr2,
+  vcpu->arch.page_fault.error_code,
+  vcpu->arch.page_fault.insn,
+  vcpu->arch.page_fault.insn_len);
+   }
+
return r;
 }
 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 06/21] KVM: Add facility to run guests on slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Add path to migrate execution of vcpu_enter_guest to a slave CPU when
vcpu->arch.slave_cpu is set.

After moving to the slave CPU, it goes back to the online CPU when the
guest is exited by reasons that cannot be handled by the slave CPU only
(e.g. handling async page faults).

On migration, kvm_arch_vcpu_put_migrate is used to avoid using IPI to
clear loaded vmcs from the old CPU. Instead, this immediately clears
vmcs.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/kvm_host.h |9 ++
 arch/x86/kernel/smp.c   |2 
 arch/x86/kvm/vmx.c  |   10 ++
 arch/x86/kvm/x86.c  |  189 ++-
 arch/x86/kvm/x86.h  |9 ++
 include/linux/kvm_host.h|1 
 kernel/smp.c|1 
 virt/kvm/async_pf.c |9 +-
 virt/kvm/kvm_main.c |3 -
 9 files changed, 203 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 09155d6..72a0a64 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -354,6 +354,14 @@ struct kvm_vcpu_arch {
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
 
+#ifdef CONFIG_SLAVE_CPU
+   /* slave cpu dedicated to this vcpu */
+   int slave_cpu;
+#endif
+
+   /* user process tied to each vcpu */
+   struct task_struct *task;
+
/*
 * Paging state of the vcpu
 *
@@ -617,6 +625,7 @@ struct kvm_x86_ops {
void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
+   void (*vcpu_put_migrate)(struct kvm_vcpu *vcpu);
 
void (*set_guest_debug)(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 48d2b7d..a58dead 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -119,7 +119,7 @@ static bool smp_no_nmi_ipi = false;
  */
 static void native_smp_send_reschedule(int cpu)
 {
-   if (unlikely(cpu_is_offline(cpu))) {
+   if (unlikely(cpu_is_offline(cpu) && !cpu_slave(cpu))) {
WARN_ON(1);
return;
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c00f03d..c5db714 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1557,6 +1557,13 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
}
 }
 
+static void vmx_vcpu_put_migrate(struct kvm_vcpu *vcpu)
+{
+   vmx_vcpu_put(vcpu);
+   __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
+   vcpu->cpu = -1;
+}
+
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 {
ulong cr0;
@@ -5017,7 +5024,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu 
*vcpu)
return 0;
}
 
-   if (signal_pending(current))
+   if (signal_pending(vcpu->arch.task))
goto out;
if (need_resched())
schedule();
@@ -7263,6 +7270,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.prepare_guest_switch = vmx_save_host_state,
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
+   .vcpu_put_migrate = vmx_vcpu_put_migrate,
 
.set_guest_debug = set_guest_debug,
.get_msr = vmx_get_msr,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7501cc4..827b681 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define CREATE_TRACE_POINTS
@@ -62,6 +63,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
@@ -1655,6 +1657,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, 
u64 data)
if (unlikely(!sched_info_on()))
return 1;
 
+   if (vcpu_has_slave_cpu(vcpu))
+   break;
+
if (data & KVM_STEAL_RESERVED_MASK)
return 1;
 
@@ -2348,6 +2353,13 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
vcpu->arch.last_host_tsc = native_read_tsc();
 }
 
+void kvm_arch_vcpu_put_migrate(struct kvm_vcpu *vcpu)
+{
+   kvm_x86_ops->vcpu_put_migrate(vcpu);
+   kvm_put_guest_fpu(vcpu);
+   vcpu->arch.last_host_tsc = native_read_tsc();
+}
+
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
 {
@@ -5255,7 +5267,46 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
-static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+enum vcpu_enter_guest_slave_retval {
+   EXIT_TO_USER = 0,
+   LOOP_ONLINE,/* vcpu_post_run 

[RFC v2 PATCH 03/21] x86: Support hrtimer on slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Adds a facility to use hrtimer on slave CPUs.

To initialize hrtimer when slave CPUs are activated, and to shutdown hrtimer
when slave CPUs are stopped, this patch adds the slave cpu notifier chain,
which call registered callbacks when slave CPUs are up, dying, and died.

The registered callbacks are called with CPU_SLAVE_UP when a slave CPU
becomes active. When the slave CPU is stopped, callbacks are called with
CPU_SLAVE_DYING on slave CPUs, and with CPU_SLAVE_DEAD on online CPUs.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/cpu.h |   11 ---
 arch/x86/kernel/smpboot.c  |   37 +
 include/linux/cpu.h|   22 ++
 kernel/hrtimer.c   |   14 ++
 4 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b7ace52..4564c8e 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -30,17 +30,6 @@ extern int arch_register_cpu(int num);
 extern void arch_unregister_cpu(int);
 #endif
 
-#ifdef CONFIG_SLAVE_CPU
-#define CPU_SLAVE_UP_PREPARE   0xff00
-#define CPU_SLAVE_UP   0xff01
-#define CPU_SLAVE_DEAD 0xff02
-
-extern int slave_cpu_up(unsigned int cpu);
-extern int slave_cpu_down(unsigned int cpu);
-extern void slave_cpu_call_function(unsigned int cpu,
-   void (*f)(void *), void *arg);
-#endif
-
 DECLARE_PER_CPU(int, cpu_state);
 
 int mwait_usable(const struct cpuinfo_x86 *);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b9e1297..e8cfe377 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -127,6 +127,36 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
 
 atomic_t init_deasserted;
 
+static void __ref remove_cpu_from_maps(int cpu);
+
+
+#ifdef CONFIG_SLAVE_CPU
+/* Notify slave cpu up and down */
+static RAW_NOTIFIER_HEAD(slave_cpu_chain);
+
+int register_slave_cpu_notifier(struct notifier_block *nb)
+{
+   return raw_notifier_chain_register(_cpu_chain, nb);
+}
+EXPORT_SYMBOL(register_slave_cpu_notifier);
+
+void unregister_slave_cpu_notifier(struct notifier_block *nb)
+{
+   raw_notifier_chain_unregister(_cpu_chain, nb);
+}
+EXPORT_SYMBOL(unregister_slave_cpu_notifier);
+
+static int slave_cpu_notify(unsigned long val, int cpu)
+{
+   int ret;
+
+   ret = __raw_notifier_call_chain(_cpu_chain, val,
+   (void *)(long)cpu, -1, NULL);
+
+   return notifier_to_errno(ret);
+}
+#endif
+
 /*
  * Report back to the Boot Processor.
  * Running on AP.
@@ -307,6 +337,7 @@ notrace static void __cpuinit start_slave_cpu(void *unused)
 * most necessary things.
 */
cpu_init();
+   x86_cpuinit.early_percpu_clock_init();
preempt_disable();
smp_callin(0);
 
@@ -333,10 +364,14 @@ notrace static void __cpuinit start_slave_cpu(void 
*unused)
/* to prevent fake stack check failure */
boot_init_stack_canary();
 
+   x86_cpuinit.setup_percpu_clockev();
+   tick_nohz_idle_enter();
+
/* announce slave CPU started */
pr_info("Slave CPU %d is up\n", cpu);
__this_cpu_write(cpu_state, CPU_SLAVE_UP);
set_cpu_slave(cpu, true);
+   slave_cpu_notify(CPU_SLAVE_UP, cpu);
wmb();
 
/* wait for slave_cpu_call_function or slave_cpu_down */
@@ -363,6 +398,7 @@ notrace static void __cpuinit start_slave_cpu(void *unused)
local_irq_disable();
native_cpu_disable();
set_cpu_slave(cpu, false);
+   slave_cpu_notify(CPU_SLAVE_DYING, cpu);
native_play_dead();
 }
 #endif
@@ -995,6 +1031,7 @@ int slave_cpu_down(unsigned int cpu)
return -EBUSY;
}
 
+   slave_cpu_notify(CPU_SLAVE_DEAD, cpu);
return 0;
 }
 EXPORT_SYMBOL(slave_cpu_down);
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 8395ac9..f1aa3cc 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -221,4 +221,26 @@ static inline int disable_nonboot_cpus(void) { return 0; }
 static inline void enable_nonboot_cpus(void) {}
 #endif /* !CONFIG_PM_SLEEP_SMP */
 
+#ifdef CONFIG_SLAVE_CPU
+int register_slave_cpu_notifier(struct notifier_block *nb);
+void unregister_slave_cpu_notifier(struct notifier_block *nb);
+
+/* CPU notifier constants for slave processors */
+#define CPU_SLAVE_UP_PREPARE   0xff00
+#define CPU_SLAVE_UP   0xff01
+#define CPU_SLAVE_DEAD 0xff02
+#define CPU_SLAVE_DYING0xff03
+
+extern int slave_cpu_up(unsigned int cpu);
+extern int slave_cpu_down(unsigned int cpu);
+extern void slave_cpu_call_function(unsigned int cpu,
+   void (*f)(void *), void *arg);
+#else
+static inline int register_slave_cpu_notifier(struct notifier_block *nb)
+{
+   return 0;
+}
+static inline void unregiste

[RFC v2 PATCH 02/21] x86: Add a facility to use offlined CPUs as slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Add a facility of using offlined CPUs as slave CPUs. Slave CPUs are
specialized to exclusively run functions specified by online CPUs,
which do not run user processes.

To use this feature, build the kernel with CONFIG_SLAVE_CPU=y.

A slave CPU is launched by calling cpu_slave_up() when the CPU is offlined.
Once launched, the slave CPU waits for IPI in the idle thread context.
Users of the slave CPU can run specific kernel function by sending IPI using
slave_cpu_call_function().

When cpu_slave_down() is called, the slave cpu will go offline state again.

Cpumask `cpu_slave_mask' is provided to manage whether CPU is slave.
In addition, `cpu_online_or_slave_mask' is also provided for convenence of
APIC handling, etc.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/Kconfig |   10 ++
 arch/x86/include/asm/cpu.h   |   11 ++
 arch/x86/kernel/cpu/common.c |5 +
 arch/x86/kernel/smpboot.c|  190 --
 include/linux/cpumask.h  |   26 ++
 kernel/cpu.c |   37 
 kernel/smp.c |8 +-
 7 files changed, 275 insertions(+), 12 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8ec3a1a..106c958 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1678,6 +1678,16 @@ config HOTPLUG_CPU
automatically on SMP systems. )
  Say N if you want to disable CPU hotplug.
 
+config SLAVE_CPU
+   bool "Support for slave CPUs (EXPERIMENTAL)"
+   depends on EXPERIMENTAL && HOTPLUG_CPU
+   ---help---
+ Say Y here to allow use some of CPUs as slave processors.
+ Slave CPUs are controlled from another CPU and do some tasks
+ and cannot run user processes. Slave processors can be
+ specified through /sys/devices/system/cpu.
+ Say N if you want to disable slave CPU support.
+
 config COMPAT_VDSO
def_bool y
prompt "Compat VDSO support"
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 4564c8e..b7ace52 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -30,6 +30,17 @@ extern int arch_register_cpu(int num);
 extern void arch_unregister_cpu(int);
 #endif
 
+#ifdef CONFIG_SLAVE_CPU
+#define CPU_SLAVE_UP_PREPARE   0xff00
+#define CPU_SLAVE_UP   0xff01
+#define CPU_SLAVE_DEAD 0xff02
+
+extern int slave_cpu_up(unsigned int cpu);
+extern int slave_cpu_down(unsigned int cpu);
+extern void slave_cpu_call_function(unsigned int cpu,
+   void (*f)(void *), void *arg);
+#endif
+
 DECLARE_PER_CPU(int, cpu_state);
 
 int mwait_usable(const struct cpuinfo_x86 *);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a5fbc3c..ab7f9a7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -913,7 +913,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
}
 
/* Init Machine Check Exception if available. */
-   mcheck_cpu_init(c);
+#ifdef CONFIG_SLAVE_CPU
+   if (per_cpu(cpu_state, smp_processor_id()) != CPU_SLAVE_UP_PREPARE)
+#endif
+   mcheck_cpu_init(c);
 
select_idle_routine(c);
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7c5a8c3..b9e1297 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,6 +53,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include "../kernel/smpboot.h"
 
 #include 
 #include 
@@ -128,7 +131,7 @@ atomic_t init_deasserted;
  * Report back to the Boot Processor.
  * Running on AP.
  */
-static void __cpuinit smp_callin(void)
+static void __cpuinit smp_callin(int notify_starting)
 {
int cpuid, phys_id;
unsigned long timeout;
@@ -220,7 +223,8 @@ static void __cpuinit smp_callin(void)
set_cpu_sibling_map(raw_smp_processor_id());
wmb();
 
-   notify_cpu_starting(cpuid);
+   if (notify_starting)
+   notify_cpu_starting(cpuid);
 
/*
 * Allow the master to continue.
@@ -241,7 +245,7 @@ notrace static void __cpuinit start_secondary(void *unused)
cpu_init();
x86_cpuinit.early_percpu_clock_init();
preempt_disable();
-   smp_callin();
+   smp_callin(1);
 
 #ifdef CONFIG_X86_32
/* switch away from the initial page table */
@@ -279,6 +283,90 @@ notrace static void __cpuinit start_secondary(void *unused)
cpu_idle();
 }
 
+#ifdef CONFIG_SLAVE_CPU
+
+struct slave_cpu_func_info {
+   void (*func)(void *);
+   void *arg;
+};
+static DEFINE_PER_CPU(struct slave_cpu_func_info, slave_cpu_func);
+
+/*
+ * Activate cpu as a slave processor.
+ * The cpu is used to run specified function using smp_call_function
+ * from online processors.
+ * Note that this doesn't mark the cpu online.
+ */
+notrace static void __cpuinit 

[RFC v2 PATCH 01/21] x86: Split memory hotplug function from cpu_up() as cpu_memory_up()

2012-09-06 Thread Tomoki Sekiyama
Split memory hotplug function from cpu_up() as cpu_memory_up(), which will
be used for assigning memory area to off-lined cpus at following patch
in this series.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 include/linux/cpu.h |9 +
 kernel/cpu.c|   46 +++---
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ce7a074..8395ac9 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -146,6 +146,15 @@ void notify_cpu_starting(unsigned int cpu);
 extern void cpu_maps_update_begin(void);
 extern void cpu_maps_update_done(void);
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+extern int cpu_memory_up(unsigned int cpu);
+#else
+static inline int cpu_memory_up(unsigned int cpu)
+{
+   return 0;
+}
+#endif
+
 #else  /* CONFIG_SMP */
 
 #define cpu_notifier(fn, pri)  do { (void)(fn); } while (0)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 14d3258..5df8f36 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -384,11 +384,6 @@ int __cpuinit cpu_up(unsigned int cpu)
 {
int err = 0;
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-   int nid;
-   pg_data_t   *pgdat;
-#endif
-
if (!cpu_possible(cpu)) {
printk(KERN_ERR "can't online cpu %d because it is not "
"configured as may-hotadd at boot time\n", cpu);
@@ -399,7 +394,32 @@ int __cpuinit cpu_up(unsigned int cpu)
return -EINVAL;
}
 
+   err = cpu_memory_up(cpu);
+   if (err)
+   return err;
+
+   cpu_maps_update_begin();
+
+   if (cpu_hotplug_disabled) {
+   err = -EBUSY;
+   goto out;
+   }
+
+   err = _cpu_up(cpu, 0);
+
+out:
+   cpu_maps_update_done();
+   return err;
+}
+EXPORT_SYMBOL_GPL(cpu_up);
+
 #ifdef CONFIG_MEMORY_HOTPLUG
+int __cpuinit cpu_memory_up(unsigned int cpu)
+{
+   int err;
+   int nid;
+   pg_data_t   *pgdat;
+
nid = cpu_to_node(cpu);
if (!node_online(nid)) {
err = mem_online_node(nid);
@@ -419,22 +439,10 @@ int __cpuinit cpu_up(unsigned int cpu)
build_all_zonelists(NULL, NULL);
mutex_unlock(_mutex);
}
-#endif
 
-   cpu_maps_update_begin();
-
-   if (cpu_hotplug_disabled) {
-   err = -EBUSY;
-   goto out;
-   }
-
-   err = _cpu_up(cpu, 0);
-
-out:
-   cpu_maps_update_done();
-   return err;
+   return 0;
 }
-EXPORT_SYMBOL_GPL(cpu_up);
+#endif
 
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 10/21] KVM: proxy slab operations for slave CPUs on online CPUs

2012-09-06 Thread Tomoki Sekiyama
Add some fix-ups that proxy slab operations on online CPUs for the guest,
in order to avoid touching slab on slave CPUs where some slab functions
are not activated.

Currently, slab may be touched on slave CPUs in following 3 cases.
For each cases, the fix-ups below are introduced:

* kvm_mmu_commit_zap_page
With this patch, Instead of commiting zap pages, the pages are added
into invalid_mmu_pages list and make KVM_REQ_COMMIT_ZAP_PAGE request.
Then, the pages are freed on online CPUs after the execution of vCPU
thread is resumed.

* mmu_topup_memory_caches
Preallocate caches for mmu operations in vcpu_enter_guest_slave,
which is done by online CPUs before entering guests.

* kvm_async_pf_wakeup_all
If this function is called on slave CPUs, it makes KVM_REQ_WAKEUP_APF.
The request is handled by calling kvm_async_pf_wakeup_all on online CPUs.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/kvm_host.h |5 
 arch/x86/kvm/mmu.c  |   52 ---
 arch/x86/kvm/mmu.h  |4 +++
 arch/x86/kvm/x86.c  |   15 +++
 virt/kvm/async_pf.c |8 ++
 5 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af68ffb..5ce89f1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -70,6 +70,8 @@
 #ifdef CONFIG_SLAVE_CPU
 /* Requests to handle VM exit on online cpu */
 #define KVM_REQ_HANDLE_PF  32
+#define KVM_REQ_COMMIT_ZAP_PAGE33
+#define KVM_REQ_WAKEUP_APF 34
 #endif
 
 /* KVM Hugepage definitions for x86 */
@@ -542,6 +544,9 @@ struct kvm_arch {
 * Hash table of struct kvm_mmu_page.
 */
struct list_head active_mmu_pages;
+#ifdef CONFIG_SLAVE_CPU
+   struct list_head invalid_mmu_pages;
+#endif
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
int iommu_flags;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eb1d397..871483a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -612,6 +612,10 @@ static int mmu_topup_memory_cache(struct 
kvm_mmu_memory_cache *cache,
 
if (cache->nobjs >= min)
return 0;
+#ifdef CONFIG_SLAVE_CPU
+   if (cpu_slave(raw_smp_processor_id()))
+   return -ENOMEM;
+#endif
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
if (!obj)
@@ -655,7 +659,7 @@ static void mmu_free_memory_cache_page(struct 
kvm_mmu_memory_cache *mc)
free_page((unsigned long)mc->objects[--mc->nobjs]);
 }
 
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 {
int r;
 
@@ -1617,7 +1621,7 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, 
struct kvm_mmu_page *sp)
 
 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
struct list_head *invalid_list);
-static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct kvm_vcpu *vcpu,
struct list_head *invalid_list);
 
 #define for_each_gfn_sp(kvm, sp, gfn, pos) \
@@ -1660,7 +1664,7 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 
ret = __kvm_sync_page(vcpu, sp, _list, false);
if (ret)
-   kvm_mmu_commit_zap_page(vcpu->kvm, _list);
+   kvm_mmu_commit_zap_page(vcpu->kvm, vcpu, _list);
 
return ret;
 }
@@ -1700,7 +1704,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t 
gfn)
flush = true;
}
 
-   kvm_mmu_commit_zap_page(vcpu->kvm, _list);
+   kvm_mmu_commit_zap_page(vcpu->kvm, vcpu, _list);
if (flush)
kvm_mmu_flush_tlb(vcpu);
 }
@@ -1787,7 +1791,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
kvm_sync_page(vcpu, sp, _list);
mmu_pages_clear_parents();
}
-   kvm_mmu_commit_zap_page(vcpu->kvm, _list);
+   kvm_mmu_commit_zap_page(vcpu->kvm, vcpu, _list);
cond_resched_lock(>kvm->mmu_lock);
kvm_mmu_pages_init(parent, , );
}
@@ -2064,7 +2068,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
return ret;
 }
 
-static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct kvm_vcpu *vcpu,
struct list_head *invalid_list)
 {
struct kvm_mmu_page *sp;
@@ -2078,6 +2082,16 @@ static void kvm_mmu_commit_zap_page(s

[RFC v2 PATCH 18/21] KVM: route assigned devices' MSI/MSI-X directly to guests on slave CPUs

2012-09-06 Thread Tomoki Sekiyama
When a PCI device is assigned to a guest running on slave CPUs, this
routes the device's MSI/MSI-X interrupts directly to the guest.

Because the guest uses a different interrupt vector from the host,
vector remapping is required. This is safe because slave CPUs only handles
interrupts for the assigned guest.

The slave CPU may receive interrupts for the guest while the guest is not
running. In that case, the host IRQ handler is invoked and the interrupt is
transfered as vIRQ.

If the guest receive the direct interrupt from the devices, EOI to physical
APIC is required. To handle this, if the guest issues EOI when there are no
in-service interrupts in the virtual APIC, physical EOI is issued.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/kvm_host.h |   19 +
 arch/x86/kvm/irq.c  |  136 +++
 arch/x86/kvm/lapic.c|6 +-
 arch/x86/kvm/x86.c  |   12 +++
 virt/kvm/assigned-dev.c |8 ++
 5 files changed, 179 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 624e5ad..f43680e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1033,4 +1033,23 @@ void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
 
 int kvm_arch_vcpu_run_prevented(struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_SLAVE_CPU
+void kvm_get_slave_cpu_mask(struct kvm *kvm, struct cpumask *mask);
+
+struct kvm_assigned_dev_kernel;
+extern void assign_slave_msi(struct kvm *kvm,
+struct kvm_assigned_dev_kernel *assigned_dev);
+extern void deassign_slave_msi(struct kvm *kvm,
+  struct kvm_assigned_dev_kernel *assigned_dev);
+extern void assign_slave_msix(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev);
+extern void deassign_slave_msix(struct kvm *kvm,
+   struct kvm_assigned_dev_kernel *assigned_dev);
+#else
+#define assign_slave_msi(kvm, assigned_dev)
+#define deassign_slave_msi(kvm, assigned_dev)
+#define assign_slave_msix(kvm, assigned_dev)
+#define deassign_slave_msix(kvm, assigned_dev)
+#endif
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 7e06ba1..128431a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -22,6 +22,8 @@
 
 #include 
 #include 
+#include 
+#include 
 
 #include "irq.h"
 #include "i8254.h"
@@ -94,3 +96,137 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
__kvm_migrate_apic_timer(vcpu);
__kvm_migrate_pit_timer(vcpu);
 }
+
+
+#ifdef CONFIG_SLAVE_CPU
+
+static int kvm_lookup_msi_routing_entry(struct kvm *kvm, int irq)
+{
+   int vec = -1;
+   struct kvm_irq_routing_table *irq_rt;
+   struct kvm_kernel_irq_routing_entry *e;
+   struct hlist_node *n;
+
+   rcu_read_lock();
+   irq_rt = rcu_dereference(kvm->irq_routing);
+   if (irq < irq_rt->nr_rt_entries)
+   hlist_for_each_entry(e, n, _rt->map[irq], link)
+   if (e->type == KVM_IRQ_ROUTING_MSI)
+   vec = (e->msi.data & MSI_DATA_VECTOR_MASK)
+   >> MSI_DATA_VECTOR_SHIFT;
+   rcu_read_unlock();
+
+   return vec;
+}
+
+void assign_slave_msi(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+   int irq = assigned_dev->guest_irq;
+   int host_irq = assigned_dev->host_irq;
+   struct irq_data *data = irq_get_irq_data(host_irq);
+   int vec = kvm_lookup_msi_routing_entry(kvm, irq);
+   cpumask_var_t slave_mask;
+   char buffer[16];
+
+   BUG_ON(!data);
+
+   if (!zalloc_cpumask_var(_mask, GFP_KERNEL)) {
+   pr_err("assign slave MSI failed: no memory\n");
+   return;
+   }
+   kvm_get_slave_cpu_mask(kvm, slave_mask);
+
+   bitmap_scnprintf(buffer, sizeof(buffer), cpu_slave_mask->bits, 32);
+   pr_info("assigned_device slave msi: irq:%d host:%d vec:%d mask:%s\n",
+   irq, host_irq, vec, buffer);
+
+   remap_slave_vector_irq(host_irq, vec, slave_mask);
+   data->chip->irq_set_affinity(data, slave_mask, 1);
+
+   free_cpumask_var(slave_mask);
+}
+
+void deassign_slave_msi(struct kvm *kvm,
+   struct kvm_assigned_dev_kernel *assigned_dev)
+{
+   int host_irq = assigned_dev->host_irq;
+   cpumask_var_t slave_mask;
+   char buffer[16];
+
+   if (!zalloc_cpumask_var(_mask, GFP_KERNEL)) {
+   pr_err("deassign slave MSI failed: no memory\n");
+   return;
+   }
+   kvm_get_slave_cpu_mask(kvm, slave_mask);
+
+   bitmap_scnprintf(buffer, sizeof(buffer), cpu_slave_mask->

[RFC v2 PATCH 21/21] x86: request TLB flush to slave CPU using NMI

2012-09-06 Thread Tomoki Sekiyama
For slave CPUs, it is inapropriate to request TLB flush using IPI.
because the IPI may be sent to a KVM guest when the slave CPU is running
the guest with direct interrupt routing.

Instead, it registers a TLB flush request in per-cpu bitmask and send a NMI
to interrupt execution of the guest. Then, NMI handler will check the
requests and handles the requests.

This implementation has an issue in scalability, and is just for PoC.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/tlbflush.h |5 ++
 arch/x86/kernel/smpboot.c   |3 +
 arch/x86/kvm/x86.c  |5 ++
 arch/x86/mm/tlb.c   |   94 +++
 4 files changed, 106 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 74a4433..bcd637b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -168,6 +168,11 @@ static inline void reset_lazy_tlbstate(void)
this_cpu_write(cpu_tlbstate.active_mm, _mm);
 }
 
+#ifdef CONFIG_SLAVE_CPU
+DECLARE_PER_CPU(bool, slave_idle);
+void handle_slave_tlb_flush(unsigned int cpu);
+#endif /* SLAVE_CPU */
+
 #endif /* SMP */
 
 #ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ba7c99b..9854087 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -395,7 +395,10 @@ notrace static void __cpuinit start_slave_cpu(void *unused)
rcu_note_context_switch(cpu);
 
if (!f.func) {
+   __this_cpu_write(slave_idle, 1);
+   handle_slave_tlb_flush(cpu);
native_safe_halt();
+   __this_cpu_write(slave_idle, 0);
continue;
}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9d92581..d3ee570 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -65,6 +65,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
@@ -5529,6 +5530,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct 
task_struct *task)
 
srcu_read_unlock(>kvm->srcu, vcpu->srcu_idx);
 
+   handle_slave_tlb_flush(vcpu->cpu);
+
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
 
@@ -5631,6 +5634,8 @@ static void __vcpu_enter_guest_slave(void *_arg)
 
r = vcpu_enter_guest(vcpu, arg->task);
 
+   handle_slave_tlb_flush(cpu);
+
if (r <= 0)
break;
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 613cd83..54f1c1b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -35,6 +36,10 @@ struct flush_tlb_info {
struct mm_struct *flush_mm;
unsigned long flush_start;
unsigned long flush_end;
+#ifdef CONFIG_SLAVE_CPU
+   cpumask_var_t mask;
+   struct list_head list;
+#endif
 };
 
 /*
@@ -97,6 +102,7 @@ EXPORT_SYMBOL_GPL(leave_mm);
 static void flush_tlb_func(void *info)
 {
struct flush_tlb_info *f = info;
+   int cpu = smp_processor_id();
 
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
@@ -115,9 +121,94 @@ static void flush_tlb_func(void *info)
}
}
} else
-   leave_mm(smp_processor_id());
+   leave_mm(cpu);
+
+#ifdef CONFIG_SLAVE_CPU
+   if (cpu_slave(cpu))
+   cpumask_test_and_clear_cpu(cpu, f->mask);
+#endif
+}
+
+#ifdef CONFIG_SLAVE_CPU
+static DEFINE_PER_CPU(atomic_t, nr_slave_tlbf);
+DEFINE_PER_CPU(bool, slave_idle);
+static LIST_HEAD(fti_list);
+static DEFINE_RWLOCK(fti_list_lock);
+
+static int slave_tlb_flush_nmi(unsigned int val, struct pt_regs *regs)
+{
+   int cpu = smp_processor_id();
+
+   if (!cpu_slave(cpu) || !atomic_read(&__get_cpu_var(nr_slave_tlbf)))
+   return NMI_DONE;
+   if (this_cpu_read(slave_idle))
+   handle_slave_tlb_flush(cpu);
+   return NMI_HANDLED;
+}
+
+static int __cpuinit register_slave_tlb_flush_nmi(void)
+{
+   register_nmi_handler(NMI_LOCAL, slave_tlb_flush_nmi,
+NMI_FLAG_FIRST, "slave_tlb_flush");
+   return 0;
+}
+late_initcall(register_slave_tlb_flush_nmi);
+
+void handle_slave_tlb_flush(unsigned int cpu)
+{
+   struct flush_tlb_info *info;
 
+   if (!cpu_slave(cpu) ||
+   !atomic_read(&__get_cpu_var(nr_slave_tlbf)))
+   return;
+
+   read_lock(_list_lock);
+   list_for_each_entry(info, _list, list) {
+   if (cpumask_test_cpu(cpu, info->mask)) {
+   flush_tlb_func(info);
+

[RFC v2 PATCH 19/21] KVM: Enable direct EOI for directly routed interrupts to guests

2012-09-06 Thread Tomoki Sekiyama
Enable direct access to EOI MSR of x2apic to accelerate guests.
This accelerate handling of interrupts delivered directly to guest from
passed-through PCI devices. When a virtual IRQ is injected, this feature
is disabled in order to route following EOI to virtual APIC. Then, it is
enabled again after every virtual IRQ is handled.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/kvm/vmx.c |   69 ++--
 1 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 39a4cb4..f93e08c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -636,6 +636,10 @@ static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
+#ifdef CONFIG_SLAVE_CPU
+static unsigned long *vmx_msr_bitmap_slave_legacy;
+static unsigned long *vmx_msr_bitmap_slave_longmode;
+#endif
 
 static bool cpu_has_load_ia32_efer;
 static bool cpu_has_load_perf_global_ctrl;
@@ -912,6 +916,11 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12,
u32 reason, unsigned long qualification);
 
+static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only);
+#ifdef CONFIG_SLAVE_CPU
+static void vmx_disable_intercept_for_msr_slave(u32 msr, bool longmode_only);
+#endif
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
int i;
@@ -1716,13 +1725,28 @@ static void vmx_set_direct_interrupt(struct kvm_vcpu 
*vcpu, bool enabled)
 #ifdef CONFIG_SLAVE_CPU
void *msr_bitmap;
 
-   if (enabled)
+   if (enabled) {
vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
PIN_BASED_EXT_INTR_MASK);
-   else
+
+   if (cpu_has_vmx_msr_bitmap()) {
+   msr_bitmap = is_long_mode(vcpu) ?
+   vmx_msr_bitmap_slave_longmode :
+   vmx_msr_bitmap_slave_legacy;
+   vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+   }
+   } else {
vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
  PIN_BASED_EXT_INTR_MASK);
 
+   if (cpu_has_vmx_msr_bitmap()) {
+   msr_bitmap = is_long_mode(vcpu) ?
+   vmx_msr_bitmap_longmode :
+   vmx_msr_bitmap_legacy;
+   vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+   }
+   }
+
trace_kvm_set_direct_interrupt(vcpu, enabled);
 #endif
 }
@@ -3771,6 +3795,16 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool 
longmode_only)
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
 }
 
+#ifdef CONFIG_SLAVE_CPU
+static void vmx_disable_intercept_for_msr_slave(u32 msr, bool longmode_only)
+{
+   if (!longmode_only)
+   __vmx_disable_intercept_for_msr(vmx_msr_bitmap_slave_legacy,
+  msr);
+   __vmx_disable_intercept_for_msr(vmx_msr_bitmap_slave_longmode, msr);
+}
+#endif
+
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
@@ -7474,6 +7508,22 @@ static int __init vmx_init(void)
goto out2;
 
 
+#ifdef CONFIG_SLAVE_CPU
+   vmx_msr_bitmap_slave_legacy =
+   (unsigned long *)__get_free_page(GFP_KERNEL);
+   if (!vmx_msr_bitmap_slave_legacy) {
+   r = -ENOMEM;
+   goto out1s;
+   }
+
+   vmx_msr_bitmap_slave_longmode =
+   (unsigned long *)__get_free_page(GFP_KERNEL);
+   if (!vmx_msr_bitmap_slave_longmode) {
+   r = -ENOMEM;
+   goto out2s;
+   }
+#endif
+
/*
 * Allow direct access to the PC debug port (it is often used for I/O
 * delays, but the vmexits simply slow things down).
@@ -7500,6 +7550,15 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
+#ifdef CONFIG_SLAVE_CPU
+   memcpy(vmx_msr_bitmap_slave_legacy,
+  vmx_msr_bitmap_legacy, PAGE_SIZE);
+   memcpy(vmx_msr_bitmap_slave_longmode,
+  vmx_msr_bitmap_longmode, PAGE_SIZE);
+   vmx_disable_intercept_for_msr_slave(
+   APIC_BASE_MSR + (APIC_EOI >> 4), false);
+#endif
+
if (enable_ept) {
kvm_mmu_set_mask_ptes(0ull,
(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
@@ -7513,6 +7572,12 @@ static int __init vmx_init(void)
return 0;
 
 out3:
+#ifdef CONFIG_SLAVE_CPU
+   free_page((unsigned long)vmx_msr_bitmap_slave_longmode);
+out2s:
+

[RFC v2 PATCH 16/21] KVM: vmx: Add definitions PIN_BASED_PREEMPTION_TIMER

2012-09-06 Thread Tomoki Sekiyama
Add some definitions to use PIN_BASED_PREEMPTION_TIMER.

When PIN_BASED_PREEMPTION_TIMER is enabled, the guest will exit
with reason=EXIT_REASON_PREEMPTION_TIMER when the counter specified in
VMX_PREEMPTION_TIMER_VALUE becomes 0.
This patch also adds a dummy handler for EXIT_REASON_PREEMPTION_TIMER,
which just goes back to VM execution soon.

These are currently intended only to be used with avoid entering the
guest on a slave CPU when vmx_prevent_run(vcpu, 1) is called.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/vmx.h |3 +++
 arch/x86/kvm/trace.h   |1 +
 arch/x86/kvm/vmx.c |7 +++
 3 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 74fcb96..6899aaa 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,6 +66,7 @@
 #define PIN_BASED_EXT_INTR_MASK 0x0001
 #define PIN_BASED_NMI_EXITING   0x0008
 #define PIN_BASED_VIRTUAL_NMIS  0x0020
+#define PIN_BASED_PREEMPTION_TIMER  0x0040
 
 #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x0002
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE0x0200
@@ -196,6 +197,7 @@ enum vmcs_field {
GUEST_INTERRUPTIBILITY_INFO = 0x4824,
GUEST_ACTIVITY_STATE= 0X4826,
GUEST_SYSENTER_CS   = 0x482A,
+   VMX_PREEMPTION_TIMER_VALUE  = 0x482E,
HOST_IA32_SYSENTER_CS   = 0x4c00,
CR0_GUEST_HOST_MASK = 0x6000,
CR4_GUEST_HOST_MASK = 0x6002,
@@ -280,6 +282,7 @@ enum vmcs_field {
 #define EXIT_REASON_APIC_ACCESS 44
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_PREEMPTION_TIMER   52
 #define EXIT_REASON_WBINVD 54
 #define EXIT_REASON_XSETBV 55
 #define EXIT_REASON_INVPCID58
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6081be7..fc350f3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -218,6 +218,7 @@ TRACE_EVENT(kvm_apic,
{ EXIT_REASON_APIC_ACCESS,  "APIC_ACCESS" }, \
{ EXIT_REASON_EPT_VIOLATION,"EPT_VIOLATION" }, \
{ EXIT_REASON_EPT_MISCONFIG,"EPT_MISCONFIG" }, \
+   { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \
{ EXIT_REASON_WBINVD,   "WBINVD" }
 
 #define SVM_EXIT_REASONS \
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6dc59c8..2130cbd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4456,6 +4456,12 @@ static int handle_external_interrupt(struct kvm_vcpu 
*vcpu)
return 1;
 }
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+   /* Nothing */
+   return 1;
+}
+
 static int handle_triple_fault(struct kvm_vcpu *vcpu)
 {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
@@ -5768,6 +5774,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu 
*vcpu) = {
[EXIT_REASON_VMON]= handle_vmon,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+   [EXIT_REASON_PREEMPTION_TIMER]= handle_preemption_timer,
[EXIT_REASON_WBINVD]  = handle_wbinvd,
[EXIT_REASON_XSETBV]  = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 04/21] x86: Avoid RCU warnings on slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Initialize rcu related variables to avoid warnings about RCU usage while
slave CPUs is running specified functions. Also notify RCU subsystem before
the slave CPU is entered into idle state.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/kernel/smpboot.c |4 
 kernel/rcutree.c  |   14 ++
 2 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e8cfe377..45dfc1d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -382,6 +382,8 @@ notrace static void __cpuinit start_slave_cpu(void *unused)
f = per_cpu(slave_cpu_func, cpu);
per_cpu(slave_cpu_func, cpu).func = NULL;
 
+   rcu_note_context_switch(cpu);
+
if (!f.func) {
native_safe_halt();
continue;
@@ -1005,6 +1007,8 @@ int __cpuinit slave_cpu_up(unsigned int cpu)
if (IS_ERR(idle))
return PTR_ERR(idle);
 
+   slave_cpu_notify(CPU_SLAVE_UP_PREPARE, cpu);
+
ret = __native_cpu_up(cpu, idle, 1);
 
cpu_maps_update_done();
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f280e54..31a7c8c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -2589,6 +2589,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block 
*self,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
+#ifdef CONFIG_SLAVE_CPU
+   case CPU_SLAVE_UP_PREPARE:
+#endif
rcu_prepare_cpu(cpu);
rcu_prepare_kthreads(cpu);
break;
@@ -2603,6 +2606,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block 
*self,
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
+#ifdef CONFIG_SLAVE_CPU
+   case CPU_SLAVE_DYING:
+#endif
/*
 * The whole machine is "stopped" except this CPU, so we can
 * touch any data without introducing corruption. We send the
@@ -2616,6 +2622,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block 
*self,
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
+#ifdef CONFIG_SLAVE_CPU
+   case CPU_SLAVE_DEAD:
+#endif
for_each_rcu_flavor(rsp)
rcu_cleanup_dead_cpu(cpu, rsp);
break;
@@ -2797,6 +2806,10 @@ static void __init rcu_init_geometry(void)
rcu_num_nodes -= n;
 }
 
+static struct notifier_block __cpuinitdata rcu_slave_nb = {
+   .notifier_call = rcu_cpu_notify,
+};
+
 void __init rcu_init(void)
 {
int cpu;
@@ -2814,6 +2827,7 @@ void __init rcu_init(void)
 * or the scheduler are operational.
 */
cpu_notifier(rcu_cpu_notify, 0);
+   register_slave_cpu_notifier(_slave_nb);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
check_cpu_stall_init();


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 11/21] KVM: no exiting from guest when slave CPU halted

2012-09-06 Thread Tomoki Sekiyama
Avoid exiting from a guest on slave CPU even if HLT instruction is
executed. Since the slave CPU is dedicated to a vCPU, exit on HLT is
not required, and avoiding VM exit will improve the guest's performance.

This is a partial revert of

10166744b80a ("KVM: VMX: remove yield_on_hlt")

Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/kvm/vmx.c |   25 -
 1 files changed, 24 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d99bee6..03a2d02 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1698,9 +1698,29 @@ static void skip_emulated_instruction(struct kvm_vcpu 
*vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static inline void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_SLAVE_CPU
+   /* Ensure that we clear the HLT state in the VMCS.  We don't need to
+* explicitly skip the instruction because if the HLT state is set,
+* then the instruction is already executing and RIP has already been
+* advanced. */
+   if (vcpu->arch.slave_cpu >= 0 &&
+   vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+   vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+#endif
+}
+
 static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, bool slave)
 {
-   /* Nothing */
+   /* Don't intercept the guest's halt on slave CPU */
+   if (slave) {
+   vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+   CPU_BASED_HLT_EXITING);
+   } else {
+   vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_HLT_EXITING);
+   }
 }
 
 /*
@@ -1755,6 +1775,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, 
unsigned nr,
intr_info |= INTR_TYPE_HARD_EXCEPTION;
 
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+   vmx_clear_hlt(vcpu);
 }
 
 static bool vmx_rdtscp_supported(void)
@@ -4125,6 +4146,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+   vmx_clear_hlt(vcpu);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4156,6 +4178,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+   vmx_clear_hlt(vcpu);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 12/21] x86/apic: Enable external interrupt routing to slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Enable APIC to handle interrupts on slave CPUs, and enables interrupt
routing to slave CPUs by setting IRQ affinity.

As slave CPUs which run a KVM guest handle external interrupts directly in
the vCPUs, the guest's vector/IRQ mapping is different from the host's.
That requires interrupts to be routed either online CPUs or slave CPUs.

In this patch, if online CPUs are contained in specified affinity settings,
the affinity settings will be only applied to online CPUs. If every
specified CPU is slave, IRQ will be routed to slave CPUs.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/apic.h   |6 ++---
 arch/x86/kernel/apic/io_apic.c|   43 -
 arch/x86/kernel/apic/x2apic_cluster.c |8 +++---
 drivers/iommu/intel_irq_remapping.c   |   30 +++
 kernel/irq/manage.c   |4 ++-
 kernel/irq/migration.c|2 +-
 kernel/irq/proc.c |2 +-
 7 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f342612..d37ae5c 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -535,7 +535,7 @@ extern void generic_bigsmp_probe(void);
 static inline const struct cpumask *default_target_cpus(void)
 {
 #ifdef CONFIG_SMP
-   return cpu_online_mask;
+   return cpu_online_or_slave_mask;
 #else
return cpumask_of(0);
 #endif
@@ -543,7 +543,7 @@ static inline const struct cpumask 
*default_target_cpus(void)
 
 static inline const struct cpumask *online_target_cpus(void)
 {
-   return cpu_online_mask;
+   return cpu_online_or_slave_mask;
 }
 
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
@@ -602,7 +602,7 @@ flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 {
unsigned long cpu_mask = cpumask_bits(cpumask)[0] &
 cpumask_bits(andmask)[0] &
-cpumask_bits(cpu_online_mask)[0] &
+cpumask_bits(cpu_online_or_slave_mask)[0] &
 APIC_ALL_CPUS;
 
if (likely(cpu_mask)) {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c265593..0cd2682 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1125,7 +1125,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const 
struct cpumask *mask)
/* Only try and allocate irqs on cpus that are present */
err = -ENOSPC;
cpumask_clear(cfg->old_domain);
-   cpu = cpumask_first_and(mask, cpu_online_mask);
+   cpu = cpumask_first_and(mask, cpu_online_or_slave_mask);
while (cpu < nr_cpu_ids) {
int new_cpu, vector, offset;
 
@@ -1158,14 +1158,14 @@ next:
if (unlikely(current_vector == vector)) {
cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
cpumask_andnot(tmp_mask, mask, cfg->old_domain);
-   cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+   cpu = cpumask_first_and(tmp_mask, 
cpu_online_or_slave_mask);
continue;
}
 
if (test_bit(vector, used_vectors))
goto next;
 
-   for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+   for_each_cpu_and(new_cpu, tmp_mask, cpu_online_or_slave_mask)
if (per_cpu(vector_irq, new_cpu)[vector] != -1)
goto next;
/* Found one! */
@@ -1175,7 +1175,7 @@ next:
cfg->move_in_progress = 1;
cpumask_copy(cfg->old_domain, cfg->domain);
}
-   for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+   for_each_cpu_and(new_cpu, tmp_mask, cpu_online_or_slave_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
cfg->vector = vector;
cpumask_copy(cfg->domain, tmp_mask);
@@ -1204,7 +1204,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg 
*cfg)
BUG_ON(!cfg->vector);
 
vector = cfg->vector;
-   for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+   for_each_cpu_and(cpu, cfg->domain, cpu_online_or_slave_mask)
per_cpu(vector_irq, cpu)[vector] = -1;
 
cfg->vector = 0;
@@ -1212,7 +1212,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg 
*cfg)
 
if (likely(!cfg->move_in_progress))
return;
-   for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+   for_each_cpu_and(cpu, cfg->old_domain, cpu_online_or_slave_

[RFC v2 PATCH 14/21] KVM: Directly handle interrupts by guests without VM EXIT on slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Make interrupts on slave CPUs handled by guests without VM EXIT.
This reduces CPU usage by the host to transfer interrupts of assigned
PCI devices from the host to guests. It also reduces cost of VM EXIT
and quickens response of guests to the interrupts.

When a slave CPU is dedicated to a vCPU, exit on external interrupts is
disabled. Unfortunately, we can only enable/disable exits for whole
external interrupts except NMIs and cannot switch exits based on IRQ#
or vectors. Thus, to avoid IPIs from online CPUs transferred to guests,
this patch modify kvm_vcpu_kick() to use NMI for guests on slave CPUs.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/lapic.c|5 +
 arch/x86/kvm/vmx.c  |   19 ++
 arch/x86/kvm/x86.c  |   41 +++
 include/linux/kvm_host.h|1 +
 virt/kvm/kvm_main.c |5 +++--
 6 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5ce89f1..65242a6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -725,6 +725,7 @@ struct kvm_x86_ops {
   struct x86_instruction_info *info,
   enum x86_intercept_stage stage);
 
+   void (*set_direct_interrupt)(struct kvm_vcpu *vcpu, bool enabled);
void (*set_slave_mode)(struct kvm_vcpu *vcpu, bool slave);
 };
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ce87878..73f57f3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -601,6 +601,9 @@ static int apic_set_eoi(struct kvm_lapic *apic)
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
}
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+   if (vcpu_has_slave_cpu(apic->vcpu) &&
+   kvm_x86_ops->set_direct_interrupt)
+   kvm_x86_ops->set_direct_interrupt(apic->vcpu, 1);
return vector;
 }
 
@@ -1569,6 +1572,8 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 
data)
u64 addr = data & ~KVM_MSR_ENABLED;
if (!IS_ALIGNED(addr, 4))
return 1;
+   if (vcpu_has_slave_cpu(vcpu))
+   return 1;
 
vcpu->arch.pv_eoi.msr_val = data;
if (!pv_eoi_enabled(vcpu))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 03a2d02..605abea 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1711,6 +1711,16 @@ static inline void vmx_clear_hlt(struct kvm_vcpu *vcpu)
 #endif
 }
 
+static void vmx_set_direct_interrupt(struct kvm_vcpu *vcpu, bool enabled)
+{
+   if (enabled)
+   vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+   PIN_BASED_EXT_INTR_MASK);
+   else
+   vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_EXT_INTR_MASK);
+}
+
 static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, bool slave)
 {
/* Don't intercept the guest's halt on slave CPU */
@@ -1721,6 +1731,8 @@ static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, 
bool slave)
vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
  CPU_BASED_HLT_EXITING);
}
+
+   vmx_set_direct_interrupt(vcpu, slave);
 }
 
 /*
@@ -1776,6 +1788,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, 
unsigned nr,
 
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
vmx_clear_hlt(vcpu);
+   if (vcpu_has_slave_cpu(vcpu))
+   vmx_set_direct_interrupt(vcpu, 0);
 }
 
 static bool vmx_rdtscp_supported(void)
@@ -4147,6 +4161,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
vmx_clear_hlt(vcpu);
+   if (vcpu_has_slave_cpu(vcpu))
+   vmx_set_direct_interrupt(vcpu, 0);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4179,6 +4195,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
vmx_clear_hlt(vcpu);
+   if (vcpu_has_slave_cpu(vcpu))
+   vmx_set_direct_interrupt(vcpu, 0);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -7374,6 +7392,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
.check_intercept = vmx_check_intercept,
 
+   .set_direct_interrupt = vmx_set_direct_interrupt,
.set_slave_mode = vmx_set_slave_mode,
 };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a6b2521..b7d28df 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -63,6 +63,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define MA

[RFC v2 PATCH 20/21] KVM: Pass-through local APIC timer of on slave CPUs to guest VM

2012-09-06 Thread Tomoki Sekiyama
Provide direct control of local APIC timer of slave CPUs to the guest.
The timer interrupt does not cause VM exit if direct interrupt delivery is
enabled. To handle the timer correctly, this makes the guest occupy the
local APIC timer.

If the host supports x2apic, this expose TMICT and TMCCT to the guest in
order to allow guests to start the timer and to read the timer count
without VM exit. Otherwise, it sets APIC registers to specified values.
LVTT is not passed-through to avoid modifying timer interrupt vector.

Currently the guest timer interrupt vector remapping is not supported, and
guest must use the same vector as host.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/apic.h |4 +++
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kernel/apic/apic.c |   11 ++
 arch/x86/kernel/smpboot.c   |   30 ++
 arch/x86/kvm/lapic.c|   45 +++
 arch/x86/kvm/lapic.h|2 ++
 arch/x86/kvm/vmx.c  |6 +
 arch/x86/kvm/x86.c  |3 +++
 include/linux/cpu.h |5 
 kernel/hrtimer.c|2 +-
 10 files changed, 108 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d37ae5c..66e1155 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -44,6 +44,8 @@ static inline void generic_apic_probe(void)
 
 #ifdef CONFIG_X86_LOCAL_APIC
 
+struct clock_event_device;
+
 extern unsigned int apic_verbosity;
 extern int local_apic_timer_c2_ok;
 
@@ -245,6 +247,8 @@ extern void init_apic_mappings(void);
 void register_lapic_address(unsigned long address);
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
+extern void override_local_apic_timer(int cpu,
+   void (*handler)(struct clock_event_device *));
 extern int APIC_init_uniprocessor(void);
 extern int apic_force_enable(unsigned long addr);
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f43680e..a95bb62 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1035,6 +1035,7 @@ int kvm_arch_vcpu_run_prevented(struct kvm_vcpu *vcpu);
 
 #ifdef CONFIG_SLAVE_CPU
 void kvm_get_slave_cpu_mask(struct kvm *kvm, struct cpumask *mask);
+struct kvm_vcpu *get_slave_vcpu(int cpu);
 
 struct kvm_assigned_dev_kernel;
 extern void assign_slave_msi(struct kvm *kvm,
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 24deb30..90ed84a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -901,6 +901,17 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs 
*regs)
set_irq_regs(old_regs);
 }
 
+void override_local_apic_timer(int cpu,
+  void (*handler)(struct clock_event_device *))
+{
+   unsigned long flags;
+
+   local_irq_save(flags);
+   per_cpu(lapic_events, cpu).event_handler = handler;
+   local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(override_local_apic_timer);
+
 int setup_profiling_timer(unsigned int multiplier)
 {
return -EINVAL;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 45dfc1d..ba7c99b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -133,6 +133,7 @@ static void __ref remove_cpu_from_maps(int cpu);
 #ifdef CONFIG_SLAVE_CPU
 /* Notify slave cpu up and down */
 static RAW_NOTIFIER_HEAD(slave_cpu_chain);
+struct notifier_block *slave_timer_nb;
 
 int register_slave_cpu_notifier(struct notifier_block *nb)
 {
@@ -140,6 +141,13 @@ int register_slave_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(register_slave_cpu_notifier);
 
+int register_slave_cpu_timer_notifier(struct notifier_block *nb)
+{
+   slave_timer_nb = nb;
+   return register_slave_cpu_notifier(nb);
+}
+EXPORT_SYMBOL(register_slave_cpu_timer_notifier);
+
 void unregister_slave_cpu_notifier(struct notifier_block *nb)
 {
raw_notifier_chain_unregister(_cpu_chain, nb);
@@ -155,6 +163,8 @@ static int slave_cpu_notify(unsigned long val, int cpu)
 
return notifier_to_errno(ret);
 }
+
+static void slave_cpu_disable_timer(int cpu);
 #endif
 
 /*
@@ -1013,10 +1023,30 @@ int __cpuinit slave_cpu_up(unsigned int cpu)
 
cpu_maps_update_done();
 
+   /* Timer may be used only in starting the slave CPU */
+   slave_cpu_disable_timer(cpu);
+
return ret;
 }
 EXPORT_SYMBOL(slave_cpu_up);
 
+static void __slave_cpu_disable_timer(void *hcpu)
+{
+   int cpu = (long)hcpu;
+
+   pr_info("Disabling timer on slave cpu %d\n", cpu);
+   BUG_ON(!slave_timer_nb);
+   slave_timer_nb->notifier_call(slave_timer_nb, CPU_SLAVE_DYING, hcpu);
+}
+
+static void slave_cpu_disable_timer(int cpu)
+{
+   void *h

[RFC v2 PATCH 17/21] KVM: add kvm_arch_vcpu_prevent_run to prevent VM ENTER when NMI is received

2012-09-06 Thread Tomoki Sekiyama
Since NMI can not be disabled around VM enter, there is a race between
receiving NMI to kick a guest and entering the guests on slave CPUs.If the
NMI is received just before entering VM, after the NMI handler is invoked,
it continues entering the guest and the effect of the NMI will be lost.

This patch adds kvm_arch_vcpu_prevent_run(), which causes VM exit right
after VM enter. The NMI handler uses this to ensure the execution of the
guest is cancelled after NMI.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/kvm_host.h |6 ++
 arch/x86/kvm/vmx.c  |   42 ++-
 arch/x86/kvm/x86.c  |   31 +
 3 files changed, 78 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 65242a6..624e5ad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -429,6 +429,9 @@ struct kvm_vcpu_arch {
void *insn;
int insn_len;
} page_fault;
+
+   bool prevent_run;
+   bool prevent_needed;
 #endif
 
int halt_request; /* real mode on Intel only */
@@ -681,6 +684,7 @@ struct kvm_x86_ops {
 
void (*run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu);
+   void (*prevent_run)(struct kvm_vcpu *vcpu, int prevent);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -1027,4 +1031,6 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, 
u64 *data);
 void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
 void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
 
+int kvm_arch_vcpu_run_prevented(struct kvm_vcpu *vcpu);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2130cbd..39a4cb4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1713,6 +1713,9 @@ static inline void vmx_clear_hlt(struct kvm_vcpu *vcpu)
 
 static void vmx_set_direct_interrupt(struct kvm_vcpu *vcpu, bool enabled)
 {
+#ifdef CONFIG_SLAVE_CPU
+   void *msr_bitmap;
+
if (enabled)
vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
PIN_BASED_EXT_INTR_MASK);
@@ -1721,6 +1724,7 @@ static void vmx_set_direct_interrupt(struct kvm_vcpu 
*vcpu, bool enabled)
  PIN_BASED_EXT_INTR_MASK);
 
trace_kvm_set_direct_interrupt(vcpu, enabled);
+#endif
 }
 
 static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, bool slave)
@@ -4458,7 +4462,7 @@ static int handle_external_interrupt(struct kvm_vcpu 
*vcpu)
 
 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
 {
-   /* Nothing */
+   kvm_arch_vcpu_run_prevented(vcpu);
return 1;
 }
 
@@ -6052,6 +6056,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
 
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+#ifdef CONFIG_SLAVE_CPU
+   if (vcpu->arch.prevent_run)
+   return kvm_arch_vcpu_run_prevented(vcpu);
+#endif
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason;
@@ -6059,6 +6067,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
 
if (unlikely(vmx->fail)) {
+#ifdef CONFIG_SLAVE_CPU
+   if (vcpu->arch.prevent_run)
+   return kvm_arch_vcpu_run_prevented(vcpu);
+#endif
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
@@ -6275,6 +6287,21 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host);
 }
 
+/*
+ * Make VMRESUME fail using preemption timer with timer value = 0.
+ * On processors that doesn't support preemption timer, VMRESUME will fail
+ * by internal error.
+ */
+static void vmx_prevent_run(struct kvm_vcpu *vcpu, int prevent)
+{
+   if (prevent)
+   vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_PREEMPTION_TIMER);
+   else
+   vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+   PIN_BASED_PREEMPTION_TIMER);
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #define Q "q"
@@ -6326,6 +6353,13 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
atomic_switch_perf_msrs(vmx);
 
+#ifdef CONFIG_SLAVE_CPU
+   barrier();  /* Avoid vmcs modification by NMI before here */
+   vcpu->arch.prevent_needed = 1;
+   if (vcpu->arch.prevent_run)
+   

[RFC v2 PATCH 13/21] x86/apic: IRQ vector remapping on slave for slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Add a facility to use IRQ vector different from online CPUs on slave CPUs.

When alternative vector for IRQ is registered by remap_slave_vector_irq()
and the IRQ affinity is set only to slave CPUs, the device is configured
to use the alternative vector.

Current patch only supports MSI and Intel CPU with IRQ remapper of IOMMU.

This is intended to be used to routing interrupts directly to KVM guest
which is running on slave CPUs which do not cause VM EXIT by external
interrupts.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/irq.h  |   15 
 arch/x86/kernel/apic/io_apic.c  |   68 ++-
 drivers/iommu/intel_irq_remapping.c |2 +
 3 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ba870bb..84756f7 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -41,4 +41,19 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
 
 extern void init_ISA_irqs(void);
 
+#ifdef CONFIG_SLAVE_CPU
+extern void remap_slave_vector_irq(int irq, int vector,
+  const struct cpumask *mask);
+extern void revert_slave_vector_irq(int irq, const struct cpumask *mask);
+extern u8 get_remapped_slave_vector(u8 vector, unsigned int irq,
+   const struct cpumask *mask);
+#else
+static inline u8 get_remapped_slave_vector(u8 vector, unsigned int irq,
+  const struct cpumask *mask)
+{
+   return vector;
+}
+#endif
+
+
 #endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0cd2682..167b001 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1266,6 +1266,69 @@ void __setup_vector_irq(int cpu)
raw_spin_unlock(_lock);
 }
 
+#ifdef CONFIG_SLAVE_CPU
+
+/* vector table remapped on slave cpus, indexed by IRQ */
+static DEFINE_PER_CPU(u8[NR_IRQS], slave_vector_remap_tbl) = {
+   [0 ... NR_IRQS - 1] = 0,
+};
+
+void remap_slave_vector_irq(int irq, int vector, const struct cpumask *mask)
+{
+   int cpu;
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(_lock, flags);
+   for_each_cpu(cpu, mask) {
+   BUG_ON(!cpu_slave(cpu));
+   per_cpu(slave_vector_remap_tbl, cpu)[irq] = vector;
+   per_cpu(vector_irq, cpu)[vector] = irq;
+   }
+   raw_spin_unlock_irqrestore(_lock, flags);
+}
+EXPORT_SYMBOL_GPL(remap_slave_vector_irq);
+
+void revert_slave_vector_irq(int irq, const struct cpumask *mask)
+{
+   int cpu;
+   u8 vector;
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(_lock, flags);
+   for_each_cpu(cpu, mask) {
+   BUG_ON(!cpu_slave(cpu));
+   vector = per_cpu(slave_vector_remap_tbl, cpu)[irq];
+   if (vector) {
+   per_cpu(vector_irq, cpu)[vector] = -1;
+   per_cpu(slave_vector_remap_tbl, cpu)[irq] = 0;
+   }
+   }
+   raw_spin_unlock_irqrestore(_lock, flags);
+}
+EXPORT_SYMBOL_GPL(revert_slave_vector_irq);
+
+/* If all targets CPUs are slave, returns remapped vector */
+u8 get_remapped_slave_vector(u8 vector, unsigned int irq,
+const struct cpumask *mask)
+{
+   u8 slave_vector;
+
+   if (vector < FIRST_EXTERNAL_VECTOR ||
+   cpumask_intersects(mask, cpu_online_mask))
+   return vector;
+
+   slave_vector = per_cpu(slave_vector_remap_tbl,
+  cpumask_first(mask))[irq];
+   if (slave_vector >= FIRST_EXTERNAL_VECTOR)
+   vector = slave_vector;
+
+   pr_info("slave vector remap: irq: %d => vector: %d\n", irq, vector);
+
+   return vector;
+}
+
+#endif
+
 static struct irq_chip ioapic_chip;
 
 #ifdef CONFIG_X86_32
@@ -3133,6 +3196,7 @@ static int
 msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
struct irq_cfg *cfg = data->chip_data;
+   int vector = cfg->vector;
struct msi_msg msg;
unsigned int dest;
 
@@ -3141,8 +3205,10 @@ msi_set_affinity(struct irq_data *data, const struct 
cpumask *mask, bool force)
 
__get_cached_msi_msg(data->msi_desc, );
 
+   vector = get_remapped_slave_vector(vector, data->irq, mask);
+
msg.data &= ~MSI_DATA_VECTOR_MASK;
-   msg.data |= MSI_DATA_VECTOR(cfg->vector);
+   msg.data |= MSI_DATA_VECTOR(vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
diff --git a/drivers/iommu/intel_irq_remapping.c 
b/drivers/iommu/intel_irq_remapping.c
index df38334..471d23f 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -970,7 +970,7 @@ intel_ioap

[RFC v2 PATCH 15/21] KVM: add tracepoint on enabling/disabling direct interrupt delivery

2012-09-06 Thread Tomoki Sekiyama
Add trace event "kvm_set_direct_interrupt" to trace enabling/disabling
direct interrupt delivery on slave CPUs. At the event, the guest rip and
whether the feature is enabled or not is logged.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/kvm/trace.h |   18 ++
 arch/x86/kvm/vmx.c   |2 ++
 arch/x86/kvm/x86.c   |1 +
 3 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a71faf7..6081be7 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -551,6 +551,24 @@ TRACE_EVENT(kvm_pv_eoi,
TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
 );
 
+TRACE_EVENT(kvm_set_direct_interrupt,
+   TP_PROTO(struct kvm_vcpu *vcpu, bool enabled),
+   TP_ARGS(vcpu, enabled),
+
+   TP_STRUCT__entry(
+   __field(unsigned long,  guest_rip   )
+   __field(bool,   enabled )
+   ),
+
+   TP_fast_assign(
+   __entry->guest_rip  = kvm_rip_read(vcpu);
+   __entry->enabled= enabled;
+   ),
+
+   TP_printk("rip 0x%lx enabled %d",
+__entry->guest_rip, __entry->enabled)
+);
+
 /*
  * Tracepoint for nested VMRUN
  */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 605abea..6dc59c8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1719,6 +1719,8 @@ static void vmx_set_direct_interrupt(struct kvm_vcpu 
*vcpu, bool enabled)
else
vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
  PIN_BASED_EXT_INTR_MASK);
+
+   trace_kvm_set_direct_interrupt(vcpu, enabled);
 }
 
 static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, bool slave)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b7d28df..1449187 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6936,3 +6936,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_set_direct_interrupt);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 09/21] KVM: Go back to online CPU on VM exit by external interrupt

2012-09-06 Thread Tomoki Sekiyama
If the slave CPU receives an interrupt in running a guest, current
implementation must once go back to onilne CPUs to handle the interupt.

This behavior will be replaced by later patch, which introduces direct
interrupt handling mechanism by the guest.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/vmx.c  |1 +
 arch/x86/kvm/x86.c  |6 ++
 3 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0ea04c9..af68ffb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
int sipi_vector;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
+   bool interrupted;
 
 #ifdef CONFIG_SLAVE_CPU
/* slave cpu dedicated to this vcpu */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7bbfa01..d99bee6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4408,6 +4408,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 
 static int handle_external_interrupt(struct kvm_vcpu *vcpu)
 {
+   vcpu->arch.interrupted = true;
++vcpu->stat.irq_exits;
return 1;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b62f59c..db0be81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5566,6 +5566,12 @@ static void __vcpu_enter_guest_slave(void *_arg)
break;
 
/* determine if slave cpu can handle the exit alone */
+   if (vcpu->arch.interrupted) {
+   vcpu->arch.interrupted = false;
+   arg->ret = LOOP_ONLINE;
+   break;
+   }
+
r = vcpu_post_run(vcpu, arg->task, >apf_pending);
 
if (r == LOOP_SLAVE &&


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 08/21] KVM: Add KVM_GET_SLAVE_CPU and KVM_SET_SLAVE_CPU to vCPU ioctl

2012-09-06 Thread Tomoki Sekiyama
Add an interface to set/get slave CPU dedicated to the vCPUs.

By calling ioctl with KVM_GET_SLAVE_CPU, users can get the slave CPU id
for the vCPU. -1 is returned if a slave CPU is not set.

By calling ioctl with KVM_SET_SLAVE_CPU, users can dedicate the specified
slave CPU to the vCPU. The CPU must be offlined before calling ioctl.
The CPU is activated as slave CPU for the vCPU when the correct id is set.
The slave CPU is freed and offlined by setting -1 as slave CPU id.

Whether getting/setting slave CPUs are supported by KVM or not can be
known by checking KVM_CAP_SLAVE_CPU.

Signed-off-by: Tomoki Sekiyama 
Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
---

 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/vmx.c  |7 +
 arch/x86/kvm/x86.c  |   58 +++
 include/linux/kvm.h |4 +++
 4 files changed, 71 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8dc1a0a..0ea04c9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -718,6 +718,8 @@ struct kvm_x86_ops {
int (*check_intercept)(struct kvm_vcpu *vcpu,
   struct x86_instruction_info *info,
   enum x86_intercept_stage stage);
+
+   void (*set_slave_mode)(struct kvm_vcpu *vcpu, bool slave);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c5db714..7bbfa01 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1698,6 +1698,11 @@ static void skip_emulated_instruction(struct kvm_vcpu 
*vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, bool slave)
+{
+   /* Nothing */
+}
+
 /*
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -7344,6 +7349,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_tdp_cr3 = vmx_set_cr3,
 
.check_intercept = vmx_check_intercept,
+
+   .set_slave_mode = vmx_set_slave_mode,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 579c41c..b62f59c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2183,6 +2183,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
+#ifdef CONFIG_SLAVE_CPU
+   case KVM_CAP_SLAVE_CPU:
+#endif
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2657,6 +2660,48 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
return 0;
 }
 
+#ifdef CONFIG_SLAVE_CPU
+/* vcpu currently running on each slave CPU */
+static DEFINE_PER_CPU(struct kvm_vcpu *, slave_vcpu);
+
+static int kvm_arch_vcpu_ioctl_set_slave_cpu(struct kvm_vcpu *vcpu,
+int slave, int set_slave_mode)
+{
+   int old = vcpu->arch.slave_cpu;
+   int r = -EINVAL;
+
+   if (slave >= nr_cpu_ids || (slave >= 0 && cpu_online(slave)))
+   goto out;
+   if (slave >= 0 && slave != old && cpu_slave(slave))
+   goto out; /* new slave cpu must be offlined */
+
+   if (old >= 0 && slave != old) {
+   BUG_ON(old >= nr_cpu_ids || !cpu_slave(old));
+   per_cpu(slave_vcpu, old) = NULL;
+   r = slave_cpu_down(old);
+   if (r) {
+   pr_err("kvm: slave_cpu_down %d failed\n", old);
+   goto out;
+   }
+   }
+
+   if (slave >= 0) {
+   r = slave_cpu_up(slave);
+   if (r)
+   goto out;
+   BUG_ON(!cpu_slave(slave));
+   per_cpu(slave_vcpu, slave) = vcpu;
+   }
+
+   vcpu->arch.slave_cpu = slave;
+   if (set_slave_mode && kvm_x86_ops->set_slave_mode)
+   kvm_x86_ops->set_slave_mode(vcpu, slave >= 0);
+out:
+   return r;
+}
+
+#endif
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 unsigned int ioctl, unsigned long arg)
 {
@@ -2937,6 +2982,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_set_guest_paused(vcpu);
goto out;
}
+#ifdef CONFIG_SLAVE_CPU
+   case KVM_SET_SLAVE_CPU: {
+   r = kvm_arch_vcpu_ioctl_set_slave_cpu(vcpu, (int)arg, 1);
+   goto out;
+   }
+   case KVM_GET_SLAVE_CPU: {
+   r = vcpu->arch.slave_cpu;
+   goto out;
+   }
+#endif
default:
r = -EINVAL;
}
@@ -6154,6 +6209,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
kvmclock_reset

[RFC v2 PATCH 00/21] KVM: x86: CPU isolation and direct interrupts delivery to guests

2012-09-06 Thread Tomoki Sekiyama
This RFC patch series provides facility to dedicate CPUs to KVM guests
and enable the guests to handle interrupts from passed-through PCI devices
directly (without VM exit and relay by the host).

With this feature, we can improve throughput and response time of the device
and the host's CPU usage by reducing the overhead of interrupt handling.
This is good for the application using very high throughput/frequent
interrupt device (e.g. 10GbE NIC).
Real-time applicatoins also gets benefit from CPU isolation feature, which
reduces interfare from host kernel tasks and scheduling delay.

The overview of this patch series is presented in CloudOpen 2012.
The slides are available at:
http://events.linuxfoundation.org/images/stories/pdf/lcna_co2012_sekiyama.pdf

* Changes from v1 ( https://lkml.org/lkml/2012/6/28/30 )
 - SMP guest is supported
 - Direct EOI is added, that eliminate VM exit on EOI
 - Direct local APIC timer access from guests is added, which pass-through the
   physical timer of a dedicated CPU to the guest.
 - Rebased on v3.6-rc4

* How to test
 - Create a guest VM with 1 CPU and some PCI passthrough devices (which
   supports MSI/MSI-X).
   No VGA display will be better...
 - Apply the patch at the end of this mail to qemu-kvm.
   (This patch is just for simple testing, and dedicated CPU ID for the
guest is hard-coded.)
 - Run the guest once to ensure the PCI passthrough works correctly.
 - Make the specified CPU offline.
 # echo 0 > /sys/devices/system/cpu/cpu3/online
 - Launch qemu-kvm with -no-kvm-pit option.
   The offlined CPU is booted as a slave CPU and guest is runs on that CPU.

* To-do
 - Enable slave CPUs to handle access fault
 - Support AMD SVM
 - Support non-Linux guests

---

Tomoki Sekiyama (21):
  x86: request TLB flush to slave CPU using NMI
  KVM: Pass-through local APIC timer of on slave CPUs to guest VM
  KVM: Enable direct EOI for directly routed interrupts to guests
  KVM: route assigned devices' MSI/MSI-X directly to guests on slave CPUs
  KVM: add kvm_arch_vcpu_prevent_run to prevent VM ENTER when NMI is 
received
  KVM: vmx: Add definitions PIN_BASED_PREEMPTION_TIMER
  KVM: add tracepoint on enabling/disabling direct interrupt delivery
  KVM: Directly handle interrupts by guests without VM EXIT on slave CPUs
  x86/apic: IRQ vector remapping on slave for slave CPUs
  x86/apic: Enable external interrupt routing to slave CPUs
  KVM: no exiting from guest when slave CPU halted
  KVM: proxy slab operations for slave CPUs on online CPUs
  KVM: Go back to online CPU on VM exit by external interrupt
  KVM: Add KVM_GET_SLAVE_CPU and KVM_SET_SLAVE_CPU to vCPU ioctl
  KVM: handle page faults of slave guests on online CPUs
  KVM: Add facility to run guests on slave CPUs
  KVM: Enable/Disable virtualization on slave CPUs are activated/dying
  x86: Avoid RCU warnings on slave CPUs
  x86: Support hrtimer on slave CPUs
  x86: Add a facility to use offlined CPUs as slave CPUs
  x86: Split memory hotplug function from cpu_up() as cpu_memory_up()


 arch/x86/Kconfig  |   10 +
 arch/x86/include/asm/apic.h   |   10 +
 arch/x86/include/asm/irq.h|   15 +
 arch/x86/include/asm/kvm_host.h   |   59 +
 arch/x86/include/asm/tlbflush.h   |5 
 arch/x86/include/asm/vmx.h|3 
 arch/x86/kernel/apic/apic.c   |   11 +
 arch/x86/kernel/apic/io_apic.c|  111 -
 arch/x86/kernel/apic/x2apic_cluster.c |8 -
 arch/x86/kernel/cpu/common.c  |5 
 arch/x86/kernel/smp.c |2 
 arch/x86/kernel/smpboot.c |  264 ++-
 arch/x86/kvm/irq.c|  136 
 arch/x86/kvm/lapic.c  |   56 +
 arch/x86/kvm/lapic.h  |2 
 arch/x86/kvm/mmu.c|   63 -
 arch/x86/kvm/mmu.h|4 
 arch/x86/kvm/trace.h  |   19 ++
 arch/x86/kvm/vmx.c|  180 +++
 arch/x86/kvm/x86.c|  387 +++--
 arch/x86/kvm/x86.h|9 +
 arch/x86/mm/tlb.c |   94 
 drivers/iommu/intel_irq_remapping.c   |   32 ++-
 include/linux/cpu.h   |   36 +++
 include/linux/cpumask.h   |   26 ++
 include/linux/kvm.h   |4 
 include/linux/kvm_host.h  |2 
 kernel/cpu.c  |   83 +--
 kernel/hrtimer.c  |   14 +
 kernel/irq/manage.c   |4 
 kernel/irq/migration.c|2 
 kernel/irq/proc.c |2 
 kernel/rcutree.c  |   14 +
 kernel/smp.c  |9 +
 virt/kvm/assigned-dev.c   |8 +
 virt/kvm/async_pf.c   |   17 +
 virt/kvm/kvm_main.c   |  

[RFC v2 PATCH 00/21] KVM: x86: CPU isolation and direct interrupts delivery to guests

2012-09-06 Thread Tomoki Sekiyama
This RFC patch series provides facility to dedicate CPUs to KVM guests
and enable the guests to handle interrupts from passed-through PCI devices
directly (without VM exit and relay by the host).

With this feature, we can improve throughput and response time of the device
and the host's CPU usage by reducing the overhead of interrupt handling.
This is good for the application using very high throughput/frequent
interrupt device (e.g. 10GbE NIC).
Real-time applicatoins also gets benefit from CPU isolation feature, which
reduces interfare from host kernel tasks and scheduling delay.

The overview of this patch series is presented in CloudOpen 2012.
The slides are available at:
http://events.linuxfoundation.org/images/stories/pdf/lcna_co2012_sekiyama.pdf

* Changes from v1 ( https://lkml.org/lkml/2012/6/28/30 )
 - SMP guest is supported
 - Direct EOI is added, that eliminate VM exit on EOI
 - Direct local APIC timer access from guests is added, which pass-through the
   physical timer of a dedicated CPU to the guest.
 - Rebased on v3.6-rc4

* How to test
 - Create a guest VM with 1 CPU and some PCI passthrough devices (which
   supports MSI/MSI-X).
   No VGA display will be better...
 - Apply the patch at the end of this mail to qemu-kvm.
   (This patch is just for simple testing, and dedicated CPU ID for the
guest is hard-coded.)
 - Run the guest once to ensure the PCI passthrough works correctly.
 - Make the specified CPU offline.
 # echo 0  /sys/devices/system/cpu/cpu3/online
 - Launch qemu-kvm with -no-kvm-pit option.
   The offlined CPU is booted as a slave CPU and guest is runs on that CPU.

* To-do
 - Enable slave CPUs to handle access fault
 - Support AMD SVM
 - Support non-Linux guests

---

Tomoki Sekiyama (21):
  x86: request TLB flush to slave CPU using NMI
  KVM: Pass-through local APIC timer of on slave CPUs to guest VM
  KVM: Enable direct EOI for directly routed interrupts to guests
  KVM: route assigned devices' MSI/MSI-X directly to guests on slave CPUs
  KVM: add kvm_arch_vcpu_prevent_run to prevent VM ENTER when NMI is 
received
  KVM: vmx: Add definitions PIN_BASED_PREEMPTION_TIMER
  KVM: add tracepoint on enabling/disabling direct interrupt delivery
  KVM: Directly handle interrupts by guests without VM EXIT on slave CPUs
  x86/apic: IRQ vector remapping on slave for slave CPUs
  x86/apic: Enable external interrupt routing to slave CPUs
  KVM: no exiting from guest when slave CPU halted
  KVM: proxy slab operations for slave CPUs on online CPUs
  KVM: Go back to online CPU on VM exit by external interrupt
  KVM: Add KVM_GET_SLAVE_CPU and KVM_SET_SLAVE_CPU to vCPU ioctl
  KVM: handle page faults of slave guests on online CPUs
  KVM: Add facility to run guests on slave CPUs
  KVM: Enable/Disable virtualization on slave CPUs are activated/dying
  x86: Avoid RCU warnings on slave CPUs
  x86: Support hrtimer on slave CPUs
  x86: Add a facility to use offlined CPUs as slave CPUs
  x86: Split memory hotplug function from cpu_up() as cpu_memory_up()


 arch/x86/Kconfig  |   10 +
 arch/x86/include/asm/apic.h   |   10 +
 arch/x86/include/asm/irq.h|   15 +
 arch/x86/include/asm/kvm_host.h   |   59 +
 arch/x86/include/asm/tlbflush.h   |5 
 arch/x86/include/asm/vmx.h|3 
 arch/x86/kernel/apic/apic.c   |   11 +
 arch/x86/kernel/apic/io_apic.c|  111 -
 arch/x86/kernel/apic/x2apic_cluster.c |8 -
 arch/x86/kernel/cpu/common.c  |5 
 arch/x86/kernel/smp.c |2 
 arch/x86/kernel/smpboot.c |  264 ++-
 arch/x86/kvm/irq.c|  136 
 arch/x86/kvm/lapic.c  |   56 +
 arch/x86/kvm/lapic.h  |2 
 arch/x86/kvm/mmu.c|   63 -
 arch/x86/kvm/mmu.h|4 
 arch/x86/kvm/trace.h  |   19 ++
 arch/x86/kvm/vmx.c|  180 +++
 arch/x86/kvm/x86.c|  387 +++--
 arch/x86/kvm/x86.h|9 +
 arch/x86/mm/tlb.c |   94 
 drivers/iommu/intel_irq_remapping.c   |   32 ++-
 include/linux/cpu.h   |   36 +++
 include/linux/cpumask.h   |   26 ++
 include/linux/kvm.h   |4 
 include/linux/kvm_host.h  |2 
 kernel/cpu.c  |   83 +--
 kernel/hrtimer.c  |   14 +
 kernel/irq/manage.c   |4 
 kernel/irq/migration.c|2 
 kernel/irq/proc.c |2 
 kernel/rcutree.c  |   14 +
 kernel/smp.c  |9 +
 virt/kvm/assigned-dev.c   |8 +
 virt/kvm/async_pf.c   |   17 +
 virt/kvm/kvm_main.c   |   32 +++
 37

[RFC v2 PATCH 08/21] KVM: Add KVM_GET_SLAVE_CPU and KVM_SET_SLAVE_CPU to vCPU ioctl

2012-09-06 Thread Tomoki Sekiyama
Add an interface to set/get slave CPU dedicated to the vCPUs.

By calling ioctl with KVM_GET_SLAVE_CPU, users can get the slave CPU id
for the vCPU. -1 is returned if a slave CPU is not set.

By calling ioctl with KVM_SET_SLAVE_CPU, users can dedicate the specified
slave CPU to the vCPU. The CPU must be offlined before calling ioctl.
The CPU is activated as slave CPU for the vCPU when the correct id is set.
The slave CPU is freed and offlined by setting -1 as slave CPU id.

Whether getting/setting slave CPUs are supported by KVM or not can be
known by checking KVM_CAP_SLAVE_CPU.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/vmx.c  |7 +
 arch/x86/kvm/x86.c  |   58 +++
 include/linux/kvm.h |4 +++
 4 files changed, 71 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8dc1a0a..0ea04c9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -718,6 +718,8 @@ struct kvm_x86_ops {
int (*check_intercept)(struct kvm_vcpu *vcpu,
   struct x86_instruction_info *info,
   enum x86_intercept_stage stage);
+
+   void (*set_slave_mode)(struct kvm_vcpu *vcpu, bool slave);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c5db714..7bbfa01 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1698,6 +1698,11 @@ static void skip_emulated_instruction(struct kvm_vcpu 
*vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, bool slave)
+{
+   /* Nothing */
+}
+
 /*
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -7344,6 +7349,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_tdp_cr3 = vmx_set_cr3,
 
.check_intercept = vmx_check_intercept,
+
+   .set_slave_mode = vmx_set_slave_mode,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 579c41c..b62f59c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2183,6 +2183,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
+#ifdef CONFIG_SLAVE_CPU
+   case KVM_CAP_SLAVE_CPU:
+#endif
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2657,6 +2660,48 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
return 0;
 }
 
+#ifdef CONFIG_SLAVE_CPU
+/* vcpu currently running on each slave CPU */
+static DEFINE_PER_CPU(struct kvm_vcpu *, slave_vcpu);
+
+static int kvm_arch_vcpu_ioctl_set_slave_cpu(struct kvm_vcpu *vcpu,
+int slave, int set_slave_mode)
+{
+   int old = vcpu-arch.slave_cpu;
+   int r = -EINVAL;
+
+   if (slave = nr_cpu_ids || (slave = 0  cpu_online(slave)))
+   goto out;
+   if (slave = 0  slave != old  cpu_slave(slave))
+   goto out; /* new slave cpu must be offlined */
+
+   if (old = 0  slave != old) {
+   BUG_ON(old = nr_cpu_ids || !cpu_slave(old));
+   per_cpu(slave_vcpu, old) = NULL;
+   r = slave_cpu_down(old);
+   if (r) {
+   pr_err(kvm: slave_cpu_down %d failed\n, old);
+   goto out;
+   }
+   }
+
+   if (slave = 0) {
+   r = slave_cpu_up(slave);
+   if (r)
+   goto out;
+   BUG_ON(!cpu_slave(slave));
+   per_cpu(slave_vcpu, slave) = vcpu;
+   }
+
+   vcpu-arch.slave_cpu = slave;
+   if (set_slave_mode  kvm_x86_ops-set_slave_mode)
+   kvm_x86_ops-set_slave_mode(vcpu, slave = 0);
+out:
+   return r;
+}
+
+#endif
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 unsigned int ioctl, unsigned long arg)
 {
@@ -2937,6 +2982,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_set_guest_paused(vcpu);
goto out;
}
+#ifdef CONFIG_SLAVE_CPU
+   case KVM_SET_SLAVE_CPU: {
+   r = kvm_arch_vcpu_ioctl_set_slave_cpu(vcpu, (int)arg, 1);
+   goto out;
+   }
+   case KVM_GET_SLAVE_CPU: {
+   r = vcpu-arch.slave_cpu;
+   goto out;
+   }
+#endif
default:
r = -EINVAL;
}
@@ -6154,6 +6209,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
kvmclock_reset(vcpu

[RFC v2 PATCH 09/21] KVM: Go back to online CPU on VM exit by external interrupt

2012-09-06 Thread Tomoki Sekiyama
If the slave CPU receives an interrupt in running a guest, current
implementation must once go back to onilne CPUs to handle the interupt.

This behavior will be replaced by later patch, which introduces direct
interrupt handling mechanism by the guest.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/vmx.c  |1 +
 arch/x86/kvm/x86.c  |6 ++
 3 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0ea04c9..af68ffb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
int sipi_vector;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
+   bool interrupted;
 
 #ifdef CONFIG_SLAVE_CPU
/* slave cpu dedicated to this vcpu */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7bbfa01..d99bee6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4408,6 +4408,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 
 static int handle_external_interrupt(struct kvm_vcpu *vcpu)
 {
+   vcpu-arch.interrupted = true;
++vcpu-stat.irq_exits;
return 1;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b62f59c..db0be81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5566,6 +5566,12 @@ static void __vcpu_enter_guest_slave(void *_arg)
break;
 
/* determine if slave cpu can handle the exit alone */
+   if (vcpu-arch.interrupted) {
+   vcpu-arch.interrupted = false;
+   arg-ret = LOOP_ONLINE;
+   break;
+   }
+
r = vcpu_post_run(vcpu, arg-task, arg-apf_pending);
 
if (r == LOOP_SLAVE 


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 15/21] KVM: add tracepoint on enabling/disabling direct interrupt delivery

2012-09-06 Thread Tomoki Sekiyama
Add trace event kvm_set_direct_interrupt to trace enabling/disabling
direct interrupt delivery on slave CPUs. At the event, the guest rip and
whether the feature is enabled or not is logged.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/kvm/trace.h |   18 ++
 arch/x86/kvm/vmx.c   |2 ++
 arch/x86/kvm/x86.c   |1 +
 3 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a71faf7..6081be7 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -551,6 +551,24 @@ TRACE_EVENT(kvm_pv_eoi,
TP_printk(apicid %x vector %d, __entry-apicid, __entry-vector)
 );
 
+TRACE_EVENT(kvm_set_direct_interrupt,
+   TP_PROTO(struct kvm_vcpu *vcpu, bool enabled),
+   TP_ARGS(vcpu, enabled),
+
+   TP_STRUCT__entry(
+   __field(unsigned long,  guest_rip   )
+   __field(bool,   enabled )
+   ),
+
+   TP_fast_assign(
+   __entry-guest_rip  = kvm_rip_read(vcpu);
+   __entry-enabled= enabled;
+   ),
+
+   TP_printk(rip 0x%lx enabled %d,
+__entry-guest_rip, __entry-enabled)
+);
+
 /*
  * Tracepoint for nested VMRUN
  */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 605abea..6dc59c8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1719,6 +1719,8 @@ static void vmx_set_direct_interrupt(struct kvm_vcpu 
*vcpu, bool enabled)
else
vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
  PIN_BASED_EXT_INTR_MASK);
+
+   trace_kvm_set_direct_interrupt(vcpu, enabled);
 }
 
 static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, bool slave)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b7d28df..1449187 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6936,3 +6936,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_set_direct_interrupt);


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 13/21] x86/apic: IRQ vector remapping on slave for slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Add a facility to use IRQ vector different from online CPUs on slave CPUs.

When alternative vector for IRQ is registered by remap_slave_vector_irq()
and the IRQ affinity is set only to slave CPUs, the device is configured
to use the alternative vector.

Current patch only supports MSI and Intel CPU with IRQ remapper of IOMMU.

This is intended to be used to routing interrupts directly to KVM guest
which is running on slave CPUs which do not cause VM EXIT by external
interrupts.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/include/asm/irq.h  |   15 
 arch/x86/kernel/apic/io_apic.c  |   68 ++-
 drivers/iommu/intel_irq_remapping.c |2 +
 3 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ba870bb..84756f7 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -41,4 +41,19 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
 
 extern void init_ISA_irqs(void);
 
+#ifdef CONFIG_SLAVE_CPU
+extern void remap_slave_vector_irq(int irq, int vector,
+  const struct cpumask *mask);
+extern void revert_slave_vector_irq(int irq, const struct cpumask *mask);
+extern u8 get_remapped_slave_vector(u8 vector, unsigned int irq,
+   const struct cpumask *mask);
+#else
+static inline u8 get_remapped_slave_vector(u8 vector, unsigned int irq,
+  const struct cpumask *mask)
+{
+   return vector;
+}
+#endif
+
+
 #endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0cd2682..167b001 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1266,6 +1266,69 @@ void __setup_vector_irq(int cpu)
raw_spin_unlock(vector_lock);
 }
 
+#ifdef CONFIG_SLAVE_CPU
+
+/* vector table remapped on slave cpus, indexed by IRQ */
+static DEFINE_PER_CPU(u8[NR_IRQS], slave_vector_remap_tbl) = {
+   [0 ... NR_IRQS - 1] = 0,
+};
+
+void remap_slave_vector_irq(int irq, int vector, const struct cpumask *mask)
+{
+   int cpu;
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(vector_lock, flags);
+   for_each_cpu(cpu, mask) {
+   BUG_ON(!cpu_slave(cpu));
+   per_cpu(slave_vector_remap_tbl, cpu)[irq] = vector;
+   per_cpu(vector_irq, cpu)[vector] = irq;
+   }
+   raw_spin_unlock_irqrestore(vector_lock, flags);
+}
+EXPORT_SYMBOL_GPL(remap_slave_vector_irq);
+
+void revert_slave_vector_irq(int irq, const struct cpumask *mask)
+{
+   int cpu;
+   u8 vector;
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(vector_lock, flags);
+   for_each_cpu(cpu, mask) {
+   BUG_ON(!cpu_slave(cpu));
+   vector = per_cpu(slave_vector_remap_tbl, cpu)[irq];
+   if (vector) {
+   per_cpu(vector_irq, cpu)[vector] = -1;
+   per_cpu(slave_vector_remap_tbl, cpu)[irq] = 0;
+   }
+   }
+   raw_spin_unlock_irqrestore(vector_lock, flags);
+}
+EXPORT_SYMBOL_GPL(revert_slave_vector_irq);
+
+/* If all targets CPUs are slave, returns remapped vector */
+u8 get_remapped_slave_vector(u8 vector, unsigned int irq,
+const struct cpumask *mask)
+{
+   u8 slave_vector;
+
+   if (vector  FIRST_EXTERNAL_VECTOR ||
+   cpumask_intersects(mask, cpu_online_mask))
+   return vector;
+
+   slave_vector = per_cpu(slave_vector_remap_tbl,
+  cpumask_first(mask))[irq];
+   if (slave_vector = FIRST_EXTERNAL_VECTOR)
+   vector = slave_vector;
+
+   pr_info(slave vector remap: irq: %d = vector: %d\n, irq, vector);
+
+   return vector;
+}
+
+#endif
+
 static struct irq_chip ioapic_chip;
 
 #ifdef CONFIG_X86_32
@@ -3133,6 +3196,7 @@ static int
 msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
struct irq_cfg *cfg = data-chip_data;
+   int vector = cfg-vector;
struct msi_msg msg;
unsigned int dest;
 
@@ -3141,8 +3205,10 @@ msi_set_affinity(struct irq_data *data, const struct 
cpumask *mask, bool force)
 
__get_cached_msi_msg(data-msi_desc, msg);
 
+   vector = get_remapped_slave_vector(vector, data-irq, mask);
+
msg.data = ~MSI_DATA_VECTOR_MASK;
-   msg.data |= MSI_DATA_VECTOR(cfg-vector);
+   msg.data |= MSI_DATA_VECTOR(vector);
msg.address_lo = ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
diff --git a/drivers/iommu/intel_irq_remapping.c 
b/drivers/iommu/intel_irq_remapping.c
index df38334..471d23f 100644
--- a/drivers/iommu

[RFC v2 PATCH 17/21] KVM: add kvm_arch_vcpu_prevent_run to prevent VM ENTER when NMI is received

2012-09-06 Thread Tomoki Sekiyama
Since NMI can not be disabled around VM enter, there is a race between
receiving NMI to kick a guest and entering the guests on slave CPUs.If the
NMI is received just before entering VM, after the NMI handler is invoked,
it continues entering the guest and the effect of the NMI will be lost.

This patch adds kvm_arch_vcpu_prevent_run(), which causes VM exit right
after VM enter. The NMI handler uses this to ensure the execution of the
guest is cancelled after NMI.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/include/asm/kvm_host.h |6 ++
 arch/x86/kvm/vmx.c  |   42 ++-
 arch/x86/kvm/x86.c  |   31 +
 3 files changed, 78 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 65242a6..624e5ad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -429,6 +429,9 @@ struct kvm_vcpu_arch {
void *insn;
int insn_len;
} page_fault;
+
+   bool prevent_run;
+   bool prevent_needed;
 #endif
 
int halt_request; /* real mode on Intel only */
@@ -681,6 +684,7 @@ struct kvm_x86_ops {
 
void (*run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu);
+   void (*prevent_run)(struct kvm_vcpu *vcpu, int prevent);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -1027,4 +1031,6 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, 
u64 *data);
 void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
 void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
 
+int kvm_arch_vcpu_run_prevented(struct kvm_vcpu *vcpu);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2130cbd..39a4cb4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1713,6 +1713,9 @@ static inline void vmx_clear_hlt(struct kvm_vcpu *vcpu)
 
 static void vmx_set_direct_interrupt(struct kvm_vcpu *vcpu, bool enabled)
 {
+#ifdef CONFIG_SLAVE_CPU
+   void *msr_bitmap;
+
if (enabled)
vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
PIN_BASED_EXT_INTR_MASK);
@@ -1721,6 +1724,7 @@ static void vmx_set_direct_interrupt(struct kvm_vcpu 
*vcpu, bool enabled)
  PIN_BASED_EXT_INTR_MASK);
 
trace_kvm_set_direct_interrupt(vcpu, enabled);
+#endif
 }
 
 static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, bool slave)
@@ -4458,7 +4462,7 @@ static int handle_external_interrupt(struct kvm_vcpu 
*vcpu)
 
 static int handle_preemption_timer(struct kvm_vcpu *vcpu)
 {
-   /* Nothing */
+   kvm_arch_vcpu_run_prevented(vcpu);
return 1;
 }
 
@@ -6052,6 +6056,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
 
if (exit_reason  VMX_EXIT_REASONS_FAILED_VMENTRY) {
+#ifdef CONFIG_SLAVE_CPU
+   if (vcpu-arch.prevent_run)
+   return kvm_arch_vcpu_run_prevented(vcpu);
+#endif
vcpu-run-exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu-run-fail_entry.hardware_entry_failure_reason
= exit_reason;
@@ -6059,6 +6067,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
 
if (unlikely(vmx-fail)) {
+#ifdef CONFIG_SLAVE_CPU
+   if (vcpu-arch.prevent_run)
+   return kvm_arch_vcpu_run_prevented(vcpu);
+#endif
vcpu-run-exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu-run-fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
@@ -6275,6 +6287,21 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host);
 }
 
+/*
+ * Make VMRESUME fail using preemption timer with timer value = 0.
+ * On processors that doesn't support preemption timer, VMRESUME will fail
+ * by internal error.
+ */
+static void vmx_prevent_run(struct kvm_vcpu *vcpu, int prevent)
+{
+   if (prevent)
+   vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_PREEMPTION_TIMER);
+   else
+   vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+   PIN_BASED_PREEMPTION_TIMER);
+}
+
 #ifdef CONFIG_X86_64
 #define R r
 #define Q q
@@ -6326,6 +6353,13 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
atomic_switch_perf_msrs(vmx);
 
+#ifdef CONFIG_SLAVE_CPU
+   barrier();  /* Avoid vmcs modification by NMI before here */
+   vcpu-arch.prevent_needed = 1;
+   if (vcpu-arch.prevent_run

[RFC v2 PATCH 20/21] KVM: Pass-through local APIC timer of on slave CPUs to guest VM

2012-09-06 Thread Tomoki Sekiyama
Provide direct control of local APIC timer of slave CPUs to the guest.
The timer interrupt does not cause VM exit if direct interrupt delivery is
enabled. To handle the timer correctly, this makes the guest occupy the
local APIC timer.

If the host supports x2apic, this expose TMICT and TMCCT to the guest in
order to allow guests to start the timer and to read the timer count
without VM exit. Otherwise, it sets APIC registers to specified values.
LVTT is not passed-through to avoid modifying timer interrupt vector.

Currently the guest timer interrupt vector remapping is not supported, and
guest must use the same vector as host.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/include/asm/apic.h |4 +++
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kernel/apic/apic.c |   11 ++
 arch/x86/kernel/smpboot.c   |   30 ++
 arch/x86/kvm/lapic.c|   45 +++
 arch/x86/kvm/lapic.h|2 ++
 arch/x86/kvm/vmx.c  |6 +
 arch/x86/kvm/x86.c  |3 +++
 include/linux/cpu.h |5 
 kernel/hrtimer.c|2 +-
 10 files changed, 108 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d37ae5c..66e1155 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -44,6 +44,8 @@ static inline void generic_apic_probe(void)
 
 #ifdef CONFIG_X86_LOCAL_APIC
 
+struct clock_event_device;
+
 extern unsigned int apic_verbosity;
 extern int local_apic_timer_c2_ok;
 
@@ -245,6 +247,8 @@ extern void init_apic_mappings(void);
 void register_lapic_address(unsigned long address);
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
+extern void override_local_apic_timer(int cpu,
+   void (*handler)(struct clock_event_device *));
 extern int APIC_init_uniprocessor(void);
 extern int apic_force_enable(unsigned long addr);
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f43680e..a95bb62 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1035,6 +1035,7 @@ int kvm_arch_vcpu_run_prevented(struct kvm_vcpu *vcpu);
 
 #ifdef CONFIG_SLAVE_CPU
 void kvm_get_slave_cpu_mask(struct kvm *kvm, struct cpumask *mask);
+struct kvm_vcpu *get_slave_vcpu(int cpu);
 
 struct kvm_assigned_dev_kernel;
 extern void assign_slave_msi(struct kvm *kvm,
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 24deb30..90ed84a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -901,6 +901,17 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs 
*regs)
set_irq_regs(old_regs);
 }
 
+void override_local_apic_timer(int cpu,
+  void (*handler)(struct clock_event_device *))
+{
+   unsigned long flags;
+
+   local_irq_save(flags);
+   per_cpu(lapic_events, cpu).event_handler = handler;
+   local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(override_local_apic_timer);
+
 int setup_profiling_timer(unsigned int multiplier)
 {
return -EINVAL;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 45dfc1d..ba7c99b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -133,6 +133,7 @@ static void __ref remove_cpu_from_maps(int cpu);
 #ifdef CONFIG_SLAVE_CPU
 /* Notify slave cpu up and down */
 static RAW_NOTIFIER_HEAD(slave_cpu_chain);
+struct notifier_block *slave_timer_nb;
 
 int register_slave_cpu_notifier(struct notifier_block *nb)
 {
@@ -140,6 +141,13 @@ int register_slave_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(register_slave_cpu_notifier);
 
+int register_slave_cpu_timer_notifier(struct notifier_block *nb)
+{
+   slave_timer_nb = nb;
+   return register_slave_cpu_notifier(nb);
+}
+EXPORT_SYMBOL(register_slave_cpu_timer_notifier);
+
 void unregister_slave_cpu_notifier(struct notifier_block *nb)
 {
raw_notifier_chain_unregister(slave_cpu_chain, nb);
@@ -155,6 +163,8 @@ static int slave_cpu_notify(unsigned long val, int cpu)
 
return notifier_to_errno(ret);
 }
+
+static void slave_cpu_disable_timer(int cpu);
 #endif
 
 /*
@@ -1013,10 +1023,30 @@ int __cpuinit slave_cpu_up(unsigned int cpu)
 
cpu_maps_update_done();
 
+   /* Timer may be used only in starting the slave CPU */
+   slave_cpu_disable_timer(cpu);
+
return ret;
 }
 EXPORT_SYMBOL(slave_cpu_up);
 
+static void __slave_cpu_disable_timer(void *hcpu)
+{
+   int cpu = (long)hcpu;
+
+   pr_info(Disabling timer on slave cpu %d\n, cpu);
+   BUG_ON(!slave_timer_nb);
+   slave_timer_nb-notifier_call(slave_timer_nb, CPU_SLAVE_DYING, hcpu

[RFC v2 PATCH 14/21] KVM: Directly handle interrupts by guests without VM EXIT on slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Make interrupts on slave CPUs handled by guests without VM EXIT.
This reduces CPU usage by the host to transfer interrupts of assigned
PCI devices from the host to guests. It also reduces cost of VM EXIT
and quickens response of guests to the interrupts.

When a slave CPU is dedicated to a vCPU, exit on external interrupts is
disabled. Unfortunately, we can only enable/disable exits for whole
external interrupts except NMIs and cannot switch exits based on IRQ#
or vectors. Thus, to avoid IPIs from online CPUs transferred to guests,
this patch modify kvm_vcpu_kick() to use NMI for guests on slave CPUs.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/lapic.c|5 +
 arch/x86/kvm/vmx.c  |   19 ++
 arch/x86/kvm/x86.c  |   41 +++
 include/linux/kvm_host.h|1 +
 virt/kvm/kvm_main.c |5 +++--
 6 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5ce89f1..65242a6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -725,6 +725,7 @@ struct kvm_x86_ops {
   struct x86_instruction_info *info,
   enum x86_intercept_stage stage);
 
+   void (*set_direct_interrupt)(struct kvm_vcpu *vcpu, bool enabled);
void (*set_slave_mode)(struct kvm_vcpu *vcpu, bool slave);
 };
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ce87878..73f57f3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -601,6 +601,9 @@ static int apic_set_eoi(struct kvm_lapic *apic)
kvm_ioapic_update_eoi(apic-vcpu-kvm, vector, trigger_mode);
}
kvm_make_request(KVM_REQ_EVENT, apic-vcpu);
+   if (vcpu_has_slave_cpu(apic-vcpu) 
+   kvm_x86_ops-set_direct_interrupt)
+   kvm_x86_ops-set_direct_interrupt(apic-vcpu, 1);
return vector;
 }
 
@@ -1569,6 +1572,8 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 
data)
u64 addr = data  ~KVM_MSR_ENABLED;
if (!IS_ALIGNED(addr, 4))
return 1;
+   if (vcpu_has_slave_cpu(vcpu))
+   return 1;
 
vcpu-arch.pv_eoi.msr_val = data;
if (!pv_eoi_enabled(vcpu))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 03a2d02..605abea 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1711,6 +1711,16 @@ static inline void vmx_clear_hlt(struct kvm_vcpu *vcpu)
 #endif
 }
 
+static void vmx_set_direct_interrupt(struct kvm_vcpu *vcpu, bool enabled)
+{
+   if (enabled)
+   vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+   PIN_BASED_EXT_INTR_MASK);
+   else
+   vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_EXT_INTR_MASK);
+}
+
 static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, bool slave)
 {
/* Don't intercept the guest's halt on slave CPU */
@@ -1721,6 +1731,8 @@ static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, 
bool slave)
vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
  CPU_BASED_HLT_EXITING);
}
+
+   vmx_set_direct_interrupt(vcpu, slave);
 }
 
 /*
@@ -1776,6 +1788,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, 
unsigned nr,
 
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
vmx_clear_hlt(vcpu);
+   if (vcpu_has_slave_cpu(vcpu))
+   vmx_set_direct_interrupt(vcpu, 0);
 }
 
 static bool vmx_rdtscp_supported(void)
@@ -4147,6 +4161,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
vmx_clear_hlt(vcpu);
+   if (vcpu_has_slave_cpu(vcpu))
+   vmx_set_direct_interrupt(vcpu, 0);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4179,6 +4195,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
vmx_clear_hlt(vcpu);
+   if (vcpu_has_slave_cpu(vcpu))
+   vmx_set_direct_interrupt(vcpu, 0);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -7374,6 +7392,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
.check_intercept = vmx_check_intercept,
 
+   .set_direct_interrupt = vmx_set_direct_interrupt,
.set_slave_mode = vmx_set_slave_mode,
 };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a6b2521..b7d28df 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -63,6 +63,7 @@
 #include asm/pvclock.h
 #include

[RFC v2 PATCH 12/21] x86/apic: Enable external interrupt routing to slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Enable APIC to handle interrupts on slave CPUs, and enables interrupt
routing to slave CPUs by setting IRQ affinity.

As slave CPUs which run a KVM guest handle external interrupts directly in
the vCPUs, the guest's vector/IRQ mapping is different from the host's.
That requires interrupts to be routed either online CPUs or slave CPUs.

In this patch, if online CPUs are contained in specified affinity settings,
the affinity settings will be only applied to online CPUs. If every
specified CPU is slave, IRQ will be routed to slave CPUs.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/include/asm/apic.h   |6 ++---
 arch/x86/kernel/apic/io_apic.c|   43 -
 arch/x86/kernel/apic/x2apic_cluster.c |8 +++---
 drivers/iommu/intel_irq_remapping.c   |   30 +++
 kernel/irq/manage.c   |4 ++-
 kernel/irq/migration.c|2 +-
 kernel/irq/proc.c |2 +-
 7 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f342612..d37ae5c 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -535,7 +535,7 @@ extern void generic_bigsmp_probe(void);
 static inline const struct cpumask *default_target_cpus(void)
 {
 #ifdef CONFIG_SMP
-   return cpu_online_mask;
+   return cpu_online_or_slave_mask;
 #else
return cpumask_of(0);
 #endif
@@ -543,7 +543,7 @@ static inline const struct cpumask 
*default_target_cpus(void)
 
 static inline const struct cpumask *online_target_cpus(void)
 {
-   return cpu_online_mask;
+   return cpu_online_or_slave_mask;
 }
 
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
@@ -602,7 +602,7 @@ flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 {
unsigned long cpu_mask = cpumask_bits(cpumask)[0] 
 cpumask_bits(andmask)[0] 
-cpumask_bits(cpu_online_mask)[0] 
+cpumask_bits(cpu_online_or_slave_mask)[0] 
 APIC_ALL_CPUS;
 
if (likely(cpu_mask)) {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c265593..0cd2682 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1125,7 +1125,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const 
struct cpumask *mask)
/* Only try and allocate irqs on cpus that are present */
err = -ENOSPC;
cpumask_clear(cfg-old_domain);
-   cpu = cpumask_first_and(mask, cpu_online_mask);
+   cpu = cpumask_first_and(mask, cpu_online_or_slave_mask);
while (cpu  nr_cpu_ids) {
int new_cpu, vector, offset;
 
@@ -1158,14 +1158,14 @@ next:
if (unlikely(current_vector == vector)) {
cpumask_or(cfg-old_domain, cfg-old_domain, tmp_mask);
cpumask_andnot(tmp_mask, mask, cfg-old_domain);
-   cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+   cpu = cpumask_first_and(tmp_mask, 
cpu_online_or_slave_mask);
continue;
}
 
if (test_bit(vector, used_vectors))
goto next;
 
-   for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+   for_each_cpu_and(new_cpu, tmp_mask, cpu_online_or_slave_mask)
if (per_cpu(vector_irq, new_cpu)[vector] != -1)
goto next;
/* Found one! */
@@ -1175,7 +1175,7 @@ next:
cfg-move_in_progress = 1;
cpumask_copy(cfg-old_domain, cfg-domain);
}
-   for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+   for_each_cpu_and(new_cpu, tmp_mask, cpu_online_or_slave_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
cfg-vector = vector;
cpumask_copy(cfg-domain, tmp_mask);
@@ -1204,7 +1204,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg 
*cfg)
BUG_ON(!cfg-vector);
 
vector = cfg-vector;
-   for_each_cpu_and(cpu, cfg-domain, cpu_online_mask)
+   for_each_cpu_and(cpu, cfg-domain, cpu_online_or_slave_mask)
per_cpu(vector_irq, cpu)[vector] = -1;
 
cfg-vector = 0;
@@ -1212,7 +1212,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg 
*cfg)
 
if (likely(!cfg-move_in_progress))
return;
-   for_each_cpu_and(cpu, cfg-old_domain, cpu_online_mask) {
+   for_each_cpu_and(cpu, cfg-old_domain, cpu_online_or_slave_mask) {
for (vector

[RFC v2 PATCH 11/21] KVM: no exiting from guest when slave CPU halted

2012-09-06 Thread Tomoki Sekiyama
Avoid exiting from a guest on slave CPU even if HLT instruction is
executed. Since the slave CPU is dedicated to a vCPU, exit on HLT is
not required, and avoiding VM exit will improve the guest's performance.

This is a partial revert of

10166744b80a (KVM: VMX: remove yield_on_hlt)

Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/kvm/vmx.c |   25 -
 1 files changed, 24 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d99bee6..03a2d02 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1698,9 +1698,29 @@ static void skip_emulated_instruction(struct kvm_vcpu 
*vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static inline void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_SLAVE_CPU
+   /* Ensure that we clear the HLT state in the VMCS.  We don't need to
+* explicitly skip the instruction because if the HLT state is set,
+* then the instruction is already executing and RIP has already been
+* advanced. */
+   if (vcpu-arch.slave_cpu = 0 
+   vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+   vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+#endif
+}
+
 static void vmx_set_slave_mode(struct kvm_vcpu *vcpu, bool slave)
 {
-   /* Nothing */
+   /* Don't intercept the guest's halt on slave CPU */
+   if (slave) {
+   vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+   CPU_BASED_HLT_EXITING);
+   } else {
+   vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_HLT_EXITING);
+   }
 }
 
 /*
@@ -1755,6 +1775,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, 
unsigned nr,
intr_info |= INTR_TYPE_HARD_EXCEPTION;
 
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+   vmx_clear_hlt(vcpu);
 }
 
 static bool vmx_rdtscp_supported(void)
@@ -4125,6 +4146,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+   vmx_clear_hlt(vcpu);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -4156,6 +4178,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+   vmx_clear_hlt(vcpu);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 04/21] x86: Avoid RCU warnings on slave CPUs

2012-09-06 Thread Tomoki Sekiyama
Initialize rcu related variables to avoid warnings about RCU usage while
slave CPUs is running specified functions. Also notify RCU subsystem before
the slave CPU is entered into idle state.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/kernel/smpboot.c |4 
 kernel/rcutree.c  |   14 ++
 2 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e8cfe377..45dfc1d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -382,6 +382,8 @@ notrace static void __cpuinit start_slave_cpu(void *unused)
f = per_cpu(slave_cpu_func, cpu);
per_cpu(slave_cpu_func, cpu).func = NULL;
 
+   rcu_note_context_switch(cpu);
+
if (!f.func) {
native_safe_halt();
continue;
@@ -1005,6 +1007,8 @@ int __cpuinit slave_cpu_up(unsigned int cpu)
if (IS_ERR(idle))
return PTR_ERR(idle);
 
+   slave_cpu_notify(CPU_SLAVE_UP_PREPARE, cpu);
+
ret = __native_cpu_up(cpu, idle, 1);
 
cpu_maps_update_done();
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f280e54..31a7c8c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -2589,6 +2589,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block 
*self,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
+#ifdef CONFIG_SLAVE_CPU
+   case CPU_SLAVE_UP_PREPARE:
+#endif
rcu_prepare_cpu(cpu);
rcu_prepare_kthreads(cpu);
break;
@@ -2603,6 +2606,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block 
*self,
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
+#ifdef CONFIG_SLAVE_CPU
+   case CPU_SLAVE_DYING:
+#endif
/*
 * The whole machine is stopped except this CPU, so we can
 * touch any data without introducing corruption. We send the
@@ -2616,6 +2622,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block 
*self,
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
+#ifdef CONFIG_SLAVE_CPU
+   case CPU_SLAVE_DEAD:
+#endif
for_each_rcu_flavor(rsp)
rcu_cleanup_dead_cpu(cpu, rsp);
break;
@@ -2797,6 +2806,10 @@ static void __init rcu_init_geometry(void)
rcu_num_nodes -= n;
 }
 
+static struct notifier_block __cpuinitdata rcu_slave_nb = {
+   .notifier_call = rcu_cpu_notify,
+};
+
 void __init rcu_init(void)
 {
int cpu;
@@ -2814,6 +2827,7 @@ void __init rcu_init(void)
 * or the scheduler are operational.
 */
cpu_notifier(rcu_cpu_notify, 0);
+   register_slave_cpu_notifier(rcu_slave_nb);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
check_cpu_stall_init();


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC v2 PATCH 16/21] KVM: vmx: Add definitions PIN_BASED_PREEMPTION_TIMER

2012-09-06 Thread Tomoki Sekiyama
Add some definitions to use PIN_BASED_PREEMPTION_TIMER.

When PIN_BASED_PREEMPTION_TIMER is enabled, the guest will exit
with reason=EXIT_REASON_PREEMPTION_TIMER when the counter specified in
VMX_PREEMPTION_TIMER_VALUE becomes 0.
This patch also adds a dummy handler for EXIT_REASON_PREEMPTION_TIMER,
which just goes back to VM execution soon.

These are currently intended only to be used with avoid entering the
guest on a slave CPU when vmx_prevent_run(vcpu, 1) is called.

Signed-off-by: Tomoki Sekiyama tomoki.sekiyama...@hitachi.com
Cc: Avi Kivity a...@redhat.com
Cc: Marcelo Tosatti mtosa...@redhat.com
Cc: Thomas Gleixner t...@linutronix.de
Cc: Ingo Molnar mi...@redhat.com
Cc: H. Peter Anvin h...@zytor.com
---

 arch/x86/include/asm/vmx.h |3 +++
 arch/x86/kvm/trace.h   |1 +
 arch/x86/kvm/vmx.c |7 +++
 3 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 74fcb96..6899aaa 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,6 +66,7 @@
 #define PIN_BASED_EXT_INTR_MASK 0x0001
 #define PIN_BASED_NMI_EXITING   0x0008
 #define PIN_BASED_VIRTUAL_NMIS  0x0020
+#define PIN_BASED_PREEMPTION_TIMER  0x0040
 
 #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x0002
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE0x0200
@@ -196,6 +197,7 @@ enum vmcs_field {
GUEST_INTERRUPTIBILITY_INFO = 0x4824,
GUEST_ACTIVITY_STATE= 0X4826,
GUEST_SYSENTER_CS   = 0x482A,
+   VMX_PREEMPTION_TIMER_VALUE  = 0x482E,
HOST_IA32_SYSENTER_CS   = 0x4c00,
CR0_GUEST_HOST_MASK = 0x6000,
CR4_GUEST_HOST_MASK = 0x6002,
@@ -280,6 +282,7 @@ enum vmcs_field {
 #define EXIT_REASON_APIC_ACCESS 44
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_PREEMPTION_TIMER   52
 #define EXIT_REASON_WBINVD 54
 #define EXIT_REASON_XSETBV 55
 #define EXIT_REASON_INVPCID58
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6081be7..fc350f3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -218,6 +218,7 @@ TRACE_EVENT(kvm_apic,
{ EXIT_REASON_APIC_ACCESS,  APIC_ACCESS }, \
{ EXIT_REASON_EPT_VIOLATION,EPT_VIOLATION }, \
{ EXIT_REASON_EPT_MISCONFIG,EPT_MISCONFIG }, \
+   { EXIT_REASON_PREEMPTION_TIMER, PREEMPTION_TIMER }, \
{ EXIT_REASON_WBINVD,   WBINVD }
 
 #define SVM_EXIT_REASONS \
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6dc59c8..2130cbd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4456,6 +4456,12 @@ static int handle_external_interrupt(struct kvm_vcpu 
*vcpu)
return 1;
 }
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+   /* Nothing */
+   return 1;
+}
+
 static int handle_triple_fault(struct kvm_vcpu *vcpu)
 {
vcpu-run-exit_reason = KVM_EXIT_SHUTDOWN;
@@ -5768,6 +5774,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu 
*vcpu) = {
[EXIT_REASON_VMON]= handle_vmon,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+   [EXIT_REASON_PREEMPTION_TIMER]= handle_preemption_timer,
[EXIT_REASON_WBINVD]  = handle_wbinvd,
[EXIT_REASON_XSETBV]  = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   >