[PATCH v7 2/3] VFIO-AER: Vfio-pci driver changes for supporting AER

2013-03-08 Thread Vijay Mohan Pandarathil
- New VFIO_SET_IRQ ioctl option to pass the eventfd that is signaled 
when
  an error occurs in the vfio_pci_device

- Register pci_error_handler for the vfio_pci driver

- When the device encounters an error, the error handler registered by
  the vfio_pci driver gets invoked by the AER infrastructure

- In the error handler, signal the eventfd registered for the device.

- This results in the qemu eventfd handler getting invoked and
  appropriate action taken for the guest.

Signed-off-by: Vijay Mohan Pandarathil 
---
 drivers/vfio/pci/vfio_pci.c | 44 -
 drivers/vfio/pci/vfio_pci_intrs.c   | 64 +
 drivers/vfio/pci/vfio_pci_private.h |  1 +
 include/uapi/linux/vfio.h   |  1 +
 4 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 8189cb6..acfcb1a 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -201,7 +201,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device 
*vdev, int irq_type)
 
return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
}
-   }
+   } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
+   if (pci_is_pcie(vdev->pdev))
+   return 1;
 
return 0;
 }
@@ -317,6 +319,17 @@ static long vfio_pci_ioctl(void *device_data,
if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
return -EINVAL;
 
+   switch (info.index) {
+   case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
+   break;
+   case VFIO_PCI_ERR_IRQ_INDEX:
+   if (pci_is_pcie(vdev->pdev))
+   break;
+   /* pass thru to return error */
+   default:
+   return -EINVAL;
+   }
+
info.flags = VFIO_IRQ_INFO_EVENTFD;
 
info.count = vfio_pci_get_irq_count(vdev, info.index);
@@ -551,11 +564,40 @@ static void vfio_pci_remove(struct pci_dev *pdev)
kfree(vdev);
 }
 
+static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
+{
+   struct vfio_pci_device *vdev;
+   struct vfio_device *device;
+
+   device = vfio_device_get_from_dev(&pdev->dev);
+   if (device == NULL)
+   return PCI_ERS_RESULT_DISCONNECT;
+
+   vdev = vfio_device_data(device);
+   if (vdev == NULL) {
+   vfio_device_put(device);
+   return PCI_ERS_RESULT_DISCONNECT;
+   }
+
+   if (vdev->err_trigger)
+   eventfd_signal(vdev->err_trigger, 1);
+
+   vfio_device_put(device);
+
+   return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static struct pci_error_handlers vfio_err_handlers = {
+   .error_detected = vfio_pci_aer_err_detected,
+};
+
 static struct pci_driver vfio_pci_driver = {
.name   = "vfio-pci",
.id_table   = NULL, /* only dynamic ids */
.probe  = vfio_pci_probe,
.remove = vfio_pci_remove,
+   .err_handler= &vfio_err_handlers,
 };
 
 static void __exit vfio_pci_cleanup(void)
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 3639371..b84bf22 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -745,6 +745,63 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device 
*vdev,
return 0;
 }
 
+static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
+   unsigned index, unsigned start,
+   unsigned count, uint32_t flags, void *data)
+{
+   int32_t fd = *(int32_t *)data;
+   struct pci_dev *pdev = vdev->pdev;
+
+   if ((index != VFIO_PCI_ERR_IRQ_INDEX) ||
+   !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
+   return -EINVAL;
+
+   /*
+* device_lock synchronizes setting and checking of
+* err_trigger. The vfio_pci_aer_err_detected() is also
+* called with device_lock held.
+*/
+
+   /* DATA_NONE/DATA_BOOL enables loopback testing */
+
+   if (flags & VFIO_IRQ_SET_DATA_NONE) {
+   device_lock(&pdev->dev);
+   if (vdev->err_trigger)
+   eventfd_signal(vdev->err_trigger, 1);
+   device_unlock(&pdev->dev);
+   return 0;
+   } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+   uint8_t trigger = *(uint8_t *)data;
+   device_lock(&pdev->dev);
+   if (trigger && vdev->err_trigger)
+   eventfd_signal(vdev->err_trigger, 1);
+   device_unlock(&pdev->dev);
+   return 0;
+   }
+
+   /* Handle SET_DATA_EVENTFD */
+
+   if 

[PATCH v7 3/3] QEMU-AER: Qemu changes to support AER for VFIO-PCI devices

2013-03-08 Thread Vijay Mohan Pandarathil
- Create eventfd per vfio device assigned to a guest and register an
  event handler

- This fd is passed to the vfio_pci driver through the SET_IRQ ioctl

- When the device encounters an error, the eventfd is signalled
  and the qemu eventfd handler gets invoked.

- In the handler decide what action to take. Current action taken
  is to stop the guest.

Signed-off-by: Vijay Mohan Pandarathil 
---
 hw/vfio_pci.c  | 123 +
 linux-headers/linux/vfio.h |   1 +
 2 files changed, 124 insertions(+)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index ad9ae36..3c78771 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -38,6 +38,7 @@
 #include "qemu/error-report.h"
 #include "qemu/queue.h"
 #include "qemu/range.h"
+#include "sysemu/sysemu.h"
 
 /* #define DEBUG_VFIO */
 #ifdef DEBUG_VFIO
@@ -129,7 +130,9 @@ typedef struct VFIODevice {
 PCIHostDeviceAddress host;
 QLIST_ENTRY(VFIODevice) next;
 struct VFIOGroup *group;
+EventNotifier err_notifier;
 bool reset_works;
+bool pci_aer;
 } VFIODevice;
 
 typedef struct VFIOGroup {
@@ -1802,6 +1805,7 @@ static int vfio_get_device(VFIOGroup *group, const char 
*name, VFIODevice *vdev)
 {
 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
 struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
+struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
 int ret, i;
 
 ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
@@ -1904,6 +1908,18 @@ static int vfio_get_device(VFIOGroup *group, const char 
*name, VFIODevice *vdev)
 }
 vdev->config_offset = reg_info.offset;
 
+irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
+
+ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+if (ret) {
+/* This can fail for an old kernel or legacy PCI dev */
+DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure ret=%d\n", ret);
+ret = 0;
+} else if (irq_info.count == 1) {
+vdev->pci_aer = true;
+} else {
+error_report("vfio: Warning: Could not enable error recovery for the 
device\n");
+}
 error:
 if (ret) {
 QLIST_REMOVE(vdev, next);
@@ -1925,6 +1941,110 @@ static void vfio_put_device(VFIODevice *vdev)
 }
 }
 
+static void vfio_err_notifier_handler(void *opaque)
+{
+VFIODevice *vdev = opaque;
+
+if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
+return;
+}
+
+/*
+ * TBD. Retrieve the error details and decide what action
+ * needs to be taken. One of the actions could be to pass
+ * the error to the guest and have the guest driver recover
+ * from the error. This requires that PCIe capabilities be
+ * exposed to the guest. For now, we just terminate the
+ * guest to contain the error.
+ */
+
+error_report("%s (%04x:%02x:%02x.%x)"
+"Unrecoverable error detected...\n"
+"Please collect any data possible and then kill the guest",
+__func__, vdev->host.domain, vdev->host.bus,
+vdev->host.slot, vdev->host.function);
+
+vm_stop(RUN_STATE_IO_ERROR);
+}
+
+/*
+ * Registers error notifier for devices supporting error recovery.
+ * If we encounter a failure in this function, we report an error
+ * and continue after disabling error recovery support for the
+ * device.
+ */
+static void vfio_register_err_notifier(VFIODevice *vdev)
+{
+int ret;
+int argsz;
+struct vfio_irq_set *irq_set;
+int32_t *pfd;
+
+if (!vdev->pci_aer) {
+return;
+}
+
+if (event_notifier_init(&vdev->err_notifier, 0)) {
+error_report("vfio: Warning: Unable to init event notifier for error 
detection\n");
+vdev->pci_aer = false;
+return;
+}
+
+argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+irq_set = g_malloc0(argsz);
+irq_set->argsz = argsz;
+irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+irq_set->index = VFIO_PCI_ERR_IRQ_INDEX;
+irq_set->start = 0;
+irq_set->count = 1;
+pfd = (int32_t *)&irq_set->data;
+
+*pfd = event_notifier_get_fd(&vdev->err_notifier);
+qemu_set_fd_handler(*pfd, vfio_err_notifier_handler, NULL, vdev);
+
+ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+if (ret) {
+error_report("vfio: Failed to set up error notification\n");
+qemu_set_fd_handler(*pfd, NULL, NULL, vdev);
+event_notifier_cleanup(&vdev->err_notifier);
+vdev->pci_aer = false;
+}
+g_free(irq_set);
+}
+static void vfio_unregister_err_notifier(VFIODevice *vdev)
+{
+int argsz;
+struct vfio_irq_set *irq_set;
+int32_t *pfd;
+int ret;
+
+if (!vdev->pci_aer) {
+return;
+}
+
+argsz = sizeof(*irq_set) + sizeof(*pfd);
+
+irq_set = g_malloc0(argsz);
+irq_set->argsz = argsz;
+irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TR

[PATCH v7 0/3] AER-KVM: Error containment of VFIO devices assigned to KVM guests

2013-03-08 Thread Vijay Mohan Pandarathil
Add support for error containment when a VFIO device assigned to a KVM
guest encounters an error. This is for PCIe devices/drivers that support AER
functionality. When the host OS is notified of an error in a device either
through the firmware first approach or through an interrupt handled by the AER
root port driver, the error handler registered by the vfio-pci driver gets
invoked. The qemu process is signaled through an eventfd registered per
VFIO device by the qemu process. In the eventfd handler, qemu decides on
what action to take. In this implementation, guest is brought down to
contain the error.


v7:
 - Rebased to latest upstream
 - Used device_lock() for synchronising err_trigger access
v6:
 - Rebased to latest upstream
 - Resolved merge conflict with vfio_dev_present()
v5:
 - Rebased to latest upstream stable bits
 - Incorporated v4 feedback
v4:
 - Stop the guest instead of terminating
 - Remove unwanted returns from functions
 - Incorporate other feedback
v3:
 - Removed PCI_AER* flags from device info ioctl.
 - Incorporated feedback
v2:
 - Rebased to latest upstream stable bits
 - Changed the new ioctl to be part of VFIO_SET_IRQs ioctl
 - Added a new patch to get/put reference to a vfio device from struct device
 - Incorporated all other feedback.

---

Vijay Mohan Pandarathil(3):

[PATCH 1/3] VFIO: Wrapper to get reference to vfio_device from device 
[PATCH 2/3] VFIO-AER: Vfio-pci driver changes for supporting AER
[PATCH 3/3] QEMU-AER: Qemu changes to support AER for VFIO-PCI devices

Kernel files changed

 drivers/vfio/vfio.c  | 30 +-
 include/linux/vfio.h |  3 +++
 2 files changed, 32 insertions(+), 1 deletion(-)

 drivers/vfio/pci/vfio_pci.c | 44 -
 drivers/vfio/pci/vfio_pci_intrs.c   | 64 +
 drivers/vfio/pci/vfio_pci_private.h |  1 +
 include/uapi/linux/vfio.h   |  1 +
 4 files changed, 109 insertions(+), 1 deletion(-)

Qemu files changed

 hw/vfio_pci.c  | 123 +
 linux-headers/linux/vfio.h |   1 +
 2 files changed, 124 insertions(+)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 1/3] VFIO: Wrapper for getting reference to vfio_device from device

2013-03-08 Thread Vijay Mohan Pandarathil
- Added vfio_device_get_from_dev() as wrapper to get
  reference to vfio_device from struct device.

- Added vfio_device_data() as a wrapper to get device_data from
  vfio_device.

Signed-off-by: Vijay Mohan Pandarathil 
---
 drivers/vfio/vfio.c  | 30 +-
 include/linux/vfio.h |  3 +++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index fcc12f3..21eddd9 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -392,12 +392,13 @@ static void vfio_device_release(struct kref *kref)
 }
 
 /* Device reference always implies a group reference */
-static void vfio_device_put(struct vfio_device *device)
+void vfio_device_put(struct vfio_device *device)
 {
struct vfio_group *group = device->group;
kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
vfio_group_put(group);
 }
+EXPORT_SYMBOL_GPL(vfio_device_put);
 
 static void vfio_device_get(struct vfio_device *device)
 {
@@ -627,6 +628,33 @@ int vfio_add_group_dev(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 
+/**
+ * Get a reference to the vfio_device for a device that is known to
+ * be bound to a vfio driver.  The driver implicitly holds a
+ * vfio_device reference between vfio_add_group_dev and
+ * vfio_del_group_dev.  We can therefore use drvdata to increment
+ * that reference from the struct device.  This additional
+ * reference must be released by calling vfio_device_put.
+ */
+struct vfio_device *vfio_device_get_from_dev(struct device *dev)
+{
+   struct vfio_device *device = dev_get_drvdata(dev);
+
+   vfio_device_get(device);
+
+   return device;
+}
+EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
+
+/*
+ * Caller must hold a reference to the vfio_device
+ */
+void *vfio_device_data(struct vfio_device *device)
+{
+   return device->device_data;
+}
+EXPORT_SYMBOL_GPL(vfio_device_data);
+
 /* Given a referenced group, check if it contains the device */
 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ab9e862..ac8d488 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -45,6 +45,9 @@ extern int vfio_add_group_dev(struct device *dev,
  void *device_data);
 
 extern void *vfio_del_group_dev(struct device *dev);
+extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
+extern void vfio_device_put(struct vfio_device *device);
+extern void *vfio_device_data(struct vfio_device *device);
 
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
-- 
1.7.11.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: lockdep trace from prepare_bprm_creds

2013-03-08 Thread Li Zefan
On 2013/3/9 11:29, Tejun Heo wrote:
> Hello, Li.
> 
> On Sat, Mar 09, 2013 at 10:11:51AM +0800, Li Zefan wrote:
>> On 2013/3/8 3:38, Tejun Heo wrote:
>>> On Thu, Mar 07, 2013 at 08:12:42PM +0100, Oleg Nesterov wrote:
 Well yes, I agree. I think that perfomance-wise threadgroup_change_begin()
 in de_thread() is fine, and perhaps it is even more clean because we are
 going to do the thread-group change. The scope of cred_guard_mutex is huge,
 it doesn't look very nice in threadgroup_lock().

 But we should avoid the cgroup-specific hooks as much as possible, so I
 like your patch more.
>>>
>>> I don't really mind how it's done but while my approach seems to limit
>>> itself to cgroup proper, threadgroup locking is actually more invasive
>>> by meddling with cred_mutex.  As you said, yours is the cleaner and
>>> probably more permanent one here.
>>>
>>
>> Agreed.
>>
>> Now we need that patch to be resent with SOB and proper changelog.
> 
> Now that I think more about it, I think I want both patches.  It is
> bothering that threadgroup lock is nested inside cgroup_lock.  It
> always has.  I just couldn't do anything about that until recently.
> Li, can you be persuaded into getting the lock reordering patch into a
> useable shape?  :)
> 

The patch is actually already in good shape. 

I'll give it some test and then you can queue it?

We don't need both patches for 3.9, so we'll queue Oleg's fix for 3.9 and
yours for 3.10?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 02/26] x86, irq: Modify irq chip once for irq remapping

2013-03-08 Thread Yinghai Lu
On Fri, Mar 8, 2013 at 12:10 PM, Thomas Gleixner  wrote:
>
> And just for clarification. If you are not going to provide proper
> changelogs for _all_ patches of the series, this stuff is going
> directly towards /dev/null. I'm not wasting my time to review any of
> that otherwise.

ok, will try to update change logs and resend them next week.

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 12/26] x86, irq: Add realloc_irq_and_cfg_at()

2013-03-08 Thread Yinghai Lu
On Fri, Mar 8, 2013 at 11:53 AM, Konrad Rzeszutek Wilk
 wrote:
>> + * irq_realloc_desc - allocate irq descriptor for irq that is already 
>> reserved
>
> Which begs the question - why was it not allocated when it was reserved?

The reasons for not allocating them during reserving:
1. only several pins in ioapic are used, allocate for them pin, will
   waste memory for not used one.
2. relocate later could make sure irq_desc is allocated on local node ram.

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: epoll: possible bug from wakeup_source activation

2013-03-08 Thread Eric Wong
Arve Hjønnevåg  wrote:
> On Fri, Mar 8, 2013 at 12:49 PM, Eric Wong  wrote:
> > What happens if ep_modify calls ep_destroy_wakeup_source
> > while __pm_stay_awake is running on the same epi->ws?
> 
> Yes, that looks like a problem. I think calling
> ep_destroy_wakeup_source with ep->lock held should fix that. It is not
> clear how useful changing EPOLLWAKEUP in ep_modify is, so
> alternatively we could remove that feature and instead only allow it
> to be set in ep_insert.

ep->lock would work, but ep->lock is already a source of heavy
contention in my multithreaded+epoll webservers.

Perhaps RCU can be used?  I've no experience with RCU, but I've been
meaning to get acquainted with RCU.

Another possible solution is to only use ep->ws and add an atomic
counter to ep; so __pm_relax(ep->ws) is only called when the atomic
counter reaches zero.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] x86: kvm: reset the bootstrap processor when it gets an INIT

2013-03-08 Thread Paolo Bonzini
After receiving an INIT signal (either via the local APIC, or through
KVM_SET_MP_STATE), the bootstrap processor should reset immediately
and start execution at 0xfff0.  Also, SIPIs have no effect on the
bootstrap processor.  However, KVM currently does not differentiate
between the BSP and APs.

Implement this so that userspace can correctly implement CPU soft resets
even when the in-kernel APIC is in use.  Another small change is needed,
because INITs sent to the bootstrap processor do not go through a halt
state; it is incorrect to go through kvm_vcpu_block.  I think this also
fixes a race before between sending the INIT and SIPI interrupts; if the
two were close enough, the receiving VCPU could have received the SIPI
before entering kvm_vcpu_block.  It would them stay in kvm_vcpu_block
until the next kvm_vcpu_kick.  In practice this was not a problem,
because the Intel SDM suggests to send two SIPIs with some time passing
between them; the second SIPI would unblock the VCPU.

The tests in vcpu_needs_reset are organized so that the hypervisor
will go through the same number of compare-and-jump sequences as
before in the common case.

Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/lapic.c |  3 ++-
 arch/x86/kvm/x86.c   | 23 +++
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9392f52..0c515ac 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -710,7 +710,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
case APIC_DM_STARTUP:
apic_debug("SIPI to vcpu %d vector 0x%02x\n",
   vcpu->vcpu_id, vector);
-   if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+   if (!kvm_vcpu_is_bsp(apic->vcpu) &&
+   vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
result = 1;
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c243b81..603e6ff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5784,15 +5784,29 @@ out:
return r;
 }
 
+static inline int vcpu_needs_reset(struct kvm_vcpu *vcpu)
+{
+   /* Shortcut the test in the common case.  */
+   if (likely(vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE))
+   return 0;
+
+   if (kvm_vcpu_is_bsp(vcpu))
+   return vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED;
+   else
+   return vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
+}
 
 static int __vcpu_run(struct kvm_vcpu *vcpu)
 {
int r;
struct kvm *kvm = vcpu->kvm;
 
-   if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
-   pr_debug("vcpu %d received sipi with vector # %x\n",
-vcpu->vcpu_id, vcpu->arch.sipi_vector);
+   if (unlikely(vcpu_needs_reset(vcpu))) {
+   if (kvm_vcpu_is_bsp(vcpu))
+   pr_debug("vcpu %d received init\n", vcpu->vcpu_id);
+   else
+   pr_debug("vcpu %d received sipi with vector # %x\n",
+vcpu->vcpu_id, vcpu->arch.sipi_vector);
kvm_lapic_reset(vcpu);
r = kvm_vcpu_reset(vcpu);
if (r)
@@ -5812,6 +5826,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted)
r = vcpu_enter_guest(vcpu);
+   else if (unlikely(vcpu_needs_reset(vcpu)))
+   r = -EINTR;
else {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
@@ -5825,7 +5841,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
-   case KVM_MP_STATE_SIPI_RECEIVED:
default:
r = -EINTR;
break;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/1] scripts/gen_sendmail.py: Script Added: Generate git send-email arguments automatically!

2013-03-08 Thread Raphael S. Carvalho
I guess this script won't be merged upstream, but I think it could be useful 
for someone else.

Description borrowed from the header:
---
!# This script generates the git send-email automatically
!# by looking at the output generated by scripts/get_maintainer.pl
!#
!# You can pass as many patch files as needed.
!# Usage: python send-mail.py [option]   ...

Example of use:
Passed as arguments one patch file and -v (verbose mode).
---
raphaelsc@debian:~/kernel/linux$ scripts/gen_sendmail.py -v 
0001-kernel-pid.c-Improve-flow-of-a-loop-inside-alloc_pid.patch
git send-email --from "Raphael S. Carvalho " --to 
"Eric W. Biederman " --to "Andrew Morton 
" --to "Serge E. Hallyn " --to 
"Serge Hallyn " --to "David S. Miller 
" --cc "linux-kernel@vger.kernel.org" 
0001-kernel-pid.c-Improve-flow-of-a-loop-inside-alloc_pid.patch

* Statistics: Maintainer(s): 5, List(s): 1
---

Any bug reports or improvements are welcome!

Signed-off-by: Raphael S. Carvalho 
---
 scripts/gen_sendmail.py |  165 +++
 1 files changed, 165 insertions(+), 0 deletions(-)
 create mode 100755 scripts/gen_sendmail.py

diff --git a/scripts/gen_sendmail.py b/scripts/gen_sendmail.py
new file mode 100755
index 000..921a8cc
--- /dev/null
+++ b/scripts/gen_sendmail.py
@@ -0,0 +1,165 @@
+#! /usr/bin/env python
+#
+# Generate git send-email arguments (gen_sendmail.py)
+# (c) 2013, Raphael S.Carvalho 
+#
+# This script generates the git send-email automatically
+# by looking at the output generated by scripts/get_maintainer.pl
+#
+# You can pass as many patch files as needed.
+# Usage: python send-mail.py [options]   ...
+#
+# Licensed under the terms of the GNU GPL License version 2
+
+import commands
+import sys
+import StringIO
+import getopt
+
+# Default definitions
+GIT_SENDMAIL = "git send-email"
+SCRIPT = "scripts/get_maintainer.pl"
+
+
+def get_user_gitconfig(git_config):
+   get_name = "git config user.name"
+   get_email = "git config user.email"
+
+   (stat, name) = commands.getstatusoutput(get_name)
+   if (stat != 0):
+   return (stat, get_name)
+
+   (stat, email) = commands.getstatusoutput(get_email)
+   if (stat != 0):
+   return (stat, get_email)
+
+   # Setup git config structure!
+   git_config['user_name'] = name
+   git_config['user_email'] = email
+
+   return (0, None)
+
+
+# Try to execute: get_maintainer.pl 
+def exec_get_maintainers(patch_name):
+   command = SCRIPT + ' ' + patch_name
+   (stat, output) = commands.getstatusoutput(command)
+   return (stat, output)
+
+
+def find_maintainer(maintainers, email):
+   for m in maintainers:
+   if m['email'] == email:
+   return True
+   return False
+
+
+# Get file and/or mail from each line,
+# and build a simple maintainers database.
+def build_list(buf):
+   maintainers = []
+
+   for line in iter(buf.readline, ""):
+   name = ""
+   email = ""
+
+   pos = line.find("<")
+   if pos != -1:
+   name = line[: pos-1].replace('"', "")
+   pos2 = line.find("(")
+   email = line[pos : pos2-1]
+   else:
+   pos = line.find("(")
+   email = line[: pos-1]
+
+   # If not find_maintainer, then add to the list.
+   if not find_maintainer(maintainers, email):
+   maintainer = {'name': name, 'email': email}
+   maintainers.append(maintainer)
+
+   return maintainers
+
+
+# Generates command from the built database.
+def generate_gitmail_cmd(maintainers, git_config, args):
+   mnt_count = list_count = 0
+
+   print '%s --from "%s <%s>"' % \
+   (GIT_SENDMAIL, \
+   git_config['user_name'], git_config['user_email']),
+
+   for m in maintainers:
+   if m['name'] != "":
+   print '--to "%s %s"' % (m['name'], m['email']),
+   mnt_count += 1
+   else:
+   print '--cc "%s"' % m['email'],
+   list_count += 1
+   for arg in args:
+   print arg,
+
+   return (mnt_count, list_count)
+
+
+def usage(program):
+   print 'Usage: %s [options] \n' \
+   '-v --verbose: Print verbose messages.\n' \
+   '-h --help: Print this information.\n\n' \
+   'This script must be installed in the directory' \
+   ' scripts of linux tree!' % program,
+   sys.exit(2)
+
+
+def main(argc, argv):
+   verbose = False
+
+   try:
+   opts, args = getopt.getopt(argv[1:], 'hv', ['help', 'verbose'])
+   if not opts and argc < 2:
+   usage(argv[0])
+   except getopt.GetoptError, e:
+   print e
+   usage(argv[0])
+
+ 

Re: [PATCH] [Timer][Trivial] __clocksource_register_scale return value use?

2013-03-08 Thread anish singh
ping

On Thu, Mar 7, 2013 at 4:41 PM, anish kumar  wrote:
> __clocksource_register_scale() currently returns int but it should
> return void as there are no error paths in that function.
> Making it void would help some amount of code to be removed at various
> places.
>
> clocksource_register_hz/khz() return value is checked
> in most of the places but I think it will translate to always
> if(true) so let's remove those checks as well(patch will be sent
> later for that).
>
> Is this return value for some future usecase(?), if yes then my
> apologies.
>
> Signed-off-by: anish kumar 
> ---
>  include/linux/clocksource.h |6 +++---
>  kernel/time/clocksource.c   |7 +--
>  2 files changed, 4 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
> index 27cfda4..2b074cc 100644
> --- a/include/linux/clocksource.h
> +++ b/include/linux/clocksource.h
> @@ -294,17 +294,17 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32
> from, u32 to, u32 minsec);
>   * Don't call __clocksource_register_scale directly, use
>   * clocksource_register_hz/khz
>   */
> -extern int
> +extern void
>  __clocksource_register_scale(struct clocksource *cs, u32 scale, u32
> freq);
>  extern void
>  __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32
> freq);
>
> -static inline int clocksource_register_hz(struct clocksource *cs, u32
> hz)
> +static inline void clocksource_register_hz(struct clocksource *cs, u32
> hz)
>  {
> return __clocksource_register_scale(cs, 1, hz);
>  }
>
> -static inline int clocksource_register_khz(struct clocksource *cs, u32
> khz)
> +static inline void clocksource_register_khz(struct clocksource *cs, u32
> khz)
>  {
> return __clocksource_register_scale(cs, 1000, khz);
>  }
> diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
> index c958338..1915550 100644
> --- a/kernel/time/clocksource.c
> +++ b/kernel/time/clocksource.c
> @@ -703,14 +703,11 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
>   * @scale: Scale factor multiplied against freq to get clocksource hz
>   * @freq:  clocksource frequency (cycles per second) divided by scale
>   *
> - * Returns -EBUSY if registration fails, zero otherwise.
> - *
>   * This *SHOULD NOT* be called directly! Please use the
>   * clocksource_register_hz() or clocksource_register_khz helper
> functions.
>   */
> -int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32
> freq)
> +void __clocksource_register_scale(struct clocksource *cs, u32 scale,
> u32 freq)
>  {
> -
> /* Initialize mult/shift and max_idle_ns */
> __clocksource_updatefreq_scale(cs, scale, freq);
>
> @@ -720,11 +717,9 @@ int __clocksource_register_scale(struct clocksource
> *cs, u32 scale, u32 freq)
> clocksource_enqueue_watchdog(cs);
> clocksource_select();
> mutex_unlock(&clocksource_mutex);
> -   return 0;
>  }
>  EXPORT_SYMBOL_GPL(__clocksource_register_scale);
>
> -
>  /**
>   * clocksource_register - Used to install new clocksources
>   * @cs:clocksource to be registered
> --
> 1.7.1
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Suggestion for fixing the variable length array used in the kernel.

2013-03-08 Thread Dan Carpenter
On Fri, Mar 08, 2013 at 04:29:22PM -0800, Andrew Morton wrote:
> On Wed, 6 Mar 2013 20:46:35 -0800 Christopher Li  wrote:
> 
> > Hi,
> > 
> > I am looking at the current sparse warning on the kernel source.
> > One category of those warning are produce by the variable length array.
> > We all know that the kernel stack has a limit so we don't want to allocate
> > too much stack to the variable size array.
> > 
> > Is there a recommended way to fix those warnings? Is it worth while to
> > fix it at all? I am looking forward to some kind of guideline how to handle
> > this.
> 
> Roughly how many instances of this are there kernel-wide?
> 

Around 150 on x86 allmodconfig.  They are pretty well audited.

regards,
dan carpenter

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


BUG in get_next_timer_interrupt

2013-03-08 Thread Syam Puranam
Hello -

I am seeing the following crash on a older kernel (2.6.32-200) but
thought I would share as I couldn't
find earlier reports of this. Please let me know if any more
information will help.

BUG: unable to handle kernel NULL pointer dereference at 0028
IP: [] get_next_timer_interrupt+0x148/0x250
PGD b0c981067 PUD 0
Oops:  [#1] SMP
last sysfs file: /sys/module/garp/parameters/garp_join_time
CPU 6
Modules linked in: 8021q garp bridge stp llc nfp(U) ipmi_devintf
ipmi_si ipmi_msghandler fuse sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf bonding nf_conntrack_ipv4 nf_defrag_ipv4 ipt_REJECT
xt_comment ipt_LOG xt_limit iptable_filter ip_tables ip6t_REJECT
nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter
ip6_tables ipv6 xfs exportfs igb sg dcdbas microcode i2c_i801 i2c_core
iTCO_wdt iTCO_vendor_support ioatdma dca i7core_edac edac_core shpchp
ext4 mbcache jbd2 sd_mod crc_t10dif mpt2sas scsi_transport_sas
raid_class ahci dm_mirror dm_region_hash dm_log dm_mod [last unloaded:
scsi_wait_scan]

Pid: 0, comm: swapper Tainted: GW  
2.6.32-220.el6.x86_64 #1 Dell   C6100   /0D61XP
RIP: 0010:[]  []
get_next_timer_interrupt+0x148/0x250
RSP: 0018:880630a37e48  EFLAGS: 00010007
RAX:  RBX: 000281159654 RCX: 880c30fa5340
RDX: 0001 RSI: 0032 RDI: 880c30fa5020
RBP: 880630a37e98 R08: 0030 R09: 02811570
R10: 880630a37e60 R11: 880630a37e78 R12: 000281156f45
R13: 880c30fa4000 R14: 0040 R15: 0016f42f2a7cd7a4
FS:  () GS:88065544() knlGS:
CS:  0010 DS: 0018 ES: 0018 CR0: 8005003b
CR2: 0028 CR3: 000b0cabc000 CR4: 06e0
DR0:  DR1:  DR2: 
DR3:  DR6: 0ff0 DR7: 0400
Process swapper (pid: 0, threadinfo 880630a36000, task 880c30e394c0)
Stack:
 880630a37e58 81013563 880c30fa5020 880c30fa5420
<0> 880c30fa5820 880c30fa5c20 880655451040 0006
<0> 000281156f45 0292 880630a37ef8 810a14f8
Call Trace:
 [] ? native_sched_clock+0x13/0x60
 [] tick_nohz_stop_sched_tick+0x2e8/0x3d0
 [] cpu_idle+0x79/0x110
 [] start_secondary+0x202/0x245
Code: 89 45 d8 45 89 c8 41 83 e0 3f 44 89 c6 66 0f 1f 44 00 00 48 63
ce 48 c1 e1 04 48 8b 04 39 48 8d 0c 0f 48 39 c8 74 22 0f 1f 40 00 
40 28 01 75 10 48 8b 50 10 48 39 da 48 0f 48 da ba 01 00 00
RIP  [] get_next_timer_interrupt+0x148/0x250
 RSP 
CR2: 0028
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Sat, Mar 09, 2013 at 12:13:16AM -0500, Sasha Levin wrote:
 > On 03/08/2013 11:39 PM, Dave Jones wrote:
 > > On Fri, Mar 08, 2013 at 08:31:48PM -0800, Linus Torvalds wrote:
 > >  > On Fri, Mar 8, 2013 at 7:50 PM, Dave Jones  wrote:
 > >  > >  > >
 > >  > >  > > I have a feeling there were some sysfs ones that may still be 
 > > unfixed.
 > >  > >
 > >  > > I was right..
 > >  > >
 > >  > > [  425.836722] general protection fault:  [#1] PREEMPT SMP
 > >  > 
 > >  > You forgot to enable DEBUG_PAGE_ALLOC again, but I don't think it much
 > >  > matters. It's another slab free poison thing.
 > >  > 
 > >  > The faulting instruction is
 > >  > 
 > >  > mov0x28(%rbx),%ecx
 > >  > 
 > >  > with %rbx having the value 6b6b6b6b6b6b6b6b.
 > >  > 
 > >  > > [  425.847859] RIP: 0010:[]  [] 
 > > sysfs_find_dirent+0x47/0xf0
 > >  > 
 > >  > That seems to be
 > >  > 
 > >  > if (hash != sd->s_hash)
 > >  > 
 > >  > from sysfs_name_compare() that has been inlined into
 > >  > sysfs_find_dirent(). And where "sd" is the corrupted value. If I read
 > >  > things right.
 > >  > 
 > >  > So it looks like the sysfs rbtree is corrupted or something. Adding
 > >  > Greg to the cc.
 > >  
 > > oh , that rings a bell.  ISTR this had something to do with Sasha's idr 
 > > changes.
 > 
 > I believe the issue you're referring to is my report of a panic in sysfs keys
 > where the device has gone away? If that is it, I don't think that this issue
 > is related to that one.
 > 
 > If not, I'm not sure which change you're referring to.

Hmm I might have been thinking of the hlist changes rather than idr.
Though looking at those oopses, they were in find_pid_ns, so different.

So many bugs.

Dave

 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Sasha Levin
On 03/08/2013 11:39 PM, Dave Jones wrote:
> On Fri, Mar 08, 2013 at 08:31:48PM -0800, Linus Torvalds wrote:
>  > On Fri, Mar 8, 2013 at 7:50 PM, Dave Jones  wrote:
>  > >  > >
>  > >  > > I have a feeling there were some sysfs ones that may still be 
> unfixed.
>  > >
>  > > I was right..
>  > >
>  > > [  425.836722] general protection fault:  [#1] PREEMPT SMP
>  > 
>  > You forgot to enable DEBUG_PAGE_ALLOC again, but I don't think it much
>  > matters. It's another slab free poison thing.
>  > 
>  > The faulting instruction is
>  > 
>  > mov0x28(%rbx),%ecx
>  > 
>  > with %rbx having the value 6b6b6b6b6b6b6b6b.
>  > 
>  > > [  425.847859] RIP: 0010:[]  [] 
> sysfs_find_dirent+0x47/0xf0
>  > 
>  > That seems to be
>  > 
>  > if (hash != sd->s_hash)
>  > 
>  > from sysfs_name_compare() that has been inlined into
>  > sysfs_find_dirent(). And where "sd" is the corrupted value. If I read
>  > things right.
>  > 
>  > So it looks like the sysfs rbtree is corrupted or something. Adding
>  > Greg to the cc.
>  
> oh , that rings a bell.  ISTR this had something to do with Sasha's idr 
> changes.

I believe the issue you're referring to is my report of a panic in sysfs keys
where the device has gone away? If that is it, I don't think that this issue
is related to that one.

If not, I'm not sure which change you're referring to.


Thanks,
Sasha

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4] Quirk for buggy dma source tags with Intel IOMMU.

2013-03-08 Thread Andrew Cooks
On Fri, Mar 8, 2013 at 7:43 PM, Gaudenz Steinlin  wrote:
>
> Hi Andrew
>
> Andrew Cooks  writes:
>
>> This patch creates a quirk to allow the Intel IOMMU to be enabled for devices
>> that use incorrect tags during DMA. It is similar to the quirk for Ricoh
>> devices, but allows mapping multiple functions and mapping of 'ghost'
>> functions that do not correspond to real devices. Devices that need this
>> include a variety of Marvell 88SE91xx based SATA controllers. [1][2]
>
> I can confirm that this version of the patch also works for my mini-PCIe
> device (88NV9143). See the my mail about it for more information. I had
> to manually fix the patch because the patch utility did not understand
> it. There is a formatting error in the last hunk for quirks.c (missing
> space before context line) and the line count in the hunk header is
> wrong (66 lines changed should be 56 lines). I hope nothing was missing
> from the patch.
>
> Tested on 3.8.2.

Thanks for testing.

The formatting error is embarrassing. I was impatient and removed some
unused content from the patch, instead of cleaning the source. The
thing about posting to open lists with thousands or subscribers and
searchable archives is that it's impossible to hide incompetence.

a.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 08:31:48PM -0800, Linus Torvalds wrote:
 > On Fri, Mar 8, 2013 at 7:50 PM, Dave Jones  wrote:
 > >  > >
 > >  > > I have a feeling there were some sysfs ones that may still be unfixed.
 > >
 > > I was right..
 > >
 > > [  425.836722] general protection fault:  [#1] PREEMPT SMP
 > 
 > You forgot to enable DEBUG_PAGE_ALLOC again, but I don't think it much
 > matters. It's another slab free poison thing.
 > 
 > The faulting instruction is
 > 
 > mov0x28(%rbx),%ecx
 > 
 > with %rbx having the value 6b6b6b6b6b6b6b6b.
 > 
 > > [  425.847859] RIP: 0010:[]  [] 
 > > sysfs_find_dirent+0x47/0xf0
 > 
 > That seems to be
 > 
 > if (hash != sd->s_hash)
 > 
 > from sysfs_name_compare() that has been inlined into
 > sysfs_find_dirent(). And where "sd" is the corrupted value. If I read
 > things right.
 > 
 > So it looks like the sysfs rbtree is corrupted or something. Adding
 > Greg to the cc.
 
oh , that rings a bell.  ISTR this had something to do with Sasha's idr changes.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Linus Torvalds
On Fri, Mar 8, 2013 at 7:50 PM, Dave Jones  wrote:
>  > >
>  > > I have a feeling there were some sysfs ones that may still be unfixed.
>
> I was right..
>
> [  425.836722] general protection fault:  [#1] PREEMPT SMP

You forgot to enable DEBUG_PAGE_ALLOC again, but I don't think it much
matters. It's another slab free poison thing.

The faulting instruction is

mov0x28(%rbx),%ecx

with %rbx having the value 6b6b6b6b6b6b6b6b.

> [  425.847859] RIP: 0010:[]  [] 
> sysfs_find_dirent+0x47/0xf0

That seems to be

if (hash != sd->s_hash)

from sysfs_name_compare() that has been inlined into
sysfs_find_dirent(). And where "sd" is the corrupted value. If I read
things right.

So it looks like the sysfs rbtree is corrupted or something. Adding
Greg to the cc.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 0/5] crash dump bitmap: scan memory pages in kernel to speedup kernel dump process

2013-03-08 Thread HATAYAMA Daisuke
From: Jingbai Ma 
Subject: Re: [RFC PATCH 0/5] crash dump bitmap: scan memory pages in kernel to 
speedup kernel dump process
Date: Fri, 8 Mar 2013 18:06:31 +0800

> On 03/07/2013 11:21 PM, Vivek Goyal wrote:
>> On Thu, Mar 07, 2013 at 10:58:18PM +0800, Jingbai Ma wrote:
...
>> First of all 64MB per TB should not be a huge deal. And makedumpfile
>> also has this cyclic mode where you process a map, discard it and then
>> move on to next section. So memory usage remains constant at the
>> expense
>> of processing time.
> 
> Yes, that's true. But in cyclic mode, makedumpfile will have to
> write/read bitmap from storage, it will also impact the performance.
> I have measured the penalty for cyclic mode is about 70%
> slowdown. Maybe could be faster after mmap implemented.

I guess the slowdown came from the issue that enough VMCOREINFO was
not provided from the kernel, and unnecessary filtering processing for
free pages is done multiple times.

For example, confirm how filtering is done in your environment like
this:

$ makedumpfile --message-level 16  # 16 is report message
makedumpfile: map_size = 4
sadump: does not have partition header
...
  pfn_end: 88
Can't select page_is_buddy handler; follow free lists instead of mem_map array.
STEP [Excluding free pages   ] : 0.431724 seconds
STEP [Excluding unnecessary pages] : 1.052160 seconds

Here STEP [..] colum occurs the number of cycles in cyclic-mode. If
STEP [Excluding free pages ] column occurs multiple times in log, it
causes the slowdown on your environment. (free_list doesn't sort its
elements in pfn's order, so we have only to iterate a whole part of
free_list in each cycle...; it could amount to be close to a whole
memory size in worst case just after system boot)

To use mem_map array logic, VMCOREINFO nees to have the corresponding
information to refer to related data structures. The patch is 

commit 8d67091ec6ae98ca67f77990ef9e9ec21337f077
Author: Atsushi Kumagai 
Date:   Wed Feb 27 17:03:25 2013 -0800

kexec: add the values related to buddy system for filtering free pages.

and it has been merged in 3.9-rc1.

$ git describe 8d67091ec6ae98ca67f77990ef9e9ec21337f077
v3.8-9443-g8d67091

Or you can edit VMCOREINFO manually and specify it to makedumpfile as:

1. generate vmcoreinfo from vmlinux

  makedumpfile -x vmlinux -g vmcoreinfo.txt

2. Add the following values in the generated vmcoreinfo.txt

- 3.1, 3.4, 3.8.x
NUMBER(PG_slab)=7
SIZE(pageflags)=4
OFFSET(page._mapcount)=24
OFFSET(page.private)=48
NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE)=-128

- 2.6.38
SIZE(pageflags)=4
OFFSET(page._mapcount)=12
OFFSET(page.private)=16
NUMBER(PG_slab)=7
NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE)=-2

- 2.6.32
NUMBER(PG_slab)=7
NUMBER(PG_buddy)=19
OFFSET(page._mapcount)=12
OFFSET(page.private)=16
SIZE(pageflags)=4

- 2.6.18
NUMBER(PG_slab)=7
NUMBER(PG_buddy)=19
OFFSET(page._mapcount)=12
OFFSET(page.private)=16

3. Specify the vmcoreinfo.txt to makedumpfile via -i option

  makedumpfile -i vmcoreinfo.txt [-c|-l|-p] -d 31 /proc/vmcore dumpfile

Anyway, please help benchmark. I'll send CC to you too.

Thanks.
HATAYAMA, Daisuke

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 07:38:33PM -0800, Eric W. Biederman wrote:
 > Dave Jones  writes:
 > 
 > > On Fri, Mar 08, 2013 at 07:16:09PM -0800, Linus Torvalds wrote:
 > >  > Goodie. Your bug reports gave me heartburn. But it sounds like we have 
 > > an
 > >  > angle on all of the ones you've seen now?
 > >  > 
 > >  > Or have I forgotten about some case?
 > >  
 > > To be honest I've lost track of the whole collection.
 > > Let me repull your latest tree, and see what falls out.
 > > (I'll turn off CONFIG_USER_NS for now too until that gets fixed)
 > 
 > It was CONFIG_UTS_NS that tripped you.  
 > 
 > Since I can trigger this with /proc/self/ns/mnt/a you are going to be
 > able to compile this one out.

fwiw, the other namespace procfs files look like they have the same bug

I just triggered it again on /proc/571/task/571/ns/net

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: epoll: possible bug from wakeup_source activation

2013-03-08 Thread Arve Hjønnevåg
On Fri, Mar 8, 2013 at 12:49 PM, Eric Wong  wrote:
> Arve Hjønnevåg  wrote:
>> On Thu, Mar 7, 2013 at 5:30 PM, Eric Wong  wrote:
>> > Eric Wong  wrote:
>> >> Hi Arve, looking at commit 4d7e30d98939a0340022ccd49325a3d70f7e0238
>> >> (epoll: Add a flag, EPOLLWAKEUP, to prevent suspend ...)
>> >>
>> >> I think the reason for using ep->ws instead of epi->ws in the unlikely
>> >> ovflist case applies to the likely rdllist case, too.  Since epi->ws is
>> >> only protected by ep->mtx, it can also be deactivated while inside
>> >> ep_poll_callback.
>> >>
>> >> So something like the following patch might be necessary
>> >> (shown here with extra context):
>> >>
>> >> --- a/fs/eventpoll.c
>> >> +++ b/fs/eventpoll.c
>> >> @@ -968,39 +968,45 @@ static int ep_poll_callback(wait_queue_t *wait, 
>> >> unsigned mode, int sync, void *k
>> >>   if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
>> >>   if (epi->next == EP_UNACTIVE_PTR) {
>> >>   epi->next = ep->ovflist;
>> >>   ep->ovflist = epi;
>> >>   if (epi->ws) {
>> >>   /*
>> >>* Activate ep->ws since epi->ws may get
>> >>* deactivated at any time.
>> >>*/
>> >>   __pm_stay_awake(ep->ws);
>> >>   }
>> >>
>> >>   }
>> >
>> > Thinking about this more, it looks like the original ep->ovflist case of
>> > using ep->ws is unnecessary.
>> >
>> > ep->ovflist != EP_UNACTIVE_PTR can only happen while ep->mtx is held (in
>> > ep_scan_ready_list); which means ep_modify+friends cannot remove epi->ws.
>> >
>>
>> The callback function in ep_scan_ready_list can call __pm_relax on it though.
>>
>> > ep_poll_callback holding ep->lock means ep_poll_callback prevents
>> > ep_scan_ready_list from setting ep->ovflist = EP_UNACTIVE_PTR and
>> > releasing ep->mtx.
>>
>> This code is reached when ep_scan_ready_list has set ep->ovflist to
>> NULL before releasing ep->lock. Since the callback function can call
>> __pm_relax on epi->ws without holding ep->lock we call __pm_stay_awake
>> in ep->ws here (the callback does not call __pm_relax on that).
>
> Thanks for the explanation.  I got "deactivate" and "destroy"
> mixed up.  However, I'm still concerned about the "destroy" case:
>
>> >
>> >>   goto out_unlock;
>> >>   }
>> >>
>> >>   /* If this file is already in the ready list we exit soon */
>> >>   if (!ep_is_linked(&epi->rdllink)) {
>> >>   list_add_tail(&epi->rdllink, &ep->rdllist);
>> >> - __pm_stay_awake(epi->ws);
>> >> + if (epi->ws) {
>> >> + /*
>> >> +  * Activate ep->ws since epi->ws may get
>> >> +  * deactivated at any time.
>> >> +  */
>> >> + __pm_stay_awake(ep->ws);
>> >> + }
>> >>   }
>> >
>> > I still think ep->ws needs to be used in the common ep->rdllist case.
>>
>> ep_scan_ready_list calls __pm_relax on ep->ws when it is done, so this
>> will not work. ep->ws is not a "ep->rdllist not empty wakeup_source is
>> is a "ep_scan_ready_list is running" wakeup_source.
>
> What happens if ep_modify calls ep_destroy_wakeup_source
> while __pm_stay_awake is running on the same epi->ws?

Yes, that looks like a problem. I think calling
ep_destroy_wakeup_source with ep->lock held should fix that. It is not
clear how useful changing EPOLLWAKEUP in ep_modify is, so
alternatively we could remove that feature and instead only allow it
to be set in ep_insert.

-- 
Arve Hjønnevåg
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [nsproxy] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024

2013-03-08 Thread Rakib Mullick
On Fri, Mar 8, 2013 at 10:01 PM, Eric W. Biederman
 wrote:
>
> When a new task is created one of two things needs to happen.
> A) A reference count needs to be added to the current nsproxy.
> B) B a new nsproxy needs to be created.
>
> The way that code works today is far from a shiny example of totally
> clear code but it is not incorrect.
>
> By moving get_nsproxy down below the first return 0, you removed taking
> the reference count in the one case it is important.
>
> Arguably we should apply the patch below for clarity, and I just might
> queue it up for 3.10.
>
This one is much more cleaner. One thing regarding this patch, can we
check the namespace related flags at copy_namespace() call time at
copy_process(), also get_nsproxy()? I think this will reduce some
extra function call overhead and as you've mentioned get_nsproxy() is
needed at every process creation.

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: page_alloc: remove branch operation in free_pages_prepare()

2013-03-08 Thread Will Huck

Hi Hugh,
On 03/08/2013 10:01 AM, Hugh Dickins wrote:

On Fri, 8 Mar 2013, Joonsoo Kim wrote:

On Thu, Mar 07, 2013 at 10:54:15AM -0800, Hugh Dickins wrote:

On Thu, 7 Mar 2013, Joonsoo Kim wrote:


When we found that the flag has a bit of PAGE_FLAGS_CHECK_AT_PREP,
we reset the flag. If we always reset the flag, we can reduce one
branch operation. So remove it.

Cc: Hugh Dickins 
Signed-off-by: Joonsoo Kim 

I don't object to this patch.  But certainly I would have written it
that way in order not to dirty a cacheline unnecessarily.  It may be
obvious to you that the cacheline in question is almost always already
dirty, and the branch almost always more expensive.  But I'll leave that
to you, and to those who know more about these subtle costs than I do.

Yes. I already think about that. I thought that even if a cacheline is
not dirty at this time, we always touch the 'struct page' in
set_freepage_migratetype() a little later, so dirtying is not the problem.

I expect that a very high proportion of user pages have
PG_uptodate to be cleared here; and there's also the recently added


When PG_uptodate will be set?


page_nid_reset_last(), which will dirty the flags or a nearby field
when CONFIG_NUMA_BALANCING.  Those argue in favour of your patch.


But, now, I re-think this and decide to drop this patch.
The reason is that 'struct page' of 'compound pages' may not be dirty
at this time and will not be dirty at later time.

Actual compound pages would have PG_head or PG_tail or PG_compound
to be cleared there, I believe (check if I'm right on that).  The
questionable case is the ordinary order>0 case without __GFP_COMP
(and page_nid_reset_last() is applied to each subpage of those).


So this patch is bad idea.

I'm not so sure.  I doubt your patch will make a giant improvement
in kernel performance!  But it might make a little - maybe you just
need to give some numbers from perf to justify it (but I'm easily
dazzled by numbers - don't expect me to judge the result).

Hugh


Is there any comments?

Thanks.


Hugh


diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8fcced7..778f2a9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -614,8 +614,7 @@ static inline int free_pages_check(struct page *page)
return 1;
}
page_nid_reset_last(page);
-   if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
-   page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+   page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
  }
  

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org";> em...@kvack.org 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org";> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 07:36:34PM -0800, Linus Torvalds wrote:
 > Note that my tree does not have the pipe changes. Um still not sure about
 > the cause, see the patch with a warn-on-once in it. I don't like just
 > adding the NULL pointer checks willy nilly without understanding why they
 > made...

Yeah, understood. I still have that patched.

 > > To be honest I've lost track of the whole collection.
 > > Let me repull your latest tree, and see what falls out.
 > > (I'll turn off CONFIG_USER_NS for now too until that gets fixed)
 > >
 > > I have a feeling there were some sysfs ones that may still be unfixed.

I was right..

[  425.836722] general protection fault:  [#1] PREEMPT SMP 
[  425.837529] Modules linked in: hidp l2tp_ppp l2tp_netlink l2tp_core cmtp 
kernelcapi bnep rfcomm scsi_transport_iscsi can_raw ipt_ULOG nfnetlink irda 
can_bcm ipx caif_socket appletalk x25 p8023 af_key caif psnap p8022 crc_ccitt 
phonet netrom rose rds llc2 nfc llc af_802154 can ax25 pppoe decnet af_rxrpc 
pppox ppp_generic slhc atm lockd sunrpc ip6t_REJECT nf_conntrack_ipv6 
nf_defrag_ipv6 xt_conntrack nf_conntrack ip6table_filter ip6_tables 
snd_hda_codec_realtek snd_hda_intel snd_hda_codec btusb bluetooth snd_pcm 
snd_page_alloc snd_timer snd usb_debug microcode rfkill pcspkr serio_raw 
soundcore edac_core vhost_net tun macvtap macvlan kvm_amd r8169 kvm mii radeon 
backlight drm_kms_helper ttm
[  425.846148] CPU 2 
[  425.846387] Pid: 15263, comm: trinity-child3 Not tainted 3.9.0-rc1+ #88 
Gigabyte Technology Co., Ltd. GA-MA78GM-S2H/GA-MA78GM-S2H
[  425.847859] RIP: 0010:[]  [] 
sysfs_find_dirent+0x47/0xf0
[  425.848919] RSP: 0018:88011aab5d38  EFLAGS: 00010202
[  425.849581] RAX: 0ad55669 RBX: 6b6b6b6b6b6b6b6b RCX: 6b6b6b6b
[  425.850468] RDX:  RSI:  RDI: 
[  425.851356] RBP: 88011aab5d58 R08:  R09: 
[  425.852237] R10:  R11:  R12: 7640c1d4
[  425.853119] R13: 880105b41e48 R14:  R15: 88012711
[  425.854001] FS:  7f7cdb71d740() GS:88012b20() 
knlGS:
[  425.855000] CS:  0010 DS:  ES:  CR0: 80050033
[  425.855712] CR2: 0001 CR3: 000102df4000 CR4: 07e0
[  425.856599] DR0:  DR1:  DR2: 
[  425.857486] DR3:  DR6: 0ff0 DR7: 0400
[  425.858368] Process trinity-child3 (pid: 15263, threadinfo 88011aab4000, 
task 88012711)
[  425.859490] Stack:
[  425.859747]  880113228d58 880105b41e10 fffe 
88010887eee8
[  425.860718]  88011aab5d88 8123f76d 880105b41e10 
8801088804a0
[  425.861683]  88011aab5f28  88011aab5db8 
811c5b7d
[  425.862648] Call Trace:
[  425.862960]  [] sysfs_lookup+0x6d/0xe0
[  425.863623]  [] lookup_real+0x1d/0x60
[  425.864274]  [] __lookup_hash+0x38/0x50
[  425.864951]  [] lookup_hash+0x19/0x20
[  425.865607]  [] kern_path_create+0x95/0x170
[  425.866322]  [] ? getname_flags.part.33+0x86/0x150
[  425.868909]  [] user_path_create+0x4a/0x70
[  425.871401]  [] sys_linkat+0x88/0x230
[  425.873841]  [] ? trace_hardirqs_on_thunk+0x3a/0x3f
[  425.876433]  [] sys_link+0x1e/0x20
[  425.878839]  [] system_call_fastpath+0x16/0x1b
[  425.881364] Code: 00 48 8b 9f 88 00 00 00 f6 c4 0f 0f 95 c0 48 85 f6 0f 95 
c2 38 d0 75 79 4c 89 ee 4c 89 f7 e8 91 ef ff ff 41 89 c4 48 85 db 74 1d <8b> 4b 
28 41 39 cc 74 21 44 89 e0 29 c8 83 f8 00 7c 2c 74 45 48 


I'll try some more divide and conquer debugging in a while.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 15/20] kexec: fill note buffers by NT_VMCORE_PAD notes in page-size boundary

2013-03-08 Thread HATAYAMA Daisuke
From: Yanfei Zhang 
Subject: Re: [PATCH v2 15/20] kexec: fill note buffers by NT_VMCORE_PAD notes 
in page-size boundary
Date: Fri, 8 Mar 2013 21:02:50 +0800

> 2013/3/8 HATAYAMA Daisuke :
>> From: Zhang Yanfei 
>> Subject: Re: [PATCH v2 15/20] kexec: fill note buffers by NT_VMCORE_PAD 
>> notes in page-size boundary
>> Date: Thu, 7 Mar 2013 18:11:30 +0800
>>
>>> 于 2013年03月02日 16:37, HATAYAMA Daisuke 写道:
 Fill both crash_notes and vmcoreinfo_note buffers by NT_VMCORE_PAD
 note type to make them satisfy mmap()'s page-size boundary
 requirement.

 So far, end of note segments has been marked by zero-filled elf
 header. Instead, this patch writes NT_VMCORE_PAD note in the end of
 note segments until the offset on page-size boundary.
>>>
>>>
>>> In the codes below, it seems that you assign name "VMCOREINFO" for
>>> note type NT_VMCORE_PAD, right? This is kind of wired, i think. This
>>> name has been used for NT_VMCORE_DEBUGINFO note already. Why not something
>>> like "VMCOREPAD" or "PAD"?
>>>
>>
>> It looks you are confusing or don't know name and type. The name is
>> namespace and in the namespace, there are multiple note types, each of
>> which has the corresponding data. In other words, data corresponding
>> to types differ if they belong to differnet name space even if
>> integers of the types are coincide with.
> 
> Yes, I knew this. Just as the spec said " a program must recognize both the 
> name
> and the type to recognize a descriptor.". But I cannot understand what your 
> word
> "namespace" came from? I think you complicate simple things here.
> 
> Only with a type, we cannot recognize a descriptor, because "multiple
> interpretations of
> a single type value may exist", So we should combine the name and the type
> together. If both the name and type of two descriptors are the same,
> we could say we
> have two same descriptors. If one of them (type or name) are
> different, we say the
> two descriptors are different and the two notes have different data.
> 
> If I am wrong, please correct me.

??? I think you're saying here the same thing as my explanation above.

Although the term ''name space'' never occurs in ELF, it seems to me
standard to represent the same values as different ones by combining
additional elements as names to them.

Well, formally, it is represented as simply tuples or vector
space. For example, support set S and S' and define new set S x S' by

  S x S' := { (s, s') | s in S, s' in S' }

and equality of the S x S' are defined as usual:

  (s1, s1') == (s2, s2') iff s1 == s2 and s1' == s2'.

In ELF, S is names and S' is types. There's no other formal meaning
there.

>>
>> The "VMCOREINFO" name represents information exported from
>> /proc/vmcore that is used in kdump framework. In this sense,
>> NT_VMCORE_PAD that is specific for /proc/vmcore and kdump framework,
>> should belong to the "VMCOREINFO" name.
> 
> I cannot understand the name explanation totally. Does the name really
> have this meaning? Is there any authentic document? I was always thinking we
> could feel free to name a name by ourselves!

Of course, it's optional for you to decide how to name notes within
the mechanism. But it's important to treat naming for ease of managing
note types. In addition to the above formal definition, it's important
to consider what name gives us. It's readability, telling us that note
types that belong to unique name are treated in common in the sense of
the name. This is apart from the formal definition above.

It's certainly possible to distinguish notes by giving names only and
not giving types. For example, imagine there are new 27 notes and they
have different names but have 0 as type.

name  type
"SOME_NOTE_A" 0
"SOME_NOTE_B" 0
...
"SOME_NOTE_Z" 0

Also, for example,

nametype
"SOME_NOTE" 0 => NT_SOME_NOTE_A
"SOME_NOTE" 1 => NT_SOME_NOTE_B
...
"SOME_NOTE" 26=> NT_SOME_NOTE_Z

For the former case, it *looks to me* that space of time is not used
effectively and it *looks to me* that space of name is not consumed
efficiently.

After all, it amounts to individual preference about naming. I cannot
say anything more.

Thanks.
HATAYAMA, Daisuke

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Eric W. Biederman
Dave Jones  writes:

> On Fri, Mar 08, 2013 at 07:16:09PM -0800, Linus Torvalds wrote:
>  > Goodie. Your bug reports gave me heartburn. But it sounds like we have an
>  > angle on all of the ones you've seen now?
>  > 
>  > Or have I forgotten about some case?
>  
> To be honest I've lost track of the whole collection.
> Let me repull your latest tree, and see what falls out.
> (I'll turn off CONFIG_USER_NS for now too until that gets fixed)

It was CONFIG_UTS_NS that tripped you.  

Since I can trigger this with /proc/self/ns/mnt/a you are going to be
able to compile this one out.

Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: lockdep trace from prepare_bprm_creds

2013-03-08 Thread Tejun Heo
Hello, Li.

On Sat, Mar 09, 2013 at 10:11:51AM +0800, Li Zefan wrote:
> On 2013/3/8 3:38, Tejun Heo wrote:
> > On Thu, Mar 07, 2013 at 08:12:42PM +0100, Oleg Nesterov wrote:
> >> Well yes, I agree. I think that perfomance-wise threadgroup_change_begin()
> >> in de_thread() is fine, and perhaps it is even more clean because we are
> >> going to do the thread-group change. The scope of cred_guard_mutex is huge,
> >> it doesn't look very nice in threadgroup_lock().
> >>
> >> But we should avoid the cgroup-specific hooks as much as possible, so I
> >> like your patch more.
> > 
> > I don't really mind how it's done but while my approach seems to limit
> > itself to cgroup proper, threadgroup locking is actually more invasive
> > by meddling with cred_mutex.  As you said, yours is the cleaner and
> > probably more permanent one here.
> > 
> 
> Agreed.
> 
> Now we need that patch to be resent with SOB and proper changelog.

Now that I think more about it, I think I want both patches.  It is
bothering that threadgroup lock is nested inside cgroup_lock.  It
always has.  I just couldn't do anything about that until recently.
Li, can you be persuaded into getting the lock reordering patch into a
useable shape?  :)

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mmap vs fs cache

2013-03-08 Thread Ric Mason

Hi Johannes,
On 03/09/2013 12:16 AM, Johannes Weiner wrote:

On Fri, Mar 08, 2013 at 07:00:55AM -0800, Howard Chu wrote:

Chris Friesen wrote:

On 03/08/2013 03:40 AM, Howard Chu wrote:


There is no way that a process that is accessing only 30GB of a mmap
should be able to fill up 32GB of RAM. There's nothing else running on
the machine, I've killed or suspended everything else in userland
besides a couple shells running top and vmstat. When I manually
drop_caches repeatedly, then eventually slapd RSS/SHR grows to 30GB and
the physical I/O stops.

Is it possible that the kernel is doing some sort of automatic
readahead, but it ends up reading pages corresponding to data that isn't
ever queried and so doesn't get mapped by the application?

Yes, that's what I was thinking. I added a
posix_madvise(..POSIX_MADV_RANDOM) but that had no effect on the
test.

First obvious conclusion - kswapd is being too aggressive. When free
memory hits the low watermark, the reclaim shrinks slapd down from
25GB to 18-19GB, while the page cache still contains ~7GB of
unmapped pages. Ideally I'd like a tuning knob so I can say to keep
no more than 2GB of unmapped pages in the cache. (And the desired
effect of that would be to allow user processes to grow to 30GB
total, in this case.)

We should find out where the unmapped page cache is coming from if you
are only accessing mapped file cache and disabled readahead.

How do you arrive at this number of unmapped page cache?

What could happen is that previously used and activated pages do not
get evicted anymore since there is a constant supply of younger


If a user process exit, its file pages and anonymous pages will be freed 
immediately or go through page reclaim?



reclaimable cache that is actually thrashing.  Whenever you drop the
caches, you get rid of those stale active pages and allow the
previously thrashing cache to get activated.  However, that would
require that there is already a significant amount of active file


Why you emphasize a *significant* amount of active file pages?


pages before your workload starts (check the nr_active_file number in
/proc/vmstat before launching slapd, try sync; echo 3 >drop_caches
before launching to eliminate this option) OR that the set of pages
accessed during your workload changes and the combined set of pages
accessed by your workload is bigger than available memory -- which you
claimed would not happen because you only access the 30GB file area on
that system.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org";> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Eric W. Biederman
Dave Jones  writes:

> On Fri, Mar 08, 2013 at 09:56:31PM -0500, Dave Jones wrote:
>  > On Fri, Mar 08, 2013 at 09:26:23PM -0500, Dave Jones wrote:
>  >  > On Fri, Mar 08, 2013 at 06:08:52PM -0800, Linus Torvalds wrote:
>  >  >  > On Fri, Mar 8, 2013 at 6:03 PM, Dave Jones  wrote:
>  >  >  > >
>  >  >  > > existing pathname + 'a' = fine.
>  >  >  > >
>  >  >  > > existing pathname + '/' + 'a' = boom.
>  >  >  > 
>  >  >  > Good.
>  >  >  > 
>  >  >  > > Looks like if I do this..
>  >  >  > >
>  >  >  > >if (isdigit(newpath[len]) != 0) {
>  >  >  > > newpath[len] = '/';
>  >  >  > >newpath[len+1] = 'A';
>  >  >  > >newpath[len+2] = 0;
>  >  >  > >
>  >  >  > > no bug.
>  >  >  > 
>  >  >  > Well, but that will never trigger. newpath[len] will always be NUL, 
> so
>  >  >  > you just disabled things entirely. Use "len-1".
>  >  >  > 
>  >  >  > So I don't think that did what you meant it to do.
>  >  > 
>  >  > Fixed that up, and even double checked my sanity by printing stuff out.
>  >  > 
>  >  > Confirmed that it's something that doesn't end in a number.
>  >  
>  > I've got a hunch that it's /proc/$$/ns/uts.
>  > 
>  > After 3-4 runs, that's the only common file in the last few that got 
> mangled.
>  > 
>  > I'll do some more tests, but this might be the one.
>  
> confirmed.  A simple 
>
> mkdir /proc/self/ns/uts/A
>
> will trigger it.

This is a magic symlink similar to the other magic symlinks in proc so I
don't know if the test is wrong or my code is doing something to clever.

But I can reproduce it so I will look at it and see if I can make sense
of what is going on.

Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 07:16:09PM -0800, Linus Torvalds wrote:
 > Goodie. Your bug reports gave me heartburn. But it sounds like we have an
 > angle on all of the ones you've seen now?
 > 
 > Or have I forgotten about some case?
 
To be honest I've lost track of the whole collection.
Let me repull your latest tree, and see what falls out.
(I'll turn off CONFIG_USER_NS for now too until that gets fixed)

I have a feeling there were some sysfs ones that may still be unfixed.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] PCI: fix system hang issue of Marvell SATA host controller

2013-03-08 Thread Myron Stowe
On Thu, Mar 7, 2013 at 11:51 PM, Xiangliang Yu  wrote:
> Hi, Bjorn
>
>> >> > Fix system hang issue: if first accessed resource file of BAR0 ~
>> >> > BAR4, system will hang after executing lspci command
>> >>
>> >> This needs more explanation.  We've already read the BARs by the time
>> >> header quirks are run, so apparently it's not just the mere act of
>> >> accessing a BAR that causes a hang.
>> >>
>> >> We need to know exactly what's going on here.  For example, do BARs
>> >> 0-4 exist?  Does the device decode accesses to the regions described
>> >> by the BARs?  The PCI core has to know what resources the device uses,
>> >> so if the device decodes accesses, we can't just throw away the
>> >> start/end information.
>> > The BARs 0-4 is exist and the PCI device is enable IO space, but user 
>> > access
>> the regions file by udevadm command with info parameter, the system will 
>> hang.
>> > Like this: udevadmin info --attribut-walk
>> --path=/sys/device/pci-device/000:*.
>> > Because the device is just AHCI host controller, don't need the BAR0 ~ 4 
>> > region
>> file.
>> > Is my explanation ok for the patch?
>>
>> No, I still don't know what causes the hang; I only know that udevadm
>> can trigger it.  I don't want to just paper over the problem until we
>> know what the root cause is.
>>
>> Does "lspci -H1 -vv" also cause a hang?  What about "setpci -s
>> BASE_ADDRESS_0"?  "setpci -H1 -s BASE_ADDRESS_0"?
> The commands are ok because the commands can't find the device after 
> accessing IO port.

Xiangliang:

Sorry but I didn't understand your response above, could you elaborate
a little more?


Are the first five BARs of the suspect device all mapping to I/O port
space - i.e. similar to something like this (a capture and inclusion
of an 'lspci' of the suspect device would be nice to see):
  00:1f.2 SATA controller:
Region 0: I/O ports at 1860 [size=8]
Region 1: I/O ports at 1814 [size=4]
Region 2: I/O ports at 1818 [size=8]
Region 3: I/O ports at 1810 [size=4]
Region 4: I/O ports at 1840 [size=32]
Region 5: Memory at f2827000 (32-bit, non-prefetchable) [size=2K]

You have done a good job isolating the issue so far.  As Bjorn noted;
it's looking as if the problem is with accessing the I/O port space
mapped by the suspect device's BAR(s), not with accessing the BAR(s)
in the device's configuration space.

As you responded positively to earlier, as proposed the suspect device
will still actively be decoding accesses to the regions described by
the BARs.  Because the device is actively decoding the PCI core can't
just throw away the BAR's corresponding resource regions, as the patch
is currently doing, due to the possibility of another device being
added at a later time.

If a subsequent device were added later, the core may need to try and
allocate resources for it and, in the worst case scenario, the core
could end up allocating resources that conflict with this suspect
device as a consequence of the suspect device's original resource
allocations having been silently thrown away.  The result would be
both devices believing they each exclusively own the same set (or
subset) of I/O port mappings and thus both actively decoding accesses
to such which.  A situation that would obviously be disastrous.

There is still something going on here that we still do not
understand.  Could you please capture the following information to
help further isolate the issue:
  A 'dmesg' log from the system which was booted using both the
"debug" and "ignore_loglevel" boot parameters, a 'lspci -xxx -s'
capture, and a 'lspci -vv' capture.

Thanks,
 Myron

> The root cause is that accessing of IO port will make the chip go bad. So, 
> the point of the patch is don't export capability of the IO accessing.
>
>>
>> >>
>> >> > ---
>> >> >  drivers/pci/quirks.c |   15 +++
>> >> >  1 files changed, 15 insertions(+), 0 deletions(-)
>> >> >
>> >> > diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
>> >> > index 0369fb6..d49f8dc 100644
>> >> > --- a/drivers/pci/quirks.c
>> >> > +++ b/drivers/pci/quirks.c
>> >> > @@ -44,6 +44,21 @@ static void quirk_mmio_always_on(struct pci_dev *dev)
>> >> >  DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_ANY_ID, PCI_ANY_ID,
>> >> > PCI_CLASS_BRIDGE_HOST, 8,
>> >> quirk_mmio_always_on);
>> >> >
>> >> > +/* The BAR0 ~ BAR4 of Marvell 9125 device can't be accessed
>> >> > +*  by IO resource file, and need to skip the files
>> >> > +*/
>> >> > +static void quirk_marvell_mask_bar(struct pci_dev *dev)
>> >> > +{
>> >> > +   int i;
>> >> > +
>> >> > +   for (i = 0; i < 5; i++)
>> >> > +   if (dev->resource[i].start)
>> >> > +   dev->resource[i].start =
>> >> > +   dev->resource[i].end = 0;
>> >> > +}
>> >> > +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9125,
>> >> > +   quirk_marvell_mask_bar);
>> >> > +
>> >> >  /* The Mellanox Tavor devic

Re: [PATCH V2] cpufreq: ARM big LITTLE: Add generic cpufreq driver and its DT glue

2013-03-08 Thread Viresh Kumar
On 8 March 2013 14:11, Guennadi Liakhovetski  wrote:
> Also in your driver you're doing
>
> cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
> ...
> cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
>
> So, theoretically you could install such notifiers to adjust CPU voltages
> (using regulators too). But adding regulator calls directly to the driver
> would make it consistent with cpufreq-cpu0.c.

Yes

>  so, if this doesn't violate
> any concepts, I think, it would be good to add those when suitable systems
> appear.

That's what i thought :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] intel_idle: changing the continue to break in intel_idle_cpu_init()

2013-03-08 Thread Daniel Lezcano
On 03/08/2013 04:03 PM, Chuansheng Liu wrote:
> 
> According to commit e022e7eb9, the .enter == NULL is the last one in
> state_tables[].
> 
> So just like intel_idle_cpuidle_driver_init(), in case of .enter == NULL,
> breaking the for(;;) loop directly.
> 
> Signed-off-by: liu chuansheng 
> ---

Sounds good.

Acked-by: Daniel Lezcano 

>  drivers/idle/intel_idle.c |2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index 5d66750..17c9cf9 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -610,7 +610,7 @@ static int intel_idle_cpu_init(int cpu)
>   int num_substates, mwait_hint, mwait_cstate, mwait_substate;
>  
>   if (cpuidle_state_table[cstate].enter == NULL)
> - continue;
> + break;
>  
>   if (cstate + 1 > max_cstate) {
>   printk(PREFIX "max_cstate %d reached\n", max_cstate);
> 


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] intel_idle: set the state_tables array as __initdata to save mem

2013-03-08 Thread Daniel Lezcano
On 03/08/2013 04:06 PM, Chuansheng Liu wrote:
> 
> Currently, in intel_idle.c, there are 5 state_tables array, every
> array size is sizeof(struct cpuidle_state) * CPUIDLE_STATE_MAX.
> 
> As in intel_idle_cpuidle_driver_init(), we have copied the data into
> intel_idle_driver->state[], so do not need to keep state_tables[]
> there any more after system init.
> 
> It will save about 3~4k memory, also benefits mobile devices.
> Here changing them as __initdata, also removing global var
> cpuidle_state_table pointer.
> 
> Signed-off-by: liu chuansheng 
> ---

Acked-by: Daniel Lezcano 

-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] intel_idle: Removing the redundant calculating for dev->state_count

2013-03-08 Thread Daniel Lezcano
On 03/08/2013 04:04 PM, Chuansheng Liu wrote:
> 
> In function intel_idle_cpu_init() and intel_idle_cpuidle_driver_init(),
> they are having the same for(;;) loop.
> 
> Here in intel_idle_cpu_init(), the dev->state_count can be assigned by
> drv->state_count directly.
> 
> Signed-off-by: liu chuansheng 
> ---
>  drivers/idle/intel_idle.c |   30 ++
>  1 files changed, 2 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index 17c9cf9..503b401 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -599,38 +599,12 @@ static int intel_idle_cpuidle_driver_init(void)
>   */
>  static int intel_idle_cpu_init(int cpu)
>  {
> - int cstate;
>   struct cpuidle_device *dev;
> + struct cpuidle_driver *drv = &intel_idle_driver;
>  
>   dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
>  
> - dev->state_count = 1;
> -
> - for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
> - int num_substates, mwait_hint, mwait_cstate, mwait_substate;
> -
> - if (cpuidle_state_table[cstate].enter == NULL)
> - break;
> -
> - if (cstate + 1 > max_cstate) {
> - printk(PREFIX "max_cstate %d reached\n", max_cstate);
> - break;
> - }
> -
> - mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
> - mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint);
> - mwait_substate = MWAIT_HINT2SUBSTATE(mwait_hint);
> -
> - /* does the state exist in CPUID.MWAIT? */
> - num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4))
> - & MWAIT_SUBSTATE_MASK;
> -
> - /* if sub-state in table is not enumerated by CPUID */
> - if ((mwait_substate + 1) > num_substates)
> - continue;
> -
> - dev->state_count += 1;
> - }
> + dev->state_count = drv->state_count;

The cpuidle_register_device function already does this initialization.

Probably you can get rid of this initialization and certainly factor out
a bit the code in this case.


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


sunrpc ODEBUG assertion.

2013-03-08 Thread Dave Jones
restarted my nfs server, and mounted it from a Mac, and got this..


[47433.585266] WARNING: at lib/debugobjects.c:260 debug_print_object+0x8c/0xb0()
[47433.585269] Hardware name: 
[47433.585273] ODEBUG: assert_init not available (active state 0) object type: 
timer_list hint: stub_timer+0x0/0x20
[47433.585275] Modules linked in: ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 
xt_conntrack nf_conntrack ip6table_filter ip6_tables coretemp microcode pcspkr 
snd_emu10k1 snd_hwdep snd_util_mem snd_ac97_codec ac97_bus snd_rawmidi snd_seq 
snd_seq_device snd_pcm snd_page_alloc snd_timer snd e1000e soundcore ptp 
pps_core vhost_net tun macvtap macvlan kvm_intel kvm nfsd binfmt_misc 
auth_rpcgss nfs_acl lockd sunrpc btrfs libcrc32c xor lzo_compress zlib_deflate 
raid6_pq firewire_ohci firewire_core sata_sil crc_itu_t ata_piix radeon 
i2c_algo_bit hwmon drm_kms_helper ttm drm i2c_core floppy
[47433.585324] Pid: 611, comm: nfsd Not tainted 3.9.0-rc1+ #80
[47433.585326] Call Trace:
[47433.585330]  [] warn_slowpath_common+0x75/0xa0
[47433.585334]  [] warn_slowpath_fmt+0x46/0x50
[47433.585337]  [] ? do_init_timer+0x66/0x70
[47433.585340]  [] debug_print_object+0x8c/0xb0
[47433.585343]  [] ? timer_debug_hint+0x10/0x10
[47433.585346]  [] debug_object_assert_init+0xe3/0x120
[47433.585349]  [] del_timer+0x2b/0x80
[47433.585353]  [] try_to_grab_pending+0xd9/0x1a0
[47433.585356]  [] __cancel_work_timer+0x27/0xf0
[47433.585359]  [] cancel_delayed_work_sync+0x13/0x20
[47433.585373]  [] xs_destroy+0x27/0x80 [sunrpc]
[47433.585381]  [] xprt_destroy+0x78/0xa0 [sunrpc]
[47433.585390]  [] xprt_put+0x21/0x30 [sunrpc]
[47433.585398]  [] rpc_free_client+0x16b/0x320 [sunrpc]
[47433.585406]  [] ? rpc_free_client+0x33/0x320 [sunrpc]
[47433.585414]  [] rpc_release_client+0x6e/0xb0 [sunrpc]
[47433.585423]  [] rpc_shutdown_client+0xe5/0x170 [sunrpc]
[47433.585427]  [] ? get_parent_ip+0x11/0x50
[47433.585431]  [] ? sub_preempt_count+0x79/0xd0
[47433.585442]  [] rpcb_put_local+0x141/0x250 [sunrpc]
[47433.585452]  [] ? rpcb_put_local+0x5/0x250 [sunrpc]
[47433.585461]  [] svc_rpcb_cleanup+0x1e/0x30 [sunrpc]
[47433.585471]  [] nfsd_last_thread+0x1e4/0x210 [nfsd]
[47433.585477]  [] ? nfsd_last_thread+0x5/0x210 [nfsd]
[47433.585487]  [] svc_shutdown_net+0x34/0x40 [sunrpc]
[47433.585493]  [] nfsd_destroy+0x170/0x210 [nfsd]
[47433.585500]  [] ? nfsd_destroy+0x5/0x210 [nfsd]
[47433.585509]  [] ? svc_exit_thread+0x99/0xb0 [sunrpc]
[47433.585516]  [] nfsd+0x136/0x160 [nfsd]
[47433.585522]  [] ? nfsd_destroy+0x210/0x210 [nfsd]
[47433.585526]  [] kthread+0xed/0x100
[47433.585529]  [] ? put_lock_stats.isra.25+0xe/0x40
[47433.585533]  [] ? kthread_create_on_node+0x160/0x160
[47433.585536]  [] ret_from_fork+0x7c/0xb0
[47433.585539]  [] ? kthread_create_on_node+0x160/0x160
[47433.585542] ---[ end trace 0ffae049d68a07e1 ]---
[47433.585687] nfsd: last server has exited, flushing export cache
[47433.904483] NFSD: starting 90-second grace period (net 81cc9380)
[47461.093051] [ cut here ]
[47461.093061] WARNING: at lib/debugobjects.c:260 debug_print_object+0x8c/0xb0()
[47461.093063] Hardware name: 
[47461.093067] ODEBUG: assert_init not available (active state 0) object type: 
timer_list hint: stub_timer+0x0/0x20
[47461.093069] Modules linked in: ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 
xt_conntrack nf_conntrack ip6table_filter ip6_tables coretemp microcode pcspkr 
snd_emu10k1 snd_hwdep snd_util_mem snd_ac97_codec ac97_bus snd_rawmidi snd_seq 
snd_seq_device snd_pcm snd_page_alloc snd_timer snd e1000e soundcore ptp 
pps_core vhost_net tun macvtap macvlan kvm_intel kvm nfsd binfmt_misc 
auth_rpcgss nfs_acl lockd sunrpc btrfs libcrc32c xor lzo_compress zlib_deflate 
raid6_pq firewire_ohci firewire_core sata_sil crc_itu_t ata_piix radeon 
i2c_algo_bit hwmon drm_kms_helper ttm drm i2c_core floppy
[47461.093117] Pid: 9956, comm: nfsd Tainted: GW3.9.0-rc1+ #80
[47461.093119] Call Trace:
[47461.093124]  [] warn_slowpath_common+0x75/0xa0
[47461.093127]  [] warn_slowpath_fmt+0x46/0x50
[47461.093130]  [] ? do_init_timer+0x66/0x70
[47461.093133]  [] debug_print_object+0x8c/0xb0
[47461.093136]  [] ? timer_debug_hint+0x10/0x10
[47461.093139]  [] debug_object_assert_init+0xe3/0x120
[47461.093142]  [] del_timer+0x2b/0x80
[47461.093146]  [] try_to_grab_pending+0xd9/0x1a0
[47461.093149]  [] __cancel_work_timer+0x27/0xf0
[47461.093152]  [] cancel_delayed_work_sync+0x13/0x20
[47461.093166]  [] xs_destroy+0x27/0x80 [sunrpc]
[47461.093174]  [] xprt_destroy+0x78/0xa0 [sunrpc]
[47461.093183]  [] xprt_put+0x21/0x30 [sunrpc]
[47461.093191]  [] rpc_free_client+0x16b/0x320 [sunrpc]
[47461.093200]  [] ? rpc_free_client+0x33/0x320 [sunrpc]
[47461.093209]  [] rpc_release_client+0x6e/0xb0 [sunrpc]
[47461.093217]  [] rpc_shutdown_client+0xe5/0x170 [sunrpc]
[47461.093221]  [] ? get_parent_ip+0x11/0x50
[47461.093226]  [] ? sub_preempt_count+0x79/0xd0
[47461.093237]  [] rpcb_put_local+0x141/0x250 [sunrpc]
[47461.093246]  [] ? rpcb_put_

Re: [PATCH] usb/core/devio.c: Don't use GFP_KERNEL while we cannot reset a storage device

2013-03-08 Thread Alan Stern
On Fri, 8 Mar 2013, Oliver Neukum wrote:

> On Friday 08 March 2013 12:55:08 Alan Stern wrote:
> > On Sat, 9 Mar 2013, Alexey Khoroshilov wrote:
> > 
> > > As it was described by Oliver Neukum in commit acbe2fe
> > > "USB: Don't use GFP_KERNEL while we cannot reset a storage device":
> > > 
> > >   Memory allocations with GFP_KERNEL can cause IO to a storage device
> > >   which can fail resulting in a need to reset the device. Therefore
> > >   GFP_KERNEL cannot be safely used between usb_lock_device()
> > >   and usb_unlock_device(). Replace by GFP_NOIO.
> > > 
> > > The patch fixes the same issue in usb/core/devio.c.
> > > All the allocations fixed are under usb_lock_device() from 
> > > usbdev_do_ioctl().
> > > 
> > > Found by Linux Driver Verification project (linuxtesting.org).
> > 
> > I don't know if this is a good idea.  People can and do submit 
> > transfers requiring a lot of buffer space.  Switching to GFP_NOIO 
> > will make those allocations a lot more likely to fail.
> > 
> > Oliver, what do you think?
> 
> Ideally we'd split memory allocation and use, by it fixes a bug.
> Better allocation failure than deadlock.

In fact we wouldn't deadlock.  This is because 
usb_lock_device_for_reset() gives up if it can't obtain the device lock 
after one second of trying.  We'd just end up with a failure to reset, 
leading to an I/O failure.

Probably the mass-storage device would be taken off-line...  but there
wouldn't be a deadlock.  Under the circumstances, I'd say that the 
consequences of merging this patch would be worse than the consequences 
of keeping things as they are now.

Alan Stern

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 09:56:31PM -0500, Dave Jones wrote:
 > On Fri, Mar 08, 2013 at 09:26:23PM -0500, Dave Jones wrote:
 >  > On Fri, Mar 08, 2013 at 06:08:52PM -0800, Linus Torvalds wrote:
 >  >  > On Fri, Mar 8, 2013 at 6:03 PM, Dave Jones  wrote:
 >  >  > >
 >  >  > > existing pathname + 'a' = fine.
 >  >  > >
 >  >  > > existing pathname + '/' + 'a' = boom.
 >  >  > 
 >  >  > Good.
 >  >  > 
 >  >  > > Looks like if I do this..
 >  >  > >
 >  >  > >if (isdigit(newpath[len]) != 0) {
 >  >  > > newpath[len] = '/';
 >  >  > >newpath[len+1] = 'A';
 >  >  > >newpath[len+2] = 0;
 >  >  > >
 >  >  > > no bug.
 >  >  > 
 >  >  > Well, but that will never trigger. newpath[len] will always be NUL, so
 >  >  > you just disabled things entirely. Use "len-1".
 >  >  > 
 >  >  > So I don't think that did what you meant it to do.
 >  > 
 >  > Fixed that up, and even double checked my sanity by printing stuff out.
 >  > 
 >  > Confirmed that it's something that doesn't end in a number.
 >  
 > I've got a hunch that it's /proc/$$/ns/uts.
 > 
 > After 3-4 runs, that's the only common file in the last few that got mangled.
 > 
 > I'll do some more tests, but this might be the one.
 
confirmed.  A simple 

mkdir /proc/self/ns/uts/A

will trigger it.

Eric ?

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 09:26:23PM -0500, Dave Jones wrote:
 > On Fri, Mar 08, 2013 at 06:08:52PM -0800, Linus Torvalds wrote:
 >  > On Fri, Mar 8, 2013 at 6:03 PM, Dave Jones  wrote:
 >  > >
 >  > > existing pathname + 'a' = fine.
 >  > >
 >  > > existing pathname + '/' + 'a' = boom.
 >  > 
 >  > Good.
 >  > 
 >  > > Looks like if I do this..
 >  > >
 >  > >if (isdigit(newpath[len]) != 0) {
 >  > > newpath[len] = '/';
 >  > >newpath[len+1] = 'A';
 >  > >newpath[len+2] = 0;
 >  > >
 >  > > no bug.
 >  > 
 >  > Well, but that will never trigger. newpath[len] will always be NUL, so
 >  > you just disabled things entirely. Use "len-1".
 >  > 
 >  > So I don't think that did what you meant it to do.
 > 
 > Fixed that up, and even double checked my sanity by printing stuff out.
 > 
 > Confirmed that it's something that doesn't end in a number.
 
I've got a hunch that it's /proc/$$/ns/uts.

After 3-4 runs, that's the only common file in the last few that got mangled.

I'll do some more tests, but this might be the one.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mmap vs fs cache

2013-03-08 Thread Ric Mason

Hi Johannes,
On 03/08/2013 10:08 AM, Johannes Weiner wrote:

On Thu, Mar 07, 2013 at 04:43:12PM +0100, Jan Kara wrote:

   Added mm list to CC.

On Tue 05-03-13 09:57:34, Howard Chu wrote:

I'm testing our memory-mapped database code on a small VM. The
machine has 32GB of RAM and the size of the DB on disk is ~44GB. The
database library mmaps the entire file as a single region and starts
accessing it as a tree of B+trees. Running on an Ubuntu 3.5.0-23
kernel, XFS on a local disk.

If I start running read-only queries against the DB with a freshly
started server, I see that my process (OpenLDAP slapd) quickly grows
to an RSS of about 16GB in tandem with the FS cache. (I.e., "top"
shows 16GB cached, and slapd is 16GB.)
If I confine my queries to the first 20% of the data then it all
fits in RAM and queries are nice and fast.

if I extend the query range to cover more of the data, approaching
the size of physical RAM, I see something strange - the FS cache
keeps growing, but the slapd process size grows at a slower rate.
This is rather puzzling to me since the only thing triggering reads
is accesses through the mmap region. Eventually the FS cache grows
to basically all of the 32GB of RAM (+/- some text/data space...)
but the slapd process only reaches 25GB, at which point it actually
starts to shrink - apparently the FS cache is now stealing pages
from it. I find that a bit puzzling; if the pages are present in
memory, and the only reason they were paged in was to satisfy an
mmap reference, why aren't they simply assigned to the slapd
process?

The current behavior gets even more aggravating: I can run a test
that spans exactly 30GB of the data. One would expect that the slapd
process should simply grow to 30GB in size, and then remain static
for the remainder of the test. Instead, the server grows to 25GB,
the FS cache grows to 32GB, and starts stealing pages from the
server, shrinking it back down to 19GB or so.

If I do an "echo 1 > /proc/sys/vm/drop_caches" at the onset of this
condition, the FS cache shrinks back to 25GB, matching the slapd
process size.
This then frees up enough RAM for slapd to grow further. If I don't
do this, the test is constantly paging in data from disk. Even so,
the FS cache continues to grow faster than the slapd process size,
so the system may run out of free RAM again, and I have to drop
caches multiple times before slapd finally grows to the full 30GB.
Once it gets to that size the test runs entirely from RAM with zero
I/Os, but it doesn't get there without a lot of babysitting.

2 questions:
   why is there data in the FS cache that isn't owned by (the mmap
of) the process that caused it to be paged in in the first place?

The filesystem cache is shared among processes because the filesystem
is also shared among processes.  If another task were to access the
same file, we still should only have one copy of that data in memory.

It sounds to me like slapd is itself caching all the data it reads.
If that is true, shouldn't it really be using direct IO to prevent
this double buffering of filesystem data in memory?


When use direct IO is better? When use page cache is better?




   is there a tunable knob to discourage the page cache from stealing
from the process?

Try reducing /proc/sys/vm/swappiness, which ranges from 0-100 and
defaults to 60.


Why redunce? IIUC, swappiness is used to determine how aggressive 
reclaim anonymous pages, if the value is high more anonymous pages will 
be reclaimed.




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org";> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 06:08:52PM -0800, Linus Torvalds wrote:
 > On Fri, Mar 8, 2013 at 6:03 PM, Dave Jones  wrote:
 > >
 > > existing pathname + 'a' = fine.
 > >
 > > existing pathname + '/' + 'a' = boom.
 > 
 > Good.
 > 
 > > Looks like if I do this..
 > >
 > >if (isdigit(newpath[len]) != 0) {
 > > newpath[len] = '/';
 > >newpath[len+1] = 'A';
 > >newpath[len+2] = 0;
 > >
 > > no bug.
 > 
 > Well, but that will never trigger. newpath[len] will always be NUL, so
 > you just disabled things entirely. Use "len-1".
 > 
 > So I don't think that did what you meant it to do.

Fixed that up, and even double checked my sanity by printing stuff out.

Confirmed that it's something that doesn't end in a number.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [3.9-rc1] irq 16: nobody cared (was [3.9-rc1] very poor interrupt responses)

2013-03-08 Thread Alan Stern
On Fri, 8 Mar 2013, Peter Hurley wrote:

> [ +linux-usb ]
> 
> On Fri, 2013-03-08 at 14:12 -0500, Shawn Starr wrote:
> > Hello folks,
> > 
> > I am noticing since rc0 and now rc1, very poor interrupt handling. Keyboard 
> > response, mouse movements, display refreshing etc. General input/display 
> > sluggishness. Did something break IRQ handling somewhere? I need to 
> > validate if this happens with X not running also if it is i915 related 
> > somehow. The behavor is noticed in a console login however.
> > 
> > Device: Lenovo W500 laptop
> 
> Hi Shawn,
> 
> Unhandled interrupts is the problem.
> 
> Is the device below being id'd properly?
> If you remove this device, does the problem go away?

Does either of the kernels in question have commit 0f815a0a700b (USB:
UHCI: fix IRQ race during initialization)?  That commit was added to
fix precisely this sort of thing.

Alan Stern

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[3.9-rc1] Locking dependency problem

2013-03-08 Thread Larry Finger
In kernel 3.9-rc1, I get the following lockdep warning. This kernel is from the 
wireless-testing tree, but I have seen the same message from the mainline kernel.


[ 4199.401157]
[ 4199.401159] ==
[ 4199.401160] [ INFO: possible circular locking dependency detected ]
[ 4199.401163] 3.9.0-rc1-wl+ #124 Tainted: GW
[ 4199.401164] ---
[ 4199.401167] kworker/0:3/2382 is trying to acquire lock:
[ 4199.401177]  (&fb_info->lock){+.+.+.}, at: [] 
lock_fb_info+0x21/0x60

[ 4199.401177]
[ 4199.401177] but task is already holding lock:
[ 4199.401183]  (console_lock){+.+.+.}, at: [] 
console_callback+0xe/0x130

[ 4199.401184]
[ 4199.401184] which lock already depends on the new lock.
[ 4199.401184]
[ 4199.401185]
[ 4199.401185] the existing dependency chain (in reverse order) is:
[ 4199.401187]
[ 4199.401187] -> #1 (console_lock){+.+.+.}:
[ 4199.401193][] lock_acquire+0xa8/0x1f0
[ 4199.401197][] console_lock+0x77/0x80
[ 4199.401200][] register_framebuffer+0x1b2/0x300
[ 4199.401205][] vesafb_probe+0x6ec/0x76c
[ 4199.401208][] platform_drv_probe+0x3e/0x70
[ 4199.401211][] driver_probe_device+0x75/0x230
[ 4199.401213][] __driver_attach+0xa3/0xb0
[ 4199.401216][] bus_for_each_dev+0x55/0x90
[ 4199.401218][] driver_attach+0x19/0x20
[ 4199.401220][] bus_add_driver+0x109/0x270
[ 4199.401223][] driver_register+0x72/0x170
[ 4199.401226][] platform_driver_register+0x41/0x50
[ 4199.401228][] platform_driver_probe+0x16/0xa0
[ 4199.401231][] vesafb_init+0x215/0x258
[ 4199.401235][] do_one_initcall+0x122/0x180
[ 4199.401239][] kernel_init_freeable+0x103/0x192
[ 4199.401242][] kernel_init+0x9/0xf0
[ 4199.401247][] ret_from_fork+0x7c/0xb0
[ 4199.401250]
[ 4199.401250] -> #0 (&fb_info->lock){+.+.+.}:
[ 4199.401252][] __lock_acquire+0x1479/0x1c70
[ 4199.401255][] lock_acquire+0xa8/0x1f0
[ 4199.401257][] mutex_lock_nested+0x69/0x370
[ 4199.401259][] lock_fb_info+0x21/0x60
[ 4199.401263][] fbcon_blank+0x29b/0x2e0
[ 4199.401266][] do_blank_screen+0x1d6/0x280
[ 4199.401269][] console_callback+0x5f/0x130
[ 4199.401273][] process_one_work+0x1f1/0x660
[ 4199.401275][] worker_thread+0x110/0x380
[ 4199.401279][] kthread+0xd6/0xe0
[ 4199.401282][] ret_from_fork+0x7c/0xb0
[ 4199.401282]
[ 4199.401282] other info that might help us debug this:
[ 4199.401282]
[ 4199.401283]  Possible unsafe locking scenario:
[ 4199.401283]
[ 4199.401284]CPU0CPU1
[ 4199.401284]
[ 4199.401286]   lock(console_lock);
[ 4199.401287]lock(&fb_info->lock);
[ 4199.401288]lock(console_lock);
[ 4199.401290]   lock(&fb_info->lock);
[ 4199.401290]
[ 4199.401290]  *** DEADLOCK ***
[ 4199.401290]
[ 4199.401292] 3 locks held by kworker/0:3/2382:
[ 4199.401297]  #0:  (events){.+.+.+}, at: [] 
process_one_work+0x185/0x660
[ 4199.401301]  #1:  (console_work){+.+...}, at: [] 
process_one_work+0x185/0x660
[ 4199.401305]  #2:  (console_lock){+.+.+.}, at: [] 
console_callback+0xe/0x130

[ 4199.401306]
[ 4199.401306] stack backtrace:
[ 4199.401308] Pid: 2382, comm: kworker/0:3 Tainted: GW3.9.0-rc1-wl+ 
#124

[ 4199.401309] Call Trace:
[ 4199.401312]  [] print_circular_bug+0x28e/0x29f
[ 4199.401315]  [] ? retint_restore_args+0x13/0x13
[ 4199.401318]  [] __lock_acquire+0x1479/0x1c70
[ 4199.401322]  [] ? bitfill_aligned+0x8b/0x140
[ 4199.401324]  [] lock_acquire+0xa8/0x1f0
[ 4199.401326]  [] ? lock_fb_info+0x21/0x60
[ 4199.401329]  [] mutex_lock_nested+0x69/0x370
[ 4199.401331]  [] ? lock_fb_info+0x21/0x60
[ 4199.401333]  [] ? bit_clear+0xcc/0x100
[ 4199.401336]  [] ? fbcon_clear+0x1be/0x1f0
[ 4199.401338]  [] lock_fb_info+0x21/0x60
[ 4199.401341]  [] fbcon_blank+0x29b/0x2e0
[ 4199.401343]  [] ? _raw_spin_unlock_irqrestore+0x3a/0x80
[ 4199.401346]  [] ? trace_hardirqs_on_caller+0x10d/0x1a0
[ 4199.401349]  [] ? trace_hardirqs_on+0xd/0x10
[ 4199.401354]  [] ? try_to_del_timer_sync+0x4a/0x60
[ 4199.401356]  [] ? del_timer_sync+0xba/0xf0
[ 4199.401359]  [] ? try_to_del_timer_sync+0x60/0x60
[ 4199.401362]  [] do_blank_screen+0x1d6/0x280
[ 4199.401364]  [] console_callback+0x5f/0x130
[ 4199.401367]  [] process_one_work+0x1f1/0x660
[ 4199.401370]  [] ? process_one_work+0x185/0x660
[ 4199.401372]  [] worker_thread+0x110/0x380
[ 4199.401375]  [] ? trace_hardirqs_on+0xd/0x10
[ 4199.401377]  [] ? rescuer_thread+0x250/0x250
[ 4199.401379]  [] kthread+0xd6/0xe0
[ 4199.401382]  [] ? _raw_spin_unlock_irq+0x2b/0x50
[ 4199.401385]  [] ? __init_kthread_worker+0x70/0x70
[ 4199.401388]  [] ret_from_fork+0x7c/0xb0
[ 4199.401390]  [] ? __init_kthread_worker+0x70/0x70

This problem seems to be related to 
http://www.mentby.com/russell-king-

Re: lockdep trace from prepare_bprm_creds

2013-03-08 Thread Li Zefan
On 2013/3/8 3:38, Tejun Heo wrote:
> Hello,
> 
> On Thu, Mar 07, 2013 at 08:12:42PM +0100, Oleg Nesterov wrote:
>> Well yes, I agree. I think that perfomance-wise threadgroup_change_begin()
>> in de_thread() is fine, and perhaps it is even more clean because we are
>> going to do the thread-group change. The scope of cred_guard_mutex is huge,
>> it doesn't look very nice in threadgroup_lock().
>>
>> But we should avoid the cgroup-specific hooks as much as possible, so I
>> like your patch more.
> 
> I don't really mind how it's done but while my approach seems to limit
> itself to cgroup proper, threadgroup locking is actually more invasive
> by meddling with cred_mutex.  As you said, yours is the cleaner and
> probably more permanent one here.
> 

Agreed.

Now we need that patch to be resent with SOB and proper changelog.

>>> +   if (threadgroup && !thread_group_leader(tsk)) {
>>> +   /*
>>> +* a race with de_thread from another thread's exec() may
>>> +* strip us of our leadership, if this happens, there is no
>>> +* choice but to throw this task away and try again; this
>>> +* is "double-double-toil-and-trouble-check locking".
>>> +*/
>>> +   threadgroup_unlock(tsk);
>>> +   put_task_struct(tsk);
>>> +   goto retry_find_task;
>>> +   }
>>>
>>> +   ret = -ENODEV;
>>> +   if (cgroup_lock_live_group(cgrp)) {
>>> +   if (threadgroup)
>>> +   ret = cgroup_attach_proc(cgrp, tsk);
>>
>> Offtopic, but with or without this change I do not understand the
>> thread_group_leader/retry_find_task logic.
>>
>> Why do we actually need to restart? We do not really care if it is leader
>> or not, we only need to ensure we can safely use while_each_thread() to
>> find all !PF_EXITING threads.
> 
> If my memory serves me right (which BTW often fails), it's cgroup API
> thing.  cgroup wants to guarantee to the controllers that if multiple
> tasks are migrated together, they always constitute a threadgroup and
> the first one is the leader.  ISTR a controller callback which depends
> on the first one being the leader.
> 

It did serve you right this time. :)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Inactive memory keep growing and how to release it?

2013-03-08 Thread Will Huck

Cc experts. Hugh, Johannes,

On 03/04/2013 08:21 PM, Lenky Gao wrote:

2013/3/4 Zlatko Calusic :

The drop_caches mechanism doesn't free dirty page cache pages. And your bash
script is creating a lot of dirty pages. Run it like this and see if it
helps your case:

sync; echo 3 > /proc/sys/vm/drop_caches

Thanks for your advice.

The inactive memory still cannot be reclaimed after i execute the sync command:

# cat /proc/meminfo | grep Inactive\(file\);
Inactive(file):   882824 kB
# sync;
# echo 3 > /proc/sys/vm/drop_caches
# cat /proc/meminfo | grep Inactive\(file\);
Inactive(file):   777664 kB

I find these page becomes orphaned in this function, but do not understand why:

/*
  * If truncate cannot remove the fs-private metadata from the page, the page
  * becomes orphaned.  It will be left on the LRU and may even be mapped into
  * user pagetables if we're racing with filemap_fault().
  *
  * We need to bale out if page->mapping is no longer equal to the original
  * mapping.  This happens a) when the VM reclaimed the page while we waited on
  * its lock, b) when a concurrent invalidate_mapping_pages got there first and
  * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
  */
static int
truncate_complete_page(struct address_space *mapping, struct page *page)
{
...

My file system type is ext3, mounted with the opteion data=journal and
it is easy to reproduce.




--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Linus Torvalds
On Fri, Mar 8, 2013 at 6:03 PM, Dave Jones  wrote:
>
> existing pathname + 'a' = fine.
>
> existing pathname + '/' + 'a' = boom.

Good.

> Looks like if I do this..
>
>if (isdigit(newpath[len]) != 0) {
> newpath[len] = '/';
>newpath[len+1] = 'A';
>newpath[len+2] = 0;
>
> no bug.

Well, but that will never trigger. newpath[len] will always be NUL, so
you just disabled things entirely. Use "len-1".

So I don't think that did what you meant it to do.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 05:18:29PM -0800, Linus Torvalds wrote:
 > On Fri, Mar 8, 2013 at 4:36 PM, Dave Jones  wrote:
 > >
 > > Ok, it's definitly the 'append something on the end of a valid pathname'
 > > changeset. 'something' can be anything it seems.
 > 
 > Ok. so maybe the way to "bisect" this is to play with that.
 > 
 > For example, does it happen even if the "something" does not have a
 > slash in it? IOW, you just append, say, a single 'a' character to a
 > name that doesn't already end in a slash?

existing pathname + 'a' = fine.

existing pathname + '/' + 'a' = boom.

 > And if it still does happen with that, perhaps you could have some
 > logic that only appends the 'a' to names that end with numbers. Does
 > it stop happening?
 
Looks like if I do this..

   if (isdigit(newpath[len]) != 0) {
newpath[len] = '/';
   newpath[len+1] = 'A';
   newpath[len+2] = 0;

no bug.

If I change that to == 0, I get the bug.

 > The machine keeps running, right? So you can try this out without
 > rebooting, just changing when you append the character?

Sometimes it gets wedged somewhere, so not always.
Quick to reboot though, so no biggie.

Now trying to lower the frequency at which it does the mangling
to pinpoint the affected file(s)

I need to run at least 8 child processes (-C8) to get this to 
happen (machine has 4 cores). Unfortunatly that means the logging
gets a bit spewy if they're all mangling at the same time.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


screen failing to display after resume from hibernate

2013-03-08 Thread kernel kernel
Machine resumes and fails to power on the screen.

Trace:

Mar  7 23:35:09 lenovo kernel: [33562.808041] [drm:intel_lvds_disable]
*ERROR* timed out waiting for panel to power off
Mar  7 23:35:09 lenovo kernel: [33562.808052] [ cut here
]
Mar  7 23:35:09 lenovo kernel: [33562.808087] WARNING: at
/home/abuild/rpmbuild/BUILD/kernel-desktop-3.4.28/linux-3.4/drivers/gpu/drm/i915/int
el_display.c:981 intel_crtc_disable+0x2e/0x80 [i915]()
Mar  7 23:35:09 lenovo kernel: [33562.808090] Hardware name: 6474B84
Mar  7 23:35:09 lenovo kernel: [33562.808092] plane B assertion
failure (expected off, current on)
Mar  7 23:35:09 lenovo kernel: [33562.808094] Modules linked in: fuse
af_packet rfcomm bnep cpufreq_conservative cpufreq_userspace
cpufreq_pow
ersave sha256_generic cbc dm_crypt dm_mod sg snd_hda_codec_conexant
thinkpad_acpi snd_hda_intel snd_hda_codec snd_hwdep snd_seq
snd_pcm_oss pc
mcia arc4 btusb acpi_cpufreq bluetooth sr_mod cdrom yenta_socket
pcmcia_rsrc wmi tpm_tis mperf kvm_intel snd_seq_device snd_pcm
snd_mixer_oss
mei(C) pcmcia_core tpm coretemp iTCO_wdt iTCO_vendor_support i2c_i801
iwlwifi mac80211 cfg80211 snd_timer e1000e joydev pcspkr rfkill snd
snd_
page_alloc kvm ac battery ata_generic tpm_bios soundcore edd microcode
autofs4 thermal i915 processor drm_kms_helper drm i2c_algo_bit button
v
ideo thermal_sys scsi_dh_emc scsi_dh_alua scsi_dh_hp_sw scsi_dh_rdac scsi_dh
Mar  7 23:35:09 lenovo kernel: [33562.808138] Pid: 19431, comm:
kworker/u:2 Tainted: GWC   3.4.28-2.20-desktop #1
Mar  7 23:35:09 lenovo kernel: [33562.808140] Call Trace:
Mar  7 23:35:09 lenovo kernel: [33562.808159]  []
dump_trace+0x88/0x300
Mar  7 23:35:09 lenovo kernel: [33562.808169]  []
dump_stack+0x69/0x6f
Mar  7 23:35:09 lenovo kernel: [33562.808174]  []
warn_slowpath_common+0x79/0xc0
Mar  7 23:35:09 lenovo kernel: [33562.808178]  []
warn_slowpath_fmt+0x45/0x50
Mar  7 23:35:09 lenovo kernel: [33562.808188]  []
intel_crtc_disable+0x2e/0x80 [i915]
Mar  7 23:35:09 lenovo kernel: [33562.808223]  []
drm_helper_disable_unused_functions+0x105/0x160 [drm_kms_helper]
Mar  7 23:35:09 lenovo kernel: [33562.808230]  []
drm_helper_resume_force_mode+0x118/0x150 [drm_kms_helper]
Mar  7 23:35:09 lenovo kernel: [33562.808240]  []
i915_drm_thaw+0x115/0x170 [i915]
Mar  7 23:35:09 lenovo kernel: [33562.808248]  []
i915_resume+0x45/0x70 [i915]
Mar  7 23:35:09 lenovo kernel: [33562.808253]  []
dpm_run_callback+0x54/0xa0
Mar  7 23:35:09 lenovo kernel: [33562.808257]  []
device_resume+0x10a/0x240
Mar  7 23:35:09 lenovo kernel: [33562.808261]  []
async_resume+0x14/0x40
Mar  7 23:35:09 lenovo kernel: [33562.808265]  []
async_run_entry_fn+0x6e/0x160
Mar  7 23:35:09 lenovo kernel: [33562.808269]  []
process_one_work+0x123/0x450
Mar  7 23:35:09 lenovo kernel: [33562.808272]  []
worker_thread+0x12d/0x2e0
Mar  7 23:35:09 lenovo kernel: [33562.808275]  []
kthread+0x85/0x90
Mar  7 23:35:09 lenovo kernel: [33562.808279]  []
kernel_thread_helper+0x4/0x10
Mar  7 23:35:09 lenovo kernel: [33562.808282] ---[ end trace
9e35440e65239f66 ]---


--
Ken O'Brien
PhD Researcher
Simulation Science and Extreme Events Cluster,
3B1, UCD CASL,
8 Belfield Business Park,
Dublin 4,
Ireland.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Re: [PATCH] Kprobes blacklist: Conditionally add x86-specific symbols

2013-03-08 Thread Masami Hiramatsu
(2013/03/08 22:15), oskar.and...@sonymobile.com wrote:
> On 05:23 Fri 08 Mar , Masami Hiramatsu wrote:
>> (2013/03/07 19:44), oskar.and...@sonymobile.com wrote:
>>> From: Bjorn Davidsson 
>>>
>>> The kprobes blacklist contains x86-specific symbols.
>>> Looking for these in kallsyms takes unnecessary time
>>> during startup on non-X86 platform.
>>> Added #ifdef CONFIG_X86 around them.
>>
>> Right. however, it might be better break that into
>> common and arch-specific lists, because there may be
>> other arch-specific non-probe-able functions on each
>> architecture...
> 
> Ok. You mean adding, for instance, a kprobe_blacklist_arch[] in arch/x86
> somewhere or did you have something else in mind? I guess we preferably want
> to get rid of the #ifdef.

Yes, we can have symbol tables (const char *arch_kprobes_blacksyms[],
common_kprobes_blacksyms[]) to list it up, and when initializing
the blacklist table, we can check whether kprobes_blacklist == NULL
and initialize it.

Thank you,

-- 
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hiramatsu...@hitachi.com


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: mmap vs fs cache

2013-03-08 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 03/08/2013 10:00 AM, Howard Chu wrote:
> Yes, that's what I was thinking. I added a 
> posix_madvise(..POSIX_MADV_RANDOM) but that had no effect on the
> test.

Yep, that's because it isn't implemented.

You might try MADV_WILLNEED to schedule it to be read in first.  I
believe that will only read in the requested page, without additional
readahead, and then when you fault on the page, it already has IO
scheduled, so the extra readahead will also be skipped.


-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.12 (GNU/Linux)
Comment: Using GnuPG with undefined - http://www.enigmail.net/

iQEcBAEBAgAGBQJROo7GAAoJEJrBOlT6nu759SAH+wRhoUIZUuzNGrhfUJ6RnwV8
VjFyftBCAsdC+Mzq81Da3KJOi+BdYV8VbkYNPzbKll5AnxzL5Udvbdyf9SkROhug
UgLWHe8pC6ZtHfSvWBCqS1YDLkzw+TiWwJzuL5iUEDC2NGuUJQ5SbhwyTEypvWai
pdPZeFVyhLAKOtAUwD5e/5vhBWSq2M1TG2C7BUCow2fbJ6kil+kWuXtiDeNPvtUk
4FwabL8zHA9pNtMlHB0cUrn5W3VQYGqeTaDngjyLxR1gw7uFQn52G47IPe2LAMGx
58L/tHjbkSY9oukGiMHoF1jiaFqJqV1pw+Q2P7S+0XsU8JdW6CmzotTqDmcozqE=
=DOZT
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] power: make goldfish option have a dependency on goldfish

2013-03-08 Thread Anton Vorontsov
Hi Paul,

On Fri, Mar 08, 2013 at 11:38:41AM -0500, Paul Gortmaker wrote:
> > I see. In that case, please feel free to send the patch to akpm with my
> > Nack and pointing to this discussion. If Andrew agrees and I was wrong
> > (and I'm really curious whether I am right or wrong), I will start
> > applying such patches in future.
> 
> I didn't send the patch to akpm, but I did have a chance to ask akpm how
> dependencies should be used, and you can see his answer here:
> 
>   https://lkml.org/lkml/2013/3/7/456

Thanks for asking! FWIW, I won't be against CONFIG_AKPM. ;-) Something
like that will work:

depends on GENERIC_HARDIRQS
depends on RESTRICT_PLATFORM && GOLDFISH

But not that I think we really need this option, though. Whoever wants to
(re)build the kernel is assumed to be knowledgeable enough to figure out
what needed/unneeded for the given HW. I, for example, use 'ARCH=foo
allnoconfig' for stripped kernels, and then enable specific options which
I know I will need. Distros, however, they are using kind of
'allmodconfig' anyways:

~$ du -sh /lib/modules/3.8.0-28-desktop/
148M/lib/modules/3.8.0-28-desktop/

One module less, one module more does not matter, but maintaining
CONFIG_AKPM will cost devs' time and efforts (especially figuring out what
is platform dep and what is not... I think it is easier to just keep
things simple.

But again, I won't be against it -- at least it doesn't make my life
harder. :-)

Cheers,

Anton
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 11/12] rwsem: wake all readers when first waiter is a reader

2013-03-08 Thread Michel Lespinasse
On Fri, Mar 8, 2013 at 4:32 PM, Dave Chinner  wrote:
> On Wed, Mar 06, 2013 at 03:21:50PM -0800, Michel Lespinasse wrote:
>> When the first queued waiter is a reader, wake all readers instead of
>> just those that are at the front of the queue. There are really two
>> motivations for this change:
>
> Isn't this a significant change of semantics for the rwsem? i.e.
> that read lock requests that come after a write lock request now
> jump ahead of the write lock request? i.e.the write lock request is
> no longer a barrier in the queue?

Yes, I am allowing readers to skip ahead of writers in the queue (but
only if they can run with another reader that was already ahead).

I don't see that this is a change of observable semantics for correct
programs. If a reader and a writer both block on the rwsem, how do you
known for sure which one got queued first ? rwsem API doesn't give you
any easy way to know whether a thread is currently queued on the rwsem
(it could also be descheduled before it gets onto the rwsem queue).

But yes, if you're making assumptions about queuing order the change
makes it more likely that they'll be observably wrong.

> XFS has long assumed that a rwsem write lock is a barrier that
> stops new read locks from being taken, and this change will break
> that assumption. Given that this barrier assumption is used as the
> basis for serialisation of operations like IO vs truncate, there's a
> bit more at stake than just improving parallelism here.  i.e. IO
> issued after truncate/preallocate/hole punch could now be issued
> ahead of the pending metadata operation, whereas currently the IO
> issued after the pending metadata operation is waiting for the write
> lock will be only be processed -after- the metadata modification
> operation completes...
>
> That is a recipe for weird data corruption problems because
> applications are likely to have implicit dependencies on the barrier
> effect of metadata operations on data IO...

I am confused as to exactly what XFS is doing, could you point me to
the code / indicate a scenario where this would go wrong ? If you
really rely on this for correctness you'd have to do something already
to guarantee that your original queueing order is as desired, and I
just don't see how it'd be done...

That said, it is doable to add support for write lock stealing in the
rwsem write path while still preserving the queueing order of readers
vs writers; I'm just not sure that I fully understand the correctness
concern at this point.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Linus Torvalds
On Fri, Mar 8, 2013 at 4:36 PM, Dave Jones  wrote:
>
> Ok, it's definitly the 'append something on the end of a valid pathname'
> changeset. 'something' can be anything it seems.

Ok. so maybe the way to "bisect" this is to play with that.

For example, does it happen even if the "something" does not have a
slash in it? IOW, you just append, say, a single 'a' character to a
name that doesn't already end in a slash?

And if it still does happen with that, perhaps you could have some
logic that only appends the 'a' to names that end with numbers. Does
it stop happening?

The machine keeps running, right? So you can try this out without
rebooting, just changing when you append the character?

   Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] add extra free kbytes tunable

2013-03-08 Thread Simon Jeons

Hi Hugh,
On 03/02/2013 11:08 AM, Hugh Dickins wrote:

On Sat, 2 Mar 2013, Simon Jeons wrote:

On 03/02/2013 09:42 AM, Hugh Dickins wrote:

On Sat, 2 Mar 2013, Simon Jeons wrote:

In function __add_to_swap_cache if add to radix tree successfully will
result
in increase NR_FILE_PAGES, why? This is anonymous page instead of file
backed
page.

Right, that's hard to understand without historical background.

I think the quick answer would be that we used to (and still do) think
of file-cache and swap-cache as two halves of page-cache.  And then when

shmem page should be treated as file-cache or swap-cache? It is strange since
it is consist of anonymous pages and these pages establish files.

A shmem page is swap-backed file-cache, and it may get transferred to or
from swap-cache: yes, it's a difficult and confusing case, as I said below.

I would never call it "anonymous", but it is counted in /proc/meminfo's
Active(anon) or Inactive(anon) rather than in (file), because "anon"
there is shorthand for "swap-backed".


In read_swap_cache_async:

SetPageSwapBacked(new_page);
__add_to_swap_cache();
swap_readpage();
ClearPageSwapBacked(new_page);

Why clear PG_swapbacked flag?




So you'll find that shmem and swap are counted as file in some places
and anon in others, and it's hard to grasp which is where and why,
without remembering the history.

Hugh


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 03/10] ARM: smp_twd: Divorce smp_twd from local timer API

2013-03-08 Thread Stephen Boyd
On 03/08/13 12:29, Tony Lindgren wrote:
> Applying that does not seem to help, but you might want to get vexpress
> running anyways for some multiplatform sanity checks.
>
> I just built and installed qemu-linaro from their git, then ran the
> command above. Looks like stock qemu does not work for vexpress for some
> reason. You can probably use a dummy initrd and rootfs to debug this
> though :)

So my patchset didn't break qemu?

>
> More info on qemu-linaro at:
>
> https://launchpad.net/qemu-linaro

Thanks for the link.

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH net-next] drivers:net: Remove unnecessary OOM messages after netdev_alloc_skb

2013-03-08 Thread Joe Perches
Emitting netdev_alloc_skb and netdev_alloc_skb_ip_align OOM
messages is unnecessary as there is already a dump_stack
after allocation failures.

Other trivial changes around these removals:

Convert a few comparisons of pointer to 0 to !pointer.
Change flow to remove unnecessary label.
Remove now unused variable.
Hoist assignment from if.

Signed-off-by: Joe Perches 
---
 drivers/net/caif/caif_shmcore.c|  5 +
 drivers/net/ethernet/adi/bfin_mac.c|  6 ++
 drivers/net/ethernet/amd/7990.c|  2 --
 drivers/net/ethernet/amd/a2065.c   |  1 -
 drivers/net/ethernet/amd/am79c961a.c   |  1 -
 drivers/net/ethernet/amd/ariadne.c |  1 -
 drivers/net/ethernet/amd/atarilance.c  |  2 --
 drivers/net/ethernet/amd/au1000_eth.c  |  1 -
 drivers/net/ethernet/amd/declance.c|  2 --
 drivers/net/ethernet/amd/pcnet32.c |  1 -
 drivers/net/ethernet/amd/sun3lance.c   |  3 ---
 drivers/net/ethernet/amd/sunlance.c|  4 
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c|  6 ++
 drivers/net/ethernet/atheros/atlx/atl2.c   |  3 ---
 drivers/net/ethernet/broadcom/bgmac.c  |  4 +---
 drivers/net/ethernet/broadcom/sb1250-mac.c |  5 +
 drivers/net/ethernet/cadence/at91_ether.c  |  1 -
 drivers/net/ethernet/cirrus/cs89x0.c   |  6 --
 drivers/net/ethernet/dlink/dl2k.c  |  7 ++-
 drivers/net/ethernet/freescale/fec.c   |  2 --
 .../net/ethernet/freescale/fs_enet/fs_enet-main.c  | 17 +++--
 drivers/net/ethernet/fujitsu/fmvj18x_cs.c  |  2 --
 drivers/net/ethernet/i825xx/82596.c|  8 +++-
 drivers/net/ethernet/i825xx/lib82596.c |  6 ++
 drivers/net/ethernet/ibm/ehea/ehea_main.c  |  9 ++---
 drivers/net/ethernet/mellanox/mlx4/en_selftest.c   |  5 ++---
 drivers/net/ethernet/natsemi/sonic.c   |  1 -
 drivers/net/ethernet/netx-eth.c|  2 --
 drivers/net/ethernet/nuvoton/w90p910_ether.c   |  1 -
 drivers/net/ethernet/nvidia/forcedeth.c|  1 -
 drivers/net/ethernet/qlogic/qla3xxx.c  |  1 -
 drivers/net/ethernet/qlogic/qlge/qlge_main.c   |  6 --
 drivers/net/ethernet/rdc/r6040.c   |  1 -
 drivers/net/ethernet/realtek/8139too.c |  2 --
 drivers/net/ethernet/realtek/atp.c |  2 --
 drivers/net/ethernet/seeq/ether3.c | 22 +-
 drivers/net/ethernet/seeq/sgiseeq.c|  2 --
 drivers/net/ethernet/sis/sis900.c  |  7 ++-
 drivers/net/ethernet/smsc/smc9194.c|  2 --
 drivers/net/ethernet/smsc/smc91x.c |  2 --
 drivers/net/ethernet/smsc/smsc9420.c   |  4 +---
 drivers/net/ethernet/sun/sunqe.c   |  5 +
 drivers/net/ethernet/tehuti/tehuti.c   |  5 ++---
 drivers/net/ethernet/ti/tlan.c |  4 +---
 drivers/net/ethernet/xilinx/ll_temac_main.c| 10 +++---
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c  |  9 +++--
 drivers/net/ethernet/xircom/xirc2ps_cs.c   |  1 -
 47 files changed, 39 insertions(+), 161 deletions(-)

diff --git a/drivers/net/caif/caif_shmcore.c b/drivers/net/caif/caif_shmcore.c
index bce8bac..cca2afc 100644
--- a/drivers/net/caif/caif_shmcore.c
+++ b/drivers/net/caif/caif_shmcore.c
@@ -338,11 +338,8 @@ static void shm_rx_work_func(struct work_struct *rx_work)
/* Get a suitable CAIF packet and copy in data. */
skb = netdev_alloc_skb(pshm_drv->pshm_dev->pshm_netdev,
frm_pck_len + 1);
-
-   if (skb == NULL) {
-   pr_info("OOM: Try next frame in descriptor\n");
+   if (skb == NULL)
break;
-   }
 
p = skb_put(skb, frm_pck_len);
memcpy(p, pbuf->desc_vptr + frm_pck_ofs, frm_pck_len);
diff --git a/drivers/net/ethernet/adi/bfin_mac.c 
b/drivers/net/ethernet/adi/bfin_mac.c
index a175d0b..ee70577 100644
--- a/drivers/net/ethernet/adi/bfin_mac.c
+++ b/drivers/net/ethernet/adi/bfin_mac.c
@@ -188,10 +188,9 @@ static int desc_list_init(struct net_device *dev)
 
/* allocate a new skb for next time receive */
new_skb = netdev_alloc_skb(dev, PKT_BUF_SZ + NET_IP_ALIGN);
-   if (!new_skb) {
-   pr_notice("init: low on mem - packet dropped\n");
+   if (!new_skb)
goto init_error;
-   }
+
skb_reserve(new_skb, NET_IP_ALIGN);
/* Invidate the data cache of skb->data range when it is write 
back
 * cache. It will pr

[ANNOUNCE] 3.9-rc1-nohz1

2013-03-08 Thread Frederic Weisbecker
Hi,

Several fixes there. And this version should have much lesser spurious
warnings. Your testing and reviews is very appreciated.

The 5 first patches of the series are pending on a pull request for -tip
(3.10 material).

I'm now considering how I should upstream the rest of the series.
All the pieces that got merged until now were sort of easy because the various
chunks were pretty self contained and independant (full dynticks cputime
accounting, printk, RCU user mode, dynticks API generalization, etc...).

Now what remains in this series is hard to cut into individual parts.
Everything depends on defining an interface with kernel parameter
to partition the full dynticks CPUs set.

I think we really need to start using a branch in -tip and move incrementally
from there with the following steps:

1) Set the kernel parameters and config option
2) Handle timers wakeup, timekeeping, posix cpu timers, perf, sched 
etc...
   on top of kernel parameter based CPU partition
3) Once we know _everything_ is handled, bring the final dynticks 
infrastructure
4) Upstream

This will make everything much easier for everyone: easier piecewise reviews 
and easier for
other people to contribute.

Because you don't want me to spam you with ~40 commits for 2 more years, right?

Thanks.

This version can be found at:

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
3.9-rc1-nohz1

---
Changes since 3.8-rc6-nohz4:

* Rebase against 3.9-rc1

* Fixed a few races with exception and preemption handling [1-3/29]

* Dropped commit "sched: Remove broken check for skip clock update"
that was buggy (thanks Steve for pointing that)

* Ignore noisy stale rq clock detection on boot and other situations
with rq->skip_clock_update [27/29]

* Dropped commit "sched: Update clock of nohz busiest rq before balancing"
that became useless (thanks Li Zhong)

* Don't issue a self IPI on timer enqueue if the CPU didn't stop its
tick [9/29]

* Rename a bit the Kconfig menu after discussion with Borislav [6/29]

* Handle broken full_nohz mask in kernel parameters (thanks Borislav) [6/29]

---
TODO list hasn't changed much:

- Posix CPU timers
- Perf events
- sched_class::task_tick()
- various other scheduler details
- ...

---
Frederic Weisbecker (29):
  context_tracking: Move exception handling to generic code
  context_tracking: Restore correct previous context state on exception
exit
  context_tracking: Restore preempted context state after
preempt_schedule_irq()
  cputime: Dynamically scale cputime for full dynticks accounting
  context_tracking: Enable probes by default for selftesting
  nohz: Basic full dynticks interface
  nohz: Assign timekeeping duty to a non-full-nohz CPU
  nohz: Trace timekeeping update
  nohz: Wake up full dynticks CPUs when a timer gets enqueued
  rcu: Restart the tick on non-responding full dynticks CPUs
  sched: Comment on rq->clock correctness in ttwu_do_wakeup() in nohz
  sched: Update rq clock on nohz CPU before migrating tasks
  sched: Update rq clock on nohz CPU before setting fair group shares
  sched: Update rq clock on tickless CPUs before calling
check_preempt_curr()
  sched: Update rq clock earlier in unthrottle_cfs_rq
  sched: Update rq clock before idle balancing
  sched: Update nohz rq clock before searching busiest group on load
balancing
  nohz: Move nohz load balancer selection into idle logic
  nohz: Full dynticks mode
  nohz: Only stop the tick on RCU nocb CPUs
  nohz: Don't turn off the tick if rcu needs it
  nohz: Don't stop the tick if posix cpu timers are running
  nohz: Add some tracing
  rcu: Don't keep the tick for RCU while in userspace
  timer: Don't run non-pinned timer to full dynticks CPUs
  sched: Use an accessor to read rq clock
  sched: Debug nohz rq clock
  sched: Update rq clock before rt sched average scale
  sched: Disable lb_bias feature for full dynticks

 arch/x86/include/asm/context_tracking.h |   21 
 arch/x86/kernel/kvm.c   |8 +-
 arch/x86/kernel/traps.c |   68 +-
 arch/x86/mm/fault.c |8 +-
 include/linux/context_tracking.h|   24 +-
 include/linux/posix-timers.h|1 +
 include/linux/rcupdate.h|8 ++
 include/linux/sched.h   |   14 ++-
 include/linux/tick.h|9 ++
 init/Kconfig|1 +
 kernel/fork.c   |2 +-
 kernel/hrtimer.c|3 +-
 kernel/posix-cpu-timers.c   |   11 ++
 kernel/rcutree.c|   19 +++-
 kernel/rcutree.h|1 -
 kernel/rcutree_plugin.h |   13 +--
 kernel/sched/core.c |  110 --
 kernel/sched/cputime.c  |  154 ---
 kernel/sched/fair.c |   79 +++-
 kernel

[PATCH] epoll: comment + BUILD_BUG_ON to prevent epitem bloat

2013-03-08 Thread Eric Wong
This will prevent us from accidentally introducing a memory bloat
regression here in the future.

Signed-off-by: Eric Wong 
Cc: Andrew Morton 
Cc: Davide Libenzi ,
Cc: Al Viro 
---
  Andrew Morton  wrote:
  > On Thu, 7 Mar 2013 10:32:40 + Eric Wong  wrote:
  > 
  > > Andrew Morton  wrote:
  > > > It's going to be hard to maintain this - someone will change something
  > > > sometime and break it.  I suppose we could add a runtime check if we
  > > > cared enough.  Adding a big fat comment to struct epitem might help.
  > > 
  > > Thanks for looking at this patch.  I'll send a patch with a comment
  > > about keeping epitem size in check.  Also, would adding (with comments):
  > > 
  > >   BUILD_BUG_ON(sizeof(struct epitem) > 128);
  > > 
  > > ...be too heavy-handed?  I used that in my testing.  I'll check for:
  > > sizeof(void *) <= 8 too; in case 128-bit machines appear...
  > 
  > I guess such a check might avoid accidents in the future.  If it
  > becomes a problem, we can always delete it.

 fs/eventpoll.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9fec183..55028da 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -128,6 +128,8 @@ struct nested_calls {
 /*
  * Each file descriptor added to the eventpoll interface will
  * have an entry of this type linked to the "rbr" RB tree.
+ * Avoid increasing the size of this struct, there can be many thousands
+ * of these on a server and we do not want this to take another cache line.
  */
 struct epitem {
/* RB tree node used to link this structure to the eventpoll RB tree */
@@ -1964,6 +1966,12 @@ static int __init eventpoll_init(void)
/* Initialize the structure used to perform file's f_op->poll() calls */
ep_nested_calls_init(&poll_readywalk_ncalls);
 
+   /*
+* We can have many thousands of epitems, so prevent this from
+* using an extra cache line on 64-bit (and smaller) CPUs
+*/
+   BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
+
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
-- 
Eric Wong
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[GIT PULL] Btrfs updates

2013-03-08 Thread Chris Mason
Hi Linus,

Please grab my for-linus:

git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs.git for-linus

These are scattered fixes and one performance improvement.  The biggest
functional change is in how we throttle metadata changes.  The new code
bumps our average file creation rate up by ~13% in fs_mark, and lowers
CPU usage.

Stefan bisected out a regression in our allocation code that made
balance loop on extents larger than 256MB.

Liu Bo (6) commits (+71/-19):
Btrfs: build up error handling for merge_reloc_roots (+35/-12)
Btrfs: check for NULL pointer in updating reloc roots (+2/-0)
Btrfs: avoid deadlock on transaction waiting list (+7/-0)
Btrfs: free all recorded tree blocks on error (+6/-3)
Btrfs: do not BUG_ON on aborted situation (+12/-3)
Btrfs: do not BUG_ON in prepare_to_reloc (+9/-1)

Chris Mason (2) commits (+96/-63):
Btrfs: enforce min_bytes parameter during extent allocation (+4/-2)
Btrfs: improve the delayed inode throttling (+92/-61)

Miao Xie (2) commits (+45/-39):
Btrfs: fix unclosed transaction handler when the async transaction 
commitment fails (+4/-0)
Btrfs: fix wrong handle at error path of create_snapshot() when the commit 
fails (+41/-39)

Stefan Behrens (1) commits (+0/-8):
Btrfs: allow running defrag in parallel to administrative tasks

Ilya Dryomov (1) commits (+5/-0):
Btrfs: fix a mismerge in btrfs_balance()

Josef Bacik (1) commits (+4/-1):
Btrfs: use set_nlink if our i_nlink is 0

Total: (13) commits (+221/-130)

 fs/btrfs/delayed-inode.c | 151 ---
 fs/btrfs/delayed-inode.h |   2 +
 fs/btrfs/disk-io.c   |  16 +++--
 fs/btrfs/inode.c |   6 +-
 fs/btrfs/ioctl.c |  18 ++
 fs/btrfs/relocation.c|  74 +--
 fs/btrfs/transaction.c   |  65 
 fs/btrfs/tree-log.c  |   5 +-
 fs/btrfs/volumes.c   |  14 -
 9 files changed, 221 insertions(+), 130 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 07:19:17PM -0500, Dave Jones wrote:
 > On Fri, Mar 08, 2013 at 04:02:02PM -0800, Linus Torvalds wrote:
 >  > On Fri, Mar 8, 2013 at 3:55 PM, Dave Jones  wrote:
 >  > >
 >  > > That one was printed out with %s
 >  > 
 >  > Ok, so those random pathnames you generate? They're funky.
 > 
 > I just did a test with just a page of 'A's, and got the same result,
 > so while that unicode stuff looks fancy, it isn't necessary to tickle
 > these bugs.
 > 
 > Trying to figure out exactly what change I made that started triggering this.
 
Ok, it's definitly the 'append something on the end of a valid pathname'
changeset. 'something' can be anything it seems.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: ipc/testmsg GPF.

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 07:27:01PM -0500, Peter Hurley wrote:
 > [ +Andrew Morton ]
 > 
 > On Thu, 2013-03-07 at 16:38 -0500, Dave Jones wrote:
 > > Trying to reproduce that nd_jump_link trace, but I keep hitting other bugs
 > > instead.  It's like whackamole. Except these are even more annoying
 > > than moles.
 > 
 > Dave,
 > I thought I copied you on the 'ipc MSG_COPY fixes' patchset that fixes
 > this. Or is this gp fault happening with that patchset?

I hadn't gotten around to applying them to my testing tree. 
Worth noting that of the dozens of oopses I've seen the last few days
I think I only saw this one once.

(Though I've narrowed my testing scope right now to try and tickle
 those fs/namei.c bugs, so it could just be that I'm not exercising
 the codepaths that caused the ipc bug)

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 11/12] rwsem: wake all readers when first waiter is a reader

2013-03-08 Thread Dave Chinner
On Wed, Mar 06, 2013 at 03:21:50PM -0800, Michel Lespinasse wrote:
> When the first queued waiter is a reader, wake all readers instead of
> just those that are at the front of the queue. There are really two
> motivations for this change:

Isn't this a significant change of semantics for the rwsem? i.e.
that read lock requests that come after a write lock request now
jump ahead of the write lock request? i.e.the write lock request is
no longer a barrier in the queue?

XFS has long assumed that a rwsem write lock is a barrier that
stops new read locks from being taken, and this change will break
that assumption. Given that this barrier assumption is used as the
basis for serialisation of operations like IO vs truncate, there's a
bit more at stake than just improving parallelism here.  i.e. IO
issued after truncate/preallocate/hole punch could now be issued
ahead of the pending metadata operation, whereas currently the IO
issued after the pending metadata operation is waiting for the write
lock will be only be processed -after- the metadata modification
operation completes...

That is a recipe for weird data corruption problems because
applications are likely to have implicit dependencies on the barrier
effect of metadata operations on data IO...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Suggestion for fixing the variable length array used in the kernel.

2013-03-08 Thread Andrew Morton
On Wed, 6 Mar 2013 20:46:35 -0800 Christopher Li  wrote:

> Hi,
> 
> I am looking at the current sparse warning on the kernel source.
> One category of those warning are produce by the variable length array.
> We all know that the kernel stack has a limit so we don't want to allocate
> too much stack to the variable size array.
> 
> Is there a recommended way to fix those warnings? Is it worth while to
> fix it at all? I am looking forward to some kind of guideline how to handle
> this.

Roughly how many instances of this are there kernel-wide?

I don't think it's good practice in the kernel - it's somewhat
dangerous and the effects of errors will be catastrophic.  And as
you've seen, those sites are difficult to review for safety.

We could just outright ban the thing and convert those sites to
kmalloc() or whatever.  If people howl about the performance impact
(unlikely) then perhaps we can put something together using
__builtin_alloca() which includes runtime checking for "excessive"
allocations.  If an excessive allocation is detected we'd warn and
return NULL.

Anyway, yes, variable-length arrays are problematic so for now, let's
leave the sparse warnings in place?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: ipc/testmsg GPF.

2013-03-08 Thread Peter Hurley
[ +Andrew Morton ]

On Thu, 2013-03-07 at 16:38 -0500, Dave Jones wrote:
> Trying to reproduce that nd_jump_link trace, but I keep hitting other bugs
> instead.  It's like whackamole. Except these are even more annoying
> than moles.

Dave,
I thought I copied you on the 'ipc MSG_COPY fixes' patchset that fixes
this. Or is this gp fault happening with that patchset?

Linus,
The fixes should be in your inbox (from Andrew) titled:
[patch 01/11] ipc: fix potential oops when src msg > 4k w/ MSG_COPY
[patch 02/11] ipc: don't allocate a copy larger than max

> general protection fault:  [#1] PREEMPT SMP 
> Modules linked in: rose ax25 phonet lockd sunrpc ip6t_REJECT 
> nf_conntrack_ipv6 nf_defrag_ipv6 xt_conntrack nf_conntrack ip6table_filter 
> ip6_tables snd_hda_codec_realtek snd_hda_intel btusb snd_hda_codec bluetooth 
> snd_pcm snd_page_alloc snd_timer snd vhost_net rfkill tun macvtap usb_debug 
> macvlan microcode serio_raw pcspkr kvm_amd soundcore edac_core r8169 mii kvm
> CPU 0 
> Pid: 845, comm: trinity-child14 Not tainted 3.9.0-rc1+ #70 Gigabyte 
> Technology Co., Ltd. GA-MA78GM-S2H/GA-MA78GM-S2H
> RIP: 0010:[]  [] testmsg.isra.1+0x40/0x60
> RSP: 0018:880122b0fe78  EFLAGS: 00010246
> RAX:  RBX: 0002 RCX: 0001
> RDX: 0002 RSI: 2c24a9b2 RDI: 697665642d737983
> RBP: 880122b0fe78 R08: fff3f14b03ae R09: 
> R10: 880127bd8000 R11:  R12: 2c24a9b2
> R13: 880123360798 R14: 8801233606e8 R15: 697665642d737973
> FS:  7f2672bd3740() GS:88012ae0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 7f2672b96068 CR3: 000127bc1000 CR4: 07f0
> DR0:  DR1:  DR2: 
> DR3:  DR6: 0ff0 DR7: 0400
> Process trinity-child14 (pid: 845, threadinfo 880122b0e000, task 
> 880127bd8000)
> Stack:
>  880122b0ff68 812b8e7e 8801276d5b90 880127bd8000
>  880127bd8000 880127bd8000  812b78c0
>   81c7a260  1000
> Call Trace:
>  [] do_msgrcv+0x1de/0x670
>  [] ? load_msg+0x180/0x180
>  [] ? trace_hardirqs_on_caller+0x115/0x1a0
>  [] ? trace_hardirqs_on_thunk+0x3a/0x3f
>  [] sys_msgrcv+0x15/0x20
>  [] system_call_fastpath+0x16/0x1b
> Code: 83 fa 04 74 16 31 c0 5d c3 66 90 ff ca b8 01 00 00 00 74 f3 31 c0 eb ef 
> 0f 1f 00 48 39 37 b8 01 00 00 00 7e e2 31 c0 eb de 66 90 <48> 3b 37 75 d5 b8 
> 01 00 00 00 5d c3 0f 1f 40 00 48 3b 37 74 c5 
> 
>  <.text>:
>0: 48 3b 37cmp(%rdi),%rsi
>3: 75 d5   jne0xffda
>5: b8 01 00 00 00  mov$0x1,%eax
>a: 5d  pop%rbp
>b: c3  retq   
>c: 0f 1f 40 00 nopl   0x0(%rax)
>   10: 48 3b 37cmp(%rdi),%rsi
>   13: 74 c5   je 0xffda
> 
> rdi is ascii. "ived-sy�" Curious.
> 
> EIP is here in testmsg.
> 
> case SEARCH_EQUAL:
> if (msg->m_type == type)
>  240:   48 3b 37cmp(%rdi),%rsi
>  243:   75 d5   jne21a 
> {
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 04:02:02PM -0800, Linus Torvalds wrote:
 > On Fri, Mar 8, 2013 at 3:55 PM, Dave Jones  wrote:
 > >
 > > That one was printed out with %s
 > 
 > Ok, so those random pathnames you generate? They're funky.

I just did a test with just a page of 'A's, and got the same result,
so while that unicode stuff looks fancy, it isn't necessary to tickle
these bugs.

Trying to figure out exactly what change I made that started triggering this.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ARM: OMAP: drop "select MACH_NOKIA_RM696"

2013-03-08 Thread Russell King - ARM Linux
On Fri, Mar 08, 2013 at 07:02:44PM +0100, Paul Bolle wrote:
> On Fri, 2013-03-08 at 09:55 -0800, Tony Lindgren wrote:
> > * Paul Bolle  [130308 09:24]:
> > > Should I draft a patch?
> > 
> > Sure that would be nice.
> 
> One thing I couldn't determine is how the generated mach-types.h header
> handles multiple CONFIG_MACH_* macros.
> 
> If both CONFIG_MACH_FOO and CONFIG_MACH_BAR are defined, and these both
> have a line in */mach-types, will the machine_is_foo() and
> machine_is_bar() macros both behave as one would expect?

It's actually quite clever.  There's two levels to it.

The first is that CONFIG_MACH_xxx result in their machine_is_xxx() macros
being defined to constant zero if the CONFIG option is not enabled.  That
allows the compiler to throw away code for disabled platforms because
the expression is always false.

Otherwise, they end up as (machine_arch_type == MACH_TYPE_xxx).

The second is the magic which happens when two CONFIG_MACH_xxx are
selected.  If only one is selected, then machine_arch_type is defined
to the appropriate MACH_TYPE_xxx.  This means that the above expression
becomes constant-true, and the conditional is eliminated.

If more than one is selected, then machine_arch_type is defined to a
variable which is appropriately set to one of the MACH_TYPE_xxx values.

So, the result is that:
- de-selected platforms have their if (machine_is_xxx()) { } optimised
  out of the kernel.
- for a kernel built targetting one platform, all the
  if (machine_is_xxx()) tests are optimised away, leaving only the
  relevant code behind.
- otherwise, we get the _appropriate_ conditional code for the
  configuration generated.

However, going back to that MACH_NOKIA_RM696.  If there exists only a
select of this symbol and no "config MACH_NOKIA_RM696" entry, then the
symbol will never be generated in the output .config file.

I too can find no trace of any use of machine_is_nokia_rm696 in the
mainline kernel.  So, if there's nothing using the machine_is_()
symbol, and nothing using the CONFIG_MACH_NOKIA_RM696 symbol, then
any select of that is entirely superfluous.

Well, I did this:

$ git grep -i nokia_rm696
arch/arm/mach-omap2/Kconfig:select MACH_NOKIA_RM696
arch/arm/mach-omap2/board-rm680.c:MACHINE_START(NOKIA_RM696, "Nokia RM-696 
board")
arch/arm/tools/mach-types:nokia_rm696   MACH_NOKIA_RM696
NOKIA_RM696 3522

So, there exists platform support for this device, provided by the RM680
support, but there's no use of the machine_is_xxx() symbol - and if there
was, it would always be false.

My conclusion is... it's a mess.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Linus Torvalds
On Fri, Mar 8, 2013 at 3:55 PM, Dave Jones  wrote:
>
> That one was printed out with %s

Ok, so those random pathnames you generate? They're funky.

 Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 03:45:37PM -0800, Linus Torvalds wrote:
 > On Fri, Mar 8, 2013 at 3:30 PM, Dave Jones  wrote:
 > > [  100.729401] nd->last.name = 
 > > (\xffd2.W._.N.".\xfffe.\xff80.^N.?.\xffe4.E.8.g.\xffd2.N.\xffb6.^G.\xfff1.\xffcc.U.\xffda.^_.h.^M.1.\xffc5.\xff82.%.B.\xffe0.\xffad.^U.8.^L.c.Z.^K.\xffe4.h.J.\xffc8.\xffad.\xff83./.\xff80.\xffd1.\xffe5.\xff87.\xffc3.\xffb2.\xffdc.\xff9d.\xffd1.E.\xffab.^B.
 > 
 > You're printing out a 'char' with %02x, aren't you? And then the sign
 > extensions gives you six extra 'f' character every time the char is
 > negative.
 > 
 > If using %02x, make sure you use 'unsigned char', or "& 255".
 > 
 > Depending on just what is going on, the "nd->last.name" thing really
 > is probably a valid string. Strictly speaking it's not necessarily
 > NUL-terminated, though, and should be printed out to a maximum of
 > "nd->last.len" characters (but the whole path will be NUL-terminated,
 > so it's all ok, you just might print out more than one component)
 > 
 >Linus

That one was printed out with %s

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] epoll: trim epitem by one cache line on x86_64

2013-03-08 Thread Andrew Morton
On Thu, 7 Mar 2013 10:32:40 + Eric Wong  wrote:

> Andrew Morton  wrote:
> > It's going to be hard to maintain this - someone will change something
> > sometime and break it.  I suppose we could add a runtime check if we
> > cared enough.  Adding a big fat comment to struct epitem might help.
> 
> Thanks for looking at this patch.  I'll send a patch with a comment
> about keeping epitem size in check.  Also, would adding (with comments):
> 
>   BUILD_BUG_ON(sizeof(struct epitem) > 128);
> 
> ...be too heavy-handed?  I used that in my testing.  I'll check for:
> sizeof(void *) <= 8 too; in case 128-bit machines appear...

I guess such a check might avoid accidents in the future.  If it
becomes a problem, we can always delete it.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Basic perf PMU support for Haswell v9

2013-03-08 Thread Andi Kleen
This is based on v7 of the full Haswell PMU support,
rebased, reviewer-optimized and stripped down to the bare bones

Most interesting new features are not in this patchkit
(full version is 
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc.git hsw/pmu5)

Contains support for:
- Basic Haswell PMU and PEBS support
- Late unmasking of the PMI
- Basic LBRv4 support

v2: Addressed Stephane's feedback. See individual patches for details.
v3: now even more bite-sized. Qualifier constraints merged earlier.
v4: Rename some variables, add some comments and other minor changes.
Add some Reviewed/Tested-bys.
v5: Address some minor review feedback. Port to latest perf/core
v6: Add just some variable names, add comments, edit descriptions, some
more testing, rebased to latest perf/core
v7: Expand comment
v8: Rename structure field.
v9: No wide counters, but add basic LBRs. Add some more 
constraints. Rebase to 3.9rc1

-Andi
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/5] perf, x86: Support Haswell v4 LBR format

2013-03-08 Thread Andi Kleen
From: Andi Kleen 

Haswell has two additional LBR from flags for TSX: intx and abort, implemented
as a new v4 version of the LBR format.

Handle those in and adjust the sign extension code to still correctly extend.
The flags are exported similarly in the LBR record to the existing misprediction
flag

Signed-off-by: Andi Kleen 
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |   18 +++---
 include/linux/perf_event.h |7 ++-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c 
b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index da02e9c..2af6695b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -12,6 +12,7 @@ enum {
LBR_FORMAT_LIP  = 0x01,
LBR_FORMAT_EIP  = 0x02,
LBR_FORMAT_EIP_FLAGS= 0x03,
+   LBR_FORMAT_EIP_FLAGS2   = 0x04,
 };
 
 /*
@@ -56,6 +57,8 @@ enum {
 LBR_FAR)
 
 #define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+#define LBR_FROM_FLAG_INTX (1ULL << 62)
+#define LBR_FROM_FLAG_ABORT(1ULL << 61)
 
 #define for_each_branch_sample_type(x) \
for ((x) = PERF_SAMPLE_BRANCH_USER; \
@@ -270,21 +273,30 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events 
*cpuc)
 
for (i = 0; i < x86_pmu.lbr_nr; i++) {
unsigned long lbr_idx = (tos - i) & mask;
-   u64 from, to, mis = 0, pred = 0;
+   u64 from, to, mis = 0, pred = 0, intx = 0, abort = 0;
 
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
 
-   if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
+   if (lbr_format == LBR_FORMAT_EIP_FLAGS ||
+   lbr_format == LBR_FORMAT_EIP_FLAGS2) {
mis = !!(from & LBR_FROM_FLAG_MISPRED);
pred = !mis;
-   from = (u64)s64)from) << 1) >> 1);
+   if (lbr_format == LBR_FORMAT_EIP_FLAGS)
+   from = (u64)s64)from) << 1) >> 1);
+   else if (lbr_format == LBR_FORMAT_EIP_FLAGS2) {
+   intx = !!(from & LBR_FROM_FLAG_INTX);
+   abort = !!(from & LBR_FROM_FLAG_ABORT);
+   from = (u64)s64)from) << 3) >> 3);
+   }
}
 
cpuc->lbr_entries[i].from   = from;
cpuc->lbr_entries[i].to = to;
cpuc->lbr_entries[i].mispred= mis;
cpuc->lbr_entries[i].predicted  = pred;
+   cpuc->lbr_entries[i].intx   = intx;
+   cpuc->lbr_entries[i].abort  = abort;
cpuc->lbr_entries[i].reserved   = 0;
}
cpuc->lbr_stack.nr = i;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e47ee46..77d5166 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -74,13 +74,18 @@ struct perf_raw_record {
  *
  * support for mispred, predicted is optional. In case it
  * is not supported mispred = predicted = 0.
+ *
+ * intx: running in a hardware transaction
+ * abort: aborting a hardware transaction
  */
 struct perf_branch_entry {
__u64   from;
__u64   to;
__u64   mispred:1,  /* target mispredicted */
predicted:1,/* target predicted */
-   reserved:62;
+   intx:1, /* in transaction */
+   abort:1,/* transaction abort */
+   reserved:60;
 };
 
 /*
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Linus Torvalds
On Fri, Mar 8, 2013 at 3:47 PM, Dave Jones  wrote:
>
> That didn't take long..

Ok, thanks, so it's not something new to this merge window. Not that I
expected it to be, but better safe than sorry.

  Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/5] perf, x86: Move NMI clearing to end of PMI handler after the counter registers are reset

2013-03-08 Thread Andi Kleen
From: Andi Kleen 

This avoids some problems with spurious PMIs on Haswell.
Haswell seems to behave more like P4 in this regard. Do
the same thing as the P4 perf handler by unmasking
the NMI only at the end. Shouldn't make any difference
for earlier family 6 cores.

Tested on Haswell, IvyBridge, Westmere, Saltwell (Atom)

Signed-off-by: Andi Kleen 
---
 arch/x86/kernel/cpu/perf_event_intel.c |   16 ++--
 1 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c 
b/arch/x86/kernel/cpu/perf_event_intel.c
index ca3b611..6f22479 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1149,16 +1149,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 
cpuc = &__get_cpu_var(cpu_hw_events);
 
-   /*
-* Some chipsets need to unmask the LVTPC in a particular spot
-* inside the nmi handler.  As a result, the unmasking was pushed
-* into all the nmi handlers.
-*
-* This handler doesn't seem to have any issues with the unmasking
-* so it was left at the top.
-*/
-   apic_write(APIC_LVTPC, APIC_DM_NMI);
-
intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
status = intel_pmu_get_status();
@@ -1218,6 +1208,12 @@ again:
 
 done:
intel_pmu_enable_all(0);
+   /*
+* Only unmask the NMI after the overflow counters
+* have been reset. This avoids spurious NMIs on
+* Haswell CPUs.
+*/
+   apic_write(APIC_LVTPC, APIC_DM_NMI);
return handled;
 }
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/5] perf, x86: Basic Haswell PEBS support v4

2013-03-08 Thread Andi Kleen
From: Andi Kleen 

Add basic PEBS support for Haswell.
The constraints are similar to SandyBridge with a few new events.

v2: Readd missing pebs_aliases
v3: Readd missing hunk. Fix some constraints.
v4: Fix typo in PEBS event table (Stephane Eranian)
Reviewed-by: Stephane Eranian 
Signed-off-by: Andi Kleen 
---
 arch/x86/kernel/cpu/perf_event.h  |2 ++
 arch/x86/kernel/cpu/perf_event_intel.c|6 --
 arch/x86/kernel/cpu/perf_event_intel_ds.c |   29 +
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index a356350..4f6b97d 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -591,6 +591,8 @@ extern struct event_constraint 
intel_snb_pebs_event_constraints[];
 
 extern struct event_constraint intel_ivb_pebs_event_constraints[];
 
+extern struct event_constraint intel_hsw_pebs_event_constraints[];
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);
 
 void intel_pmu_pebs_enable(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c 
b/arch/x86/kernel/cpu/perf_event_intel.c
index 10f3c6c..ca3b611 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -854,7 +854,8 @@ static inline bool intel_pmu_needs_lbr_smpl(struct 
perf_event *event)
return true;
 
/* implicit branch sampling to correct PEBS skid */
-   if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
+   if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
+   x86_pmu.intel_cap.pebs_format < 2)
return true;
 
return false;
@@ -2200,8 +2201,9 @@ __init int intel_pmu_init(void)
intel_pmu_lbr_init_snb();
 
x86_pmu.event_constraints = intel_hsw_event_constraints;
-
+   x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snb_extra_regs;
+   x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
/* all extra regs are per-cpu when HT is on */
x86_pmu.er_flags |= ERF_HAS_RSP_1;
x86_pmu.er_flags |= ERF_NO_HT_SHARING;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c 
b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 783b728..d1bc10e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -437,6 +437,35 @@ struct event_constraint intel_ivb_pebs_event_constraints[] 
= {
 EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_hsw_pebs_event_constraints[] = {
+   INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+   INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+   INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
+   INTEL_EVENT_CONSTRAINT(0xc4, 0xf),/* BR_INST_RETIRED.* */
+   INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */
+   INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */
+   INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */
+   INTEL_EVENT_CONSTRAINT(0xcd, 0x8),/* MEM_TRANS_RETIRED.* */
+   INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* 
MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+   INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* 
MEM_UOPS_RETIRED.STLB_MISS_STORES */
+   INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+   INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+   INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES 
*/
+   INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+   INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+   INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */
+   INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */
+   INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */
+   INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.HIT_LFB 
*/
+   INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf), /* 
MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */
+   INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf), /* 
MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */
+   INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf), /* 
MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */
+   INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */
+   INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */
+
+   EVENT_CONSTRAINT_END
+};
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 {
struct event_constraint *c;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the

[PATCH 1/5] perf, x86: Add Haswell PEBS record support v4

2013-03-08 Thread Andi Kleen
From: Andi Kleen 

Add support for the Haswell extended (fmt2) PEBS format.

It has a superset of the nhm (fmt1) PEBS fields, but has a longer record so
we need to adjust the code paths.

The main advantage is the new "EventingRip" support which directly
gives the instruction, not off-by-one instruction. So with precise == 2
we use that directly and don't try to use LBRs and walking basic blocks.
This lowers the overhead of using precise significantly.

Some other features are added in later patches.

Reviewed-by: Stephane Eranian 
v2: Rename various identifiers. Add more comments. Get rid of a cast.
v3: fmt2->hsw rename
v4: ip_of_the_event->real_ip rename
Signed-off-by: Andi Kleen 
---
 arch/x86/kernel/cpu/perf_event.c  |2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c |  113 +++--
 2 files changed, 91 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index bf0f01a..7d3b9bd 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -397,7 +397,7 @@ int x86_pmu_hw_config(struct perf_event *event)
 * check that PEBS LBR correction does not conflict with
 * whatever the user is asking with attr->branch_sample_type
 */
-   if (event->attr.precise_ip > 1) {
+   if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format 
< 2) {
u64 *br_type = &event->attr.branch_sample_type;
 
if (has_branch_stack(event)) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c 
b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 826054a..783b728 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -41,6 +41,22 @@ struct pebs_record_nhm {
u64 status, dla, dse, lat;
 };
 
+/*
+ * Same as pebs_record_nhm, with two additional fields.
+ */
+struct pebs_record_hsw {
+   struct pebs_record_nhm nhm;
+   /* 
+* Real IP of the event. In the Intel documentation this
+* is called eventingrip.
+*/
+   u64 real_ip;
+   /* 
+* TSX tuning information field: abort cycles and abort flags.
+*/
+   u64 tsx_tuning;
+};
+
 void init_debug_store_on_cpu(int cpu)
 {
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -559,11 +575,11 @@ static void __intel_pmu_pebs_event(struct perf_event 
*event,
 {
/*
 * We cast to pebs_record_core since that is a subset of
-* both formats and we don't use the other fields in this
-* routine.
+* all formats.
 */
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct pebs_record_core *pebs = __pebs;
+   struct pebs_record_hsw *pebs_hsw = __pebs;
struct perf_sample_data data;
struct pt_regs regs;
 
@@ -588,7 +604,10 @@ static void __intel_pmu_pebs_event(struct perf_event 
*event,
regs.bp = pebs->bp;
regs.sp = pebs->sp;
 
-   if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s))
+   if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
+   regs.ip = pebs_hsw->real_ip;
+   regs.flags |= PERF_EFLAGS_EXACT;
+   } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s))
regs.flags |= PERF_EFLAGS_EXACT;
else
regs.flags &= ~PERF_EFLAGS_EXACT;
@@ -641,35 +660,21 @@ static void intel_pmu_drain_pebs_core(struct pt_regs 
*iregs)
__intel_pmu_pebs_event(event, iregs, at);
 }
 
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at,
+   void *top)
 {
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct debug_store *ds = cpuc->ds;
-   struct pebs_record_nhm *at, *top;
struct perf_event *event = NULL;
u64 status = 0;
-   int bit, n;
-
-   if (!x86_pmu.pebs_active)
-   return;
-
-   at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
-   top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+   int bit;
 
ds->pebs_index = ds->pebs_buffer_base;
 
-   n = top - at;
-   if (n <= 0)
-   return;
+   for ( ; at < top; at += x86_pmu.pebs_record_size) {
+   struct pebs_record_nhm *p = at;
 
-   /*
-* Should not happen, we program the threshold at 1 and do not
-* set a reset value.
-*/
-   WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs 
records %d\n", n);
-
-   for ( ; at < top; at++) {
-   for_each_set_bit(bit, (unsigned long *)&at->status, 
x86_pmu.max_pebs_events) {
+   for_each_set_bit(bit, (unsigned long *)&p->status, 
x86_pmu.max_pebs_events) {
event = cpuc->events[bi

[PATCH 2/5] perf, x86: Basic Haswell PMU support v6

2013-03-08 Thread Andi Kleen
From: Andi Kleen 

Add basic Haswell PMU support.

Similar to SandyBridge, but has a few new events and two
new counter bits.

There are some new counter flags that need to be prevented
from being set on fixed counters, and allowed to be set
for generic counters.

Also we add support for the counter 2 constraint to handle
all raw events.

Contains fixes from Stephane Eranian

v2: Folded TSX bits into standard FIXED_EVENT_CONSTRAINTS
v3: Use SNB LBR init code. Comment fix (Stephane Eranian)
v4: Add the counter2 constraints. Fix comment in the right place.
v5: Expand comment
v6: Add CYCLE_ACTIVITY.* to counter constraints
Reviewed-by: Stephane Eranian 
Signed-off-by: Andi Kleen 
---
 arch/x86/include/asm/perf_event.h  |3 +
 arch/x86/kernel/cpu/perf_event.h   |5 ++-
 arch/x86/kernel/cpu/perf_event_intel.c |   79 
 3 files changed, 86 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 57cb634..b79b6eb 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -29,6 +29,9 @@
 #define ARCH_PERFMON_EVENTSEL_INV  (1ULL << 23)
 #define ARCH_PERFMON_EVENTSEL_CMASK0xFF00ULL
 
+#define HSW_INTX   (1ULL << 32)
+#define HSW_INTX_CHECKPOINTED  (1ULL << 33)
+
 #define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
 #define AMD64_EVENTSEL_GUESTONLY   (1ULL << 40)
 #define AMD64_EVENTSEL_HOSTONLY(1ULL << 41)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7f5c75c..a356350 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -219,11 +219,14 @@ struct cpu_hw_events {
  *  - inv
  *  - edge
  *  - cnt-mask
+ *  - intx
+ *  - intx_checkpointed
  *  The other filters are supported by fixed counters.
  *  The any-thread option is supported starting with v3.
  */
+#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_INTX|HSW_INTX_CHECKPOINTED)
 #define FIXED_EVENT_CONSTRAINT(c, n)   \
-   EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
+   EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
 
 /*
  * Constraint on the Event code + UMask
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c 
b/arch/x86/kernel/cpu/perf_event_intel.c
index 529c893..10f3c6c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 
@@ -154,6 +155,23 @@ static struct extra_reg intel_snb_extra_regs[] 
__read_mostly = {
EVENT_EXTRA_END
 };
 
+static struct event_constraint intel_hsw_event_constraints[] =
+{
+   FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+   FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+   FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+   INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */
+   INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+   INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+   /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ 
+   INTEL_EVENT_CONSTRAINT(0x08a3, 0x4),
+   /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ 
+   INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4),
+   /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */  
+   INTEL_EVENT_CONSTRAINT(0x04a3, 0xf),
+   EVENT_CONSTRAINT_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
return intel_perfmon_event_map[hw_event];
@@ -1606,6 +1624,47 @@ static void core_pmu_enable_all(int added)
}
 }
 
+static int hsw_hw_config(struct perf_event *event)
+{
+   int ret = intel_pmu_hw_config(event);
+
+   if (ret)
+   return ret;
+   if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
+   return 0;
+   event->hw.config |= event->attr.config & 
(HSW_INTX|HSW_INTX_CHECKPOINTED);
+
+   /*
+* INTX/INTX-CP filters are not supported by the Haswell PMU with
+* PEBS or in ANY thread mode. Since the results are non-sensical forbid
+* this combination.
+*/
+   if ((event->hw.config & (HSW_INTX|HSW_INTX_CHECKPOINTED)) &&
+((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
+ event->attr.precise_ip > 0))
+   return -EOPNOTSUPP;
+
+   return 0;
+}
+
+static struct event_constraint counter2_constraint =
+   EVENT_CONSTRAINT(0, 0x4, 0);
+
+static struct event_constraint *
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+   struct event_constraint *c = intel_get_event_constraints(cpuc, event);
+
+   /* Handle special quirk on intx_checkpointed only in counter 2 */
+   if (event->hw.config & HSW_INTX_CHECKPOINTED) {
+   if (c-

Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 03:28:01PM -0800, Linus Torvalds wrote:
 > Oh, btw, I'm assuming you're testing current git like you usually do.
 > 
 > If so, just to humor me, can you try plain 3.8 (or 3.8.2 or whatever
 > the current stable kernel is). Because while the oopses may be due to
 > your extensions to trinity, let's make sure. Maybe something bad
 > happened in the merge window, and we're just wasting time trying to
 > figure it out, when it could be just bisected...
 > 
 > The pipe and symlink BUG_ON() bugs you've found are likely real and
 > old, so keep the patches for those around, but let's just verify that
 > the BUG_ON(nd->inode != parent->d_inode);" is old too.
 > 
 > I suspect it's a real and old bug, and that you'll be able to recreate
 > it with the stable kernels too, but just to make sure..
 > 
 > Linus

That didn't take long..

[  163.807774] kernel BUG at fs/namei.c:1446!
[  163.807837] invalid opcode:  [#1] SMP 
[  163.807910] Modules linked in: ipt_ULOG af_key irda crc_ccitt nfc pppoe 
pppox ppp_generic slhc atm lockd sunrpc ip6t_REJECT nf_conntrack_ipv6 
nf_defrag_ipv6 xt_conntrack nf_conntrack ip6table_filter ip6_tables 
snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep btusb snd_seq 
snd_seq_device snd_pcm bluetooth ppdev snd_page_alloc rfkill microcode 
usb_debug serio_raw k10temp edac_core edac_mce_amd snd_timer sp5100_tco snd 
soundcore i2c_piix4 r8169 lm63 mii parport_pc parport vhost_net tun macvtap 
macvlan kvm_amd kvm ata_generic pata_acpi firewire_ohci firewire_core 
pata_atiixp crc_itu_t radeon i2c_algo_bit drm_kms_helper ttm drm i2c_core wmi
[  163.809090] CPU 1 
[  163.809126] Pid: 5831, comm: trinity-child1 Not tainted 
3.8.1-201.fc18.x86_64.debug #1 Gigabyte Technology Co., Ltd. 
GA-MA78GM-S2H/GA-MA78GM-S2H
[  163.809303] RIP: 0010:[]  [] 
lookup_slow+0x25/0xad
[  163.809424] RSP: 0018:8800cf895d08  EFLAGS: 00010212
[  163.809499] RAX: 8801129db790 RBX: 8800cf895e48 RCX: 02e0
[  163.809595] RDX: 8800cf895d98 RSI: 8800cf895e58 RDI: 8800cf895e48
[  163.809691] RBP: 8800cf895d38 R08: 0001 R09: 0001
[  163.809787] R10: 0001 R11: 0001 R12: 0010
[  163.809883] R13: 8800cf4e46f0 R14: ff9c R15: 8800cf895e48
[  163.809981] FS:  7f2416788740() GS:88012a80() 
knlGS:
[  163.810091] CS:  0010 DS:  ES:  CR0: 80050033
[  163.810170] CR2: 0000 CR3: 36eda000 CR4: 07e0
[  163.810269] DR0:  DR1:  DR2: 
[  163.810367] DR3:  DR6: 0ff0 DR7: 0400
[  163.810464] Process trinity-child1 (pid: 5831, threadinfo 8800cf894000, 
task 880111ff8000)
[  163.810582] Stack:
[  163.810614]  ea000324ac00  0001 
0010
[  163.810737]   ff9c 8800cf895de8 
811e8ced
[  163.810859]   81a0bb2c 8800cf895d88 
00d0
[  163.810982] Call Trace:
[  163.811026]  [] path_lookupat+0x74d/0x770
[  163.811108]  [] ? kmem_cache_alloc+0xe8/0x350
[  163.811195]  [] ? lock_release_holdtime.part.26+0xf/0x180
[  163.811294]  [] filename_lookup+0x34/0xc0
[  163.811374]  [] user_path_at_empty+0x8e/0x110
[  163.811507]  [] ? sched_clock+0x9/0x10
[  163.811586]  [] ? sched_clock_cpu+0xc5/0x120
[  163.811694]  [] ? trace_hardirqs_off+0xd/0x10
[  163.811779]  [] ? local_clock+0x6f/0x80
[  163.811857]  [] user_path_at+0x11/0x20
[  163.811935]  [] sys_lgetxattr+0x38/0x90
[  163.812014]  [] system_call_fastpath+0x16/0x1b
[  163.812096] Code: 55 48 89 e5 0f 0b 66 66 66 66 90 55 48 89 e5 41 56 41 55 
41 54 53 48 89 fb 48 83 ec 10 4c 8b 6f 08 48 8b 47 30 49 3b 45 30 74 02 <0f> 0b 
48 8d b8 e8 00 00 00 49 89 f6 31 f6 49 89 d4 e8 3d 4d 00 
[  163.812748] RIP  [] lookup_slow+0x25/0xad
[  163.812833]  RSP 
[  163.813040] ---[ end trace f67521cc7acbb145 ]---

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Announce] open ath9k_htc firmware

2013-03-08 Thread Luis R. Rodriguez
Many thanks to Adrian for his hard work on all this and to everyone
else who volunteered to help make this happen, including the
understanding by our management at QCA and even Tensilica requires
some handsome applause for their commitment, understanding on letting
us get this out. We now have a public mailing list for development:

http://lists.infradead.org/mailman/listinfo/ath9k_htc_fw

Full steam ahead!

  Luis

On Fri, Mar 8, 2013 at 3:32 PM, Adrian Chadd  wrote:
> Hi,
>
> I hate fanfare.
>
> You can all thank Luis Rodriguez for his ridiculously long and hard
> fighting to get this stuff pushed through the grinder to make this
> happen.
>
> There's also been a hard working team of developers behind the scene -
> Felix Fietkau, Eugene Krasnikov, Sujith Manoharan, and the team from
> Cozybit. They've been working hard to get the codebase converted over
> to using the open source tensilica toolchain and finding/fixing up any
> silly issues that may creep up from that.
>
> I've just been involved in shepharding this stuff into a sensible
> shape and pushing it through the final stage (legal review) before
> publication.
>
> You can find it here:
>
> https://github.com/qca/open-ath9k-htc-firmware
>
> There's a bunch of sorely needed work to do in the short term:
>
> * felix has a replacement cmake build system that I'd like to now integrate;
> * there's lots of build warnings which need to be removed (and then
> build this thing using -Wall -Werror);
> * some general code cleanup and documentation wouldn't be rejected -
> but I don't really want any sweeping code changes to go in until we've
> tidied up the build warnings and validated that the image(s) work
> right.
>
> I plan on doing some test builds shortly. I plan on bumping the
> firmware to 1.4 once felix's cmake system is in place and people have
> validated that this work.s
>
> Thanks,
>
>
>
> Adrian
> --
> To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Linus Torvalds
On Fri, Mar 8, 2013 at 3:30 PM, Dave Jones  wrote:
> [  100.729401] nd->last.name = 
> (\xffd2.W._.N.".\xfffe.\xff80.^N.?.\xffe4.E.8.g.\xffd2.N.\xffb6.^G.\xfff1.\xffcc.U.\xffda.^_.h.^M.1.\xffc5.\xff82.%.B.\xffe0.\xffad.^U.8.^L.c.Z.^K.\xffe4.h.J.\xffc8.\xffad.\xff83./.\xff80.\xffd1.\xffe5.\xff87.\xffc3.\xffb2.\xffdc.\xff9d.\xffd1.E.\xffab.^B.

You're printing out a 'char' with %02x, aren't you? And then the sign
extensions gives you six extra 'f' character every time the char is
negative.

If using %02x, make sure you use 'unsigned char', or "& 255".

Depending on just what is going on, the "nd->last.name" thing really
is probably a valid string. Strictly speaking it's not necessarily
NUL-terminated, though, and should be printed out to a maximum of
"nd->last.len" characters (but the whole path will be NUL-terminated,
so it's all ok, you just might print out more than one component)

   Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 8250.nr_uarts broken in 3.7

2013-03-08 Thread Josh Boyer
On Fri, Mar 08, 2013 at 06:28:09PM -0500, Josh Boyer wrote:
> On Sat, Mar 09, 2013 at 12:14:23AM +0100, Jiri Slaby wrote:
> > On 03/09/2013 12:10 AM, Jiri Slaby wrote:
> > > On 03/08/2013 11:58 PM, Jiri Slaby wrote:
> > >> On 03/08/2013 11:49 PM, Josh Boyer wrote:
> > >>> On Fri, Mar 08, 2013 at 11:47:01PM +0100, Jiri Slaby wrote:
> >  Yeah, I agree this is ugly. Just re-definining MODULE_PARAM_PREFIX at
> >  the end of the file should do the trick (followed by
> >  "module_param(nr_uarts, uint, 0644)").
> > >>>
> > >>> For some reason, I thought I had tried that.  Maybe I didn't.  I'll look
> > >>> into it again.
> > >>
> > >> I see. Because we would re-define some global variables. What if we put
> > >> module_param into a function?
> > > 
> > > Something like this?
> > > #ifdef MODULE
> 
> I don't think you want this surrounded in #ifdef MODULE, do you?  That
> won't let people building the driver into the kernel continue to use
> 8250. on the kernel command line.
> 
> > > static void __unused splat(void) {
> > 
> > I meant __used. It should make no difference though.
> > 
> > > #   undef MODULE_PARAM_PREFIX
> > > #   define MODULE_PARAM_PREFIX "8250."
> > > module_param_cb(nr_uarts, ¶m_ops_uint, &nr_uarts, 0644);
> > > ...
> > > }
> > > #endif
> > > 
> > > Not nice, but should work. The other way is to have those in a separate
> > > file linked to 8250 (to avoid re-definition errors).
> 
> Ew.  I'll try the function first.

OK, the function (without the surrounding ifdef) seems to be working OK.
I'll do a bit more testing and send out a v2 in a bit.

Thanks for the tip.  It's still not pretty, but at least I don't feel
ashamed about it.

josh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3] fat: editions to support fat_fallocate

2013-03-08 Thread Andrew Morton
On Thu,  7 Mar 2013 22:56:57 +0900 Namjae Jeon  wrote:

> From: Namjae Jeon 
> 
> Implement preallocation via the fallocate syscall on VFAT partitions.
> 
> Change Log:
> v3: Release preallocated blocks at file release.
> 
> With FALLOC_FL_KEEP_SIZE, there is no way to distinguish if the mismatch
> between i_size and no. of clusters allocated is a consequence of
> fallocate or just plain corruption. When a non fallocate aware (old)
> linux fat driver tries to write to such a file, it throws an error.
> Also, fsck detects this as inconsistency and truncates the prealloc'd blocks.
> 
> To avoid this, as suggested by OGAWA, remove changes that make fallocate
> persistent across mounts and restrict lifetime of blocks from
> fallocate(2) to file release.
> 
> v2: On an area preallocated with FALLOC_FL_KEEP_SIZE, when a seek was
> done to an offset beyond i_size, the old (garbage) data was exposed as
> we did not zero out the area at allocation time. Added
> fat_zero_falloc_area() to fix this.
> 
> v1: Reworked an earlier patch of the same name
> (https://lkml.org/lkml/2007/12/22/130) to fix some bugs:
> i)Preallocated space was not persistent and was lost on remount. Fixed
> it.
> ii)Did not zero out allocated clusters when FALLOC_FL_KEEP_SIZE was set,
> thereby speeding up preallocation time.
> 
> ...
>
> --- a/fs/fat/file.c
> +++ b/fs/fat/file.c
> @@ -17,8 +17,11 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "fat.h"
>  
> +static long fat_fallocate(struct file *file, int mode,
> + loff_t offset, loff_t len);
>  static int fat_ioctl_get_attributes(struct inode *inode, u32 __user 
> *user_attr)
>  {
>   u32 attr;
> @@ -140,6 +143,12 @@ static long fat_generic_compat_ioctl(struct file *filp, 
> unsigned int cmd,
>  
>  static int fat_file_release(struct inode *inode, struct file *filp)
>  {
> + struct super_block *sb = inode->i_sb;
> + loff_t mmu_private_ideal = (inode->i_size + (sb->s_blocksize-1)) &
> + ~(sb->s_blocksize-1);

Stylistically, it looks better to do

loff_t mmu_private_ideal;

mmu_private_ideal = (inode->i_size + (sb->s_blocksize-1)) &
~(sb->s_blocksize-1);

Note the blank line between end-of-definitions and start-of-code.  The
patch fails to do this in numerous places.

Also, I think and hope we can use round_up() here.

And we're not using i_size_read().  Probably that's OK if it is
guaranteed that fat_file_release() is always called under i_mutex, but
I might have forgotten the rules there.


> + if (mmu_private_ideal < MSDOS_I(inode)->mmu_private &&
> + filp->f_dentry->d_count == 1)
> + fat_truncate_blocks(inode, inode->i_size);

I suggest that a comment be added here.  It is unobvious why this code
is here, and what role d_count plays.

>   if ((filp->f_mode & FMODE_WRITE) &&
>MSDOS_SB(inode->i_sb)->options.flush) {
>   fat_flush_inodes(inode->i_sb, inode, NULL);
> @@ -174,6 +183,7 @@ const struct file_operations fat_file_operations = {
>  #endif
>   .fsync  = fat_file_fsync,
>   .splice_read= generic_file_splice_read,
> + .fallocate  = fat_fallocate,
>  };
>  
>  static int fat_cont_expand(struct inode *inode, loff_t size)
> @@ -211,7 +221,78 @@ static int fat_cont_expand(struct inode *inode, loff_t 
> size)
>  out:
>   return err;
>  }
> +/*
> + * preallocate space for a file. This implements fat's fallocate file
> + * operation, which gets called from sys_fallocate system call. User
> + * space requests len bytes at offset.If FALLOC_FL_KEEP_SIZE is set
> + * we just allocate clusters without zeroing them out.Otherwise we
> + * allocate and zero out clusters via an expanding truncate.

This comment is a bit lazy :( Capital letters at the start of
sentences, a space after a full stop etc, please.

> + */
> +static long fat_fallocate(struct file *file, int mode,
> + loff_t offset, loff_t len)
> +{
> + int err = 0;
> + struct inode *inode = file->f_mapping->host;
> + int cluster, nr_cluster, fclus, dclus, free_bytes, nr_bytes;

I'm rather allergic to multiple-definitions-on-one-line like this. 
They make the code harder to read and they result in messy patch resolution
efforts.  Most significantly, one-definition-per-line leaves a little
room on the right for a brief comment explaining the variable's role. 
Such comments appear to be needed in this function!

Are you sure that `int' is the best type for all these?  Do they need
to be signed?  For example nr_bytes and free_bytes are derived from
loff_t's and it is unobvious that there is no risk of overflowing.


> + struct super_block *sb = inode->i_sb;
> + struct msdos_sb_info *sbi = MSDOS_SB(sb);
> +
> + /* No support for hole punch or other fallocate flags. */
> + if (mode & ~FALLOC_FL_KEEP_SIZE)
> + return -EOPNOTSUPP;
> +
> + if ((offset + len

Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 03:28:01PM -0800, Linus Torvalds wrote:
 > Oh, btw, I'm assuming you're testing current git like you usually do.
 
yes.

 > If so, just to humor me, can you try plain 3.8 (or 3.8.2 or whatever
 > the current stable kernel is). Because while the oopses may be due to
 > your extensions to trinity, let's make sure. Maybe something bad
 > happened in the merge window, and we're just wasting time trying to
 > figure it out, when it could be just bisected...

Sure, will do.

 > The pipe and symlink BUG_ON() bugs you've found are likely real and
 > old, so keep the patches for those around, but let's just verify that
 > the BUG_ON(nd->inode != parent->d_inode);" is old too.
 > 
 > I suspect it's a real and old bug, and that you'll be able to recreate
 > it with the stable kernels too, but just to make sure..

Hopefully I won't uncover anything new.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 03:20:40PM -0800, Linus Torvalds wrote:
 > On Fri, Mar 8, 2013 at 3:07 PM, Dave Jones  wrote:
 > >
 > > Ok, got something more meaningful out of the lookup_slow trace.
 > >
 > > [   66.082984] parent->dname.name  (06b6b6b6b6b6b6b)
 > > [   66.083637] parent =
 > >
 > > At first I thought AH-HA! SLAB POISON!
 > > But look closer.. it's shifted by 8 bits.
 > 
 > Or just the high byte has been cleared.

Duh, yes. I wasn't zero padding, so that's 006b6b6b.

Here's another run, with something similar in path->dentry..

[  100.729395] parent->dname.name  (06 b6 b6b 6b6b6b6b 6b6b6b6b 6b6b6b6b)
[  100.729397] path->dentry: 6d6e740 6b6b6b6b
[  100.729401] nd->last.name = 
(\xffd2.W._.N.".\xfffe.\xff80.^N.?.\xffe4.E.8.g.\xffd2.N.\xffb6.^G.\xfff1.\xffcc.U.\xffda.^_.h.^M.1.\xffc5.\xff82.%.B.\xffe0.\xffad.^U.8.^L.c.Z.^K.\xffe4.h.J.\xffc8.\xffad.\xff83./.\xff80.\xffd1.\xffe5.\xff87.\xffc3.\xffb2.\xffdc.\xff9d.\xffd1.E.\xffab.^B.
(garbage continues..)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 8250.nr_uarts broken in 3.7

2013-03-08 Thread Josh Boyer
On Sat, Mar 09, 2013 at 12:14:23AM +0100, Jiri Slaby wrote:
> On 03/09/2013 12:10 AM, Jiri Slaby wrote:
> > On 03/08/2013 11:58 PM, Jiri Slaby wrote:
> >> On 03/08/2013 11:49 PM, Josh Boyer wrote:
> >>> On Fri, Mar 08, 2013 at 11:47:01PM +0100, Jiri Slaby wrote:
>  Yeah, I agree this is ugly. Just re-definining MODULE_PARAM_PREFIX at
>  the end of the file should do the trick (followed by
>  "module_param(nr_uarts, uint, 0644)").
> >>>
> >>> For some reason, I thought I had tried that.  Maybe I didn't.  I'll look
> >>> into it again.
> >>
> >> I see. Because we would re-define some global variables. What if we put
> >> module_param into a function?
> > 
> > Something like this?
> > #ifdef MODULE

I don't think you want this surrounded in #ifdef MODULE, do you?  That
won't let people building the driver into the kernel continue to use
8250. on the kernel command line.

> > static void __unused splat(void) {
> 
> I meant __used. It should make no difference though.
> 
> > #   undef MODULE_PARAM_PREFIX
> > #   define MODULE_PARAM_PREFIX "8250."
> > module_param_cb(nr_uarts, ¶m_ops_uint, &nr_uarts, 0644);
> > ...
> > }
> > #endif
> > 
> > Not nice, but should work. The other way is to have those in a separate
> > file linked to 8250 (to avoid re-definition errors).

Ew.  I'll try the function first.

josh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Linus Torvalds
Oh, btw, I'm assuming you're testing current git like you usually do.

If so, just to humor me, can you try plain 3.8 (or 3.8.2 or whatever
the current stable kernel is). Because while the oopses may be due to
your extensions to trinity, let's make sure. Maybe something bad
happened in the merge window, and we're just wasting time trying to
figure it out, when it could be just bisected...

The pipe and symlink BUG_ON() bugs you've found are likely real and
old, so keep the patches for those around, but let's just verify that
the BUG_ON(nd->inode != parent->d_inode);" is old too.

I suspect it's a real and old bug, and that you'll be able to recreate
it with the stable kernels too, but just to make sure..

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] [SCSI] scsi_transport_sas: check for allocation failure

2013-03-08 Thread Douglas Gilbert

On 13-03-08 05:50 PM, James Bottomley wrote:

On Fri, 2013-03-08 at 12:57 -0500, Douglas Gilbert wrote:

On 13-03-08 07:02 AM, Dan Carpenter wrote:

Static checkers complain that this allocation isn't checked.  We
should return zero if the allocation fails.

Signed-off-by: Dan Carpenter 

diff --git a/drivers/scsi/scsi_transport_sas.c 
b/drivers/scsi/scsi_transport_sas.c
index 1b68142..a022997 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -379,9 +379,12 @@ sas_tlr_supported(struct scsi_device *sdev)
   {
const int vpd_len = 32;
struct sas_end_device *rdev = sas_sdev_to_rdev(sdev);
-   char *buffer = kzalloc(vpd_len, GFP_KERNEL);
+   char *buffer;
int ret = 0;

+   buffer = kzalloc(vpd_len, GFP_KERNEL);
+   if (!buffer)
+   goto out;
if (scsi_get_vpd_page(sdev, 0x90, buffer, vpd_len))
goto out;



For 32 bytes, why not use the stack?


Because the buffer is a DMA target.  You can't DMA to stack because of
padding and cacheline issues.


And I went to the definition of scsi_get_vpd_page()
to see if that was called out in the header comments.
Guess what ... and those same header comments talked
about freeing a returned pointer. It needs to be
cleaned up, IMO.

Doug Gilbert

/**
 * scsi_get_vpd_page - Get Vital Product Data from a SCSI device
 * @sdev: The device to ask
 * @page: Which Vital Product Data to return
 * @buf: where to store the VPD
 * @buf_len: number of bytes in the VPD buffer area
 *
 * SCSI devices may optionally supply Vital Product Data.  Each 'page'
 * of VPD is defined in the appropriate SCSI document (eg SPC, SBC).
 * If the device supports this VPD page, this routine returns a pointer
 * to a buffer containing the data from that page.  The caller is
 * responsible for calling kfree() on this pointer when it is no longer
 * needed.  If we cannot retrieve the VPD page this routine returns %NULL.
 */
int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf,
  int buf_len)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] perf: Enable function tracing for perf core

2013-03-08 Thread Andi Kleen
From: Andi Kleen 

According to Steven R. there is no reason left to not support
function tracing for the perf core. This makes it easier to debug
perf.

Don't remove -pg for the x86 and generic perf core.

Cc: rost...@goodmis.org
Signed-off-by: Andi Kleen 
---
 arch/x86/kernel/cpu/Makefile |1 -
 kernel/events/Makefile   |4 
 2 files changed, 0 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a0e067d..8eb5d28 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,7 +5,6 @@
 # Don't trace early stages of a secondary CPU boot
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_common.o = -pg
-CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
 # Make sure load_percpu_segment has no stackprotector
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 103f5d1..a630994 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,7 +1,3 @@
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_core.o = -pg
-endif
-
 obj-y := core.o ring_buffer.o callchain.o
 
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] perf, x86: Allow setting period 1

2013-03-08 Thread Andi Kleen
From: Andi Kleen 

I had some requests for setting period 1, so that every event of something
is caught.  To my knowledge there is no limit to 1 on Intel hardware.
Just remove the check for minimum 2

If specific CPUs have problems we can black list them.

Signed-off-by: Andi Kleen 
---
 arch/x86/kernel/cpu/perf_event.c |5 -
 1 files changed, 0 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index bf0f01a..2b394ae 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -946,11 +946,6 @@ int x86_perf_event_set_period(struct perf_event *event)
hwc->last_period = period;
ret = 1;
}
-   /*
-* Quirk: certain CPUs dont like it if just 1 hw_event is left:
-*/
-   if (unlikely(left < 2))
-   left = 2;
 
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] perf, x86: Add Sandy Bridge constraints for CYCLE_ACTIVITY.*

2013-03-08 Thread Andi Kleen
From: Andi Kleen 

Add CYCLE_ACTIVITY.CYCLES_NO_DISPATCH/CYCLES_L1D_PENDING
These recently documented events have restrictions to counter 0-3
and counter 2 respectively.  The scheduler needs to know that
to schedule them correctly.

IvyBridge already has the necessary constraints.

Signed-off-by: Andi Kleen 
---
 arch/x86/kernel/cpu/perf_event_intel.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c 
b/arch/x86/kernel/cpu/perf_event_intel.c
index 5b59c6c..0d2f9d8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -104,6 +104,8 @@ static struct event_constraint 
intel_snb_event_constraints[] __read_mostly =
INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+   INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* 
CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+   INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* 
CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
EVENT_CONSTRAINT_END
 };
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: kswapd craziness round 2

2013-03-08 Thread Jiri Slaby
On 03/08/2013 07:42 AM, Hillf Danton wrote:
> On Fri, Mar 8, 2013 at 3:37 AM, Jiri Slaby  wrote:
>> On 03/01/2013 03:02 PM, Hillf Danton wrote:
>>> On Fri, Mar 1, 2013 at 1:02 AM, Jiri Slaby  wrote:

 Ok, no difference, kswap is still crazy. I'm attaching the output of
 "grep -vw '0' /proc/vmstat" if you see something there.

>>> Thanks to you for test and data.
>>>
>>> Lets try to restore the deleted nap, then.
>>
>> Oh, it seems to be nice now:
>> root   579  0.0  0.0  0 0 ?SMar04   0:13 [kswapd0]
>>
> Double thanks.

There is one downside. I'm not sure whether that patch was the culprit.
My Thunderbird is jerky when scrolling and lags while writing this
message. The letters sometimes appear later than typed and in groups. Like
I (kbd): My Thunder
TB: My Thunder
I (kbd): b-i-r-d
TB: is silent
I (kbd): still typing...
TB: bird is

Perhaps it's not only TB.

> But Mel does not like it, probably.
> Lets try nap in another way.

Will try next week.

> --- a/mm/vmscan.c Thu Feb 21 20:01:02 2013
> +++ b/mm/vmscan.c Fri Mar  8 14:36:10 2013
> @@ -2793,6 +2793,10 @@ loop_again:
>* speculatively avoid congestion waits
>*/
>   zone_clear_flag(zone, ZONE_CONGESTED);
> +
> + else if (sc.priority > 2 &&
> +  sc.priority < DEF_PRIORITY - 2)
> + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
>   }
> 
>   /*

-- 
js
suse labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Linus Torvalds
On Fri, Mar 8, 2013 at 3:07 PM, Dave Jones  wrote:
>
> Ok, got something more meaningful out of the lookup_slow trace.
>
> [   66.082984] parent->dname.name  (06b6b6b6b6b6b6b)
> [   66.083637] parent =
>
> At first I thought AH-HA! SLAB POISON!
> But look closer.. it's shifted by 8 bits.

Or just the high byte has been cleared.

But yeah, if the parent has been free'd then that certainly explains
why the "impossible" test of

  nd->inode != parent->d_inode

would trigger. And it would explain any odd crashes at lookup time
too. In particular, the NULL pointer one you reference seems to be
dir->i_op->lookup being NULL, so calling it (understandable) ends up
doing bad things.

I really don't understand how the parent could be free'd early.
Dentries are freed by RCU, and the dentry lookup code is some of the
most well-tested out there. I don't see how /proc could mess that up,
unless it just completely screws up some refcounting thing or other.

 Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/1] kernel/pid.c: Improve flow of a loop inside alloc_pidmap.

2013-03-08 Thread Raphael S.Carvalho
Notes: find_next_offset searches for an available "cleaned bit"
in the respective pid bitmap (page), so returns the offset if found,
otherwise it returns a value equals to BITS_PER_PAGE (invalid offset).

For example, suppose find_next_offset didn't find any available
bit, so there's no purpose to call mk_pid (Wasteful Cpu Cycles)
since it only computes a new PID based on a *valid* offset of 
the current map.

Therefore, I found it could be better to call mk_pid after
the checking (offset < BITS_PER_PAGE) returned sucessfully!

Signed-off-by: Raphael S.Carvalho 
---
 kernel/pid.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index 047dc62..7ecb09a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -190,8 +190,8 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
return pid;
}
offset = find_next_offset(map, offset);
-   pid = mk_pid(pid_ns, map, offset);
-   } while (offset < BITS_PER_PAGE && pid < pid_max);
+   } while (offset < BITS_PER_PAGE &&
+   (pid = mk_pid(pid_ns, map, offset)) < pid_max);
}
if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
++map;
-- 
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH wq/for-3.9-fixes] workqueue: fix possible pool stall bug in wq_unbind_fn()

2013-03-08 Thread Tejun Heo
From: Lai Jiangshan 

Since multiple pools per cpu have been introduced, wq_unbind_fn() has
a subtle bug which may theoretically stall work item processing.  The
problem is two-fold.

* wq_unbind_fn() depends on the worker executing wq_unbind_fn() itself
  to start unbound chain execution, which works fine when there was
  only single pool.  With multiple pools, only the pool which is
  running wq_unbind_fn() - the highpri one - is guaranteed to have
  such kick-off.  The other pool could stall when its busy workers
  block.

* The current code is setting WORKER_UNBIND / POOL_DISASSOCIATED of
  the two pools in succession without initiating work execution
  inbetween.  Because setting the flags requires grabbing assoc_mutex
  which is held while new workers are created, this could lead to
  stalls if a pool's manager is waiting for the previous pool's work
  items to release memory.  This is almost purely theoretical tho.

Update wq_unbind_fn() such that it sets WORKER_UNBIND /
POOL_DISASSOCIATED, goes over schedule() and explicitly kicks off
execution for a pool and then moves on to the next one.

tj: Updated comments and description.

Signed-off-by: Lai Jiangshan 
Signed-off-by: Tejun Heo 
Cc: sta...@vger.kernel.org
---
As you seemingly has disappeared, I just fixed up this patch and
applied it to wq/for-3.9-fixes.

Thanks.

 kernel/workqueue.c |   44 +---
 1 file changed, 25 insertions(+), 19 deletions(-)

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3446,28 +3446,34 @@ static void wq_unbind_fn(struct work_str
 
spin_unlock_irq(&pool->lock);
mutex_unlock(&pool->assoc_mutex);
-   }
 
-   /*
-* Call schedule() so that we cross rq->lock and thus can guarantee
-* sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
-* as scheduler callbacks may be invoked from other cpus.
-*/
-   schedule();
+   /*
+* Call schedule() so that we cross rq->lock and thus can
+* guarantee sched callbacks see the %WORKER_UNBOUND flag.
+* This is necessary as scheduler callbacks may be invoked
+* from other cpus.
+*/
+   schedule();
 
-   /*
-* Sched callbacks are disabled now.  Zap nr_running.  After this,
-* nr_running stays zero and need_more_worker() and keep_working()
-* are always true as long as the worklist is not empty.  Pools on
-* @cpu now behave as unbound (in terms of concurrency management)
-* pools which are served by workers tied to the CPU.
-*
-* On return from this function, the current worker would trigger
-* unbound chain execution of pending work items if other workers
-* didn't already.
-*/
-   for_each_std_worker_pool(pool, cpu)
+   /*
+* Sched callbacks are disabled now.  Zap nr_running.
+* After this, nr_running stays zero and need_more_worker()
+* and keep_working() are always true as long as the
+* worklist is not empty.  This pool now behaves as an
+* unbound (in terms of concurrency management) pool which
+* are served by workers tied to the pool.
+*/
atomic_set(&pool->nr_running, 0);
+
+   /*
+* With concurrency management just turned off, a busy
+* worker blocking could lead to lengthy stalls.  Kick off
+* unbound chain execution of currently pending work items.
+*/
+   spin_lock_irq(&pool->lock);
+   wake_up_worker(pool);
+   spin_unlock_irq(&pool->lock);
+   }
 }
 
 /*
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 06:07:34PM -0500, Dave Jones wrote:
 > parent seems to be a pointer to "\0".

Ignore that last part, it's wrong.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 8250.nr_uarts broken in 3.7

2013-03-08 Thread Jiri Slaby
On 03/09/2013 12:10 AM, Jiri Slaby wrote:
> On 03/08/2013 11:58 PM, Jiri Slaby wrote:
>> On 03/08/2013 11:49 PM, Josh Boyer wrote:
>>> On Fri, Mar 08, 2013 at 11:47:01PM +0100, Jiri Slaby wrote:
 Yeah, I agree this is ugly. Just re-definining MODULE_PARAM_PREFIX at
 the end of the file should do the trick (followed by
 "module_param(nr_uarts, uint, 0644)").
>>>
>>> For some reason, I thought I had tried that.  Maybe I didn't.  I'll look
>>> into it again.
>>
>> I see. Because we would re-define some global variables. What if we put
>> module_param into a function?
> 
> Something like this?
> #ifdef MODULE
> static void __unused splat(void) {

I meant __used. It should make no difference though.

> #   undef MODULE_PARAM_PREFIX
> #   define MODULE_PARAM_PREFIX "8250."
> module_param_cb(nr_uarts, ¶m_ops_uint, &nr_uarts, 0644);
> ...
> }
> #endif
> 
> Not nice, but should work. The other way is to have those in a separate
> file linked to 8250 (to avoid re-definition errors).
> 
> thanks,
-- 
js
suse labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 8250.nr_uarts broken in 3.7

2013-03-08 Thread Josh Boyer
On Fri, Mar 08, 2013 at 11:58:39PM +0100, Jiri Slaby wrote:
> On 03/08/2013 11:49 PM, Josh Boyer wrote:
> > On Fri, Mar 08, 2013 at 11:47:01PM +0100, Jiri Slaby wrote:
> >> Yeah, I agree this is ugly. Just re-definining MODULE_PARAM_PREFIX at
> >> the end of the file should do the trick (followed by
> >> "module_param(nr_uarts, uint, 0644)").
> > 
> > For some reason, I thought I had tried that.  Maybe I didn't.  I'll look
> > into it again.
> 
> I see. Because we would re-define some global variables. What if we put

Right.  For the peanut gallery, you get this error:

CC  drivers/tty/serial/8250/8250.o
drivers/tty/serial/8250/8250.c:3351:1: error: redefinition of 
‘__check_share_irqs’
drivers/tty/serial/8250/8250.c::1: note: previous definition of 
‘__check_share_irqs’ was here
drivers/tty/serial/8250/8250.c:3351:1: error: redefinition of 
‘__param_perm_check_share_irqs’
drivers/tty/serial/8250/8250.c::1: note: previous definition of 
‘__param_perm_check_share_irqs’ was here
drivers/tty/serial/8250/8250.c:3351:1: error: redefinition of 
‘__param_str_share_irqs’
drivers/tty/serial/8250/8250.c::1: note: previous definition of 
‘__param_str_share_irqs’ was here
drivers/tty/serial/8250/8250.c:3351:1: error: redefinition of 
‘__param_share_irqs’
drivers/tty/serial/8250/8250.c::1: note: previous definition of 
‘__param_share_irqs’ was here

for each variable you redefine like that.

> module_param into a function?

Not sure what you mean by that.

josh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 8250.nr_uarts broken in 3.7

2013-03-08 Thread Jiri Slaby
On 03/08/2013 11:58 PM, Jiri Slaby wrote:
> On 03/08/2013 11:49 PM, Josh Boyer wrote:
>> On Fri, Mar 08, 2013 at 11:47:01PM +0100, Jiri Slaby wrote:
>>> Yeah, I agree this is ugly. Just re-definining MODULE_PARAM_PREFIX at
>>> the end of the file should do the trick (followed by
>>> "module_param(nr_uarts, uint, 0644)").
>>
>> For some reason, I thought I had tried that.  Maybe I didn't.  I'll look
>> into it again.
> 
> I see. Because we would re-define some global variables. What if we put
> module_param into a function?

Something like this?
#ifdef MODULE
static void __unused splat(void) {
#   undef MODULE_PARAM_PREFIX
#   define MODULE_PARAM_PREFIX "8250."
module_param_cb(nr_uarts, ¶m_ops_uint, &nr_uarts, 0644);
...
}
#endif

Not nice, but should work. The other way is to have those in a separate
file linked to 8250 (to avoid re-definition errors).

thanks,
-- 
js
suse labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG_ON(nd->inode != parent->d_inode);

2013-03-08 Thread Dave Jones
On Fri, Mar 08, 2013 at 02:41:19PM -0800, Linus Torvalds wrote:
 > On Fri, Mar 8, 2013 at 1:04 PM, Dave Jones  wrote:
 > >
 > > queue up the sad trombone noises.
 > >
 > > One of the things trinity passes syscalls is a page of deformed unicode.
 > > Apparently this page is so fucked up, that it crashes *printk*.
 > 
 > It's probably my debug stuff that is bogus. One of the string pointers
 > passed to printk %s seems bad, and instead of being a proper kernel
 > pointer it's "0xaf0f48ef7bdef7bd". So trying to access it causes a GP
 > fault (it's not a validly formed pointer)
 > 
 > I'm not seeing what the problem is, but I'll mull on it..

Ok, got something more meaningful out of the lookup_slow trace.

[   66.082984] parent->dname.name  (06b6b6b6b6b6b6b)
[   66.083637] parent = 

At first I thought AH-HA! SLAB POISON!
But look closer.. it's shifted by 8 bits.

Also, this isn't a pointer, that's the output of..

printk("parent->dname.name %s (%x%x%x%x%x%x%x%x)\n", 
parent->d_name.name,
parent->d_name.name[0],
parent->d_name.name[1],
parent->d_name.name[2],
parent->d_name.name[3],
parent->d_name.name[4],
parent->d_name.name[5],
parent->d_name.name[6],
parent->d_name.name[7]);

Interestingly, that pattern always seems to be the same across different 
reboots.

parent seems to be a pointer to "\0".


Another oddball crash..



BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [<  (null)>]   (null)
PGD 10f366067 PUD 10f36c067 PMD 0 
Oops: 0010 [#1] PREEMPT SMP 
Modules linked in: can_bcm irda pppoe pppox ppp_generic can slhc af_802154 atm 
rds af_key phonet nfc ipx p8023 p8022 af_rxrpc caif_socket caif crc_ccitt 
decnet netrom appletalk x25 psnap llc rose ax25 lockd sunrpc ip6t_REJECT 
nf_conntrack_ipv6 nf_defrag_ipv6 xt_conntrack nf_conntrack ip6table_filter 
ip6_tables snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_pcm btusb 
bluetooth usb_debug snd_page_alloc microcode snd_timer rfkill snd serio_raw 
pcspkr edac_core soundcore r8169 mii vhost_net tun macvtap macvlan kvm_amd kvm 
radeon backlight drm_kms_helper ttm
CPU 0 
Pid: 822, comm: trinity-child0 Not tainted 3.9.0-rc1+ #85 Gigabyte Technology 
Co., Ltd. GA-MA78GM-S2H/GA-MA78GM-S2H
RIP: 0010:[<>]  [<  (null)>]   (null)
RSP: 0018:88010f033db0  EFLAGS: 00010246
RAX: 8181f540 RBX: 8800cf49c940 RCX: 
RDX: 0600 RSI: 8800cf49c940 RDI: 8800cf492170
RBP: 88010f033dd8 R08: 0001 R09: 
R10: 0001 R11:  R12: 8800cf49f530
R13: 88010f033f28 R14:  R15: a000
FS:  7f29dffed740() GS:88012ae0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 00010f365000 CR4: 07f0
DR0:  DR1:  DR2: 
DR3:  DR6: 0ff0 DR7: 0400
Process trinity-child0 (pid: 822, threadinfo 88010f032000, task 
8801103ec920)
Stack:
 811c5fad 8800cf49f530 88010f033f28 
 0600 88010f033e08 811c61c8 88010f033e18
 0100   88010f033e18
Call Trace:
 [] ? lookup_real+0x1d/0x60
 [] __lookup_hash+0x38/0x50
 [] lookup_hash+0x19/0x20
 [] kern_path_create+0x95/0x170
 [] ? getname_flags.part.33+0x86/0x150
 [] ? 0x9fff
 [] user_path_create+0x4a/0x70
 [] sys_mknodat+0xac/0x1d0
 [] ? trace_hardirqs_on_thunk+0x3a/0x3f
 [] ? 0x9fff
 [] sys_mknod+0x1d/0x20
 [] system_call_fastpath+0x16/0x1b
 [] ? 0x9fff
Code:  Bad RIP value.
RIP  [<  (null)>]   (null)
 RSP 
CR2: 
---[ end trace f74c72ebf05e8c15 ]---

I'll add some similar printk's to lookup_real and see what falls out.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V7 -next 00/33] loop: Issue O_DIRECT aio using bio_vec

2013-03-08 Thread Dave Kleikamp
Zach,
I've had to alter some of these patches to build against the linux-next
due to your cleanup of the aio code. Could you please look at patches
14, 15 and 19 in particular to check if I did anything insane?

This patch series adds a kernel interface to fs/aio.c so that kernel code can
issue concurrent asynchronous IO to file systems.  It adds an aio command and
file system methods which specify io memory with pages instead of userspace
addresses.

This series was written to reduce the current overhead loop imposes by
performing synchronus buffered file system IO from a kernel thread.  These
patches turn loop into a light weight layer that translates bios into iocbs.

It introduces new file ops, read_iter() and write_iter(), that replace the
aio_read() and aio_write() operations. The iov_iter structure can now contain
either a user-space iovec or a kernel-space bio_vec. Since it would be
overly complicated to replace every instance of aio_read() and aio_write(),
the old operations are not removed, but file systems implementing the new
ones need not keep the old ones.

Asias He (1):
  block_dev: add support for read_iter, write_iter

Dave Kleikamp (21):
  iov_iter: iov_iter_copy_from_user() should use non-atomic copy
  iov_iter: add __iovec_copy_to_user()
  fuse: convert fuse to use iov_iter_copy_[to|from]_user
  iov_iter: ii_iovec_copy_to_user should pre-fault user pages
  dio: Convert direct_IO to use iov_iter
  dio: add bio_vec support to __blockdev_direct_IO()
  aio: add aio_kernel_() interface
  aio: add aio support for iov_iter arguments
  fs: create file_readable() and file_writable() functions
  fs: use read_iter and write_iter rather than aio_read and aio_write
  fs: add read_iter and write_iter to several file systems
  ext4: add support for read_iter and write_iter
  nfs: add support for read_iter, write_iter
  nfs: simplify swap
  btrfs: add support for read_iter and write_iter
  xfs: add support for read_iter and write_iter
  gfs2: Convert aio_read/write ops to read/write_iter
  udf: convert file ops from aio_read/write to read/write_iter
  afs: add support for read_iter and write_iter
  ecrpytfs: Convert aio_read/write ops to read/write_iter
  ubifs: convert file ops from aio_read/write to read/write_iter

Hugh Dickins (1):
  tmpfs: add support for read_iter and write_iter

Zach Brown (10):
  iov_iter: move into its own file
  iov_iter: add copy_to_user support
  iov_iter: hide iovec details behind ops function pointers
  iov_iter: add bvec support
  iov_iter: add a shorten call
  iov_iter: let callers extract iovecs and bio_vecs
  fs: pull iov_iter use higher up the stack
  bio: add bvec_length(), like iov_length()
  loop: use aio to perform io on the underlying file
  ocfs2: add support for read_iter, write_iter, and direct_IO_bvec

 Documentation/filesystems/Locking|   6 +-
 Documentation/filesystems/vfs.txt|  12 +-
 drivers/block/loop.c | 148 
 drivers/char/raw.c   |   4 +-
 drivers/mtd/nand/nandsim.c   |   4 +-
 drivers/staging/ccg/storage_common.c |   4 +-
 drivers/usb/gadget/storage_common.c  |   4 +-
 fs/9p/vfs_addr.c |  12 +-
 fs/9p/vfs_file.c |   8 +-
 fs/Makefile  |   2 +-
 fs/adfs/file.c   |   4 +-
 fs/affs/file.c   |   4 +-
 fs/afs/file.c|   4 +-
 fs/afs/internal.h|   3 +-
 fs/afs/write.c   |   9 +-
 fs/aio.c | 153 -
 fs/bad_inode.c   |  14 ++
 fs/bfs/file.c|   4 +-
 fs/block_dev.c   |  27 ++-
 fs/btrfs/file.c  |  42 ++--
 fs/btrfs/inode.c |  63 +++--
 fs/ceph/addr.c   |   3 +-
 fs/cifs/file.c   |   4 +-
 fs/compat.c  |  10 +-
 fs/direct-io.c   | 223 --
 fs/ecryptfs/file.c   |  15 +-
 fs/exofs/file.c  |   4 +-
 fs/ext2/file.c   |   4 +-
 fs/ext2/inode.c  |   8 +-
 fs/ext3/file.c   |   4 +-
 fs/ext3/inode.c  |  15 +-
 fs/ext4/ext4.h   |   3 +-
 fs/ext4/file.c   |  34 +--
 fs/ext4/indirect.c   |  16 +-
 fs/ext4/inode.c  |  23 +-
 fs/f2fs/data.c   |   4 +-
 fs/f2fs/file.c   |   4 +-
 fs/fat/file.c|   4 +-
 fs/fat/inode.c   |  10 +-
 fs/fuse/cuse.c   |  10 +-
 fs/fuse/file.c   |  82 +++
 fs/fuse/fuse_i.h |   5 +-
 fs/gfs2/aops.c   |   7 +-
 fs/gfs2/file.c   |  21 +-
 fs/hfs/inode.c   |  11 +-
 fs/hfsplus/inode.c   

[PATCH V7 -next 14/33] aio: add aio_kernel_() interface

2013-03-08 Thread Dave Kleikamp
This adds an interface that lets kernel callers submit aio iocbs without
going through the user space syscalls.  This lets kernel callers avoid
the management limits and overhead of the context.  It will also let us
integrate aio operations with other kernel apis that the user space
interface doesn't have access to.

Signed-off-by: Dave Kleikamp 
Cc: Zach Brown 
---
 fs/aio.c| 80 +
 include/linux/aio.h | 17 +++-
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/fs/aio.c b/fs/aio.c
index eb99ac1..6dd3a4e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -822,6 +822,9 @@ void aio_complete_batch(struct kiocb *req, long res, long 
res2,
req->ki_user_data = req->ki_res;
atomic_set(&req->ki_users, 0);
wake_up_process(req->ki_obj.tsk);
+   } else if (is_kernel_kiocb(req)) {
+   req->ki_obj.complete(req->ki_user_data, res);
+   aio_kernel_free(req);
} else if (batch) {
int res;
struct kiocb *t;
@@ -1225,6 +1228,83 @@ rw_common:
return 0;
 }
 
+/*
+ * This allocates an iocb that will be used to submit and track completion of
+ * an IO that is issued from kernel space.
+ *
+ * The caller is expected to call the appropriate aio_kernel_init_() functions
+ * and then call aio_kernel_submit().  From that point forward progress is
+ * guaranteed by the file system aio method.  Eventually the caller's
+ * completion callback will be called.
+ *
+ * These iocbs are special.  They don't have a context, we don't limit the
+ * number pending, and they can't be canceled.
+ */
+struct kiocb *aio_kernel_alloc(gfp_t gfp)
+{
+   return kzalloc(sizeof(struct kiocb), gfp);
+}
+EXPORT_SYMBOL_GPL(aio_kernel_alloc);
+
+void aio_kernel_free(struct kiocb *iocb)
+{
+   kfree(iocb);
+}
+EXPORT_SYMBOL_GPL(aio_kernel_free);
+
+/*
+ * ptr and count can be a buff and bytes or an iov and segs.
+ */
+void aio_kernel_init_rw(struct kiocb *iocb, struct file *filp,
+   unsigned short op, void *ptr, size_t nr, loff_t off)
+{
+   iocb->ki_filp = filp;
+   iocb->ki_opcode = op;
+   iocb->ki_buf = (char __user *)(unsigned long)ptr;
+   iocb->ki_left = nr;
+   iocb->ki_nbytes = nr;
+   iocb->ki_pos = off;
+   iocb->ki_ctx = (void *)-1;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_rw);
+
+void aio_kernel_init_callback(struct kiocb *iocb,
+ void (*complete)(u64 user_data, long res),
+ u64 user_data)
+{
+   iocb->ki_obj.complete = complete;
+   iocb->ki_user_data = user_data;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_callback);
+
+/*
+ * The iocb is our responsibility once this is called.  The caller must not
+ * reference it.
+ *
+ * Callers must be prepared for their iocb completion callback to be called the
+ * moment they enter this function.  The completion callback may be called from
+ * any context.
+ *
+ * Returns: 0: the iocb completion callback will be called with the op result
+ * negative errno: the operation was not submitted and the iocb was freed
+ */
+int aio_kernel_submit(struct kiocb *iocb)
+{
+   int ret;
+
+   BUG_ON(!is_kernel_kiocb(iocb));
+   BUG_ON(!iocb->ki_obj.complete);
+   BUG_ON(!iocb->ki_filp);
+
+   ret = aio_run_iocb(iocb, 0);
+
+   if (ret)
+   aio_kernel_free(iocb);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_submit);
+
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 struct iocb *iocb, bool compat)
 {
diff --git a/include/linux/aio.h b/include/linux/aio.h
index a7e4c59..eccf646 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -36,13 +36,15 @@ struct kiocb {
atomic_tki_users;
 
struct file *ki_filp;
-   struct kioctx   *ki_ctx;/* NULL for sync ops */
+   struct kioctx   *ki_ctx;/* NULL for sync ops,
+  -1 for kernel caller 
*/
kiocb_cancel_fn *ki_cancel;
void(*ki_dtor)(struct kiocb *);
 
union {
void __user *user;
struct task_struct  *tsk;
+   void(*complete)(u64 user_data, long res);
} ki_obj;
 
__u64   ki_user_data;   /* user's data for completion */
@@ -77,6 +79,11 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb)
return kiocb->ki_ctx == NULL;
 }
 
+static inline bool is_kernel_kiocb(struct kiocb *kiocb)
+{
+   return kiocb->ki_ctx == (void *)-1;
+}
+
 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 {
*kiocb = (struct kiocb) {
@@ -99,6 +106,14 @@ extern void exit_aio(struct mm_struct *mm);
 extern long do_io_submit(aio_context_t ctx_id, long nr,

  1   2   3   4   5   >