I just committed the following patch, which adds some initial support
for detecting and reporting catastrophic errors reported by Mellanox
HCAs. We start a periodic timer which polls the catastrophic error
reporting buffer in device memory. If an error is detected, we dump
the contents of the buffer for port-mortem debugging, and report a
fatal asynchronous error to higher levels.
In the future we can try to recover from these errors by resetting the
device, but this will require some work in higher-level code as well.
Let's get this in now, so that we at least get catastrophic errors
reported in logs.
Comments and criticisms gratefully accepted.
- R.
--- infiniband/hw/mthca/mthca_provider.c(revision 3852)
+++ infiniband/hw/mthca/mthca_provider.c(working copy)
@@ -1175,10 +1175,13 @@ int mthca_register_device(struct mthca_d
}
}
+ mthca_start_catas_poll(dev);
+
return 0;
}
void mthca_unregister_device(struct mthca_dev *dev)
{
+ mthca_stop_catas_poll(dev);
ib_unregister_device(dev-ib_dev);
}
--- infiniband/hw/mthca/mthca_catas.c (revision 0)
+++ infiniband/hw/mthca/mthca_catas.c (revision 0)
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#include mthca_dev.h
+
+enum {
+ MTHCA_CATAS_POLL_INTERVAL = 5 * HZ,
+
+ MTHCA_CATAS_TYPE_INTERNAL = 0,
+ MTHCA_CATAS_TYPE_UPLINK = 3,
+ MTHCA_CATAS_TYPE_DDR= 4,
+ MTHCA_CATAS_TYPE_PARITY = 5,
+};
+
+static DEFINE_SPINLOCK(catas_lock);
+
+static void handle_catas(struct mthca_dev *dev)
+{
+ struct ib_event event;
+ const char *type;
+ int i;
+
+ event.device = dev-ib_dev;
+ event.event = IB_EVENT_DEVICE_FATAL;
+ event.element.port_num = 0;
+
+ ib_dispatch_event(event);
+
+ switch (swab32(readl(dev-catas_err.map)) 24) {
+ case MTHCA_CATAS_TYPE_INTERNAL:
+ type = internal error;
+ break;
+ case MTHCA_CATAS_TYPE_UPLINK:
+ type = uplink bus error;
+ break;
+ case MTHCA_CATAS_TYPE_DDR:
+ type = DDR data error;
+ break;
+ case MTHCA_CATAS_TYPE_PARITY:
+ type = internal parity error;
+ break;
+ default:
+ type = unknown error;
+ break;
+ }
+
+ mthca_err(dev, Catastrophic error detected: %s\n, type);
+ for (i = 0; i dev-catas_err.size; ++i)
+ mthca_err(dev, buf[%02x]: %08x\n,
+ i, swab32(readl(dev-catas_err.map + i)));
+}
+
+static void poll_catas(unsigned long dev_ptr)
+{
+ struct mthca_dev *dev = (struct mthca_dev *) dev_ptr;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i dev-catas_err.size; ++i)
+ if (readl(dev-catas_err.map + i)) {
+ handle_catas(dev);
+ return;
+ }
+
+ spin_lock_irqsave(catas_lock, flags);
+ if (dev-catas_err.stop)
+ mod_timer(dev-catas_err.timer,
+ jiffies + MTHCA_CATAS_POLL_INTERVAL);
+ spin_unlock_irqrestore(catas_lock, flags);
+
+ return;
+}
+
+void mthca_start_catas_poll(struct mthca_dev *dev)
+{
+ init_timer(dev-catas_err.timer);
+ dev-catas_err.stop = 0;
+ dev-catas_err.map = NULL;
+
+ if (!request_mem_region(dev-catas_err.addr,
+ dev-catas_err.size * 4,
+ DRV_NAME)) {