On Thu, Aug 28, 2014 at 01:53:29PM -0700, Dave Jiang wrote: > A hardware errata causes the NTB to hang when heavy bi-directional traffic > in addition to the usage of BAR0/1 (where the registers reside, including > the doorbell registers to trigger interrupts). > > This workaround is only available on Haswell platform. > The workaround is to enable split BAR in the BIOS to allow the 64bit BAR4 to > be split into two 32bit BAR4 and BAR5. The BAR4 shall be pointed to LAPIC > region of the remote host. We will bypass the doorbell mechanism and directly > trigger the MSIX interrupts. The offsets and vectors are exchanged during > transport scratch pad negotiation. The scratch pads are now overloaded > in order to allow the exchange of the information. This gets around using > the doorbell and prevents the lockup with additional pcode changes in BIOS.
I REALLY don't like a driver mucking with the MSI-X table to work around hardware issues. I don't see any presidence for this kind of behavior in other drivers. Also, this patch is fairly invasive. I realize that there isn't much alternative, other than polling. In fact, I think I'd rather see polling. Thanks, Jon > > Signed-off-by: Dave Jiang <dave.ji...@intel.com> > --- > drivers/ntb/ntb_hw.c | 177 > +++++++++++++++++++++++++++++++++++++------ > drivers/ntb/ntb_hw.h | 9 ++ > drivers/ntb/ntb_regs.h | 1 > drivers/ntb/ntb_transport.c | 126 ++++++++++++++++++------------- > 4 files changed, 237 insertions(+), 76 deletions(-) > > diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c > index cef9d8a..97e18c3 100644 > --- a/drivers/ntb/ntb_hw.c > +++ b/drivers/ntb/ntb_hw.c > @@ -53,6 +53,8 @@ > #include <linux/pci.h> > #include <linux/random.h> > #include <linux/slab.h> > +#include <linux/msi.h> > +#include <linux/interrupt.h> > #include "ntb_hw.h" > #include "ntb_regs.h" > > @@ -150,17 +152,19 @@ static void ntb_set_errata_flags(struct ntb_device > *ndev) > case PCI_DEVICE_ID_INTEL_NTB_SS_JSF: > case PCI_DEVICE_ID_INTEL_NTB_SS_SNB: > case PCI_DEVICE_ID_INTEL_NTB_SS_IVT: > - case PCI_DEVICE_ID_INTEL_NTB_SS_HSX: > case PCI_DEVICE_ID_INTEL_NTB_PS_JSF: > case PCI_DEVICE_ID_INTEL_NTB_PS_SNB: > case PCI_DEVICE_ID_INTEL_NTB_PS_IVT: > - case PCI_DEVICE_ID_INTEL_NTB_PS_HSX: > case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF: > case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB: > case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT: > - case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX: > ndev->wa_flags |= WA_SNB_ERR; > break; > + case PCI_DEVICE_ID_INTEL_NTB_SS_HSX: > + case PCI_DEVICE_ID_INTEL_NTB_PS_HSX: > + case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX: > + ndev->wa_flags |= WA_HSX_ERR; > + break; > } > } > > @@ -209,9 +213,13 @@ static void ntb_irq_work(unsigned long data) > struct ntb_device *ndev = db_cb->ndev; > unsigned long mask; > > - mask = readw(ndev->reg_ofs.ldb_mask); > - clear_bit(db_cb->db_num * ndev->bits_per_vector, &mask); > - writew(mask, ndev->reg_ofs.ldb_mask); > + if (ndev->wa_flags & WA_HSX_ERR) > + enable_irq(db_cb->irq); > + else { > + mask = readw(ndev->reg_ofs.ldb_mask); > + clear_bit(db_cb->db_num * ndev->bits_per_vector, &mask); > + writew(mask, ndev->reg_ofs.ldb_mask); > + } > } > } > > @@ -246,9 +254,12 @@ int ntb_register_db_callback(struct ntb_device *ndev, > unsigned int idx, > (unsigned long) &ndev->db_cb[idx]); > > /* unmask interrupt */ > - mask = readw(ndev->reg_ofs.ldb_mask); > - clear_bit(idx * ndev->bits_per_vector, &mask); > - writew(mask, ndev->reg_ofs.ldb_mask); > + if (!(ndev->wa_flags & WA_HSX_ERR)) { > + /* unmask interrupt */ > + mask = readw(ndev->reg_ofs.ldb_mask); > + clear_bit(idx * ndev->bits_per_vector, &mask); > + writew(mask, ndev->reg_ofs.ldb_mask); > + } > > return 0; > } > @@ -268,9 +279,11 @@ void ntb_unregister_db_callback(struct ntb_device *ndev, > unsigned int idx) > if (idx >= ndev->max_cbs || !ndev->db_cb[idx].callback) > return; > > - mask = readw(ndev->reg_ofs.ldb_mask); > - set_bit(idx * ndev->bits_per_vector, &mask); > - writew(mask, ndev->reg_ofs.ldb_mask); > + if (!(ndev->wa_flags & WA_HSX_ERR)) { > + mask = readw(ndev->reg_ofs.ldb_mask); > + set_bit(idx * ndev->bits_per_vector, &mask); > + writew(mask, ndev->reg_ofs.ldb_mask); > + } > > tasklet_disable(&ndev->db_cb[idx].irq_work); > > @@ -518,6 +531,17 @@ void ntb_set_mw_addr(struct ntb_device *ndev, unsigned > int mw, u64 addr) > } > } > > +static void ntb_generate_rirq(struct ntb_device *ndev, int vec) > +{ > + if (vec > 2) { > + dev_err(&ndev->pdev->dev, "%s: vec %d out of bounds\n", > + __func__, vec); > + return; > + } > + > + writel(ndev->rirq[vec].data, ndev->mw[1].vbase + ndev->rirq[vec].ofs); > +} > + > /** > * ntb_ring_doorbell() - Set the doorbell on the secondary/external side > * @ndev: pointer to ntb_device instance > @@ -532,7 +556,9 @@ void ntb_ring_doorbell(struct ntb_device *ndev, unsigned > int db) > { > dev_dbg(&ndev->pdev->dev, "%s: ringing doorbell %d\n", __func__, db); > > - if (ndev->hw_type == BWD_HW) > + if (ndev->wa_flags & WA_HSX_ERR) > + ntb_generate_rirq(ndev, db); > + else if (ndev->hw_type == BWD_HW) > writeq((u64) 1 << db, ndev->reg_ofs.rdb); > else > writew(((1 << ndev->bits_per_vector) - 1) << > @@ -794,7 +820,26 @@ static int ntb_xeon_setup(struct ntb_device *ndev) > * the driver defaults, but write the Limit registers > * first just in case. > */ > - if (ndev->split_bar) > + if (ndev->wa_flags & WA_HSX_ERR) { > + /* using BAR4, must be set to 1M */ > + if (ndev->mw[1].bar_sz != 0x100000) { > + dev_err(&ndev->pdev->dev, > + "BAR4 must be 1M\n"); > + return -EINVAL; > + } > + > + /* set limit to 1M according to spec */ > + writel(pci_resource_start(ndev->pdev, 1) + > 0x100000, > + ndev->reg_base + SNB_PBAR4LMT_OFFSET); > + /* > + * need to point SBAR4XLAT to remote > + * interrupt region > + */ > + writel(0xfee00000, > + ndev->reg_base + SNB_SBAR4XLAT_OFFSET); > + > + ndev->limits.max_mw = HSX_ERRATA_MAX_MW; > + } else if (ndev->split_bar) > ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW; > else > ndev->limits.max_mw = SNB_MAX_MW; > @@ -911,7 +956,22 @@ static int ntb_xeon_setup(struct ntb_device *ndev) > if (ndev->split_bar) { > ndev->reg_ofs.bar5_xlat = > ndev->reg_base + SNB_SBAR5XLAT_OFFSET; > - ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW; > + > + if (ndev->wa_flags & WA_HSX_ERR) { > + /* using BAR4, must be set to 1M */ > + if (ndev->mw[1].bar_sz != 0x100000) { > + dev_err(&ndev->pdev->dev, > + "BAR4 must be 1M\n"); > + return -EINVAL; > + } > + > + /* set limit to 1M according to spec */ > + writel(pci_resource_start(ndev->pdev, 1) + > 0x100000, > + ndev->reg_base + SNB_PBAR4LMT_OFFSET); > + writel(0xfee00000, ndev->reg_ofs.bar4_xlat); > + ndev->limits.max_mw = HSX_ERRATA_MAX_MW; > + } else > + ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW; > } else > ndev->limits.max_mw = SNB_MAX_MW; > break; > @@ -943,7 +1003,22 @@ static int ntb_xeon_setup(struct ntb_device *ndev) > if (ndev->split_bar) { > ndev->reg_ofs.bar5_xlat = > ndev->reg_base + SNB_PBAR5XLAT_OFFSET; > - ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW; > + > + if (ndev->wa_flags & WA_HSX_ERR) { > + /* using BAR4, must be set to 1M */ > + if (ndev->mw[1].bar_sz != 0x100000) { > + dev_err(&ndev->pdev->dev, > + "BAR4 must be 1M\n"); > + return -EINVAL; > + } > + > + /* set limit to 1M according to spec */ > + writel(pci_resource_start(ndev->pdev, 1) + > 0x100000, > + ndev->reg_base + SNB_PBAR4LMT_OFFSET); > + writel(0xfee00000, ndev->reg_ofs.bar4_xlat); > + ndev->limits.max_mw = HSX_ERRATA_MAX_MW; > + } else > + ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW; > } else > ndev->limits.max_mw = SNB_MAX_MW; > break; > @@ -1085,9 +1160,14 @@ static irqreturn_t xeon_callback_msix_irq(int irq, > void *data) > dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq, > db_cb->db_num); > > - mask = readw(ndev->reg_ofs.ldb_mask); > - set_bit(db_cb->db_num * ndev->bits_per_vector, &mask); > - writew(mask, ndev->reg_ofs.ldb_mask); > + if (ndev->wa_flags & WA_HSX_ERR) { > + disable_irq_nosync(irq); > + db_cb->irq = irq; > + } else { > + mask = readw(ndev->reg_ofs.ldb_mask); > + set_bit(db_cb->db_num * ndev->bits_per_vector, &mask); > + writew(mask, ndev->reg_ofs.ldb_mask); > + } > > tasklet_schedule(&db_cb->irq_work); > > @@ -1096,8 +1176,11 @@ static irqreturn_t xeon_callback_msix_irq(int irq, > void *data) > * vectors, with the 4th having a single bit for link > * interrupts. > */ > - writew(((1 << ndev->bits_per_vector) - 1) << > - (db_cb->db_num * ndev->bits_per_vector), ndev->reg_ofs.ldb); > + if (!(ndev->wa_flags & WA_HSX_ERR)) { > + writew(((1 << ndev->bits_per_vector) - 1) << > + (db_cb->db_num * ndev->bits_per_vector), > + ndev->reg_ofs.ldb); > + } > > return IRQ_HANDLED; > } > @@ -1160,6 +1243,9 @@ static int ntb_setup_snb_msix(struct ntb_device *ndev, > int msix_entries) > struct pci_dev *pdev = ndev->pdev; > struct msix_entry *msix; > int rc, i; > + struct msi_desc *entry; > + u32 laddr = 0; > + u32 data = 0; > > if (msix_entries < ndev->limits.msix_cnt) > return -ENOSPC; > @@ -1191,6 +1277,31 @@ static int ntb_setup_snb_msix(struct ntb_device *ndev, > int msix_entries) > ndev->num_msix = msix_entries; > ndev->max_cbs = msix_entries - 1; > > + if (ndev->wa_flags & WA_HSX_ERR) { > + i = 0; > + > + /* > + * acquire the interrupt region in the LAPIC for the > + * MSIX vectors > + */ > + list_for_each_entry(entry, &pdev->msi_list, list) { > + unsigned int offset = ndev->msix_entries[i].entry * > + PCI_MSIX_ENTRY_SIZE; > + > + laddr = readl(entry->mask_base + offset + > + PCI_MSIX_ENTRY_LOWER_ADDR); > + dev_dbg(&pdev->dev, "local lower MSIX addr(%d): %#x\n", > + i, laddr); > + ndev->lirq[i].ofs = 0x1fffff & laddr; > + data = readl(entry->mask_base + offset + > + PCI_MSIX_ENTRY_DATA); > + dev_dbg(&pdev->dev, "local MSIX data(%d): %#x\n", > + i, data); > + ndev->lirq[i].data = data; > + i++; > + } > + } > + > return 0; > > err: > @@ -1288,6 +1399,11 @@ static int ntb_setup_msi(struct ntb_device *ndev) > struct pci_dev *pdev = ndev->pdev; > int rc; > > + if (ndev->wa_flags & WA_HSX_ERR) { > + dev_err(&pdev->dev, "Platform errata does not support MSI\n"); > + return -EINVAL; > + } > + > rc = pci_enable_msi(pdev); > if (rc) > return rc; > @@ -1307,6 +1423,11 @@ static int ntb_setup_intx(struct ntb_device *ndev) > struct pci_dev *pdev = ndev->pdev; > int rc; > > + if (ndev->wa_flags & WA_HSX_ERR) { > + dev_err(&pdev->dev, "Platform errata does not support INTX\n"); > + return -EINVAL; > + } > + > pci_msi_off(pdev); > > /* Verify intx is enabled */ > @@ -1720,6 +1841,17 @@ static int ntb_pci_probe(struct pci_dev *pdev, const > struct pci_device_id *id) > if (rc) > goto err; > > + if (!ndev->split_bar && (ndev->wa_flags & WA_HSX_ERR)) { > + dev_warn(&pdev->dev, > + "Please config NTB split BAR for errata workaround\n"); > + return -EINVAL; > + } > + > + /* > + * From this point on we will assume that split BAR is set when > + * WA_HSX_ERR is set > + */ > + > ndev->mw = kcalloc(sizeof(struct ntb_mw), ndev->limits.max_mw, > GFP_KERNEL); > if (!ndev->mw) { > @@ -1751,8 +1883,7 @@ static int ntb_pci_probe(struct pci_dev *pdev, const > struct pci_device_id *id) > * with the errata we need to steal last of the memory > * windows for workarounds and they point to MMIO registers. > */ > - if ((ndev->wa_flags & WA_SNB_ERR) && > - (i == (ndev->limits.max_mw - 1))) { > + if ((ndev->wa_flags & (WA_SNB_ERR | WA_HSX_ERR)) && (i > 0)) { > ndev->mw[i].vbase = > ioremap_nocache(pci_resource_start(pdev, > MW_TO_BAR(i)), > diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h > index 96de5fc..d59650e 100644 > --- a/drivers/ntb/ntb_hw.h > +++ b/drivers/ntb/ntb_hw.h > @@ -109,9 +109,16 @@ struct ntb_db_cb { > void *data; > struct ntb_device *ndev; > struct tasklet_struct irq_work; > + unsigned int irq; > +}; > + > +struct msix_info { > + u32 ofs; > + u32 data; > }; > > #define WA_SNB_ERR 0x00000001 > +#define WA_HSX_ERR 0x00000002 > > struct ntb_device { > struct pci_dev *pdev; > @@ -161,6 +168,8 @@ struct ntb_device { > struct dentry *debugfs_info; > > unsigned int wa_flags; > + struct msix_info lirq[4]; > + struct msix_info rirq[4]; > }; > > /** > diff --git a/drivers/ntb/ntb_regs.h b/drivers/ntb/ntb_regs.h > index f028ff8..7eb1440 100644 > --- a/drivers/ntb/ntb_regs.h > +++ b/drivers/ntb/ntb_regs.h > @@ -60,6 +60,7 @@ > #define HSX_SPLITBAR_MAX_MW 3 > #define SNB_MAX_MW 2 > #define SNB_ERRATA_MAX_MW 1 > +#define HSX_ERRATA_MAX_MW 1 > > #define SNB_DB_HW_LINK 0x8000 > > diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c > index 24f0ac1..7ab4d63 100644 > --- a/drivers/ntb/ntb_transport.c > +++ b/drivers/ntb/ntb_transport.c > @@ -56,6 +56,7 @@ > #include <linux/pci.h> > #include <linux/slab.h> > #include <linux/types.h> > +#include <linux/sched.h> > #include "ntb_hw.h" > > #define NTB_TRANSPORT_VERSION 3 > @@ -188,15 +189,24 @@ struct ntb_payload_header { > unsigned int flags; > }; > > +/* > + * Using 7 scratch pads, 1 left > + * VERSION > + * QP_LINKS > + * NUM_QPS NUM_MWS > + * MW0_SZ MW1_SZ MW2_SZ MW3_SZ > + * DATA0 MSIX_OFS0 > + * DATA1 MSIX_OFS1 > + * DATA2 MSIX_OFS2 > + */ > enum { > VERSION = 0, > QP_LINKS, > - NUM_QPS, > NUM_MWS, > - MW0_SZ_HIGH, > - MW0_SZ_LOW, > - MW1_SZ_HIGH, > - MW1_SZ_LOW, > + MW_SZ, > + MSIX_OFS0, > + MSIX_OFS1, > + MSIX_OFS2, > MAX_SPAD, > }; > > @@ -687,37 +697,45 @@ static void ntb_transport_link_work(struct work_struct > *work) > int rc, i; > > /* send the local info, in the opposite order of the way we read it */ > - for (i = 0; i < ntb_max_mw(ndev); i++) { > - rc = ntb_write_remote_spad(ndev, MW0_SZ_HIGH + (i * 2), > - ntb_get_mw_size(ndev, i) >> 32); > - if (rc) { > - dev_err(&pdev->dev, "Error writing %u to remote spad > %d\n", > - (u32)(ntb_get_mw_size(ndev, i) >> 32), > - MW0_SZ_HIGH + (i * 2)); > - goto out; > - } > > - rc = ntb_write_remote_spad(ndev, MW0_SZ_LOW + (i * 2), > - (u32) ntb_get_mw_size(ndev, i)); > + for (i = 0; i < 3; i++) { > + val = (ndev->lirq[i].ofs & 0x1fffff) | > + (ndev->lirq[i].data & 0xff) << 24; > + > + rc = ntb_write_remote_spad(ndev, MSIX_OFS0 + i, val); > if (rc) { > - dev_err(&pdev->dev, "Error writing %u to remote spad > %d\n", > - (u32) ntb_get_mw_size(ndev, i), > - MW0_SZ_LOW + (i * 2)); > + dev_err(&pdev->dev, "Error writing %x to remote spad > %d\n", > + val, MSIX_OFS0 + i); > goto out; > } > } > > - rc = ntb_write_remote_spad(ndev, NUM_MWS, ntb_max_mw(ndev)); > + if (ntb_max_mw(ndev) > 4) { > + dev_err(&pdev->dev, > + "Greater than 4 memory window unsupported!\n"); > + goto out; > + } > + > + val = 0; > + for (i = 0; i < ntb_max_mw(ndev); i++) { > + u32 size; > + > + size = ilog2(rounddown_pow_of_two(ntb_get_mw_size(ndev, i))); > + val |= (size & 0xff) << (8 * i); > + } > + > + rc = ntb_write_remote_spad(ndev, MW_SZ, val); > if (rc) { > - dev_err(&pdev->dev, "Error writing %x to remote spad %d\n", > - ntb_max_mw(ndev), NUM_MWS); > + dev_err(&pdev->dev, "Error writing %#x to remote spad %d\n", > + val, MW_SZ); > goto out; > } > > - rc = ntb_write_remote_spad(ndev, NUM_QPS, nt->max_qps); > + val = (ntb_max_mw(ndev) & 0xffff) | (nt->max_qps & 0xffff) << 16; > + rc = ntb_write_remote_spad(ndev, NUM_MWS, val); > if (rc) { > - dev_err(&pdev->dev, "Error writing %x to remote spad %d\n", > - nt->max_qps, NUM_QPS); > + dev_err(&pdev->dev, "Error writing %#x to remote spad %d\n", > + val, NUM_MWS); > goto out; > } > > @@ -739,46 +757,31 @@ static void ntb_transport_link_work(struct work_struct > *work) > goto out; > dev_dbg(&pdev->dev, "Remote version = %d\n", val); > > - rc = ntb_read_remote_spad(ndev, NUM_QPS, &val); > - if (rc) { > - dev_err(&pdev->dev, "Error reading remote spad %d\n", NUM_QPS); > - goto out; > - } > - > - if (val != nt->max_qps) > - goto out; > - dev_dbg(&pdev->dev, "Remote max number of qps = %d\n", val); > - > rc = ntb_read_remote_spad(ndev, NUM_MWS, &val); > if (rc) { > dev_err(&pdev->dev, "Error reading remote spad %d\n", NUM_MWS); > goto out; > } > > - if (val != ntb_max_mw(ndev)) > + if (((val >> 16) & 0xffff) != nt->max_qps) > goto out; > - dev_dbg(&pdev->dev, "Remote number of mws = %d\n", val); > > - for (i = 0; i < ntb_max_mw(ndev); i++) { > - u64 val64; > + dev_dbg(&pdev->dev, "Remote max number of qps = %d\n", > + (val >> 16) & 0xffff); > > - rc = ntb_read_remote_spad(ndev, MW0_SZ_HIGH + (i * 2), &val); > - if (rc) { > - dev_err(&pdev->dev, "Error reading remote spad %d\n", > - MW0_SZ_HIGH + (i * 2)); > - goto out1; > - } > + if ((val & 0xffff) != ntb_max_mw(ndev)) > + goto out; > > - val64 = (u64) val << 32; > + dev_dbg(&pdev->dev, "Remote number of mws = %d\n", val & 0xffff); > > - rc = ntb_read_remote_spad(ndev, MW0_SZ_LOW + (i * 2), &val); > - if (rc) { > - dev_err(&pdev->dev, "Error reading remote spad %d\n", > - MW0_SZ_LOW + (i * 2)); > - goto out1; > - } > + rc = ntb_read_remote_spad(ndev, MW_SZ, &val); > + if (rc) { > + dev_err(&pdev->dev, "Error reading remote spad %d\n", MW_SZ); > + goto out1; > + } > > - val64 |= val; > + for (i = 0; i < ntb_max_mw(ndev); i++) { > + u64 val64 = 1 << ((val >> (i * 8)) & 0xff); > > dev_dbg(&pdev->dev, "Remote MW%d size = %llu\n", i, val64); > > @@ -787,6 +790,23 @@ static void ntb_transport_link_work(struct work_struct > *work) > goto out1; > } > > + for (i = 0; i < 3; i++) { > + rc = ntb_read_remote_spad(ndev, MSIX_OFS0 + i, &val); > + if (rc) { > + dev_err(&pdev->dev, > + "Error reading remote spad %d\n", > + MSIX_OFS0 + i); > + goto out; > + } > + > + ndev->rirq[i].ofs = 0x1ffff & val; > + ndev->rirq[i].data = (val >> 24) & 0xff; > + dev_dbg(&pdev->dev, "received MSIX_OFS%d: %#x\n", > + i, ndev->rirq[i].ofs); > + dev_dbg(&pdev->dev, "received MSIX_DATA%d: %#x\n", > + i, ndev->rirq[i].data); > + } > + > nt->transport_link = NTB_LINK_UP; > > for (i = 0; i < nt->max_qps; i++) { > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/