Re: commit 16ecba59 breaks 82574L under heavy load.
> On Jul 20, 2017, at 5:44 PM, Benjamin Poirier wrote: > > [snip] > Could you please test the following patch and let me know if it: > 1) reduces the interrupt rate of the Other msi-x vector > 2) avoids the link flaps > or > 3) logs some dmesg warnings of the form "Other interrupt with unhandled [...]" > In this case, please paste icr values printed. > > Thanks > > diff --git a/drivers/net/ethernet/intel/e1000e/defines.h > b/drivers/net/ethernet/intel/e1000e/defines.h > index 0641c0098738..afb7ebe20b24 100644 > --- a/drivers/net/ethernet/intel/e1000e/defines.h > +++ b/drivers/net/ethernet/intel/e1000e/defines.h > @@ -398,6 +398,7 @@ > #define E1000_ICR_LSC 0x0004 /* Link Status Change */ > #define E1000_ICR_RXSEQ 0x0008 /* Rx sequence error */ > #define E1000_ICR_RXDMT00x0010 /* Rx desc min. threshold (0) */ > +#define E1000_ICR_RXO 0x0040 /* Receiver Overrun */ > #define E1000_ICR_RXT0 0x0080 /* Rx timer intr (ring 0) */ > #define E1000_ICR_ECCER 0x0040 /* Uncorrectable ECC Error */ > /* If this bit asserted, the driver should claim the interrupt */ > diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h > b/drivers/net/ethernet/intel/e1000e/e1000.h > index c7c994eb410e..f7b46eba3efb 100644 > --- a/drivers/net/ethernet/intel/e1000e/e1000.h > +++ b/drivers/net/ethernet/intel/e1000e/e1000.h > @@ -351,6 +351,10 @@ struct e1000_adapter { > s32 ptp_delta; > > u16 eee_advert; > + > + unsigned int uh_count; > + u32 uh_values[16]; > + unsigned int uh_values_nb; > }; > > struct e1000_info { > diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c > b/drivers/net/ethernet/intel/e1000e/netdev.c > index b3679728caac..46697338c0e1 100644 > --- a/drivers/net/ethernet/intel/e1000e/netdev.c > +++ b/drivers/net/ethernet/intel/e1000e/netdev.c > @@ -46,6 +46,8 @@ > > #include "e1000.h" > > +DEFINE_RATELIMIT_STATE(other_uh_ratelimit_state, HZ, 1); > + > #define DRV_EXTRAVERSION "-k" > > #define DRV_VERSION "3.2.6" DRV_EXTRAVERSION > @@ -1904,12 +1906,60 @@ static irqreturn_t e1000_msix_other(int > __always_unused irq, void *data) > struct net_device *netdev = data; > struct e1000_adapter *adapter = netdev_priv(netdev); > struct e1000_hw *hw = &adapter->hw; > + u32 icr; > + bool enable = true; > + bool handled = false; > + unsigned int i; > > - hw->mac.get_link_status = true; > + icr = er32(ICR); > + if (icr & E1000_ICR_RXO) { > + ew32(ICR, E1000_ICR_RXO); > + enable = false; > + /* napi poll will re-enable Other, make sure it runs */ > + if (napi_schedule_prep(&adapter->napi)) { > + adapter->total_rx_bytes = 0; > + adapter->total_rx_packets = 0; > + __napi_schedule(&adapter->napi); > + } > + handled = true; > + } > + if (icr & E1000_ICR_LSC) { > + ew32(ICR, E1000_ICR_LSC); > + hw->mac.get_link_status = true; > + /* guard against interrupt when we're going down */ > + if (!test_bit(__E1000_DOWN, &adapter->state)) { > + mod_timer(&adapter->watchdog_timer, jiffies + 1); > + } > + handled = true; > + } > > - /* guard against interrupt when we're going down */ > - if (!test_bit(__E1000_DOWN, &adapter->state)) { > - mod_timer(&adapter->watchdog_timer, jiffies + 1); > + if (!handled) { > + adapter->uh_count++; > + /* only print unseen icr values */ > + if (adapter->uh_values_nb < ARRAY_SIZE(adapter->uh_values)) { > + for (i = 0; i < ARRAY_SIZE(adapter->uh_values); i++) { > + if (adapter->uh_values[i] == icr) { > + break; > + } > + } > + if (i == ARRAY_SIZE(adapter->uh_values)) { > + adapter->uh_values[adapter->uh_values_nb] = > + icr; > + adapter->uh_values_nb++; > + netdev_warn(netdev, > + "Other interrupt with unhandled icr > 0x%08x\n", > + icr); > + } > + } > + } > + if (adapter->uh_count && __ratelimit(&other_uh_ratelimit_state)) { > + netdev_warn(netdev, > + "Other interrupt with unhandled cause, count %u\n", > + adapter->uh_count); > + adapter->uh_count = 0; > + } > + > + if (enable && !test_bit(__E1000_DOWN, &adapter->state)) { > ew32(IMS, E1000_IMS_OTHER); > } > > @@ -2681,7 +2731,8 @@ static int e1000e_poll(struct napi_struct *napi, int > weight) > napi_complete_done(napi
Re: commit 16ecba59 breaks 82574L under heavy load.
On Thu, Jul 20, 2017 at 04:44:55PM -0700, Benjamin Poirier wrote: > Could you please test the following patch and let me know if it: > 1) reduces the interrupt rate of the Other msi-x vector > 2) avoids the link flaps > or > 3) logs some dmesg warnings of the form "Other interrupt with unhandled [...]" > In this case, please paste icr values printed. By the way, while at fixing the e1000e, I just noticed that if you are blasting the port with traffic when it comes up, you risk getting a transmit queue time out, because the queue is started before the carrier is up. ixgbe already fixed that in cdc04dcce0598fead6029a2f95e95a4d2ea419c2. igb has the same problem (which goes away by moving the queue start to the watchdog after carrier_on, I just haven't got around to sending that patch yet). I am going to try moving the queue start to the watchdog and try it again. Trace looked like this: [ cut here ] WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:316 dev_watchdog+0x1f9/0x200 NETDEV WATCHDOG: eth0 (e1000e): transmit queue 0 timed out Modules linked in: dpi_drv(PO) ccu_util(PO) ipv4_mb(PO) l2bridge_config_util(PO) l2_config_util(PO) route_config_util(PO) qos_config_util(PO) sysapp_common(PO) chantry_fwd_eng_2800_config(PO) shim_module(PO) sadb_cc(PO) ipsecXformer(PO) libeCrypto(PO) ipmatch_cc(PO) l2h_cc(PO) ndproxy_cc(PO) arpint_cc(PO) portinfo_cc(PO) chantryqos_cc(PO) redirector_cc(PO) ix_ph(PO) fpm_core_cc(PO) pulse_cc(PO) vnstt_cc(PO) vnsap_cc(PO) fm_cc(PO) rutm_cc(PO) mutm_cc(PO) ethernet_tx_cc(PO) stkdrv_cc(PO) l2bridge_cc(PO) events_util(PO) sched_cc(PO) qm_cc(PO) ipv4_cc(PO) wred_cc(PO) tc_meter_cc(PO) dscp_classifier_cc(PO) classifier_6t_cc(PO) ent586_cc(PO) dev_cc_arp(PO) chantry_fwd_eng_2800_tables(PO) ether_arp_lib(PO) rtmv4_lib(PO) lkup_lib(PO) l2tm_lib(PO) fragmentation_lib(PO) properties_lib(PO) msg_support_lib(PO) utilities_lib(PO) cci_lib(PO) rm_lib(PO) libossl(O) vip(O) productSpec_x86_dp(PO) e1000e CPU: 0 PID: 0 Comm: swapper/0 Tainted: P O4.9.24 #20 Hardware name: Supermicro X7SPA-HF/X7SPA-HF, BIOS 1.2a 06/23/12 811cef1b 88007fc03e88 81037ade 88007fc03ed8 0001 0082 0001 81037b4c Call Trace: [] ? dump_stack+0x46/0x5b [] ? __warn+0xbe/0xe0 [] ? warn_slowpath_fmt+0x4c/0x50 [] ? mod_timer+0xf2/0x150 [] ? dev_watchdog+0x1f9/0x200 [] ? dev_graft_qdisc+0x70/0x70 [] ? call_timer_fn.isra.26+0x11/0x80 [] ? run_timer_softirq+0x128/0x150 [] ? __do_softirq+0xeb/0x1f0 [] ? irq_exit+0x55/0x60 [] ? smp_apic_timer_interrupt+0x39/0x50 [] ? apic_timer_interrupt+0x7c/0x90 [] ? mwait_idle+0x51/0x80 [] ? cpu_startup_entry+0xa7/0x130 [] ? start_kernel+0x306/0x30e ---[ end trace ee759b7a56e1110b ]--- -- Len Sorensen
Re: commit 16ecba59 breaks 82574L under heavy load.
On Fri, Jul 21, 2017 at 11:27:09AM -0400, wrote: > On Thu, Jul 20, 2017 at 04:44:55PM -0700, Benjamin Poirier wrote: > > Could you please test the following patch and let me know if it: > > 1) reduces the interrupt rate of the Other msi-x vector > > 2) avoids the link flaps > > or > > 3) logs some dmesg warnings of the form "Other interrupt with unhandled > > [...]" > > In this case, please paste icr values printed. > > I will give it a try. So test looks excellent. Seems to only get interrupts when link state actually changes now. > Another odd behaviour I see is that the driver will hang in > napi_synchronize on shutdown if there is traffic at the time (at least > I think that's the trigger, maybe the trigger is if there has been an > overload of traffic and the backlog in napi was used). > > From doing some searching, this seems to be a problem that has plagued > some people for years with this driver. > > I am having trouble figuring out exactly what napi_synchronize is waiting > for and who is supposed to toggle the flag it is waiting on. The flag > appears to work backwards from what I would have expected it to do. > I see lots of places that can set the bit, but only napi_enable seems > to clear it again, and I don't see how that would get called for all > the places that potentially set the bit. I just realized NAPI_STATE_SCHED and NAPIF_STATE_SCHED are the same thing and I need to look at both of those. Still something seems odd in some corner case where napi gets stuck and you can't close the port anymore due to napi_synchronize never being able to finish. Some traffic pattern causes that SCHED state bit to get into the wrong state and nothing ever clears it. Even managed to see it get stuck so it never passed traffic again and hung on shutdown. The napi poll was never called again. -- Len Sorensen
Re: commit 16ecba59 breaks 82574L under heavy load.
On Thu, Jul 20, 2017 at 04:44:55PM -0700, Benjamin Poirier wrote: > Could you please test the following patch and let me know if it: > 1) reduces the interrupt rate of the Other msi-x vector > 2) avoids the link flaps > or > 3) logs some dmesg warnings of the form "Other interrupt with unhandled [...]" > In this case, please paste icr values printed. I will give it a try. Another odd behaviour I see is that the driver will hang in napi_synchronize on shutdown if there is traffic at the time (at least I think that's the trigger, maybe the trigger is if there has been an overload of traffic and the backlog in napi was used). >From doing some searching, this seems to be a problem that has plagued some people for years with this driver. I am having trouble figuring out exactly what napi_synchronize is waiting for and who is supposed to toggle the flag it is waiting on. The flag appears to work backwards from what I would have expected it to do. I see lots of places that can set the bit, but only napi_enable seems to clear it again, and I don't see how that would get called for all the places that potentially set the bit. -- Len Sorensen
Re: commit 16ecba59 breaks 82574L under heavy load.
On 2017/07/20 10:00, Lennart Sorensen wrote: > On Wed, Jul 19, 2017 at 05:07:47PM -0700, Benjamin Poirier wrote: > > Are you sure about this? In my testing, while triggering the overrun > > with the msleep, I read ICR when entering e1000_msix_other() and RXO is > > consistently set. > > I had thousands of calls to e1000_msix_other where the only bit set > was OTHER. > > I don't know if the cause is overruns, it just seems plausible. > > > I'm working on a patch that uses that fact to handle the situation and > > limit the interrupt. > > Excellent. > Could you please test the following patch and let me know if it: 1) reduces the interrupt rate of the Other msi-x vector 2) avoids the link flaps or 3) logs some dmesg warnings of the form "Other interrupt with unhandled [...]" In this case, please paste icr values printed. Thanks diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h index 0641c0098738..afb7ebe20b24 100644 --- a/drivers/net/ethernet/intel/e1000e/defines.h +++ b/drivers/net/ethernet/intel/e1000e/defines.h @@ -398,6 +398,7 @@ #define E1000_ICR_LSC 0x0004 /* Link Status Change */ #define E1000_ICR_RXSEQ 0x0008 /* Rx sequence error */ #define E1000_ICR_RXDMT00x0010 /* Rx desc min. threshold (0) */ +#define E1000_ICR_RXO 0x0040 /* Receiver Overrun */ #define E1000_ICR_RXT0 0x0080 /* Rx timer intr (ring 0) */ #define E1000_ICR_ECCER 0x0040 /* Uncorrectable ECC Error */ /* If this bit asserted, the driver should claim the interrupt */ diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index c7c994eb410e..f7b46eba3efb 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -351,6 +351,10 @@ struct e1000_adapter { s32 ptp_delta; u16 eee_advert; + + unsigned int uh_count; + u32 uh_values[16]; + unsigned int uh_values_nb; }; struct e1000_info { diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index b3679728caac..46697338c0e1 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -46,6 +46,8 @@ #include "e1000.h" +DEFINE_RATELIMIT_STATE(other_uh_ratelimit_state, HZ, 1); + #define DRV_EXTRAVERSION "-k" #define DRV_VERSION "3.2.6" DRV_EXTRAVERSION @@ -1904,12 +1906,60 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data) struct net_device *netdev = data; struct e1000_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; + u32 icr; + bool enable = true; + bool handled = false; + unsigned int i; - hw->mac.get_link_status = true; + icr = er32(ICR); + if (icr & E1000_ICR_RXO) { + ew32(ICR, E1000_ICR_RXO); + enable = false; + /* napi poll will re-enable Other, make sure it runs */ + if (napi_schedule_prep(&adapter->napi)) { + adapter->total_rx_bytes = 0; + adapter->total_rx_packets = 0; + __napi_schedule(&adapter->napi); + } + handled = true; + } + if (icr & E1000_ICR_LSC) { + ew32(ICR, E1000_ICR_LSC); + hw->mac.get_link_status = true; + /* guard against interrupt when we're going down */ + if (!test_bit(__E1000_DOWN, &adapter->state)) { + mod_timer(&adapter->watchdog_timer, jiffies + 1); + } + handled = true; + } - /* guard against interrupt when we're going down */ - if (!test_bit(__E1000_DOWN, &adapter->state)) { - mod_timer(&adapter->watchdog_timer, jiffies + 1); + if (!handled) { + adapter->uh_count++; + /* only print unseen icr values */ + if (adapter->uh_values_nb < ARRAY_SIZE(adapter->uh_values)) { + for (i = 0; i < ARRAY_SIZE(adapter->uh_values); i++) { + if (adapter->uh_values[i] == icr) { + break; + } + } + if (i == ARRAY_SIZE(adapter->uh_values)) { + adapter->uh_values[adapter->uh_values_nb] = + icr; + adapter->uh_values_nb++; + netdev_warn(netdev, + "Other interrupt with unhandled icr 0x%08x\n", + icr); + } + } + } + if (adapter->uh_count && __ratelimit(&other_uh_ratelimit_state)) { + netdev_warn(netdev, +
Re: commit 16ecba59 breaks 82574L under heavy load.
On Wed, Jul 19, 2017 at 05:07:47PM -0700, Benjamin Poirier wrote: > Are you sure about this? In my testing, while triggering the overrun > with the msleep, I read ICR when entering e1000_msix_other() and RXO is > consistently set. I had thousands of calls to e1000_msix_other where the only bit set was OTHER. I don't know if the cause is overruns, it just seems plausible. > I'm working on a patch that uses that fact to handle the situation and > limit the interrupt. Excellent. Running in MSI mode rather than MSI-X seems to not have the problem of unexpected interrupts, but has other issues (such as loosing the IRQ affinity setting if you do ifconfig down;ifconfig up on the interface, which does not happen in MSI-X's case.) That's rather annoying as you can't set the affinity before bringing up the interface which is rather undesirable. -- Len Sorensen
Re: commit 16ecba59 breaks 82574L under heavy load.
On 2017/07/19 10:19, Lennart Sorensen wrote: > On Tue, Jul 18, 2017 at 04:14:35PM -0700, Benjamin Poirier wrote: > > Thanks for the detailed analysis. > > > > Refering to the original discussion around this patch series, it seemed like > > the IMS bit for a condition had to be set for the Other interrupt to be > > raised > > for that condition. > > > > https://lkml.org/lkml/2015/11/4/683 > > > > In this case however, E1000_ICR_RXT0 is not set in IMS so Other shouldn't be > > raised for Receiver Overrun. Apparently something is going on... > > > > I can reproduce the spurious Other interrupts with a simple mdelay() > > With the debugging patch at the end of the mail I see stuff like this > > while blasting with udp frames: > > -0 [086] d.h1 15338.742675: e1000_msix_other: got Other > > interrupt, count 15127 > ><...>-54504 [086] d.h. 15338.742724: e1000_msix_other: got Other > > interrupt, count 1 > ><...>-54504 [086] d.h. 15338.742774: e1000_msix_other: got Other > > interrupt, count 1 > ><...>-54504 [086] d.h. 15338.742824: e1000_msix_other: got Other > > interrupt, count 1 > > -0 [086] d.h1 15340.745123: e1000_msix_other: got Other > > interrupt, count 27584 > ><...>-54504 [086] d.h. 15340.745172: e1000_msix_other: got Other > > interrupt, count 1 > ><...>-54504 [086] d.h. 15340.745222: e1000_msix_other: got Other > > interrupt, count 1 > ><...>-54504 [086] d.h. 15340.745272: e1000_msix_other: got Other > > interrupt, count 1 > > > > > hence sets the flag that (unfortunately) means both link is down and link > > > state should be checked. Since this now happens 3000 times per second, > > > the chances of it happening while the watchdog_task is checking the link > > > state becomes pretty high, and it if does happen to coincice, then the > > > watchdog_task will reset the adapter, which causes a real loss of link. > > > > Through which path does watchdog_task reset the adapter? I didn't > > reproduce that. > > The other interrupt happens and sets get_link_status to true. At some > point the watchdog_task runs on some core and calls e1000e_has_link, > which then calls check_for_link to find out the current link status. > While e1000e_check_for_copper_link is checking the link state and > after updating get_link_status to false to indicate link is up, another > interrupt occurs and another core handles it and changes get_link_status > to true again. So by the time e1000e_has_link goes to determine the > return value, get_link_state has changed back again so now it returns > link down, and as a result the watchdog_task calls reset, because we > have packets in the transmit queue (we were busy forwarding over 10 > packets per second when it happened). Ah I see. Thanks again. In your previous mail, On 2017/07/18 10:21, Lennart Sorensen wrote: [...] > I tried checking what the bits in the ICR actually were under these > conditions, and it would appear that the only bit set is 24 (the Other > Causes interrupt bit). So I don't know what the real cause is although Are you sure about this? In my testing, while triggering the overrun with the msleep, I read ICR when entering e1000_msix_other() and RXO is consistently set. I'm working on a patch that uses that fact to handle the situation and limit the interrupt.
Re: commit 16ecba59 breaks 82574L under heavy load.
On Tue, Jul 18, 2017 at 04:14:35PM -0700, Benjamin Poirier wrote: > Thanks for the detailed analysis. > > Refering to the original discussion around this patch series, it seemed like > the IMS bit for a condition had to be set for the Other interrupt to be raised > for that condition. > > https://lkml.org/lkml/2015/11/4/683 > > In this case however, E1000_ICR_RXT0 is not set in IMS so Other shouldn't be > raised for Receiver Overrun. Apparently something is going on... > > I can reproduce the spurious Other interrupts with a simple mdelay() > With the debugging patch at the end of the mail I see stuff like this > while blasting with udp frames: > -0 [086] d.h1 15338.742675: e1000_msix_other: got Other > interrupt, count 15127 ><...>-54504 [086] d.h. 15338.742724: e1000_msix_other: got Other > interrupt, count 1 ><...>-54504 [086] d.h. 15338.742774: e1000_msix_other: got Other > interrupt, count 1 ><...>-54504 [086] d.h. 15338.742824: e1000_msix_other: got Other > interrupt, count 1 > -0 [086] d.h1 15340.745123: e1000_msix_other: got Other > interrupt, count 27584 ><...>-54504 [086] d.h. 15340.745172: e1000_msix_other: got Other > interrupt, count 1 ><...>-54504 [086] d.h. 15340.745222: e1000_msix_other: got Other > interrupt, count 1 ><...>-54504 [086] d.h. 15340.745272: e1000_msix_other: got Other > interrupt, count 1 > > > hence sets the flag that (unfortunately) means both link is down and link > > state should be checked. Since this now happens 3000 times per second, > > the chances of it happening while the watchdog_task is checking the link > > state becomes pretty high, and it if does happen to coincice, then the > > watchdog_task will reset the adapter, which causes a real loss of link. > > Through which path does watchdog_task reset the adapter? I didn't > reproduce that. The other interrupt happens and sets get_link_status to true. At some point the watchdog_task runs on some core and calls e1000e_has_link, which then calls check_for_link to find out the current link status. While e1000e_check_for_copper_link is checking the link state and after updating get_link_status to false to indicate link is up, another interrupt occurs and another core handles it and changes get_link_status to true again. So by the time e1000e_has_link goes to determine the return value, get_link_state has changed back again so now it returns link down, and as a result the watchdog_task calls reset, because we have packets in the transmit queue (we were busy forwarding over 10 packets per second when it happened). Running on an Atom D525 which isn't very fast and uses hyperthreading might have something to do with how the scheduling manages to trigger this race condition. On a faster CPU you very likely would be done checking the link state quickly enough that the interrupt handler rarely gets a chance to interfere. Also we have the irq affinity set so the RX/TX of one port is handled by one CPU, the RX/TX of the other port by a different CPU and the Other interrupts and other tasks (like the watchdog) are handled by the last two CPUs. Either making the current link state its own bool and keeping it's meaning away from get_link_state, or making the interrupt handler only change get_link_state when LSC is actually present makes the problem go away. Having two meanings to get_link_state (both link state needs checking and what the link state is) causes issues. After all it is using a bool to store 3 values: Link is up, link needs checking but is up and link needs checking but is down. Of course the last two states are rather quantum, in that you don't know which it is until you check. > diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c > b/drivers/net/ethernet/intel/e1000e/netdev.c > index b3679728caac..689ad76d0d12 100644 > --- a/drivers/net/ethernet/intel/e1000e/netdev.c > +++ b/drivers/net/ethernet/intel/e1000e/netdev.c > @@ -46,6 +46,8 @@ > > #include "e1000.h" > > +DEFINE_RATELIMIT_STATE(e1000e_ratelimit_state, 2 * HZ, 4); > + > #define DRV_EXTRAVERSION "-k" > > #define DRV_VERSION "3.2.6" DRV_EXTRAVERSION > @@ -937,6 +939,8 @@ static bool e1000_clean_rx_irq(struct e1000_ring > *rx_ring, int *work_done, > bool cleaned = false; > unsigned int total_rx_bytes = 0, total_rx_packets = 0; > > + mdelay(10); > + > i = rx_ring->next_to_clean; > rx_desc = E1000_RX_DESC_EXT(*rx_ring, i); > staterr = le32_to_cpu(rx_desc->wb.upper.status_error); > @@ -1067,6 +1071,13 @@ static bool e1000_clean_rx_irq(struct e1000_ring > *rx_ring, int *work_done, > > adapter->total_rx_bytes += total_rx_bytes; > adapter->total_rx_packets += total_rx_packets; > + > + if (__ratelimit(&e1000e_ratelimit_state)) { > + static unsigned int max; > + max = max(max, total_rx_packets); > + trace_printk("received %u max %u\n", to
Re: commit 16ecba59 breaks 82574L under heavy load.
On 2017/07/18 10:21, Lennart Sorensen wrote: > Commit 16ecba59bc333d6282ee057fb02339f77a880beb has apparently broken > at least the 82574L under heavy load (as in load heavy enough to cause > packet drops). In this case, when running in MSI-X mode, the Other > Causes interrupt fires about 3000 times per second, but not due to link > state changes. Unfortunately this commit changed the driver to assume > that the Other Causes interrupt can only mean link state change and Thanks for the detailed analysis. Refering to the original discussion around this patch series, it seemed like the IMS bit for a condition had to be set for the Other interrupt to be raised for that condition. https://lkml.org/lkml/2015/11/4/683 In this case however, E1000_ICR_RXT0 is not set in IMS so Other shouldn't be raised for Receiver Overrun. Apparently something is going on... I can reproduce the spurious Other interrupts with a simple mdelay() With the debugging patch at the end of the mail I see stuff like this while blasting with udp frames: -0 [086] d.h1 15338.742675: e1000_msix_other: got Other interrupt, count 15127 <...>-54504 [086] d.h. 15338.742724: e1000_msix_other: got Other interrupt, count 1 <...>-54504 [086] d.h. 15338.742774: e1000_msix_other: got Other interrupt, count 1 <...>-54504 [086] d.h. 15338.742824: e1000_msix_other: got Other interrupt, count 1 -0 [086] d.h1 15340.745123: e1000_msix_other: got Other interrupt, count 27584 <...>-54504 [086] d.h. 15340.745172: e1000_msix_other: got Other interrupt, count 1 <...>-54504 [086] d.h. 15340.745222: e1000_msix_other: got Other interrupt, count 1 <...>-54504 [086] d.h. 15340.745272: e1000_msix_other: got Other interrupt, count 1 > hence sets the flag that (unfortunately) means both link is down and link > state should be checked. Since this now happens 3000 times per second, > the chances of it happening while the watchdog_task is checking the link > state becomes pretty high, and it if does happen to coincice, then the > watchdog_task will reset the adapter, which causes a real loss of link. Through which path does watchdog_task reset the adapter? I didn't reproduce that. diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index b3679728caac..689ad76d0d12 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -46,6 +46,8 @@ #include "e1000.h" +DEFINE_RATELIMIT_STATE(e1000e_ratelimit_state, 2 * HZ, 4); + #define DRV_EXTRAVERSION "-k" #define DRV_VERSION "3.2.6" DRV_EXTRAVERSION @@ -937,6 +939,8 @@ static bool e1000_clean_rx_irq(struct e1000_ring *rx_ring, int *work_done, bool cleaned = false; unsigned int total_rx_bytes = 0, total_rx_packets = 0; + mdelay(10); + i = rx_ring->next_to_clean; rx_desc = E1000_RX_DESC_EXT(*rx_ring, i); staterr = le32_to_cpu(rx_desc->wb.upper.status_error); @@ -1067,6 +1071,13 @@ static bool e1000_clean_rx_irq(struct e1000_ring *rx_ring, int *work_done, adapter->total_rx_bytes += total_rx_bytes; adapter->total_rx_packets += total_rx_packets; + + if (__ratelimit(&e1000e_ratelimit_state)) { + static unsigned int max; + max = max(max, total_rx_packets); + trace_printk("received %u max %u\n", total_rx_packets, max); + } + return cleaned; } @@ -1904,9 +1915,16 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data) struct net_device *netdev = data; struct e1000_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; + static unsigned int count; hw->mac.get_link_status = true; + count++; + if (__ratelimit(&e1000e_ratelimit_state)) { + trace_printk("got Other interrupt, count %u\n", count); + count = 0; + } + /* guard against interrupt when we're going down */ if (!test_bit(__E1000_DOWN, &adapter->state)) { mod_timer(&adapter->watchdog_timer, jiffies + 1); @@ -7121,7 +7139,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) netdev->netdev_ops = &e1000e_netdev_ops; e1000e_set_ethtool_ops(netdev); netdev->watchdog_timeo = 5 * HZ; - netif_napi_add(netdev, &adapter->napi, e1000e_poll, 64); + netif_napi_add(netdev, &adapter->napi, e1000e_poll, 500); strlcpy(netdev->name, pci_name(pdev), sizeof(netdev->name)); netdev->mem_start = mmio_start; @@ -7327,6 +7345,8 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_register; + ratelimit_set_flags(&e1000e_ratelimit_state, RATELIMIT_MSG_ON_RELEASE); + /* carrier off reporting is important to ethtool even BEFORE open */ ne