[PATCH RFC net-next v2] net: vlan/macvlan: count packets properly with gso

2018-11-29 Thread Debabrata Banerjee
Fix packet count when using vlan/macvlan drivers with gso. Without this it
is not possible to reconcile packet counts between underlying devices and
these virtual devices. Additionally, the output looks wrong in a standalone
way i.e. device MTU of 1500, 1 packet sent, 31856 bytes sent.

There are many other drivers that likely have a similar problem, although
it is not clear how many of those could be used with gso. Perhaps all
packet counting should be wrapped in a helper fn.

v2: bytes were also incorrect for gso skb's, fix tx as that is readily
available. Fix rx packets for macvlan.

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/ethernet/intel/fm10k/fm10k_main.c |  4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  4 ++--
 drivers/net/macvlan.c | 13 -
 drivers/net/macvtap.c |  2 +-
 include/linux/if_macvlan.h|  6 +++---
 net/8021q/vlan_core.c |  3 ++-
 net/8021q/vlan_dev.c  |  4 ++--
 7 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c 
b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index 6fd15a734324..e39fad2d888f 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -431,8 +431,8 @@ static void fm10k_type_trans(struct fm10k_ring *rx_ring,
if (!l2_accel)
skb_record_rx_queue(skb, rx_ring->queue_index);
else
-   macvlan_count_rx(netdev_priv(dev), skb->len + ETH_HLEN, true,
-false);
+   macvlan_count_rx(netdev_priv(dev), skb_shinfo(skb)->gso_segs ?: 
1,
+skb->len + ETH_HLEN, true, false);
 
skb->protocol = eth_type_trans(skb, dev);
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 49a4ea38eb07..474e72ec68b0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1704,8 +1704,8 @@ void ixgbe_process_skb_fields(struct ixgbe_ring *rx_ring,
if (netif_is_ixgbe(dev))
skb_record_rx_queue(skb, rx_ring->queue_index);
else
-   macvlan_count_rx(netdev_priv(dev), skb->len + ETH_HLEN, true,
-false);
+   macvlan_count_rx(netdev_priv(dev), skb_shinfo(skb)->gso_segs ?: 
1,
+skb->len + ETH_HLEN, true, false);
 
skb->protocol = eth_type_trans(skb, dev);
 }
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index fc8d5f1ee1ad..ab8743777897 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -289,7 +289,8 @@ static void macvlan_broadcast(struct sk_buff *skb,
nskb, vlan, eth,
mode == MACVLAN_MODE_BRIDGE) ?:
  netif_rx_ni(nskb);
-   macvlan_count_rx(vlan, skb->len + ETH_HLEN,
+   macvlan_count_rx(vlan, skb_shinfo(skb)->gso_segs ?: 1,
+skb->len + ETH_HLEN,
 err == NET_RX_SUCCESS, true);
}
}
@@ -418,7 +419,8 @@ static void macvlan_forward_source_one(struct sk_buff *skb,
nskb->pkt_type = PACKET_HOST;
 
ret = netif_rx(nskb);
-   macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
+   macvlan_count_rx(vlan, skb_shinfo(skb)->gso_segs ?: 1, len,
+ret == NET_RX_SUCCESS, false);
 }
 
 static void macvlan_forward_source(struct sk_buff *skb,
@@ -505,7 +507,8 @@ static rx_handler_result_t macvlan_handle_frame(struct 
sk_buff **pskb)
ret = NET_RX_SUCCESS;
handle_res = RX_HANDLER_ANOTHER;
 out:
-   macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
+   macvlan_count_rx(vlan, skb_shinfo(skb)->gso_segs ?: 1, len,
+ret == NET_RX_SUCCESS, false);
return handle_res;
 }
 
@@ -553,7 +556,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
  struct net_device *dev)
 {
struct macvlan_dev *vlan = netdev_priv(dev);
-   unsigned int len = skb->len;
+   unsigned int len = qdisc_skb_cb(skb)->pkt_len;
int ret;
 
if (unlikely(netpoll_tx_running(dev)))
@@ -566,7 +569,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 
pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
u64_stats_update_begin(&pcpu_stats->syncp);
-   pcpu_stats->tx_packets++;
+   pcpu_stats->tx_packets += skb_shinfo(skb)->gso_segs ?: 1;
pcpu_stats->tx_bytes += len;
u64_stats_update_end(&pcpu_stats->s

[PATCH RFC net-next] net: vlan/macvlan: count packets properly with gso

2018-11-29 Thread Debabrata Banerjee
Fix packet count when using vlan/macvlan drivers with gso. Without this it
is not possible to reconcile packet counts between underlying devices and
these virtual devices. Additionally, the output looks wrong in a standalone
way i.e. device MTU of 1500, 1 packet sent, 31856 bytes sent.

There are many other drivers that likely have a similar problem, although
it is not clear how many of those could be used with gso. Perhaps all
packet counting should be wrapped in a helper fn.

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/macvlan.c | 2 +-
 net/8021q/vlan_core.c | 2 +-
 net/8021q/vlan_dev.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index fc8d5f1ee1ad..15e67a87f202 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -566,7 +566,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 
pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
u64_stats_update_begin(&pcpu_stats->syncp);
-   pcpu_stats->tx_packets++;
+   pcpu_stats->tx_packets += max_t(u16, 1, 
skb_shinfo(skb)->gso_segs);
pcpu_stats->tx_bytes += len;
u64_stats_update_end(&pcpu_stats->syncp);
} else {
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index a313165e7a67..e85f6665d0ed 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -62,7 +62,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);
 
u64_stats_update_begin(&rx_stats->syncp);
-   rx_stats->rx_packets++;
+   rx_stats->rx_packets += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
rx_stats->rx_bytes += skb->len;
if (skb->pkt_type == PACKET_MULTICAST)
rx_stats->rx_multicast++;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index b2d9c8f27cd7..b28e7535a0b9 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -135,7 +135,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff 
*skb,
 
stats = this_cpu_ptr(vlan->vlan_pcpu_stats);
u64_stats_update_begin(&stats->syncp);
-   stats->tx_packets++;
+   stats->tx_packets += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
stats->tx_bytes += len;
u64_stats_update_end(&stats->syncp);
} else {
-- 
2.19.2



[PATCH net-next v2] netpoll: allow cleanup to be synchronous

2018-10-18 Thread Debabrata Banerjee
This fixes a problem introduced by:
commit 2cde6acd49da ("netpoll: Fix __netpoll_rcu_free so that it can hold the 
rtnl lock")

When using netconsole on a bond, __netpoll_cleanup can asynchronously
recurse multiple times, each __netpoll_free_async call can result in
more __netpoll_free_async's. This means there is now a race between
cleanup_work queues on multiple netpoll_info's on multiple devices and
the configuration of a new netpoll. For example if a netconsole is set
to enable 0, reconfigured, and enable 1 immediately, this netconsole
will likely not work.

Given the reason for __netpoll_free_async is it can be called when rtnl
is not locked, if it is locked, we should be able to execute
synchronously. It appears to be locked everywhere it's called from.

Generalize the design pattern from the teaming driver for current
callers of __netpoll_free_async.

CC: Neil Horman 
CC: "David S. Miller" 
Signed-off-by: Debabrata Banerjee 
---
 drivers/net/bonding/bond_main.c |  3 ++-
 drivers/net/macvlan.c   |  2 +-
 drivers/net/team/team.c |  5 +
 include/linux/netpoll.h |  4 +---
 net/8021q/vlan_dev.c|  3 +--
 net/bridge/br_device.c  |  2 +-
 net/core/netpoll.c  | 20 +---
 net/dsa/slave.c |  2 +-
 8 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index ee28ec9e0aba..ffa37adb7681 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -963,7 +963,8 @@ static inline void slave_disable_netpoll(struct slave 
*slave)
return;
 
slave->np = NULL;
-   __netpoll_free_async(np);
+
+   __netpoll_free(np);
 }
 
 static void bond_poll_controller(struct net_device *bond_dev)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index cfda146f3b3b..fc8d5f1ee1ad 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1077,7 +1077,7 @@ static void macvlan_dev_netpoll_cleanup(struct net_device 
*dev)
 
vlan->netpoll = NULL;
 
-   __netpoll_free_async(netpoll);
+   __netpoll_free(netpoll);
 }
 #endif /* CONFIG_NET_POLL_CONTROLLER */
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index d887016e54b6..db633ae9f784 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1104,10 +1104,7 @@ static void team_port_disable_netpoll(struct team_port 
*port)
return;
port->np = NULL;
 
-   /* Wait for transmitting packets to finish before freeing. */
-   synchronize_rcu_bh();
-   __netpoll_cleanup(np);
-   kfree(np);
+   __netpoll_free(np);
 }
 #else
 static int team_port_enable_netpoll(struct team_port *port)
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 3ef82d3a78db..676f1ff161a9 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -31,8 +31,6 @@ struct netpoll {
bool ipv6;
u16 local_port, remote_port;
u8 remote_mac[ETH_ALEN];
-
-   struct work_struct cleanup_work;
 };
 
 struct netpoll_info {
@@ -63,7 +61,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt);
 int __netpoll_setup(struct netpoll *np, struct net_device *ndev);
 int netpoll_setup(struct netpoll *np);
 void __netpoll_cleanup(struct netpoll *np);
-void __netpoll_free_async(struct netpoll *np);
+void __netpoll_free(struct netpoll *np);
 void netpoll_cleanup(struct netpoll *np);
 void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 struct net_device *dev);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 546af0e73ac3..ff720f1ebf73 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -756,8 +756,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev)
return;
 
vlan->netpoll = NULL;
-
-   __netpoll_free_async(netpoll);
+   __netpoll_free(netpoll);
 }
 #endif /* CONFIG_NET_POLL_CONTROLLER */
 
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index e053a4e43758..c6abf927f0c9 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -344,7 +344,7 @@ void br_netpoll_disable(struct net_bridge_port *p)
 
p->np = NULL;
 
-   __netpoll_free_async(np);
+   __netpoll_free(np);
 }
 
 #endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index de1d1ba92f2d..6ac71624ead4 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -591,7 +591,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device 
*ndev)
 
np->dev = ndev;
strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
-   INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
 
if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
np_err(np, "%s doesn't support polling, aborting\n",
@@ -790,10 +789,6 @@ void __netpoll_cleanup(struct netpoll *np)
 {
   

[PATCH net-next] netpoll: allow cleanup to be synchronous

2018-10-12 Thread Debabrata Banerjee
This fixes a problem introduced by:
commit 2cde6acd49da ("netpoll: Fix __netpoll_rcu_free so that it can hold the 
rtnl lock")

When using netconsole on a bond, __netpoll_cleanup can asynchronously
recurse multiple times, each __netpoll_free_async call can result in
more __netpoll_free_async's. This means there is now a race between
cleanup_work queues on multiple netpoll_info's on multiple devices and
the configuration of a new netpoll. For example if a netconsole is set
to enable 0, reconfigured, and enable 1 immediately, this netconsole
will likely not work.

Given the reason for __netpoll_free_async is it can be called when rtnl
is not locked, if it is locked, we should be able to execute
synchronously.

CC: Neil Horman 
CC: "David S. Miller" 
Signed-off-by: Debabrata Banerjee 
---
 net/core/netpoll.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index de1d1ba92f2d..b899cbfbe639 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -826,7 +826,10 @@ static void netpoll_async_cleanup(struct work_struct *work)
 
 void __netpoll_free_async(struct netpoll *np)
 {
-   schedule_work(&np->cleanup_work);
+   if (rtnl_is_locked())
+   __netpoll_cleanup(np);
+   else
+   schedule_work(&np->cleanup_work);
 }
 EXPORT_SYMBOL_GPL(__netpoll_free_async);
 
-- 
2.19.1



[PATCH RFC net-next 1/1] tcp: close socket without reset on incoming data

2018-05-18 Thread Debabrata Banerjee
When TCP_CLOSE_NORST is set before a close(), offload sinking of
unwanted data to the kernel with low resource usage, with a timeout of
TCP_LINGER2. The socket will transition to FIN_WAIT1 and then FIN_WAIT2
where it will ack data until either the timeout is hit, or a RST or FIN
is received.

Signed-off-by: Debabrata Banerjee 
---
 include/linux/tcp.h  |  4 +++-
 include/uapi/linux/tcp.h |  2 +-
 net/ipv4/tcp.c   | 23 +--
 net/ipv4/tcp_input.c | 16 
 net/ipv4/tcp_minisocks.c | 15 +++
 5 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 72705eaf4b84..bd44bc99b480 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -226,7 +226,8 @@ struct tcp_sock {
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a 
cookie */
is_sack_reneg:1,/* in recovery from loss with SACK reneg? */
-   unused:2;
+   norst:1,/* Don't send RST on shutdown() socket */
+   unused:1;
u8  nonagle : 4,/* Disable Nagle algorithm? */
thin_lto: 1,/* Use linear timeouts for thin streams */
recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
@@ -429,6 +430,7 @@ struct tcp_timewait_sock {
 #ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *tw_md5_key;
 #endif
+   int   tw_norst;
 };
 
 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 29eb659aa77a..369f3402b669 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -124,8 +124,8 @@ enum {
 #define TCP_FASTOPEN_NO_COOKIE 34  /* Enable TFO without a TFO cookie */
 #define TCP_ZEROCOPY_RECEIVE   35
 #define TCP_INQ36  /* Notify bytes available to 
read as a cmsg on read */
-
 #define TCP_CM_INQ TCP_INQ
+#define TCP_CLOSE_NORST37  /* Don't send RST on close()'d 
socket */
 
 struct tcp_repair_opt {
__u32   opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0a2ea0bbf867..29fe763002e5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2318,8 +2318,10 @@ void tcp_close(struct sock *sk, long timeout)
struct sk_buff *skb;
int data_was_unread = 0;
int state;
+   struct tcp_sock *tp;
 
lock_sock(sk);
+   tp = tcp_sk(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
 
if (sk->sk_state == TCP_LISTEN) {
@@ -2362,8 +2364,19 @@ void tcp_close(struct sock *sk, long timeout)
} else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
-   tcp_set_state(sk, TCP_CLOSE);
-   tcp_send_active_reset(sk, sk->sk_allocation);
+
+   if (unlikely(tp->norst)) {
+   if (tcp_close_state(sk)) {
+   /* We will discard all new incoming data
+* set window to max of current or init.
+*/
+   tp->rcv_wnd = max(tp->rcv_wnd, MAX_TCP_WINDOW);
+   tcp_send_fin(sk);
+   }
+   } else {
+   tcp_set_state(sk, TCP_CLOSE);
+   tcp_send_active_reset(sk, sk->sk_allocation);
+   }
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -3040,6 +3053,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->recvmsg_inq = val;
break;
+   case TCP_CLOSE_NORST:
+   tp->norst = !!val;
+   break;
default:
err = -ENOPROTOOPT;
break;
@@ -3523,6 +3539,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return err;
}
 #endif
+   case TCP_CLOSE_NORST:
+   val = tp->norst;
+   break;
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aebb29ab2fdf..e0aa6e126700 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6054,7 +6054,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff 
*skb)
break;
}
 
-   if (tp->linger2 < 0) {
+   if (likely(!tp->norst) && tp->linger2 < 0) {
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LI

[PATCH RFC net-next 0/1] tcp: close socket without reset on incoming data

2018-05-18 Thread Debabrata Banerjee

There is a basic problem with TCP sockets, where sending and closing of
data is unreliable. One good example of this is a web server that wants
to send an error back on a HTTP POST and close the socket, however
assuming the POST was of any significant size what really happens is
that the browser gets a broken socket while it is trying to post, and
never reads the error, possible retrying the whole POST a number of
times. This has been well documented by other people, for example this
blog post:

https://blog.netherlabs.nl/articles/2009/01/18/the-ultimate-so_linger-page-or-why-is-my-tcp-not-reliable

Without this patch, our server application has to hang on to a socket
sink all of the POST data, eating up memory and cpu. With this patch
the task is offloaded to the kernel, which uses only a timewait socket
to efficiently ack and discard any incoming data. We've been using a
similar patch internally for years, I think it has applications for
everyone.

Debabrata Banerjee (1):
  tcp: close socket without reset on incoming data

 include/linux/tcp.h  |  4 +++-
 include/uapi/linux/tcp.h |  2 +-
 net/ipv4/tcp.c   | 23 +--
 net/ipv4/tcp_input.c | 16 
 net/ipv4/tcp_minisocks.c | 15 +++
 5 files changed, 52 insertions(+), 8 deletions(-)

-- 
2.17.0



[PATCH net-next] Revert "bonding: allow carrier and link status to determine link state"

2018-05-16 Thread Debabrata Banerjee
This reverts commit 1386c36b30388f46a95100924bfcae75160db715.

We don't want to encourage drivers to not report carrier status
correctly, therefore remove this commit.

Signed-off-by: Debabrata Banerjee 
---
 Documentation/networking/bonding.txt |  4 ++--
 drivers/net/bonding/bond_main.c  | 12 
 drivers/net/bonding/bond_options.c   |  7 +++
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/Documentation/networking/bonding.txt 
b/Documentation/networking/bonding.txt
index 86d07fbb592d..c13214d073a4 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -828,8 +828,8 @@ use_carrier
MII / ETHTOOL ioctl method to determine the link state.
 
A value of 1 enables the use of netif_carrier_ok(), a value of
-   0 will use the deprecated MII / ETHTOOL ioctls. A value of 2
-   will check both.  The default value is 1.
+   0 will use the deprecated MII / ETHTOOL ioctls.  The default
+   value is 1.
 
 xmit_hash_policy
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e4c253dc7dfb..a4cd7f6bfd4d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -132,7 +132,7 @@ MODULE_PARM_DESC(downdelay, "Delay before considering link 
down, "
"in milliseconds");
 module_param(use_carrier, int, 0);
 MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; 
"
- "0 for off, 1 for on (default), 2 for carrier 
then legacy checks");
+ "0 for off, 1 for on (default)");
 module_param(mode, charp, 0);
 MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
   "1 for active-backup, 2 for balance-xor, "
@@ -434,16 +434,12 @@ static int bond_check_dev_link(struct bonding *bond,
int (*ioctl)(struct net_device *, struct ifreq *, int);
struct ifreq ifr;
struct mii_ioctl_data *mii;
-   bool carrier = true;
 
if (!reporting && !netif_running(slave_dev))
return 0;
 
if (bond->params.use_carrier)
-   carrier = netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
-
-   if (!carrier)
-   return carrier;
+   return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
 
/* Try to get link status using Ethtool first. */
if (slave_dev->ethtool_ops->get_link)
@@ -4407,8 +4403,8 @@ static int bond_check_params(struct bond_params *params)
downdelay = 0;
}
 
-   if (use_carrier < 0 || use_carrier > 2) {
-   pr_warn("Warning: use_carrier module parameter (%d), not of 
valid value (0-2), so it was set to 1\n",
+   if ((use_carrier != 0) && (use_carrier != 1)) {
+   pr_warn("Warning: use_carrier module parameter (%d), not of 
valid value (0/1), so it was set to 1\n",
use_carrier);
use_carrier = 1;
}
diff --git a/drivers/net/bonding/bond_options.c 
b/drivers/net/bonding/bond_options.c
index dba6cef05134..8a945c9341d6 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -164,10 +164,9 @@ static const struct bond_opt_value 
bond_primary_reselect_tbl[] = {
 };
 
 static const struct bond_opt_value bond_use_carrier_tbl[] = {
-   { "off",  0,  0},
-   { "on",   1,  BOND_VALFLAG_DEFAULT},
-   { "both", 2,  0},
-   { NULL,  -1,  0}
+   { "off", 0,  0},
+   { "on",  1,  BOND_VALFLAG_DEFAULT},
+   { NULL,  -1, 0}
 };
 
 static const struct bond_opt_value bond_all_slaves_active_tbl[] = {
-- 
2.17.0



[PATCH net-next v2 1/4] bonding: don't queue up extraneous rlb updates

2018-05-14 Thread Debabrata Banerjee
arps for incomplete entries can't be sent anyway.

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/bonding/bond_alb.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 5eb0df2e5464..c2f6c58e4e6a 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -421,7 +421,8 @@ static void rlb_clear_slave(struct bonding *bond, struct 
slave *slave)
if (assigned_slave) {
rx_hash_table[index].slave = assigned_slave;
if 
(!ether_addr_equal_64bits(rx_hash_table[index].mac_dst,
-mac_bcast)) {
+mac_bcast) &&
+   
!is_zero_ether_addr(rx_hash_table[index].mac_dst)) {
bond_info->rx_hashtbl[index].ntt = 1;
bond_info->rx_ntt = 1;
/* A slave has been removed from the
@@ -524,7 +525,8 @@ static void rlb_req_update_slave_clients(struct bonding 
*bond, struct slave *sla
client_info = &(bond_info->rx_hashtbl[hash_index]);
 
if ((client_info->slave == slave) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
ntt = 1;
}
@@ -565,7 +567,8 @@ static void rlb_req_update_subnet_clients(struct bonding 
*bond, __be32 src_ip)
if ((client_info->ip_src == src_ip) &&
!ether_addr_equal_64bits(client_info->slave->dev->dev_addr,
 bond->dev->dev_addr) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond_info->rx_ntt = 1;
}
@@ -641,7 +644,8 @@ static struct slave *rlb_choose_channel(struct sk_buff 
*skb, struct bonding *bon
ether_addr_copy(client_info->mac_src, arp->mac_src);
client_info->slave = assigned_slave;
 
-   if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+   if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond->alb_info.rx_ntt = 1;
} else {
@@ -733,8 +737,10 @@ static void rlb_rebalance(struct bonding *bond)
assigned_slave = __rlb_next_rx_slave(bond);
if (assigned_slave && (client_info->slave != assigned_slave)) {
client_info->slave = assigned_slave;
-   client_info->ntt = 1;
-   ntt = 1;
+   if (!is_zero_ether_addr(client_info->mac_dst)) {
+   client_info->ntt = 1;
+   ntt = 1;
+   }
}
}
 
-- 
2.17.0



[PATCH net-next v2 3/4] bonding: allow use of tx hashing in balance-alb

2018-05-14 Thread Debabrata Banerjee
The rx load balancing provided by balance-alb is not mutually
exclusive with using hashing for tx selection, and should provide a decent
speed increase because this eliminates spinlocks and cache contention.

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/bonding/bond_alb.c | 20 ++--
 drivers/net/bonding/bond_main.c| 25 +++--
 drivers/net/bonding/bond_options.c |  2 +-
 include/net/bonding.h  | 11 +--
 4 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 180e50f7806f..6228635880d5 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1478,8 +1478,24 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device 
*bond_dev)
}
 
if (do_tx_balance) {
-   hash_index = _simple_hash(hash_start, hash_size);
-   tx_slave = tlb_choose_channel(bond, hash_index, skb->len);
+   if (bond->params.tlb_dynamic_lb) {
+   hash_index = _simple_hash(hash_start, hash_size);
+   tx_slave = tlb_choose_channel(bond, hash_index, 
skb->len);
+   } else {
+   /*
+* do_tx_balance means we are free to select the 
tx_slave
+* So we do exactly what tlb would do for hash selection
+*/
+
+   struct bond_up_slave *slaves;
+   unsigned int count;
+
+   slaves = rcu_dereference(bond->slave_arr);
+   count = slaves ? READ_ONCE(slaves->count) : 0;
+   if (likely(count))
+   tx_slave = slaves->arr[bond_xmit_hash(bond, 
skb) %
+  count];
+   }
}
 
return bond_do_alb_xmit(skb, bond, tx_slave);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 1f1e97b26f95..f7f8a49cb32b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -159,7 +159,7 @@ module_param(min_links, int, 0);
 MODULE_PARM_DESC(min_links, "Minimum number of available links before turning 
on carrier");
 
 module_param(xmit_hash_policy, charp, 0);
-MODULE_PARM_DESC(xmit_hash_policy, "balance-xor and 802.3ad hashing method; "
+MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 
802.3ad hashing method; "
   "0 for layer 2 (default), 1 for layer 3+4, "
   "2 for layer 2+3, 3 for encap layer 2+3, "
   "4 for encap layer 3+4");
@@ -1735,7 +1735,7 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev,
unblock_netpoll_tx();
}
 
-   if (bond_mode_uses_xmit_hash(bond))
+   if (bond_mode_can_use_xmit_hash(bond))
bond_update_slave_arr(bond, NULL);
 
bond->nest_level = dev_get_nest_level(bond_dev);
@@ -1870,7 +1870,7 @@ static int __bond_release_one(struct net_device *bond_dev,
if (BOND_MODE(bond) == BOND_MODE_8023AD)
bond_3ad_unbind_slave(slave);
 
-   if (bond_mode_uses_xmit_hash(bond))
+   if (bond_mode_can_use_xmit_hash(bond))
bond_update_slave_arr(bond, slave);
 
netdev_info(bond_dev, "Releasing %s interface %s\n",
@@ -3102,7 +3102,7 @@ static int bond_slave_netdev_event(unsigned long event,
 * events. If these (miimon/arpmon) parameters are configured
 * then array gets refreshed twice and that should be fine!
 */
-   if (bond_mode_uses_xmit_hash(bond))
+   if (bond_mode_can_use_xmit_hash(bond))
bond_update_slave_arr(bond, NULL);
break;
case NETDEV_CHANGEMTU:
@@ -3322,7 +3322,7 @@ static int bond_open(struct net_device *bond_dev)
 */
if (bond_alb_initialize(bond, (BOND_MODE(bond) == 
BOND_MODE_ALB)))
return -ENOMEM;
-   if (bond->params.tlb_dynamic_lb)
+   if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == 
BOND_MODE_ALB)
queue_delayed_work(bond->wq, &bond->alb_work, 0);
}
 
@@ -3341,7 +3341,7 @@ static int bond_open(struct net_device *bond_dev)
bond_3ad_initiate_agg_selection(bond, 1);
}
 
-   if (bond_mode_uses_xmit_hash(bond))
+   if (bond_mode_can_use_xmit_hash(bond))
bond_update_slave_arr(bond, NULL);
 
return 0;
@@ -3892,7 +3892,7 @@ static void bond_slave_arr_handler(struct work_struct 
*work)
  * to determine the slave interface -
  * (a) BOND_MODE_8023AD
  * (b) BOND_MODE_XOR
- * (c) BOND_MODE_TLB &

[PATCH net-next v2 0/4] bonding: performance and reliability

2018-05-14 Thread Debabrata Banerjee
Series of fixes to how rlb updates are handled, code cleanup, allowing
higher performance tx hashing in balance-alb mode, and reliability of
link up/down monitoring.

v2: refactor bond_is_nondyn_tlb with inline fn, update log comment to
point out that multicast addresses will not get rlb updates.

Debabrata Banerjee (4):
  bonding: don't queue up extraneous rlb updates
  bonding: use common mac addr checks
  bonding: allow use of tx hashing in balance-alb
  bonding: allow carrier and link status to determine link state

 Documentation/networking/bonding.txt |  4 +--
 drivers/net/bonding/bond_alb.c   | 50 +---
 drivers/net/bonding/bond_main.c  | 37 
 drivers/net/bonding/bond_options.c   |  9 ++---
 include/net/bonding.h| 11 --
 5 files changed, 70 insertions(+), 41 deletions(-)

-- 
2.17.0



[PATCH net-next v2 4/4] bonding: allow carrier and link status to determine link state

2018-05-14 Thread Debabrata Banerjee
In a mixed environment it may be difficult to tell if your hardware
support carrier, if it does not it can always report true. With a new
use_carrier option of 2, we can check both carrier and link status
sequentially, instead of one or the other

Signed-off-by: Debabrata Banerjee 
---
 Documentation/networking/bonding.txt |  4 ++--
 drivers/net/bonding/bond_main.c  | 12 
 drivers/net/bonding/bond_options.c   |  7 ---
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/Documentation/networking/bonding.txt 
b/Documentation/networking/bonding.txt
index 9ba04c0bab8d..f063730e7e73 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -828,8 +828,8 @@ use_carrier
MII / ETHTOOL ioctl method to determine the link state.
 
A value of 1 enables the use of netif_carrier_ok(), a value of
-   0 will use the deprecated MII / ETHTOOL ioctls.  The default
-   value is 1.
+   0 will use the deprecated MII / ETHTOOL ioctls. A value of 2
+   will check both.  The default value is 1.
 
 xmit_hash_policy
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f7f8a49cb32b..7e9652c4b35c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -132,7 +132,7 @@ MODULE_PARM_DESC(downdelay, "Delay before considering link 
down, "
"in milliseconds");
 module_param(use_carrier, int, 0);
 MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; 
"
- "0 for off, 1 for on (default)");
+ "0 for off, 1 for on (default), 2 for carrier 
then legacy checks");
 module_param(mode, charp, 0);
 MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
   "1 for active-backup, 2 for balance-xor, "
@@ -434,12 +434,16 @@ static int bond_check_dev_link(struct bonding *bond,
int (*ioctl)(struct net_device *, struct ifreq *, int);
struct ifreq ifr;
struct mii_ioctl_data *mii;
+   bool carrier = true;
 
if (!reporting && !netif_running(slave_dev))
return 0;
 
if (bond->params.use_carrier)
-   return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
+   carrier = netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
+
+   if (!carrier)
+   return carrier;
 
/* Try to get link status using Ethtool first. */
if (slave_dev->ethtool_ops->get_link)
@@ -4399,8 +4403,8 @@ static int bond_check_params(struct bond_params *params)
downdelay = 0;
}
 
-   if ((use_carrier != 0) && (use_carrier != 1)) {
-   pr_warn("Warning: use_carrier module parameter (%d), not of 
valid value (0/1), so it was set to 1\n",
+   if (use_carrier < 0 || use_carrier > 2) {
+   pr_warn("Warning: use_carrier module parameter (%d), not of 
valid value (0-2), so it was set to 1\n",
use_carrier);
use_carrier = 1;
}
diff --git a/drivers/net/bonding/bond_options.c 
b/drivers/net/bonding/bond_options.c
index 8a945c9341d6..dba6cef05134 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -164,9 +164,10 @@ static const struct bond_opt_value 
bond_primary_reselect_tbl[] = {
 };
 
 static const struct bond_opt_value bond_use_carrier_tbl[] = {
-   { "off", 0,  0},
-   { "on",  1,  BOND_VALFLAG_DEFAULT},
-   { NULL,  -1, 0}
+   { "off",  0,  0},
+   { "on",   1,  BOND_VALFLAG_DEFAULT},
+   { "both", 2,  0},
+   { NULL,  -1,  0}
 };
 
 static const struct bond_opt_value bond_all_slaves_active_tbl[] = {
-- 
2.17.0



[PATCH net-next v2 2/4] bonding: use common mac addr checks

2018-05-14 Thread Debabrata Banerjee
Replace homegrown mac addr checks with faster defs from etherdevice.h

Note that this will also prevent any rlb arp updates for multicast
addresses, however this should have been forbidden anyway.

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/bonding/bond_alb.c | 28 +---
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c2f6c58e4e6a..180e50f7806f 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -40,11 +40,6 @@
 #include 
 #include 
 
-
-
-static const u8 mac_bcast[ETH_ALEN + 2] __long_aligned = {
-   0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-};
 static const u8 mac_v6_allmcast[ETH_ALEN + 2] __long_aligned = {
0x33, 0x33, 0x00, 0x00, 0x00, 0x01
 };
@@ -420,9 +415,7 @@ static void rlb_clear_slave(struct bonding *bond, struct 
slave *slave)
 
if (assigned_slave) {
rx_hash_table[index].slave = assigned_slave;
-   if 
(!ether_addr_equal_64bits(rx_hash_table[index].mac_dst,
-mac_bcast) &&
-   
!is_zero_ether_addr(rx_hash_table[index].mac_dst)) {
+   if 
(is_valid_ether_addr(rx_hash_table[index].mac_dst)) {
bond_info->rx_hashtbl[index].ntt = 1;
bond_info->rx_ntt = 1;
/* A slave has been removed from the
@@ -525,8 +518,7 @@ static void rlb_req_update_slave_clients(struct bonding 
*bond, struct slave *sla
client_info = &(bond_info->rx_hashtbl[hash_index]);
 
if ((client_info->slave == slave) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-   !is_zero_ether_addr(client_info->mac_dst)) {
+   is_valid_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
ntt = 1;
}
@@ -567,8 +559,7 @@ static void rlb_req_update_subnet_clients(struct bonding 
*bond, __be32 src_ip)
if ((client_info->ip_src == src_ip) &&
!ether_addr_equal_64bits(client_info->slave->dev->dev_addr,
 bond->dev->dev_addr) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-   !is_zero_ether_addr(client_info->mac_dst)) {
+   is_valid_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond_info->rx_ntt = 1;
}
@@ -596,7 +587,7 @@ static struct slave *rlb_choose_channel(struct sk_buff 
*skb, struct bonding *bon
if ((client_info->ip_src == arp->ip_src) &&
(client_info->ip_dst == arp->ip_dst)) {
/* the entry is already assigned to this client */
-   if (!ether_addr_equal_64bits(arp->mac_dst, mac_bcast)) {
+   if (!is_broadcast_ether_addr(arp->mac_dst)) {
/* update mac address from arp */
ether_addr_copy(client_info->mac_dst, 
arp->mac_dst);
}
@@ -644,8 +635,7 @@ static struct slave *rlb_choose_channel(struct sk_buff 
*skb, struct bonding *bon
ether_addr_copy(client_info->mac_src, arp->mac_src);
client_info->slave = assigned_slave;
 
-   if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-   !is_zero_ether_addr(client_info->mac_dst)) {
+   if (is_valid_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond->alb_info.rx_ntt = 1;
} else {
@@ -1418,9 +1408,9 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device 
*bond_dev)
case ETH_P_IP: {
const struct iphdr *iph = ip_hdr(skb);
 
-   if (ether_addr_equal_64bits(eth_data->h_dest, mac_bcast) ||
-   (iph->daddr == ip_bcast) ||
-   (iph->protocol == IPPROTO_IGMP)) {
+   if (is_broadcast_ether_addr(eth_data->h_dest) ||
+   iph->daddr == ip_bcast ||
+   iph->protocol == IPPROTO_IGMP) {
do_tx_balance = false;
break;
}
@@ -1432,7 +1422,7 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device 
*bond_dev)
/* IPv6 doesn't really use broadcast mac address, but leave
 * that here just in case.
 */
-   if (e

[PATCH net-next 0/4] bonding: performance and reliability

2018-05-11 Thread Debabrata Banerjee
Series of fixes to how rlb updates are handled, code cleanup, allowing
higher performance tx hashing in balance-alb mode, and reliability of
link up/down monitoring.

Debabrata Banerjee (4):
  bonding: don't queue up extraneous rlb updates
  bonding: use common mac addr checks
  bonding: allow use of tx hashing in balance-alb
  bonding: allow carrier and link status to determine link state

 Documentation/networking/bonding.txt |  4 +--
 drivers/net/bonding/bond_alb.c   | 50 +---
 drivers/net/bonding/bond_main.c  | 37 
 drivers/net/bonding/bond_options.c   |  9 ++---
 include/net/bonding.h| 10 +-
 5 files changed, 70 insertions(+), 40 deletions(-)

-- 
2.17.0



[PATCH net-next 4/4] bonding: allow carrier and link status to determine link state

2018-05-11 Thread Debabrata Banerjee
In a mixed environment it may be difficult to tell if your hardware
support carrier, if it does not it can always report true. With a new
use_carrier option of 2, we can check both carrier and link status
sequentially, instead of one or the other

Signed-off-by: Debabrata Banerjee 
---
 Documentation/networking/bonding.txt |  4 ++--
 drivers/net/bonding/bond_main.c  | 12 
 drivers/net/bonding/bond_options.c   |  7 ---
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/Documentation/networking/bonding.txt 
b/Documentation/networking/bonding.txt
index 9ba04c0bab8d..f063730e7e73 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -828,8 +828,8 @@ use_carrier
MII / ETHTOOL ioctl method to determine the link state.
 
A value of 1 enables the use of netif_carrier_ok(), a value of
-   0 will use the deprecated MII / ETHTOOL ioctls.  The default
-   value is 1.
+   0 will use the deprecated MII / ETHTOOL ioctls. A value of 2
+   will check both.  The default value is 1.
 
 xmit_hash_policy
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f7f8a49cb32b..7e9652c4b35c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -132,7 +132,7 @@ MODULE_PARM_DESC(downdelay, "Delay before considering link 
down, "
"in milliseconds");
 module_param(use_carrier, int, 0);
 MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; 
"
- "0 for off, 1 for on (default)");
+ "0 for off, 1 for on (default), 2 for carrier 
then legacy checks");
 module_param(mode, charp, 0);
 MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
   "1 for active-backup, 2 for balance-xor, "
@@ -434,12 +434,16 @@ static int bond_check_dev_link(struct bonding *bond,
int (*ioctl)(struct net_device *, struct ifreq *, int);
struct ifreq ifr;
struct mii_ioctl_data *mii;
+   bool carrier = true;
 
if (!reporting && !netif_running(slave_dev))
return 0;
 
if (bond->params.use_carrier)
-   return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
+   carrier = netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
+
+   if (!carrier)
+   return carrier;
 
/* Try to get link status using Ethtool first. */
if (slave_dev->ethtool_ops->get_link)
@@ -4399,8 +4403,8 @@ static int bond_check_params(struct bond_params *params)
downdelay = 0;
}
 
-   if ((use_carrier != 0) && (use_carrier != 1)) {
-   pr_warn("Warning: use_carrier module parameter (%d), not of 
valid value (0/1), so it was set to 1\n",
+   if (use_carrier < 0 || use_carrier > 2) {
+   pr_warn("Warning: use_carrier module parameter (%d), not of 
valid value (0-2), so it was set to 1\n",
use_carrier);
use_carrier = 1;
}
diff --git a/drivers/net/bonding/bond_options.c 
b/drivers/net/bonding/bond_options.c
index 8a945c9341d6..dba6cef05134 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -164,9 +164,10 @@ static const struct bond_opt_value 
bond_primary_reselect_tbl[] = {
 };
 
 static const struct bond_opt_value bond_use_carrier_tbl[] = {
-   { "off", 0,  0},
-   { "on",  1,  BOND_VALFLAG_DEFAULT},
-   { NULL,  -1, 0}
+   { "off",  0,  0},
+   { "on",   1,  BOND_VALFLAG_DEFAULT},
+   { "both", 2,  0},
+   { NULL,  -1,  0}
 };
 
 static const struct bond_opt_value bond_all_slaves_active_tbl[] = {
-- 
2.17.0



[PATCH net-next 1/4] bonding: don't queue up extraneous rlb updates

2018-05-11 Thread Debabrata Banerjee
arps for incomplete entries can't be sent anyway.

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/bonding/bond_alb.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 5eb0df2e5464..c2f6c58e4e6a 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -421,7 +421,8 @@ static void rlb_clear_slave(struct bonding *bond, struct 
slave *slave)
if (assigned_slave) {
rx_hash_table[index].slave = assigned_slave;
if 
(!ether_addr_equal_64bits(rx_hash_table[index].mac_dst,
-mac_bcast)) {
+mac_bcast) &&
+   
!is_zero_ether_addr(rx_hash_table[index].mac_dst)) {
bond_info->rx_hashtbl[index].ntt = 1;
bond_info->rx_ntt = 1;
/* A slave has been removed from the
@@ -524,7 +525,8 @@ static void rlb_req_update_slave_clients(struct bonding 
*bond, struct slave *sla
client_info = &(bond_info->rx_hashtbl[hash_index]);
 
if ((client_info->slave == slave) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
ntt = 1;
}
@@ -565,7 +567,8 @@ static void rlb_req_update_subnet_clients(struct bonding 
*bond, __be32 src_ip)
if ((client_info->ip_src == src_ip) &&
!ether_addr_equal_64bits(client_info->slave->dev->dev_addr,
 bond->dev->dev_addr) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond_info->rx_ntt = 1;
}
@@ -641,7 +644,8 @@ static struct slave *rlb_choose_channel(struct sk_buff 
*skb, struct bonding *bon
ether_addr_copy(client_info->mac_src, arp->mac_src);
client_info->slave = assigned_slave;
 
-   if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+   if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond->alb_info.rx_ntt = 1;
} else {
@@ -733,8 +737,10 @@ static void rlb_rebalance(struct bonding *bond)
assigned_slave = __rlb_next_rx_slave(bond);
if (assigned_slave && (client_info->slave != assigned_slave)) {
client_info->slave = assigned_slave;
-   client_info->ntt = 1;
-   ntt = 1;
+   if (!is_zero_ether_addr(client_info->mac_dst)) {
+   client_info->ntt = 1;
+   ntt = 1;
+   }
}
}
 
-- 
2.17.0



[PATCH net-next 3/4] bonding: allow use of tx hashing in balance-alb

2018-05-11 Thread Debabrata Banerjee
The rx load balancing provided by balance-alb is not mutually
exclusive with using hashing for tx selection, and should provide a decent
speed increase because this eliminates spinlocks and cache contention.

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/bonding/bond_alb.c | 20 ++--
 drivers/net/bonding/bond_main.c| 25 +++--
 drivers/net/bonding/bond_options.c |  2 +-
 include/net/bonding.h  | 10 +-
 4 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 180e50f7806f..6228635880d5 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1478,8 +1478,24 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device 
*bond_dev)
}
 
if (do_tx_balance) {
-   hash_index = _simple_hash(hash_start, hash_size);
-   tx_slave = tlb_choose_channel(bond, hash_index, skb->len);
+   if (bond->params.tlb_dynamic_lb) {
+   hash_index = _simple_hash(hash_start, hash_size);
+   tx_slave = tlb_choose_channel(bond, hash_index, 
skb->len);
+   } else {
+   /*
+* do_tx_balance means we are free to select the 
tx_slave
+* So we do exactly what tlb would do for hash selection
+*/
+
+   struct bond_up_slave *slaves;
+   unsigned int count;
+
+   slaves = rcu_dereference(bond->slave_arr);
+   count = slaves ? READ_ONCE(slaves->count) : 0;
+   if (likely(count))
+   tx_slave = slaves->arr[bond_xmit_hash(bond, 
skb) %
+  count];
+   }
}
 
return bond_do_alb_xmit(skb, bond, tx_slave);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 1f1e97b26f95..f7f8a49cb32b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -159,7 +159,7 @@ module_param(min_links, int, 0);
 MODULE_PARM_DESC(min_links, "Minimum number of available links before turning 
on carrier");
 
 module_param(xmit_hash_policy, charp, 0);
-MODULE_PARM_DESC(xmit_hash_policy, "balance-xor and 802.3ad hashing method; "
+MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 
802.3ad hashing method; "
   "0 for layer 2 (default), 1 for layer 3+4, "
   "2 for layer 2+3, 3 for encap layer 2+3, "
   "4 for encap layer 3+4");
@@ -1735,7 +1735,7 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev,
unblock_netpoll_tx();
}
 
-   if (bond_mode_uses_xmit_hash(bond))
+   if (bond_mode_can_use_xmit_hash(bond))
bond_update_slave_arr(bond, NULL);
 
bond->nest_level = dev_get_nest_level(bond_dev);
@@ -1870,7 +1870,7 @@ static int __bond_release_one(struct net_device *bond_dev,
if (BOND_MODE(bond) == BOND_MODE_8023AD)
bond_3ad_unbind_slave(slave);
 
-   if (bond_mode_uses_xmit_hash(bond))
+   if (bond_mode_can_use_xmit_hash(bond))
bond_update_slave_arr(bond, slave);
 
netdev_info(bond_dev, "Releasing %s interface %s\n",
@@ -3102,7 +3102,7 @@ static int bond_slave_netdev_event(unsigned long event,
 * events. If these (miimon/arpmon) parameters are configured
 * then array gets refreshed twice and that should be fine!
 */
-   if (bond_mode_uses_xmit_hash(bond))
+   if (bond_mode_can_use_xmit_hash(bond))
bond_update_slave_arr(bond, NULL);
break;
case NETDEV_CHANGEMTU:
@@ -3322,7 +3322,7 @@ static int bond_open(struct net_device *bond_dev)
 */
if (bond_alb_initialize(bond, (BOND_MODE(bond) == 
BOND_MODE_ALB)))
return -ENOMEM;
-   if (bond->params.tlb_dynamic_lb)
+   if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == 
BOND_MODE_ALB)
queue_delayed_work(bond->wq, &bond->alb_work, 0);
}
 
@@ -3341,7 +3341,7 @@ static int bond_open(struct net_device *bond_dev)
bond_3ad_initiate_agg_selection(bond, 1);
}
 
-   if (bond_mode_uses_xmit_hash(bond))
+   if (bond_mode_can_use_xmit_hash(bond))
bond_update_slave_arr(bond, NULL);
 
return 0;
@@ -3892,7 +3892,7 @@ static void bond_slave_arr_handler(struct work_struct 
*work)
  * to determine the slave interface -
  * (a) BOND_MODE_8023AD
  * (b) BOND_MODE_XOR
- * (c) BOND_MODE_TLB &

[PATCH net-next 2/4] bonding: use common mac addr checks

2018-05-11 Thread Debabrata Banerjee
Replace homegrown mac addr checks with faster defs from etherdevice.h

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/bonding/bond_alb.c | 28 +---
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c2f6c58e4e6a..180e50f7806f 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -40,11 +40,6 @@
 #include 
 #include 
 
-
-
-static const u8 mac_bcast[ETH_ALEN + 2] __long_aligned = {
-   0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-};
 static const u8 mac_v6_allmcast[ETH_ALEN + 2] __long_aligned = {
0x33, 0x33, 0x00, 0x00, 0x00, 0x01
 };
@@ -420,9 +415,7 @@ static void rlb_clear_slave(struct bonding *bond, struct 
slave *slave)
 
if (assigned_slave) {
rx_hash_table[index].slave = assigned_slave;
-   if 
(!ether_addr_equal_64bits(rx_hash_table[index].mac_dst,
-mac_bcast) &&
-   
!is_zero_ether_addr(rx_hash_table[index].mac_dst)) {
+   if 
(is_valid_ether_addr(rx_hash_table[index].mac_dst)) {
bond_info->rx_hashtbl[index].ntt = 1;
bond_info->rx_ntt = 1;
/* A slave has been removed from the
@@ -525,8 +518,7 @@ static void rlb_req_update_slave_clients(struct bonding 
*bond, struct slave *sla
client_info = &(bond_info->rx_hashtbl[hash_index]);
 
if ((client_info->slave == slave) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-   !is_zero_ether_addr(client_info->mac_dst)) {
+   is_valid_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
ntt = 1;
}
@@ -567,8 +559,7 @@ static void rlb_req_update_subnet_clients(struct bonding 
*bond, __be32 src_ip)
if ((client_info->ip_src == src_ip) &&
!ether_addr_equal_64bits(client_info->slave->dev->dev_addr,
 bond->dev->dev_addr) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-   !is_zero_ether_addr(client_info->mac_dst)) {
+   is_valid_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond_info->rx_ntt = 1;
}
@@ -596,7 +587,7 @@ static struct slave *rlb_choose_channel(struct sk_buff 
*skb, struct bonding *bon
if ((client_info->ip_src == arp->ip_src) &&
(client_info->ip_dst == arp->ip_dst)) {
/* the entry is already assigned to this client */
-   if (!ether_addr_equal_64bits(arp->mac_dst, mac_bcast)) {
+   if (!is_broadcast_ether_addr(arp->mac_dst)) {
/* update mac address from arp */
ether_addr_copy(client_info->mac_dst, 
arp->mac_dst);
}
@@ -644,8 +635,7 @@ static struct slave *rlb_choose_channel(struct sk_buff 
*skb, struct bonding *bon
ether_addr_copy(client_info->mac_src, arp->mac_src);
client_info->slave = assigned_slave;
 
-   if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-   !is_zero_ether_addr(client_info->mac_dst)) {
+   if (is_valid_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond->alb_info.rx_ntt = 1;
} else {
@@ -1418,9 +1408,9 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device 
*bond_dev)
case ETH_P_IP: {
const struct iphdr *iph = ip_hdr(skb);
 
-   if (ether_addr_equal_64bits(eth_data->h_dest, mac_bcast) ||
-   (iph->daddr == ip_bcast) ||
-   (iph->protocol == IPPROTO_IGMP)) {
+   if (is_broadcast_ether_addr(eth_data->h_dest) ||
+   iph->daddr == ip_bcast ||
+   iph->protocol == IPPROTO_IGMP) {
do_tx_balance = false;
break;
}
@@ -1432,7 +1422,7 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device 
*bond_dev)
/* IPv6 doesn't really use broadcast mac address, but leave
 * that here just in case.
 */
-   if (ether_addr_equal_64bits(eth_data->h_dest, mac_bcast)) {
+   if (is_broadcast_ether_addr(eth_data->h_dest)) {
do_tx_balance = false;
break;
}
-- 
2.17.0



[PATCH net 2/2] bonding: send learning packets for vlans on slave

2018-05-09 Thread Debabrata Banerjee
There was a regression at some point from the intended functionality of
commit f60c3704e87d ("bonding: Fix alb mode to only use first level
vlans.")

Given the return value vlan_get_encap_level() we need to store the nest
level of the bond device, and then compare the vlan's encap level to
this. Without this, this check always fails and learning packets are
never sent.

In addition, this same commit caused a regression in the behavior of
balance_alb, which requires learning packets be sent for all interfaces
using the slave's mac in order to load balance properly. For vlan's
that have not set a user mac, we can send after checking one bit.
Otherwise we need send the set mac, albeit defeating rx load balancing
for that vlan.

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/bonding/bond_alb.c  | 13 -
 drivers/net/bonding/bond_main.c |  2 ++
 include/net/bonding.h   |  1 +
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 3f6faa657360..5eb0df2e5464 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -943,6 +943,10 @@ static void alb_send_lp_vid(struct slave *slave, u8 
mac_addr[],
skb->priority = TC_PRIO_CONTROL;
skb->dev = slave->dev;
 
+   netdev_dbg(slave->bond->dev,
+  "Send learning packet: dev %s mac %pM vlan %d\n",
+  slave->dev->name, mac_addr, vid);
+
if (vid)
__vlan_hwaccel_put_tag(skb, vlan_proto, vid);
 
@@ -965,14 +969,13 @@ static int alb_upper_dev_walk(struct net_device *upper, 
void *_data)
u8 *mac_addr = data->mac_addr;
struct bond_vlan_tag *tags;
 
-   if (is_vlan_dev(upper) && vlan_get_encap_level(upper) == 0) {
-   if (strict_match &&
-   ether_addr_equal_64bits(mac_addr,
-   upper->dev_addr)) {
+   if (is_vlan_dev(upper) &&
+   bond->nest_level == vlan_get_encap_level(upper) - 1) {
+   if (upper->addr_assign_type == NET_ADDR_STOLEN) {
alb_send_lp_vid(slave, mac_addr,
vlan_dev_vlan_proto(upper),
vlan_dev_vlan_id(upper));
-   } else if (!strict_match) {
+   } else {
alb_send_lp_vid(slave, upper->dev_addr,
vlan_dev_vlan_proto(upper),
vlan_dev_vlan_id(upper));
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 718e4914e3a0..1f1e97b26f95 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1738,6 +1738,8 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev,
if (bond_mode_uses_xmit_hash(bond))
bond_update_slave_arr(bond, NULL);
 
+   bond->nest_level = dev_get_nest_level(bond_dev);
+
netdev_info(bond_dev, "Enslaving %s as %s interface with %s link\n",
slave_dev->name,
bond_is_active_slave(new_slave) ? "an active" : "a backup",
diff --git a/include/net/bonding.h b/include/net/bonding.h
index f801fc940b29..b52235158836 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -198,6 +198,7 @@ struct bonding {
struct   slave __rcu *primary_slave;
struct   bond_up_slave __rcu *slave_arr; /* Array of usable slaves */
bool force_primary;
+   u32  nest_level;
s32  slave_cnt; /* never change this value outside the 
attach/detach wrappers */
int (*recv_probe)(const struct sk_buff *, struct bonding *,
  struct slave *);
-- 
2.17.0



[PATCH net 1/2] bonding: do not allow rlb updates to invalid mac

2018-05-09 Thread Debabrata Banerjee
Make sure multicast, broadcast, and zero mac's cannot be the output of rlb
updates, which should all be directed arps. Receive load balancing will be
collapsed if any of these happen, as the switch will broadcast.

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/bonding/bond_alb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 1ed9529e7bd1..3f6faa657360 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -450,7 +450,7 @@ static void rlb_update_client(struct rlb_client_info 
*client_info)
 {
int i;
 
-   if (!client_info->slave)
+   if (!client_info->slave || !is_valid_ether_addr(client_info->mac_dst))
return;
 
for (i = 0; i < RLB_ARP_BURST_SIZE; i++) {
-- 
2.17.0



[PATCH net 0/2] bonding: bug fixes and regressions

2018-05-09 Thread Debabrata Banerjee
Fixes to bonding driver for balance-alb mode, suitable for stable.

Debabrata Banerjee (2):
  bonding: do not allow rlb updates to invalid mac
  bonding: send learning packets for vlans on slave

 drivers/net/bonding/bond_alb.c  | 15 +--
 drivers/net/bonding/bond_main.c |  2 ++
 include/net/bonding.h   |  1 +
 3 files changed, 12 insertions(+), 6 deletions(-)

-- 
2.17.0



[PATCH] Fix handling of verdicts after NF_QUEUE

2017-12-13 Thread Debabrata Banerjee
A verdict of NF_STOLEN after NF_QUEUE will cause an incorrect return value
and a potential kernel panic via double free of skb's

This was broken by commit 7034b566a4e7 ("netfilter: fix nf_queue handling")
and subsequently fixed in v4.10 by commit c63cbc460419 ("netfilter:
use switch() to handle verdict cases from nf_hook_slow()"). However that
commit cannot be cleanly cherry-picked to v4.9

Signed-off-by: Debabrata Banerjee 

---

This fix is only needed for v4.9 stable since v4.10+ does not have the
issue
---
 net/netfilter/core.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 004af030ef1a..d869ea50623e 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -364,6 +364,11 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state 
*state)
ret = nf_queue(skb, state, &entry, verdict);
if (ret == 1 && entry)
goto next_hook;
+   } else {
+   /* Implicit handling for NF_STOLEN, as well as any other
+* non conventional verdicts.
+*/
+   ret = 0;
}
return ret;
 }
-- 
2.15.1



[PATCH] Fix handling of verdicts after NF_QUEUE

2017-12-11 Thread Debabrata Banerjee
A verdict of NF_STOLEN after NF_QUEUE will cause an incorrect return value
and a potential kernel panic via double free of skb's

This was broken by commit 7034b566a4e7 ("netfilter: fix nf_queue handling")
and subsequently fixed in v4.10 by commit c63cbc460419 ("netfilter:
use switch() to handle verdict cases from nf_hook_slow()"). However that
commit cannot be cleanly cherry-picked to v4.9

Signed-off-by: Debabrata Banerjee 

---

This fix is only needed for v4.9 stable since v4.10+ does not have the
issue
---
 net/netfilter/core.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 004af030ef1a..d869ea50623e 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -364,6 +364,11 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state 
*state)
ret = nf_queue(skb, state, &entry, verdict);
if (ret == 1 && entry)
goto next_hook;
+   } else {
+   /* Implicit handling for NF_STOLEN, as well as any other
+* non conventional verdicts.
+*/
+   ret = 0;
}
return ret;
 }
-- 
2.15.1



[PATCH net-next] macvlan: Set nocarrier when lowerdev admin down

2016-04-07 Thread Debabrata Banerjee
When the lowerdev is set administratively down disable carrier on the
macvlan interface. This means operstate gets set properly instead of
still being "up".

Signed-off-by: Debabrata Banerjee 
---
 drivers/net/macvlan.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 2bcf1f3..16d0e56 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1525,10 +1525,14 @@ static int macvlan_device_event(struct notifier_block 
*unused,
 
switch (event) {
case NETDEV_UP:
+   case NETDEV_DOWN:
case NETDEV_CHANGE:
-   list_for_each_entry(vlan, &port->vlans, list)
+   list_for_each_entry(vlan, &port->vlans, list) {
netif_stacked_transfer_operstate(vlan->lowerdev,
 vlan->dev);
+   if (!(vlan->lowerdev->flags & IFF_UP))
+   netif_carrier_off(vlan->dev);
+   }
break;
case NETDEV_FEAT_CHANGE:
list_for_each_entry(vlan, &port->vlans, list) {
-- 
2.8.0



[PATCH net-next v2] macvlan: Support interface operstate properly

2016-04-06 Thread Debabrata Banerjee
Set appropriate macvlan interface status based on lower device and our
status. Can be up, down, or lowerlayerdown.

de7d244d0 improved operstate by setting it from unknown to up, however
it did not handle transferring down or lowerlayerdown.

Signed-off-by: Debabrata Banerjee 
---
v2: Fix locking and update commit message

 drivers/net/macvlan.c | 47 +--
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 2bcf1f3..306124ba 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -91,6 +91,7 @@ static struct macvlan_port *macvlan_port_get_rtnl(const 
struct net_device *dev)
 }
 
 #define macvlan_port_exists(dev) (dev->priv_flags & IFF_MACVLAN_PORT)
+#define is_macvlan(dev) (dev->priv_flags & IFF_MACVLAN)
 
 static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port,
   const unsigned char *addr)
@@ -1242,6 +1243,28 @@ static int macvlan_changelink_sources(struct macvlan_dev 
*vlan, u32 mode,
return 0;
 }
 
+static void macvlan_set_operstate(struct net_device *lowerdev,
+ struct net_device *dev)
+{
+   unsigned char newstate = dev->operstate;
+
+   if (!(dev->flags & IFF_UP))
+   newstate = IF_OPER_DOWN;
+   else if ((lowerdev->flags & IFF_UP) && netif_oper_up(lowerdev))
+   newstate = IF_OPER_UP;
+   else
+   newstate = IF_OPER_LOWERLAYERDOWN;
+
+   write_lock_bh(&dev_base_lock);
+   if (dev->operstate != newstate) {
+   dev->operstate = newstate;
+   write_unlock_bh(&dev_base_lock);
+   netdev_state_change(dev);
+   } else {
+   write_unlock_bh(&dev_base_lock);
+   }
+}
+
 int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
   struct nlattr *tb[], struct nlattr *data[])
 {
@@ -1324,6 +1347,7 @@ int macvlan_common_newlink(struct net *src_net, struct 
net_device *dev,
 
list_add_tail_rcu(&vlan->list, &port->vlans);
netif_stacked_transfer_operstate(lowerdev, dev);
+   macvlan_set_operstate(lowerdev, dev);
linkwatch_fire_event(dev);
 
return 0;
@@ -1518,17 +1542,36 @@ static int macvlan_device_event(struct notifier_block 
*unused,
struct macvlan_port *port;
LIST_HEAD(list_kill);
 
-   if (!macvlan_port_exists(dev))
+   if (!macvlan_port_exists(dev) && !is_macvlan(dev))
+   return NOTIFY_DONE;
+
+   if (is_macvlan(dev)) {
+   vlan = netdev_priv(dev);
+
+   switch (event) {
+   case NETDEV_UP:
+   case NETDEV_DOWN:
+   case NETDEV_CHANGE:
+   netif_stacked_transfer_operstate(vlan->lowerdev,
+vlan->dev);
+   macvlan_set_operstate(vlan->lowerdev, vlan->dev);
+   break;
+   }
+
return NOTIFY_DONE;
+   }
 
port = macvlan_port_get_rtnl(dev);
 
switch (event) {
case NETDEV_UP:
+   case NETDEV_DOWN:
case NETDEV_CHANGE:
-   list_for_each_entry(vlan, &port->vlans, list)
+   list_for_each_entry(vlan, &port->vlans, list) {
netif_stacked_transfer_operstate(vlan->lowerdev,
 vlan->dev);
+   macvlan_set_operstate(vlan->lowerdev, vlan->dev);
+   }
break;
case NETDEV_FEAT_CHANGE:
list_for_each_entry(vlan, &port->vlans, list) {
-- 
2.8.0



[PATCH] macvlan: Support interface operstate properly

2016-04-06 Thread Debabrata Banerjee
Set appropriate macvlan interface status based on lower device and our
status. Can be up, down, or lowerlayerdown.

Signed-off-by: Debabrata Banerjee 

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 2bcf1f3..0f4b000 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -91,6 +91,7 @@ static struct macvlan_port *macvlan_port_get_rtnl(const 
struct net_device *dev)
 }
 
 #define macvlan_port_exists(dev) (dev->priv_flags & IFF_MACVLAN_PORT)
+#define is_macvlan(dev) (dev->priv_flags & IFF_MACVLAN)
 
 static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port,
   const unsigned char *addr)
@@ -1242,6 +1243,26 @@ static int macvlan_changelink_sources(struct macvlan_dev 
*vlan, u32 mode,
return 0;
 }
 
+static void macvlan_set_operstate(struct net_device *lowerdev,
+ struct net_device *dev)
+{
+   unsigned char newstate = dev->operstate;
+
+   if (!(dev->flags & IFF_UP))
+   newstate = IF_OPER_DOWN;
+   else if ((lowerdev->flags & IFF_UP) && netif_oper_up(lowerdev))
+   newstate = IF_OPER_UP;
+   else
+   newstate = IF_OPER_LOWERLAYERDOWN;
+
+   if (dev->operstate != newstate) {
+   write_lock_bh(&dev_base_lock);
+   dev->operstate = newstate;
+   netdev_state_change(dev);
+   write_unlock_bh(&dev_base_lock);
+   }
+}
+
 int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
   struct nlattr *tb[], struct nlattr *data[])
 {
@@ -1324,6 +1345,7 @@ int macvlan_common_newlink(struct net *src_net, struct 
net_device *dev,
 
list_add_tail_rcu(&vlan->list, &port->vlans);
netif_stacked_transfer_operstate(lowerdev, dev);
+   macvlan_set_operstate(lowerdev, dev);
linkwatch_fire_event(dev);
 
return 0;
@@ -1518,17 +1540,36 @@ static int macvlan_device_event(struct notifier_block 
*unused,
struct macvlan_port *port;
LIST_HEAD(list_kill);
 
-   if (!macvlan_port_exists(dev))
+   if (!macvlan_port_exists(dev) && !is_macvlan(dev))
return NOTIFY_DONE;
 
+   if (is_macvlan(dev)) {
+   vlan = netdev_priv(dev);
+
+   switch (event) {
+   case NETDEV_UP:
+   case NETDEV_DOWN:
+   case NETDEV_CHANGE:
+   netif_stacked_transfer_operstate(vlan->lowerdev,
+vlan->dev);
+   macvlan_set_operstate(vlan->lowerdev, vlan->dev);
+   break;
+   }
+
+   return NOTIFY_DONE;
+   }
+
port = macvlan_port_get_rtnl(dev);
 
switch (event) {
case NETDEV_UP:
+   case NETDEV_DOWN:
case NETDEV_CHANGE:
-   list_for_each_entry(vlan, &port->vlans, list)
+   list_for_each_entry(vlan, &port->vlans, list) {
netif_stacked_transfer_operstate(vlan->lowerdev,
 vlan->dev);
+   macvlan_set_operstate(vlan->lowerdev, vlan->dev);
+   }
break;
case NETDEV_FEAT_CHANGE:
list_for_each_entry(vlan, &port->vlans, list) {
-- 
2.8.0



[PATCH 2/3] bonding: don't request extraneous rlb updates

2016-04-06 Thread Debabrata Banerjee
Don't attempt to send rlb updates for incomplete entries, which can't be
sent anyway.

Signed-off-by: Debabrata Banerjee 

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 1b45378..b7c7027 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -424,7 +424,8 @@ static void rlb_clear_slave(struct bonding *bond, struct 
slave *slave)
if (assigned_slave) {
rx_hash_table[index].slave = assigned_slave;
if 
(!ether_addr_equal_64bits(rx_hash_table[index].mac_dst,
-mac_bcast)) {
+mac_bcast) &&
+   
!is_zero_ether_addr(rx_hash_table[index].mac_dst)) {
bond_info->rx_hashtbl[index].ntt = 1;
bond_info->rx_ntt = 1;
/* A slave has been removed from the
@@ -527,7 +528,8 @@ static void rlb_req_update_slave_clients(struct bonding 
*bond, struct slave *sla
client_info = &(bond_info->rx_hashtbl[hash_index]);
 
if ((client_info->slave == slave) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
ntt = 1;
}
@@ -568,7 +570,8 @@ static void rlb_req_update_subnet_clients(struct bonding 
*bond, __be32 src_ip)
if ((client_info->ip_src == src_ip) &&
!ether_addr_equal_64bits(client_info->slave->dev->dev_addr,
 bond->dev->dev_addr) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond_info->rx_ntt = 1;
}
@@ -644,7 +647,8 @@ static struct slave *rlb_choose_channel(struct sk_buff 
*skb, struct bonding *bon
ether_addr_copy(client_info->mac_src, arp->mac_src);
client_info->slave = assigned_slave;
 
-   if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+   if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond->alb_info.rx_ntt = 1;
} else {
@@ -735,8 +739,10 @@ static void rlb_rebalance(struct bonding *bond)
assigned_slave = __rlb_next_rx_slave(bond);
if (assigned_slave && (client_info->slave != assigned_slave)) {
client_info->slave = assigned_slave;
-   client_info->ntt = 1;
-   ntt = 1;
+   if (!is_zero_ether_addr(client_info->mac_dst)) {
+   client_info->ntt = 1;
+   ntt = 1;
+   }
}
}
 
-- 
2.8.0



[PATCH 3/3] bonding: use common broadcast addr checks

2016-04-06 Thread Debabrata Banerjee
Replace homegrown broadcast checks with faster defs from etherdevice.h

Signed-off-by: Debabrata Banerjee 

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index b7c7027..27238f3 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -45,9 +45,6 @@
 #ifndef __long_aligned
 #define __long_aligned __attribute__((aligned((sizeof(long)
 #endif
-static const u8 mac_bcast[ETH_ALEN] __long_aligned = {
-   0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-};
 static const u8 mac_v6_allmcast[ETH_ALEN] __long_aligned = {
0x33, 0x33, 0x00, 0x00, 0x00, 0x01
 };
@@ -423,8 +420,8 @@ static void rlb_clear_slave(struct bonding *bond, struct 
slave *slave)
 
if (assigned_slave) {
rx_hash_table[index].slave = assigned_slave;
-   if 
(!ether_addr_equal_64bits(rx_hash_table[index].mac_dst,
-mac_bcast) &&
+
+   if 
(!is_broadcast_ether_addr(rx_hash_table[index].mac_dst) &&

!is_zero_ether_addr(rx_hash_table[index].mac_dst)) {
bond_info->rx_hashtbl[index].ntt = 1;
bond_info->rx_ntt = 1;
@@ -528,7 +525,7 @@ static void rlb_req_update_slave_clients(struct bonding 
*bond, struct slave *sla
client_info = &(bond_info->rx_hashtbl[hash_index]);
 
if ((client_info->slave == slave) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_broadcast_ether_addr(client_info->mac_dst) &&
!is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
ntt = 1;
@@ -570,7 +567,7 @@ static void rlb_req_update_subnet_clients(struct bonding 
*bond, __be32 src_ip)
if ((client_info->ip_src == src_ip) &&
!ether_addr_equal_64bits(client_info->slave->dev->dev_addr,
 bond->dev->dev_addr) &&
-   !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   !is_broadcast_ether_addr(client_info->mac_dst) &&
!is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond_info->rx_ntt = 1;
@@ -599,7 +596,7 @@ static struct slave *rlb_choose_channel(struct sk_buff 
*skb, struct bonding *bon
if ((client_info->ip_src == arp->ip_src) &&
(client_info->ip_dst == arp->ip_dst)) {
/* the entry is already assigned to this client */
-   if (!ether_addr_equal_64bits(arp->mac_dst, mac_bcast)) {
+   if (!is_broadcast_ether_addr(arp->mac_dst)) {
/* update mac address from arp */
ether_addr_copy(client_info->mac_dst, 
arp->mac_dst);
}
@@ -647,7 +644,7 @@ static struct slave *rlb_choose_channel(struct sk_buff 
*skb, struct bonding *bon
ether_addr_copy(client_info->mac_src, arp->mac_src);
client_info->slave = assigned_slave;
 
-   if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+   if (!is_broadcast_ether_addr(client_info->mac_dst) &&
!is_zero_ether_addr(client_info->mac_dst)) {
client_info->ntt = 1;
bond->alb_info.rx_ntt = 1;
@@ -1386,7 +1383,7 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device 
*bond_dev)
case ETH_P_IP: {
const struct iphdr *iph = ip_hdr(skb);
 
-   if (ether_addr_equal_64bits(eth_data->h_dest, mac_bcast) ||
+   if (is_broadcast_ether_addr(eth_data->h_dest) ||
(iph->daddr == ip_bcast) ||
(iph->protocol == IPPROTO_IGMP)) {
do_tx_balance = false;
@@ -1400,7 +1397,7 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device 
*bond_dev)
/* IPv6 doesn't really use broadcast mac address, but leave
 * that here just in case.
 */
-   if (ether_addr_equal_64bits(eth_data->h_dest, mac_bcast)) {
+   if (is_broadcast_ether_addr(eth_data->h_dest)) {
do_tx_balance = false;
break;
}
-- 
2.8.0



[PATCH 1/3] bonding: do not allow rlb updates to invalid mac

2016-04-06 Thread Debabrata Banerjee
Make sure multicast, broadcast, and zero mac's cannot be the output of rlb
updates, which should all be directed arps. Receive load balancing will be
collapsed if any of these happen, as the switch will broadcast.

Signed-off-by: Debabrata Banerjee 

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c5ac160..1b45378 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -453,7 +453,7 @@ static void rlb_update_client(struct rlb_client_info 
*client_info)
 {
int i;
 
-   if (!client_info->slave)
+   if (!client_info->slave || !is_valid_ether_addr(client_info->mac_dst))
return;
 
for (i = 0; i < RLB_ARP_BURST_SIZE; i++) {
-- 
2.8.0



Re: [PATCH net-next v4 1/2] fix return of iptunnel_xmit

2015-10-09 Thread Debabrata Banerjee
Andreas, I think we need to use the net_xmit defines so the errors are
masked properly, how about:

-   if (unlikely(net_xmit_eval(err)))
-   pkt_len = 0;
-   return pkt_len;
+   if (likely(net_xmit_eval(err) == 0))
+   return pkt_len;
+   else
+   return net_xmit_errno(err);
+
+   return 0;

On Fri, Oct 9, 2015 at 5:27 AM, Andreas Schultz  wrote:
> All users of iptunnel_xmit expect the return value to be the packet
> length on success (>0), negative for a tx error and zero for a tx
> dropped error. In cset 0e6fbc5b6c6218987c93b8c7ca60cf786062899d the
> negative return case was lost.
>
> This bug was introduced when the ip_tunnel_core code was refactored.
>
> Fixes: 0e6fbc5b6c6218987c93b8c7ca60cf786062899d
> Signed-off-by: Andreas Schultz 
> Acked-by: Jiri Benc 
> Acked-by: Pravin B Shelar 
> ---
> Change in v2:
>  - remove unused variable pkt_len
>
> Change in v3:
>  - reworked based on comment from Jiri Benc
>
> Change in v4:
>  - rebased to net-next to avoid merge conflicts
>  - added Acked-By from Jiri Benc and Pravin B Shelar
>
> ---
>  net/ipv4/ip_tunnel_core.c | 9 ++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
> index 6cb9009..453d569 100644
> --- a/net/ipv4/ip_tunnel_core.c
> +++ b/net/ipv4/ip_tunnel_core.c
> @@ -80,9 +80,12 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, 
> struct sk_buff *skb,
> __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
>
> err = ip_local_out(net, sk, skb);
> -   if (unlikely(net_xmit_eval(err)))
> -   pkt_len = 0;
> -   return pkt_len;
> +   if (likely(net_xmit_eval(err) == 0))
> +   return pkt_len;
> +   if (err < 0)
> +   return err;
> +
> +   return 0;
>  }
>  EXPORT_SYMBOL_GPL(iptunnel_xmit);
>
> --
> 2.1.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] net: use atomic allocation for order-3 page allocation

2015-06-11 Thread Debabrata Banerjee
There is no "background" it doesn't matter if this activity happens
synchronously or asynchronously, unless you're sensitive to the
latency on that single operation. If you're driving all your cpu's and
memory hard then this is work that still takes resources. If there's a
kernel thread with compaction running, then obviously your process is
not.

Your patch should help in that not every atomic allocation failure
should mean yet another run at compaction/reclaim.

-Deb

On Thu, Jun 11, 2015 at 5:16 PM, Chris Mason  wrote:

> networking is asking for 32KB, and the MM layer is doing what it can to
> provide it.  Are the gains from getting 32KB contig bigger than the cost
> of moving pages around if the MM has to actually go into compaction?
> Should we start disk IO to give back 32KB contig?
>
> I think we want to tell the MM to compact in the background and give
> networking 32KB if it happens to have it available.  If not, fall back
> to smaller allocations without doing anything expensive.
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] net: use atomic allocation for order-3 page allocation

2015-06-11 Thread Debabrata Banerjee
Resend in plaintext, thanks gmail:

It's somewhat an intractable problem to know if compaction will succeed
without trying it, and you can certainly end up in a state where memory is
heavily fragmented, even with compaction running. You can't compact kernel
pages for example, so you can end up in a state where compaction does
nothing through no fault of it's own.

In this case you waste time in compaction routines, then end up reclaiming
precious page cache pages or swapping out for whatever it is your machine
was doing trying to do to satisfy these order-3 allocations, after which all
those pages need to be restored from disk almost immediately. This is not a
happy server. Any mm fix may be years away. The only simple solution I can
think of is specifically caching these allocations, in any other case under
memory pressure they will be split by other smaller allocations.

We've been forcing these allocations to order-0 internally until we can
think of something else.

-Deb


> On Thu, Jun 11, 2015 at 4:48 PM, Eric Dumazet 
> wrote:
>>
>> On Thu, 2015-06-11 at 13:24 -0700, Shaohua Li wrote:
>> > We saw excessive memory compaction triggered by skb_page_frag_refill.
>> > This causes performance issues. Commit 5640f7685831e0 introduces the
>> > order-3 allocation to improve performance. But memory compaction has
>> > high overhead. The benefit of order-3 allocation can't compensate the
>> > overhead of memory compaction.
>> >
>> > This patch makes the order-3 page allocation atomic. If there is no
>> > memory pressure and memory isn't fragmented, the alloction will still
>> > success, so we don't sacrifice the order-3 benefit here. If the atomic
>> > allocation fails, compaction will not be triggered and we will fallback
>> > to order-0 immediately.
>> >
>> > The mellanox driver does similar thing, if this is accepted, we must fix
>> > the driver too.
>> >
>> > Cc: Eric Dumazet 
>> > Signed-off-by: Shaohua Li 
>> > ---
>> >  net/core/sock.c | 2 +-
>> >  1 file changed, 1 insertion(+), 1 deletion(-)
>> >
>> > diff --git a/net/core/sock.c b/net/core/sock.c
>> > index 292f422..e9855a4 100644
>> > --- a/net/core/sock.c
>> > +++ b/net/core/sock.c
>> > @@ -1883,7 +1883,7 @@ bool skb_page_frag_refill(unsigned int sz, struct
>> > page_frag *pfrag, gfp_t gfp)
>> >
>> >   pfrag->offset = 0;
>> >   if (SKB_FRAG_PAGE_ORDER) {
>> > - pfrag->page = alloc_pages(gfp | __GFP_COMP |
>> > + pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP
>> > |
>> > __GFP_NOWARN | __GFP_NORETRY,
>> > SKB_FRAG_PAGE_ORDER);
>> >   if (likely(pfrag->page)) {
>>
>> This is not a specific networking issue, but mm one.
>>
>> You really need to start a discussion with mm experts.
>>
>> Your changelog does not exactly explains what _is_ the problem.
>>
>> If the problem lies in mm layer, it might be time to fix it, instead of
>> work around the bug by never triggering it from this particular point,
>> which is a safe point where a process is willing to wait a bit.
>>
>> Memory compaction is either working as intending, or not.
>>
>> If we enabled it but never run it because it hurts, what is the point
>> enabling it ?
>>
>>
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html