date:20160408

Re: [PATCHv2 net-next 4/6] sctp: add the sctp_diag.c file

2016-04-08 Thread Eric Dumazet

On Sat, 2016-04-09 at 12:53 +0800, Xin Long wrote:
> This one will implement all the interface of inet_diag, inet_diag_handler.
> which includes sctp_diag_dump, sctp_diag_dump_one and sctp_diag_get_info.


> +static int inet_assoc_diag_fill(struct sock *sk,
> + struct sctp_association *asoc,
> + struct sk_buff *skb,
> + const struct inet_diag_req_v2 *req,
> + struct user_namespace *user_ns,
> + int portid, u32 seq, u16 nlmsg_flags,
> + const struct nlmsghdr *unlh)
> +{
> + const struct inet_sock *inet = inet_sk(sk);
> + const struct inet_diag_handler *handler;
> + int ext = req->idiag_ext;
> + struct inet_diag_msg *r;
> + struct nlmsghdr  *nlh;
> + struct nlattr *attr;
> + void *info = NULL;
> + union sctp_addr laddr, paddr;
> + struct dst_entry *dst;
> + struct sctp_infox infox;
> +
> + handler = inet_diag_get_handler(req->sdiag_protocol);
> + BUG_ON(!handler);
> +
> + nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
> + nlmsg_flags);
> + if (!nlh)
> + return -EMSGSIZE;
> +
> + r = nlmsg_data(nlh);
> + BUG_ON(!sk_fullsock(sk));
> +
> + laddr = list_entry(asoc->base.bind_addr.address_list.next,
> +struct sctp_sockaddr_entry, list)->a;
> + paddr = asoc->peer.primary_path->ipaddr;
> + dst = asoc->peer.primary_path->dst;
> +
> + r->idiag_family = sk->sk_family;
> + r->id.idiag_sport = htons(asoc->base.bind_addr.port);
> + r->id.idiag_dport = htons(asoc->peer.port);
> + r->id.idiag_if = dst ? dst->dev->ifindex : 0;
> + sock_diag_save_cookie(sk, r->id.idiag_cookie);
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> + if (sk->sk_family == AF_INET6) {
> + *(struct in6_addr *)r->id.idiag_src = laddr.v6.sin6_addr;
> + *(struct in6_addr *)r->id.idiag_dst = paddr.v6.sin6_addr;
> + } else
> +#endif
> + {
> + memset(>id.idiag_src, 0, sizeof(r->id.idiag_src));
> + memset(>id.idiag_dst, 0, sizeof(r->id.idiag_dst));
> +
> + r->id.idiag_src[0] = laddr.v4.sin_addr.s_addr;
> + r->id.idiag_dst[0] = paddr.v4.sin_addr.s_addr;
> + }
> +
> + r->idiag_state = asoc->state;
> + r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX;
> + r->idiag_retrans = asoc->rtx_data_chunks;
> +#define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
> + r->idiag_expires =
> + EXPIRES_IN_MS(asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX]);
> +#undef EXPIRES_IN_MS
> +
> + if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
> + goto errout;
> +
> + /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
> +  * hence this needs to be included regardless of socket family.
> +  */
> + if (ext & (1 << (INET_DIAG_TOS - 1)))
> + if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
> + goto errout;
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> + if (r->idiag_family == AF_INET6) {
> + if (ext & (1 << (INET_DIAG_TCLASS - 1)))
> + if (nla_put_u8(skb, INET_DIAG_TCLASS,
> +inet6_sk(sk)->tclass) < 0)
> + goto errout;
> +
> + if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
> + nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
> + goto errout;
> + }
> +#endif
> +
> + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
> + r->idiag_inode = sock_i_ino(sk);
> +
> + if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
> + struct inet_diag_meminfo minfo = {
> + .idiag_rmem = sk_rmem_alloc_get(sk),
> + .idiag_wmem = sk->sk_wmem_queued,
> + .idiag_fmem = sk->sk_forward_alloc,
> + .idiag_tmem = sk_wmem_alloc_get(sk),
> + };
> +

All this code looks familiar.

Why inet_sk_diag_fill() is not used instead ?

> + if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), ) < 0)
> + goto errout;
> + }
> +
> + if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
> + if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
> + goto errout;
> +
> + if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
> + attr = nla_reserve(skb, INET_DIAG_INFO,
> +handler->idiag_info_size);
> + if (!attr)
> + goto errout;
> +
> + info = nla_data(attr);
> + }
> + infox.sctpinfo = (struct sctp_info *)info;
> + infox.asoc = asoc;
> + handler->idiag_get_info(sk, r, );
> +
> + if (ext & (1 << (INET_DIAG_CONG - 1)))
> + if (nla_put_string(skb,

[PATCH net-next] net: bcmgenet: use __napi_schedule_irqoff()

2016-04-08 Thread Eric Dumazet

From: Florian Fainelli 

bcmgenet_isr1() and bcmgenet_isr0() run in hard irq context,
we do not need to block irq again.

Signed-off-by: Florian Fainelli 
Signed-off-by: Eric Dumazet 
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c 
b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index f7b42b9fc979..4367d561a12e 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2493,7 +2493,7 @@ static irqreturn_t bcmgenet_isr1(int irq, void *dev_id)
 
if (likely(napi_schedule_prep(_ring->napi))) {
rx_ring->int_disable(rx_ring);
-   __napi_schedule(_ring->napi);
+   __napi_schedule_irqoff(_ring->napi);
}
}
 
@@ -2506,7 +2506,7 @@ static irqreturn_t bcmgenet_isr1(int irq, void *dev_id)
 
if (likely(napi_schedule_prep(_ring->napi))) {
tx_ring->int_disable(tx_ring);
-   __napi_schedule(_ring->napi);
+   __napi_schedule_irqoff(_ring->napi);
}
}
 
@@ -2536,7 +2536,7 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id)
 
if (likely(napi_schedule_prep(_ring->napi))) {
rx_ring->int_disable(rx_ring);
-   __napi_schedule(_ring->napi);
+   __napi_schedule_irqoff(_ring->napi);
}
}
 
@@ -2545,7 +2545,7 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id)
 
if (likely(napi_schedule_prep(_ring->napi))) {
tx_ring->int_disable(tx_ring);
-   __napi_schedule(_ring->napi);
+   __napi_schedule_irqoff(_ring->napi);
}
}

Re: [PATCH net-next] net: bcmgenet: use napi_complete_done()

2016-04-08 Thread Eric Dumazet

On Fri, 2016-04-08 at 22:19 -0700, Florian Fainelli wrote:

> Along the same line of changes, we could use napi_schedule_irqoff since NAPI 
> is always scheduled from ISR context.

Good point, I'll cook the patch ;)

Thanks !

Re: [PATCH net-next] net: bcmgenet: use napi_complete_done()

2016-04-08 Thread Florian Fainelli

On April 8, 2016 10:06:40 PM PDT, Eric Dumazet  wrote:
>From: Eric Dumazet 
>
>By using napi_complete_done(), we allow fine tuning
>of /sys/class/net/ethX/gro_flush_timeout for higher GRO aggregation
>efficiency for a Gbit NIC.
>
>Check commit 24d2e4a50737 ("tg3: use napi_complete_done()") for
>details.
>
>Signed-off-by: Eric Dumazet 
>Cc: Petri Gynther 
>Cc: Florian Fainelli 

Acked-by: Florian Fainelli 

Along the same line of changes, we could use napi_schedule_irqoff since NAPI is 
always scheduled from ISR context.


>---
> drivers/net/ethernet/broadcom/genet/bcmgenet.c |2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
>diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
>b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
>index f7b42b9fc979..e823013d3125 100644
>--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
>+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
>@@ -1735,7 +1735,7 @@ static int bcmgenet_rx_poll(struct napi_struct
>*napi, int budget)
>   work_done = bcmgenet_desc_rx(ring, budget);
> 
>   if (work_done < budget) {
>-  napi_complete(napi);
>+  napi_complete_done(napi, work_done);
>   ring->int_enable(ring);
>   }
> 


-- 
Florian

Re: [PATCHv2 net-next 1/6] sctp: add sctp_info dump api for sctp_diag

2016-04-08 Thread Eric Dumazet

On Sat, 2016-04-09 at 12:53 +0800, Xin Long wrote:
> sctp_diag will dump some important details of sctp's assoc or ep, we use
> sctp_info to describe them,  sctp_get_sctp_info to get them, and export
> it to sctp_diag.ko.
> 


> +int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
> +struct sctp_info *info)
> +{
> + struct sctp_transport *prim;
> + struct list_head *pos, *temp;
> + int mask;
> +
> + memset(info, 0, sizeof(*info));
> + if (!asoc) {
> + struct sctp_sock *sp = sctp_sk(sk);
> +
> + info->sctpi_s_autoclose = sp->autoclose;
> + info->sctpi_s_adaptation_ind = sp->adaptation_ind;
> + info->sctpi_s_pd_point = sp->pd_point;
> + info->sctpi_s_nodelay = sp->nodelay;
> + info->sctpi_s_disable_fragments = sp->disable_fragments;
> + info->sctpi_s_v4mapped = sp->v4mapped;
> + info->sctpi_s_frag_interleave = sp->frag_interleave;
> +
> + return 0;
> + }
> +
> + info->sctpi_tag = asoc->c.my_vtag;
> + info->sctpi_state = asoc->state;
> + info->sctpi_rwnd = asoc->a_rwnd;
> + info->sctpi_unackdata = asoc->unack_data;
> + info->sctpi_penddata = sctp_tsnmap_pending(>peer.tsn_map);
> + info->sctpi_instrms = asoc->c.sinit_max_instreams;
> + info->sctpi_outstrms = asoc->c.sinit_num_ostreams;
> + list_for_each_safe(pos, temp, >base.inqueue.in_chunk_list)
> + info->sctpi_inqueue++;
> + list_for_each_safe(pos, temp, >outqueue.out_chunk_list)
> + info->sctpi_outqueue++;

Is this safe ?

Do you own the lock on socket or whatever lock protecting this list ?


> + info->sctpi_overall_error = asoc->overall_error_count;
> + info->sctpi_max_burst = asoc->max_burst;
> + info->sctpi_maxseg = asoc->frag_point;
> + info->sctpi_peer_rwnd = asoc->peer.rwnd;
> + info->sctpi_peer_tag = asoc->c.peer_vtag;
> +
> + mask = asoc->peer.ecn_capable << 1;
> + mask = (mask | asoc->peer.ipv4_address) << 1;
> + mask = (mask | asoc->peer.ipv6_address) << 1;
> + mask = (mask | asoc->peer.hostname_address) << 1;
> + mask = (mask | asoc->peer.asconf_capable) << 1;
> + mask = (mask | asoc->peer.prsctp_capable) << 1;
> + mask = (mask | asoc->peer.auth_capable);
> + info->sctpi_peer_capable = mask;
> + mask = asoc->peer.sack_needed << 1;
> + mask = (mask | asoc->peer.sack_generation) << 1;
> + mask = (mask | asoc->peer.zero_window_announced);
> + info->sctpi_peer_sack = mask;
> +
> + info->sctpi_isacks = asoc->stats.isacks;
> + info->sctpi_osacks = asoc->stats.osacks;
> + info->sctpi_opackets = asoc->stats.opackets;
> + info->sctpi_ipackets = asoc->stats.ipackets;
> + info->sctpi_rtxchunks = asoc->stats.rtxchunks;
> + info->sctpi_outofseqtsns = asoc->stats.outofseqtsns;
> + info->sctpi_idupchunks = asoc->stats.idupchunks;
> + info->sctpi_gapcnt = asoc->stats.gapcnt;
> + info->sctpi_ouodchunks = asoc->stats.ouodchunks;
> + info->sctpi_iuodchunks = asoc->stats.iuodchunks;
> + info->sctpi_oodchunks = asoc->stats.oodchunks;
> + info->sctpi_iodchunks = asoc->stats.iodchunks;
> + info->sctpi_octrlchunks = asoc->stats.octrlchunks;
> + info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
> +
> + prim = asoc->peer.primary_path;
> + memcpy(>sctpi_p_address, >ipaddr,
> +sizeof(struct sockaddr_storage));
> + info->sctpi_p_state = prim->state;
> + info->sctpi_p_cwnd = prim->cwnd;
> + info->sctpi_p_srtt = prim->srtt;
> + info->sctpi_p_rto = jiffies_to_msecs(prim->rto);
> + info->sctpi_p_hbinterval = prim->hbinterval;
> + info->sctpi_p_pathmaxrxt = prim->pathmaxrxt;
> + info->sctpi_p_sackdelay = jiffies_to_msecs(prim->sackdelay);
> + info->sctpi_p_ssthresh = prim->ssthresh;
> + info->sctpi_p_partial_bytes_acked = prim->partial_bytes_acked;
> + info->sctpi_p_flight_size = prim->flight_size;
> + info->sctpi_p_error = prim->error_count;
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(sctp_get_sctp_info);

info is not guaranteed to be aligned on 8 bytes.

You need to use put_unaligned()

Check commit ff5d749772018 ("tcp: beware of alignments in
tcp_get_info()") for details.

Re: [PATCHv2 net-next 1/6] sctp: add sctp_info dump api for sctp_diag

2016-04-08 Thread Eric Dumazet

On Sat, 2016-04-09 at 12:53 +0800, Xin Long wrote:
> sctp_diag will dump some important details of sctp's assoc or ep, we use
> sctp_info to describe them,  sctp_get_sctp_info to get them, and export
> it to sctp_diag.ko.
> 
> Signed-off-by: Xin Long 
> ---
>  include/linux/sctp.h| 65 +
>  include/net/sctp/sctp.h |  3 ++
>  net/sctp/socket.c   | 86 
> +
>  3 files changed, 154 insertions(+)
> 
> diff --git a/include/linux/sctp.h b/include/linux/sctp.h
> index a9414fd..a448ebc 100644
> --- a/include/linux/sctp.h
> +++ b/include/linux/sctp.h
> @@ -705,4 +705,69 @@ typedef struct sctp_auth_chunk {
>   sctp_authhdr_t auth_hdr;
>  } __packed sctp_auth_chunk_t;
>  
> +struct sctp_info {
> + __u32   sctpi_tag;
> + __u32   sctpi_state;
> + __u32   sctpi_rwnd;
> + __u16   sctpi_unackdata;
> + __u16   sctpi_penddata;
> + __u16   sctpi_instrms;
> + __u16   sctpi_outstrms;
> + __u32   sctpi_fragmentation_point;
> + __u32   sctpi_inqueue;
> + __u32   sctpi_outqueue;
> + __u32   sctpi_overall_error;
> + __u32   sctpi_max_burst;
> + __u32   sctpi_maxseg;
> + __u32   sctpi_peer_rwnd;
> + __u32   sctpi_peer_tag;
> + __u8sctpi_peer_capable;
> + __u8sctpi_peer_sack;
> +
> + /* assoc status info */
> + __u64   sctpi_isacks;
> + __u64   sctpi_osacks;
> + __u64   sctpi_opackets;
> + __u64   sctpi_ipackets;
> + __u64   sctpi_rtxchunks;
> + __u64   sctpi_outofseqtsns;
> + __u64   sctpi_idupchunks;
> + __u64   sctpi_gapcnt;
> + __u64   sctpi_ouodchunks;
> + __u64   sctpi_iuodchunks;
> + __u64   sctpi_oodchunks;
> + __u64   sctpi_iodchunks;
> + __u64   sctpi_octrlchunks;
> + __u64   sctpi_ictrlchunks;
> +
> + /* primary transport info */
> + struct sockaddr_storage sctpi_p_address;
> + __s32   sctpi_p_state;
> + __u32   sctpi_p_cwnd;
> + __u32   sctpi_p_srtt;
> + __u32   sctpi_p_rto;
> + __u32   sctpi_p_hbinterval;
> + __u32   sctpi_p_pathmaxrxt;
> + __u32   sctpi_p_sackdelay;
> + __u32   sctpi_p_sackfreq;
> + __u32   sctpi_p_ssthresh;
> + __u32   sctpi_p_partial_bytes_acked;
> + __u32   sctpi_p_flight_size;
> + __u16   sctpi_p_error;
> +
> + /* sctp sock info */
> + __u32   sctpi_s_autoclose;
> + __u32   sctpi_s_adaptation_ind;
> + __u32   sctpi_s_pd_point;
> + __u8sctpi_s_nodelay;
> + __u8sctpi_s_disable_fragments;
> + __u8sctpi_s_v4mapped;
> + __u8sctpi_s_frag_interleave;
> +};
> +

Lots of holes in this structure...

[GIT] Networking

2016-04-08 Thread David Miller


1) Stale SKB data pointer access across pskb_may_pull() calls in L2TP,
   from Haishuang Yan.

2) Fix multicast frame handling in mac80211 AP code, from Felix
   Fietkau.

3) mac80211 station hashtable insert errors not handled properly, fix
   from Johannes Berg.

4) Fix TX descriptor count limit handling in e1000, from Alexander Duyck.

5) Revert a buggy netdev refcount fix in netpoll, from Bjorn Helgaas.

6) Must assign rtnl_link_ops of the device before registering it,
   fix in ip6_tunnel from Thadeu Lima de Souza Cascardo.

7) Memory leak fix in tc action net exit, from WANG Cong.

8) Add missing AF_KCM entries to name tables, from Dexuan Cui.

9) Fix regression in GRE handling of csums wrt. FOU, from Alexander
   Duyck.

10) Fix memory allocation alignment and congestion map corruption in
RDS, from Shamir Rabinovitch.

11) Fix default qdisc regression in tuntap driver, from Jason Wang.

Please pull, thanks a lot!

The following changes since commit 05cf8077e54b20dddb756eaa26f3aeb5c38dd3cf:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2016-04-01 
20:03:33 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to 30d237a6c2e9be1bb816fe8e787b88fd7aad833b:

  Merge tag 'mac80211-for-davem-2016-04-06' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211 (2016-04-08 
16:41:28 -0400)


Alexander Duyck (3):
  e1000: Do not overestimate descriptor counts in Tx pre-check
  e1000: Double Tx descriptors needed check for 82544
  GRE: Disable segmentation offloads w/ CSUM and we are encapsulated via FOU

Arik Nemtsov (3):
  mac80211: TDLS: always downgrade invalid chandefs
  mac80211: TDLS: change BW calculation for WIDER_BW peers
  mac80211: recalc min_def chanctx even when chandef is identical

Bastien Philbert (1):
  bridge: Fix incorrect variable assignment on error path in br_sysfs_addbr

Ben Greear (1):
  mac80211: ensure no limits on station rhashtable

Bjorn Helgaas (1):
  Revert "netpoll: Fix extra refcount release in netpoll_cleanup()"

Dave Jones (1):
  af_packet: tone down the Tx-ring unsupported spew.

David S. Miller (3):
  Merge branch 'master' of git://git.kernel.org/.../jkirsher/net-queue
  Revert "bridge: Fix incorrect variable assignment on error path in 
br_sysfs_addbr"
  Merge tag 'mac80211-for-davem-2016-04-06' of 
git://git.kernel.org/.../jberg/mac80211

Dexuan Cui (1):
  net: add the AF_KCM entries to family name tables

Emmanuel Grumbach (2):
  mac80211: don't send deferred frames outside the SP
  mac80211: close the SP when we enqueue frames during the SP

Felix Fietkau (1):
  mac80211: fix AP buffered multicast frames with queue control and txq

Giuseppe CAVALLARO (1):
  stmmac: fix adjust link call in case of a switch is attached

Haishuang Yan (2):
  ipv4: l2tp: fix a potential issue in l2tp_ip_recv
  ipv6: l2tp: fix a potential issue in l2tp_ip6_recv

Hariprasad Shenai (1):
  cxgb4: Add pci device id for chelsio t520-cr adapter

Ilan Peer (1):
  mac80211: Fix BW upgrade for TDLS peers

Jakub Sitnicki (1):
  ipv6: Count in extension headers in skb->network_header

Jason Wang (1):
  tuntap: restore default qdisc

Jeff Mahoney (1):
  mac80211: fix "warning: ‘target_metric’ may be used uninitialized"

Jesse Brandeburg (1):
  i40e: fix errant PCIe bandwidth message

Jiri Benc (1):
  MAINTAINERS: intel-wired-lan list is moderated

Johannes Berg (1):
  mac80211: properly deal with station hashtable insert errors

Jorgen Hansen (1):
  VSOCK: Detach QP check should filter out non matching QPs.

Luis de Bethencourt (2):
  mac80211: add doc for RX_FLAG_DUP_VALIDATED flag
  mac80211: remove description of dropped member

Marcelo Ricardo Leitner (2):
  sctp: flush if we can't fit another DATA chunk
  sctp: use list_* in sctp_list_dequeue

Naveen N. Rao (7):
  samples/bpf: Fix build breakage with map_perf_test_user.c
  samples/bpf: Use llc in PATH, rather than a hardcoded value
  samples/bpf: Enable powerpc support
  lib/test_bpf: Fix JMP_JSET tests
  lib/test_bpf: Add tests for unsigned BPF_JGT
  lib/test_bpf: Add test to check for result of 32-bit add that overflows
  lib/test_bpf: Add additional BPF_ADD tests

Roopa Prabhu (1):
  mpls: find_outdev: check for err ptr in addition to NULL check

Thadeu Lima de Souza Cascardo (1):
  ip6_tunnel: set rtnl_link_ops before calling register_netdevice

WANG Cong (1):
  net_sched: fix a memory leak in tc action

shamir rabinovitch (2):
  RDS: memory allocated must be align to 8
  RDS: fix congestion map corruption for PAGE_SIZE > 4k

stephen hemminger (1):
  bridge, netem: mark mailing lists as moderated

 MAINTAINERS|   6 +-

[PATCH net-next] net: bcmgenet: use napi_complete_done()

2016-04-08 Thread Eric Dumazet

From: Eric Dumazet 

By using napi_complete_done(), we allow fine tuning
of /sys/class/net/ethX/gro_flush_timeout for higher GRO aggregation
efficiency for a Gbit NIC.

Check commit 24d2e4a50737 ("tg3: use napi_complete_done()") for details.

Signed-off-by: Eric Dumazet 
Cc: Petri Gynther 
Cc: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c 
b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index f7b42b9fc979..e823013d3125 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -1735,7 +1735,7 @@ static int bcmgenet_rx_poll(struct napi_struct *napi, int 
budget)
work_done = bcmgenet_desc_rx(ring, budget);
 
if (work_done < budget) {
-   napi_complete(napi);
+   napi_complete_done(napi, work_done);
ring->int_enable(ring);
}

Re: [PATCH net-next] net: bcmgenet: add BQL support

2016-04-08 Thread Eric Dumazet

On Fri, 2016-04-08 at 21:13 -0700, Petri Gynther wrote:

> What values does the networking core program into BQL dynamic limits
> that my code in netdev->ndo_open() would wipe out?
> 

0 and 0

Clearing again these values by 0 and 0 is defensive programming.

As I said, no BQL enabled driver does that, and we do not want various
drivers implementing BQL in various ways.

Having the same logic is easier for code review and maintenance.

This was proven to work for many years.

> You mentioned the queue init path:
> netdev_init_one_queue() -> dql_init() -> dql_reset()
> 
> that is called when the netdev is created and Tx queues allocated.
> 
> But, does the networking core somewhere set *different* values for BQL
> dynamic limits than what dql_reset() did, before opening the device?
> 
> > For example, tg3 calls netdev_tx_reset_queue() only when freeing tx
> > rings, as it might have freed skb(s) not from normal TX complete path
> > and thus missed appropriate dql_completed().
> >
> 
> Looking at the tg3 driver, it calls:
> tg3_stop()
>   tg3_free_rings()
> netdev_tx_reset_queue()
> 
> netdev_tx_reset_queue() is called unconditionally, as long as the Tx
> ring exists. So "ip link set dev eth down" would cause it to be
> called.
> 
> Why is it OK to call netdev_tx_reset_queue() from the
> netdev->ndo_stop() path, but not from netdev->ndo_open() path?

Because we properly init BQL state when a device is created in core
networking stack. So that we do not have to copy the same code over and
over in 100 drivers. This is called code factorization.

Put these calls in bcmgenet_fini_dma(), to follow the BQL model used in
all other drivers.

Thanks.

[PATCHv2 net-next 6/6] sctp: fix some rhashtable functions using in sctp proc/diag

2016-04-08 Thread Xin Long

When rhashtable_walk_init return err, no release function should be
called, and when rhashtable_walk_start return err, we should only invoke
rhashtable_walk_exit to release the source.

But now when sctp_transport_walk_start return err, we just call
rhashtable_walk_stop/exit, and never care about if rhashtable_walk_init
or start return err, which is so bad.

We will fix it by calling rhashtable_walk_exit if rhashtable_walk_start
return err in sctp_transport_walk_start, and if sctp_transport_walk_start
return err, we do not need to call sctp_transport_walk_stop any more.

For sctp proc, we will use 'iter->start_fail' to decide if we will call
rhashtable_walk_stop/exit.

Signed-off-by: Xin Long 
---
 net/sctp/proc.c   |  7 ++-
 net/sctp/socket.c | 15 ++-
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 9fe1393..4cb5aed 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -280,6 +280,7 @@ void sctp_eps_proc_exit(struct net *net)
 struct sctp_ht_iter {
struct seq_net_private p;
struct rhashtable_iter hti;
+   int start_fail;
 };
 
 static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
@@ -287,8 +288,10 @@ static void *sctp_transport_seq_start(struct seq_file 
*seq, loff_t *pos)
struct sctp_ht_iter *iter = seq->private;
int err = sctp_transport_walk_start(>hti);
 
-   if (err)
+   if (err) {
+   iter->start_fail = 1;
return ERR_PTR(err);
+   }
 
return sctp_transport_get_idx(seq_file_net(seq), >hti, *pos);
 }
@@ -297,6 +300,8 @@ static void sctp_transport_seq_stop(struct seq_file *seq, 
void *v)
 {
struct sctp_ht_iter *iter = seq->private;
 
+   if (iter->start_fail)
+   return;
sctp_transport_walk_stop(>hti);
 }
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b0bf6c7..473a40c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4298,8 +4298,12 @@ int sctp_transport_walk_start(struct rhashtable_iter 
*iter)
return err;
 
err = rhashtable_walk_start(iter);
+   if (err && err != -EAGAIN) {
+   rhashtable_walk_exit(iter);
+   return err;
+   }
 
-   return err == -EAGAIN ? 0 : err;
+   return 0;
 }
 
 void sctp_transport_walk_stop(struct rhashtable_iter *iter)
@@ -4388,11 +4392,12 @@ EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
 int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
struct net *net, int pos, void *p) {
struct rhashtable_iter hti;
-   int err = 0;
void *obj;
+   int err;
 
-   if (sctp_transport_walk_start())
-   goto out;
+   err = sctp_transport_walk_start();
+   if (err)
+   return err;
 
sctp_transport_get_idx(net, , pos);
obj = sctp_transport_get_next(net, );
@@ -4406,8 +4411,8 @@ int sctp_for_each_transport(int (*cb)(struct 
sctp_transport *, void *),
if (err)
break;
}
-out:
sctp_transport_walk_stop();
+
return err;
 }
 EXPORT_SYMBOL_GPL(sctp_for_each_transport);
-- 
2.1.0

[PATCHv2 net-next 3/6] sctp: export some functions for sctp_diag in inet_diag

2016-04-08 Thread Xin Long

inet_diag_msg_common_fill is used to fill the diag msg common info,
we need to use it in sctp_diag as well, so export it.

We also add inet_diag_get_handler() to access inet_diag_table in sctp
diag.

Signed-off-by: Xin Long 
---
 net/ipv4/inet_diag.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index bd591eb..5a0bfe0 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -66,7 +66,13 @@ static void inet_diag_unlock_handler(const struct 
inet_diag_handler *handler)
mutex_unlock(_diag_table_mutex);
 }
 
-static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
+const struct inet_diag_handler *inet_diag_get_handler(int proto)
+{
+   return inet_diag_table[proto];
+}
+EXPORT_SYMBOL_GPL(inet_diag_get_handler);
+
+void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
 {
r->idiag_family = sk->sk_family;
 
@@ -89,6 +95,7 @@ static void inet_diag_msg_common_fill(struct inet_diag_msg 
*r, struct sock *sk)
r->id.idiag_dst[0] = sk->sk_daddr;
}
 }
+EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
 
 static size_t inet_sk_attr_size(void)
 {
-- 
2.1.0

[PATCHv2 net-next 5/6] sctp: merge the seq_start/next/exits in remaddrs and assocs

2016-04-08 Thread Xin Long

In sctp proc, these three functions in remaddrs and assocs are the
same. we should merge them into one.

Signed-off-by: Xin Long 
---
 net/sctp/proc.c | 45 +
 1 file changed, 9 insertions(+), 36 deletions(-)

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index dd8492f..9fe1393 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -282,7 +282,7 @@ struct sctp_ht_iter {
struct rhashtable_iter hti;
 };
 
-static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
+static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
 {
struct sctp_ht_iter *iter = seq->private;
int err = sctp_transport_walk_start(>hti);
@@ -293,14 +293,14 @@ static void *sctp_assocs_seq_start(struct seq_file *seq, 
loff_t *pos)
return sctp_transport_get_idx(seq_file_net(seq), >hti, *pos);
 }
 
-static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
+static void sctp_transport_seq_stop(struct seq_file *seq, void *v)
 {
struct sctp_ht_iter *iter = seq->private;
 
sctp_transport_walk_stop(>hti);
 }
 
-static void *sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t 
*pos)
 {
struct sctp_ht_iter *iter = seq->private;
 
@@ -367,9 +367,9 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void 
*v)
 }
 
 static const struct seq_operations sctp_assoc_ops = {
-   .start = sctp_assocs_seq_start,
-   .next  = sctp_assocs_seq_next,
-   .stop  = sctp_assocs_seq_stop,
+   .start = sctp_transport_seq_start,
+   .next  = sctp_transport_seq_next,
+   .stop  = sctp_transport_seq_stop,
.show  = sctp_assocs_seq_show,
 };
 
@@ -406,33 +406,6 @@ void sctp_assocs_proc_exit(struct net *net)
remove_proc_entry("assocs", net->sctp.proc_net_sctp);
 }
 
-static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos)
-{
-   struct sctp_ht_iter *iter = seq->private;
-   int err = sctp_transport_walk_start(>hti);
-
-   if (err)
-   return ERR_PTR(err);
-
-   return sctp_transport_get_idx(seq_file_net(seq), >hti, *pos);
-}
-
-static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-   struct sctp_ht_iter *iter = seq->private;
-
-   ++*pos;
-
-   return sctp_transport_get_next(seq_file_net(seq), >hti);
-}
-
-static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
-{
-   struct sctp_ht_iter *iter = seq->private;
-
-   sctp_transport_walk_stop(>hti);
-}
-
 static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 {
struct sctp_association *assoc;
@@ -506,9 +479,9 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void 
*v)
 }
 
 static const struct seq_operations sctp_remaddr_ops = {
-   .start = sctp_remaddr_seq_start,
-   .next  = sctp_remaddr_seq_next,
-   .stop  = sctp_remaddr_seq_stop,
+   .start = sctp_transport_seq_start,
+   .next  = sctp_transport_seq_next,
+   .stop  = sctp_transport_seq_stop,
.show  = sctp_remaddr_seq_show,
 };
 
-- 
2.1.0

[PATCHv2 net-next 4/6] sctp: add the sctp_diag.c file

2016-04-08 Thread Xin Long

This one will implement all the interface of inet_diag, inet_diag_handler.
which includes sctp_diag_dump, sctp_diag_dump_one and sctp_diag_get_info.

It will work as a modules, and register inet_diag_handler when loading.

Signed-off-by: Xin Long 
---
 include/uapi/linux/inet_diag.h |   2 +
 net/sctp/Kconfig   |   4 +
 net/sctp/Makefile  |   1 +
 net/sctp/sctp_diag.c   | 581 +
 4 files changed, 588 insertions(+)
 create mode 100644 net/sctp/sctp_diag.c

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 68a1f71..f5f3629 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -113,6 +113,8 @@ enum {
INET_DIAG_DCTCPINFO,
INET_DIAG_PROTOCOL,  /* response attribute only */
INET_DIAG_SKV6ONLY,
+   INET_DIAG_LOCALS,
+   INET_DIAG_PEERS,
 };
 
 #define INET_DIAG_MAX INET_DIAG_SKV6ONLY
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index 71c1a59..d9c04dc 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -99,5 +99,9 @@ config SCTP_COOKIE_HMAC_SHA1
select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1
select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1
 
+config INET_SCTP_DIAG
+   depends on INET_DIAG
+   def_tristate INET_DIAG
+
 
 endif # IP_SCTP
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 3b4ffb0..0fca582 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_IP_SCTP) += sctp.o
 obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o
+obj-$(CONFIG_INET_SCTP_DIAG) += sctp_diag.o
 
 sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
  protocol.o endpointola.o associola.o \
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
new file mode 100644
index 000..86fccc2
--- /dev/null
+++ b/net/sctp/sctp_diag.c
@@ -0,0 +1,581 @@
+#include 
+#include 
+#include 
+#include 
+
+extern const struct inet_diag_handler *inet_diag_get_handler(int proto);
+extern void inet_diag_msg_common_fill(struct inet_diag_msg *r,
+ struct sock *sk);
+
+static int inet_sctp_fill_laddrs(struct sk_buff *skb,
+struct list_head *address_list)
+{
+   struct sctp_sockaddr_entry *laddr;
+   int addrlen = sizeof(struct sockaddr_storage);
+   int addrcnt = 0;
+   struct nlattr *attr;
+   void *info = NULL;
+
+   list_for_each_entry_rcu(laddr, address_list, list)
+   addrcnt++;
+
+   attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt);
+   if (!attr)
+   return -EMSGSIZE;
+
+   info = nla_data(attr);
+   list_for_each_entry_rcu(laddr, address_list, list) {
+   memcpy(info, >a, addrlen);
+   info += addrlen;
+   }
+
+   return 0;
+}
+
+static int inet_sctp_fill_paddrs(struct sk_buff *skb,
+struct sctp_association *asoc)
+{
+   int addrlen = sizeof(struct sockaddr_storage);
+   struct sctp_transport *from;
+   struct nlattr *attr;
+   void *info = NULL;
+
+   attr = nla_reserve(skb, INET_DIAG_PEERS,
+  addrlen * asoc->peer.transport_count);
+   if (!attr)
+   return -EMSGSIZE;
+
+   info = nla_data(attr);
+   list_for_each_entry(from, >peer.transport_addr_list,
+   transports) {
+   memcpy(info, >ipaddr, addrlen);
+   info += addrlen;
+   }
+
+   return 0;
+}
+
+static int inet_assoc_diag_fill(struct sock *sk,
+   struct sctp_association *asoc,
+   struct sk_buff *skb,
+   const struct inet_diag_req_v2 *req,
+   struct user_namespace *user_ns,
+   int portid, u32 seq, u16 nlmsg_flags,
+   const struct nlmsghdr *unlh)
+{
+   const struct inet_sock *inet = inet_sk(sk);
+   const struct inet_diag_handler *handler;
+   int ext = req->idiag_ext;
+   struct inet_diag_msg *r;
+   struct nlmsghdr  *nlh;
+   struct nlattr *attr;
+   void *info = NULL;
+   union sctp_addr laddr, paddr;
+   struct dst_entry *dst;
+   struct sctp_infox infox;
+
+   handler = inet_diag_get_handler(req->sdiag_protocol);
+   BUG_ON(!handler);
+
+   nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+   nlmsg_flags);
+   if (!nlh)
+   return -EMSGSIZE;
+
+   r = nlmsg_data(nlh);
+   BUG_ON(!sk_fullsock(sk));
+
+   laddr = list_entry(asoc->base.bind_addr.address_list.next,
+  struct sctp_sockaddr_entry, list)->a;
+   paddr = asoc->peer.primary_path->ipaddr;
+   dst = asoc->peer.primary_path->dst;
+
+   r->idiag_family = sk->sk_family;
+   r->id.idiag_sport = htons(asoc->base.bind_addr.port);
+

[PATCHv2 net-next 1/6] sctp: add sctp_info dump api for sctp_diag

2016-04-08 Thread Xin Long

sctp_diag will dump some important details of sctp's assoc or ep, we use
sctp_info to describe them,  sctp_get_sctp_info to get them, and export
it to sctp_diag.ko.

Signed-off-by: Xin Long 
---
 include/linux/sctp.h| 65 +
 include/net/sctp/sctp.h |  3 ++
 net/sctp/socket.c   | 86 +
 3 files changed, 154 insertions(+)

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index a9414fd..a448ebc 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -705,4 +705,69 @@ typedef struct sctp_auth_chunk {
sctp_authhdr_t auth_hdr;
 } __packed sctp_auth_chunk_t;
 
+struct sctp_info {
+   __u32   sctpi_tag;
+   __u32   sctpi_state;
+   __u32   sctpi_rwnd;
+   __u16   sctpi_unackdata;
+   __u16   sctpi_penddata;
+   __u16   sctpi_instrms;
+   __u16   sctpi_outstrms;
+   __u32   sctpi_fragmentation_point;
+   __u32   sctpi_inqueue;
+   __u32   sctpi_outqueue;
+   __u32   sctpi_overall_error;
+   __u32   sctpi_max_burst;
+   __u32   sctpi_maxseg;
+   __u32   sctpi_peer_rwnd;
+   __u32   sctpi_peer_tag;
+   __u8sctpi_peer_capable;
+   __u8sctpi_peer_sack;
+
+   /* assoc status info */
+   __u64   sctpi_isacks;
+   __u64   sctpi_osacks;
+   __u64   sctpi_opackets;
+   __u64   sctpi_ipackets;
+   __u64   sctpi_rtxchunks;
+   __u64   sctpi_outofseqtsns;
+   __u64   sctpi_idupchunks;
+   __u64   sctpi_gapcnt;
+   __u64   sctpi_ouodchunks;
+   __u64   sctpi_iuodchunks;
+   __u64   sctpi_oodchunks;
+   __u64   sctpi_iodchunks;
+   __u64   sctpi_octrlchunks;
+   __u64   sctpi_ictrlchunks;
+
+   /* primary transport info */
+   struct sockaddr_storage sctpi_p_address;
+   __s32   sctpi_p_state;
+   __u32   sctpi_p_cwnd;
+   __u32   sctpi_p_srtt;
+   __u32   sctpi_p_rto;
+   __u32   sctpi_p_hbinterval;
+   __u32   sctpi_p_pathmaxrxt;
+   __u32   sctpi_p_sackdelay;
+   __u32   sctpi_p_sackfreq;
+   __u32   sctpi_p_ssthresh;
+   __u32   sctpi_p_partial_bytes_acked;
+   __u32   sctpi_p_flight_size;
+   __u16   sctpi_p_error;
+
+   /* sctp sock info */
+   __u32   sctpi_s_autoclose;
+   __u32   sctpi_s_adaptation_ind;
+   __u32   sctpi_s_pd_point;
+   __u8sctpi_s_nodelay;
+   __u8sctpi_s_disable_fragments;
+   __u8sctpi_s_v4mapped;
+   __u8sctpi_s_frag_interleave;
+};
+
+struct sctp_infox {
+   struct sctp_info *sctpinfo;
+   struct sctp_association *asoc;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 65521cf..36e1eae 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -116,6 +116,9 @@ extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
 
+int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
+  struct sctp_info *info);
+
 /*
  * sctp/primitive.c
  */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 878d28e..8f79f23 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4202,6 +4202,92 @@ static void sctp_shutdown(struct sock *sk, int how)
}
 }
 
+int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
+  struct sctp_info *info)
+{
+   struct sctp_transport *prim;
+   struct list_head *pos, *temp;
+   int mask;
+
+   memset(info, 0, sizeof(*info));
+   if (!asoc) {
+   struct sctp_sock *sp = sctp_sk(sk);
+
+   info->sctpi_s_autoclose = sp->autoclose;
+   info->sctpi_s_adaptation_ind = sp->adaptation_ind;
+   info->sctpi_s_pd_point = sp->pd_point;
+   info->sctpi_s_nodelay = sp->nodelay;
+   info->sctpi_s_disable_fragments = sp->disable_fragments;
+   info->sctpi_s_v4mapped = sp->v4mapped;
+   info->sctpi_s_frag_interleave = sp->frag_interleave;
+
+   return 0;
+   }
+
+   info->sctpi_tag = asoc->c.my_vtag;
+   info->sctpi_state = asoc->state;
+   info->sctpi_rwnd = asoc->a_rwnd;
+   info->sctpi_unackdata = asoc->unack_data;
+   info->sctpi_penddata = sctp_tsnmap_pending(>peer.tsn_map);
+   info->sctpi_instrms = asoc->c.sinit_max_instreams;
+   info->sctpi_outstrms = asoc->c.sinit_num_ostreams;
+   list_for_each_safe(pos, temp, >base.inqueue.in_chunk_list)
+   info->sctpi_inqueue++;
+   list_for_each_safe(pos, temp, >outqueue.out_chunk_list)
+   info->sctpi_outqueue++;
+   info->sctpi_overall_error = asoc->overall_error_count;
+   info->sctpi_max_burst = asoc->max_burst;
+   info->sctpi_maxseg = asoc->frag_point;
+   info->sctpi_peer_rwnd =

[PATCHv2 net-next 0/6] sctp: support sctp_diag in kernel

2016-04-08 Thread Xin Long

This patchset will add sctp_diag module to implement diag interface on
sctp in kernel.

For a listening sctp endpoint, we will just dump it's ep info.
For a sctp connection, we will the assoc info and it's ep info.

The ss dump will looks like:

[iproute2]# ./misc/ss --sctp  -n -l
State  Recv-Q Send-Q   Local Address:Port   Peer Address:Port
LISTEN 0  128  172.16.254.254:  *:*
LISTEN 0  5127.0.0.1:1234   *:*
LISTEN 0  5127.0.0.1:1234   *:*
  - ESTAB  0  0127.0.0.1%lo:1234127.0.0.1:4321
LISTEN 0  128  172.16.254.254:  *:*
  - ESTAB  0  0172.16.254.254%eth1: 172.16.253.253:
  - ESTAB  0  0172.16.254.254%eth1: 172.16.1.1:
  - ESTAB  0  0172.16.254.254%eth1: 172.16.1.2:
  - ESTAB  0  0172.16.254.254%eth1: 172.16.2.1:
  - ESTAB  0  0172.16.254.254%eth1: 172.16.2.2:
  - ESTAB  0  0172.16.254.254%eth1: 172.16.3.1:
  - ESTAB  0  0172.16.254.254%eth1: 172.16.3.2:
LISTEN 0  0127.0.0.1:4321   *:*
  - ESTAB  0  0127.0.0.1%lo:4321127.0.0.1:1234

The entries with '- ESTAB' are the assocs, some of them may belong to
the same endpoint. So we will dump the parent endpoint first, like the
entry with 'LISTEN'. then dump the assocs. ep and assocs entries will
be dumped in right order so that ss can show them in tree format easily.

Besides, this patchset also simplifies sctp proc codes, cause it has
some similar codes with sctp diag in sctp transport traversal.

v1->v2:
  1. inet_diag_get_handler needs to return it as const.
  2. merge 5/7 into 2/7 of v1.

Xin Long (6):
  sctp: add sctp_info dump api for sctp_diag
  sctp: export some apis or variables for sctp_diag and reuse some for
proc
  sctp: export some functions for sctp_diag in inet_diag
  sctp: add the sctp_diag.c file
  sctp: merge the seq_start/next/exits in remaddrs and assocs
  sctp: fix some rhashtable functions using in sctp proc/diag

 include/linux/sctp.h   |  65 +
 include/net/sctp/sctp.h|  16 ++
 include/uapi/linux/inet_diag.h |   2 +
 net/ipv4/inet_diag.c   |   9 +-
 net/sctp/Kconfig   |   4 +
 net/sctp/Makefile  |   1 +
 net/sctp/proc.c| 104 ++--
 net/sctp/sctp_diag.c   | 581 +
 net/sctp/socket.c  | 215 +++
 9 files changed, 911 insertions(+), 86 deletions(-)
 create mode 100644 net/sctp/sctp_diag.c

-- 
2.1.0

[PATCHv2 net-next 2/6] sctp: export some apis or variables for sctp_diag and reuse some for proc

2016-04-08 Thread Xin Long

For some main variables in sctp.ko, we couldn't export it to other modules,
so we have to define some api to access them.

It will include sctp transport and endpoint's traversal.

There are some transport traversal functions for sctp_diag, we can also
use it for sctp_proc. cause they have the similar situation to traversal
transport.

Signed-off-by: Xin Long 
---
 include/net/sctp/sctp.h |  13 +
 net/sctp/proc.c |  80 +++
 net/sctp/socket.c   | 124 
 3 files changed, 155 insertions(+), 62 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 36e1eae..c0c4deb 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -116,6 +116,19 @@ extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
 
+int sctp_transport_walk_start(struct rhashtable_iter *iter);
+void sctp_transport_walk_stop(struct rhashtable_iter *iter);
+struct sctp_transport *sctp_transport_get_next(struct net *net,
+   struct rhashtable_iter *iter);
+struct sctp_transport *sctp_transport_get_idx(struct net *net,
+   struct rhashtable_iter *iter, int pos);
+int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
+ struct net *net,
+ const union sctp_addr *laddr,
+ const union sctp_addr *paddr, void *p);
+int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
+   struct net *net, int pos, void *p);
+int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p);
 int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
   struct sctp_info *info);
 
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 5cfac8d..dd8492f 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -282,80 +282,31 @@ struct sctp_ht_iter {
struct rhashtable_iter hti;
 };
 
-static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq)
-{
-   struct sctp_ht_iter *iter = seq->private;
-   struct sctp_transport *t;
-
-   t = rhashtable_walk_next(>hti);
-   for (; t; t = rhashtable_walk_next(>hti)) {
-   if (IS_ERR(t)) {
-   if (PTR_ERR(t) == -EAGAIN)
-   continue;
-   break;
-   }
-
-   if (net_eq(sock_net(t->asoc->base.sk), seq_file_net(seq)) &&
-   t->asoc->peer.primary_path == t)
-   break;
-   }
-
-   return t;
-}
-
-static struct sctp_transport *sctp_transport_get_idx(struct seq_file *seq,
-loff_t pos)
-{
-   void *obj = SEQ_START_TOKEN;
-
-   while (pos && (obj = sctp_transport_get_next(seq)) && !IS_ERR(obj))
-   pos--;
-
-   return obj;
-}
-
-static int sctp_transport_walk_start(struct seq_file *seq)
-{
-   struct sctp_ht_iter *iter = seq->private;
-   int err;
-
-   err = rhashtable_walk_init(_transport_hashtable, >hti);
-   if (err)
-   return err;
-
-   err = rhashtable_walk_start(>hti);
-
-   return err == -EAGAIN ? 0 : err;
-}
-
-static void sctp_transport_walk_stop(struct seq_file *seq)
-{
-   struct sctp_ht_iter *iter = seq->private;
-
-   rhashtable_walk_stop(>hti);
-   rhashtable_walk_exit(>hti);
-}
-
 static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
 {
-   int err = sctp_transport_walk_start(seq);
+   struct sctp_ht_iter *iter = seq->private;
+   int err = sctp_transport_walk_start(>hti);
 
if (err)
return ERR_PTR(err);
 
-   return sctp_transport_get_idx(seq, *pos);
+   return sctp_transport_get_idx(seq_file_net(seq), >hti, *pos);
 }
 
 static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
 {
-   sctp_transport_walk_stop(seq);
+   struct sctp_ht_iter *iter = seq->private;
+
+   sctp_transport_walk_stop(>hti);
 }
 
 static void *sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+   struct sctp_ht_iter *iter = seq->private;
+
++*pos;
 
-   return sctp_transport_get_next(seq);
+   return sctp_transport_get_next(seq_file_net(seq), >hti);
 }
 
 /* Display sctp associations (/proc/net/sctp/assocs). */
@@ -457,24 +408,29 @@ void sctp_assocs_proc_exit(struct net *net)
 
 static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos)
 {
-   int err = sctp_transport_walk_start(seq);
+   struct sctp_ht_iter *iter = seq->private;
+   int err = sctp_transport_walk_start(>hti);
 
if (err)
return ERR_PTR(err);
 
-   return sctp_transport_get_idx(seq, *pos);
+

Re: [PATCH net-next] ibmvnic: Enable use of multiple tx/rx scrqs

2016-04-08 Thread David Miller

From: John Allen 
Date: Wed, 6 Apr 2016 11:49:55 -0500

> Enables the use of multiple transmit and receive scrqs allowing the ibmvnic
> driver to take advantage of multiqueue functionality. To achieve this, the
> driver must implement the process of negotiating the maximum number of
> queues allowed by the server. Initially, the driver will attempt to login
> with the maximum number of tx and rx queues supported by the server. If
> the server fails to allocate the requested number of scrqs, it will return
> partial success in the login response. In this case, we must reinitiate
> the login process from the request capabilities stage and attempt to login
> requesting fewer scrqs.
> 
> Signed-off-by: John Allen 

Applied, thanks.

Re: [PATCH net-next] net: bcmgenet: add BQL support

2016-04-08 Thread Petri Gynther

On Fri, Apr 8, 2016 at 6:56 PM, Eric Dumazet  wrote:
> On Fri, 2016-04-08 at 18:39 -0700, Petri Gynther wrote:
>> On Fri, Apr 8, 2016 at 1:36 PM, David Miller  wrote:
>> > From: Petri Gynther 
>> > Date: Tue,  5 Apr 2016 17:50:01 -0700
>> >
>> >> Add Byte Queue Limits (BQL) support to bcmgenet driver.
>> >>
>> >> Signed-off-by: Petri Gynther 
>> >
>> > As Eric Dumazet indicated, your ->ndo_init() code to reset the queues is
>> > probably not necessary at all.
>>
>> I added the netdev_tx_reset_queue(txq) calls to ndo_open() path:
>> netdev->ndo_open()
>>   bcmgenet_open()
>> bcmgenet_netif_start()
>>   for all Tx queues:
>> netdev_tx_reset_queue(txq)
>>   clear __QUEUE_STATE_STACK_XOFF
>>   dql_reset()
>>   netif_tx_start_all_queues(dev)
>> for all Tx queues:
>>   clear __QUEUE_STATE_DRV_XOFF
>>
>> So, I think the call to netdev_tx_reset_queue(txq) is in the right
>> place. It ensures that the Tx queue state is clean when the device is
>> opened.
>
>
> The netdev_tx_reset_queue(txq) calls are only needed in exceptional
> conditions.
>
> Not at device start, as the core networking layer init all txq
> (including their BQL state) properly before giving them to drivers for
> use.
>

What values does the networking core program into BQL dynamic limits
that my code in netdev->ndo_open() would wipe out?

You mentioned the queue init path:
netdev_init_one_queue() -> dql_init() -> dql_reset()

that is called when the netdev is created and Tx queues allocated.

But, does the networking core somewhere set *different* values for BQL
dynamic limits than what dql_reset() did, before opening the device?

> For example, tg3 calls netdev_tx_reset_queue() only when freeing tx
> rings, as it might have freed skb(s) not from normal TX complete path
> and thus missed appropriate dql_completed().
>

Looking at the tg3 driver, it calls:
tg3_stop()
  tg3_free_rings()
netdev_tx_reset_queue()

netdev_tx_reset_queue() is called unconditionally, as long as the Tx
ring exists. So "ip link set dev eth down" would cause it to be
called.

Why is it OK to call netdev_tx_reset_queue() from the
netdev->ndo_stop() path, but not from netdev->ndo_open() path?

> If you believe BQL drivers need a fix, please elaborate ?
>
> Thanks.
>
>

Re: [PATCH net-next] net: bcmgenet: add BQL support

2016-04-08 Thread Alexander Duyck

On Fri, Apr 8, 2016 at 6:56 PM, Eric Dumazet  wrote:
> On Fri, 2016-04-08 at 18:39 -0700, Petri Gynther wrote:
>> On Fri, Apr 8, 2016 at 1:36 PM, David Miller  wrote:
>> > From: Petri Gynther 
>> > Date: Tue,  5 Apr 2016 17:50:01 -0700
>> >
>> >> Add Byte Queue Limits (BQL) support to bcmgenet driver.
>> >>
>> >> Signed-off-by: Petri Gynther 
>> >
>> > As Eric Dumazet indicated, your ->ndo_init() code to reset the queues is
>> > probably not necessary at all.
>>
>> I added the netdev_tx_reset_queue(txq) calls to ndo_open() path:
>> netdev->ndo_open()
>>   bcmgenet_open()
>> bcmgenet_netif_start()
>>   for all Tx queues:
>> netdev_tx_reset_queue(txq)
>>   clear __QUEUE_STATE_STACK_XOFF
>>   dql_reset()
>>   netif_tx_start_all_queues(dev)
>> for all Tx queues:
>>   clear __QUEUE_STATE_DRV_XOFF
>>
>> So, I think the call to netdev_tx_reset_queue(txq) is in the right
>> place. It ensures that the Tx queue state is clean when the device is
>> opened.
>
>
> The netdev_tx_reset_queue(txq) calls are only needed in exceptional
> conditions.
>
> Not at device start, as the core networking layer init all txq
> (including their BQL state) properly before giving them to drivers for
> use.
>
> For example, tg3 calls netdev_tx_reset_queue() only when freeing tx
> rings, as it might have freed skb(s) not from normal TX complete path
> and thus missed appropriate dql_completed().
>
> If you believe BQL drivers need a fix, please elaborate ?
>
> Thanks.

For a bit of history on why you might want to do the reset on clean-up
instead of init you might take a look at commit dad8a3b3eaa0 ("igb,
ixgbe: netdev_tx_reset_queue incorrectly called from tx init path").

Basically you want to make certain you flush the queues after bringing
the interface down so that you don't possibly trigger any false hangs
for having stalled queues.  Basically the rule with the BQL stuff is
you need to leave the Tx queue in the state you found it in instead of
just wiping it and making use of it and putting it away dirty.

- Alex

Re: [PATCH net-next] net: bcmgenet: add BQL support

2016-04-08 Thread Eric Dumazet

On Fri, 2016-04-08 at 18:39 -0700, Petri Gynther wrote:
> On Fri, Apr 8, 2016 at 1:36 PM, David Miller  wrote:
> > From: Petri Gynther 
> > Date: Tue,  5 Apr 2016 17:50:01 -0700
> >
> >> Add Byte Queue Limits (BQL) support to bcmgenet driver.
> >>
> >> Signed-off-by: Petri Gynther 
> >
> > As Eric Dumazet indicated, your ->ndo_init() code to reset the queues is
> > probably not necessary at all.
> 
> I added the netdev_tx_reset_queue(txq) calls to ndo_open() path:
> netdev->ndo_open()
>   bcmgenet_open()
> bcmgenet_netif_start()
>   for all Tx queues:
> netdev_tx_reset_queue(txq)
>   clear __QUEUE_STATE_STACK_XOFF
>   dql_reset()
>   netif_tx_start_all_queues(dev)
> for all Tx queues:
>   clear __QUEUE_STATE_DRV_XOFF
> 
> So, I think the call to netdev_tx_reset_queue(txq) is in the right
> place. It ensures that the Tx queue state is clean when the device is
> opened.

The netdev_tx_reset_queue(txq) calls are only needed in exceptional
conditions.

Not at device start, as the core networking layer init all txq
(including their BQL state) properly before giving them to drivers for
use.

For example, tg3 calls netdev_tx_reset_queue() only when freeing tx
rings, as it might have freed skb(s) not from normal TX complete path
and thus missed appropriate dql_completed().

If you believe BQL drivers need a fix, please elaborate ?

Thanks.

Re: [PATCH net-next] net: bcmgenet: add BQL support

2016-04-08 Thread Petri Gynther

On Fri, Apr 8, 2016 at 1:36 PM, David Miller  wrote:
> From: Petri Gynther 
> Date: Tue,  5 Apr 2016 17:50:01 -0700
>
>> Add Byte Queue Limits (BQL) support to bcmgenet driver.
>>
>> Signed-off-by: Petri Gynther 
>
> As Eric Dumazet indicated, your ->ndo_init() code to reset the queues is
> probably not necessary at all.

I added the netdev_tx_reset_queue(txq) calls to ndo_open() path:
netdev->ndo_open()
  bcmgenet_open()
bcmgenet_netif_start()
  for all Tx queues:
netdev_tx_reset_queue(txq)
  clear __QUEUE_STATE_STACK_XOFF
  dql_reset()
  netif_tx_start_all_queues(dev)
for all Tx queues:
  clear __QUEUE_STATE_DRV_XOFF

So, I think the call to netdev_tx_reset_queue(txq) is in the right
place. It ensures that the Tx queue state is clean when the device is
opened.

RE: [PATCH -v2] drivers: net: ethernet: intel: e1000e: fix ethtool autoneg off for non-copper

2016-04-08 Thread Brown, Aaron F

> From: netdev-ow...@vger.kernel.org [mailto:netdev-
> ow...@vger.kernel.org] On Behalf Of Daniel Walker
> Sent: Tuesday, April 5, 2016 11:30 AM
> To: Ruinskiy, Dima ; Kirsher, Jeffrey T
> ; Brandeburg, Jesse
> ; Nelson, Shannon
> ; Wyborny, Carolyn
> ; Skidmore, Donald C
> ; Allan, Bruce W ;
> Ronciak, John ; Williams, Mitch A
> 
> Cc: Steve Shih ; xe-ker...@external.cisco.com; Daniel
> Walker ; intel-wired-...@lists.osuosl.org;
> netdev@vger.kernel.org; linux-ker...@vger.kernel.org
> Subject: [PATCH -v2] drivers: net: ethernet: intel: e1000e: fix ethtool 
> autoneg
> off for non-copper
> 
> From: Steve Shih 
> 
> This patch fixes the issues for disabling auto-negotiation and forcing
> speed and duplex settings for the non-copper media.
> 
> For non-copper media, e1000_get_settings should return
> ETH_TP_MDI_INVALID for
> eth_tp_mdix_ctrl instead of ETH_TP_MDI_AUTO so subsequent
> e1000_set_settings
> call would not fail with -EOPNOTSUPP.
> 
> e1000_set_spd_dplx should not automatically turn autoneg back on for
> forced
> 1000 Mbps full duplex settings for non-copper media.
> 
> Cc: xe-ker...@external.cisco.com
> Cc: Daniel Walker 
> Signed-off-by: Steve Shih 
> ---
>  drivers/net/ethernet/intel/e1000e/ethtool.c | 11 +--
>  1 file changed, 9 insertions(+), 2 deletions(-)

Tested-by: Aaron Brown

Re: How do I avoid recvmsg races with IP_RECVERR?

2016-04-08 Thread Andy Lutomirski

On Tue, Jun 2, 2015 at 5:33 PM, Hannes Frederic Sowa
 wrote:
> On Wed, Jun 3, 2015, at 02:03, Andy Lutomirski wrote:
>> On Tue, Jun 2, 2015 at 2:50 PM, Hannes Frederic Sowa
>>  wrote:
>> >> My proposal would be to make the error conversion lazy:
>> >>
>> >> Keeping duplicate data is not a good idea in general: So we shouldn't
>> >> use sk->sk_err if IP_RECVERR is set at all but let sock_error just use
>> >> the sk_error_queue and extract the error code from there.
>> >>
>> >> Only if IP_RECVERR was not set, we use sk->sk_err logic.
>> >>
>> >> What do you think?
>> >
>> > I just noticed that this will probably break existing user space
>> > applications which require that icmp errors are transient even with
>> > IP_RECVERR. We can mark that with a bit in the sk_error_queue pointer
>> > and xchg the pointer, hmmm
>>
>> Do you mean to fix the race like this but to otherwise leave the
>> semantics
>> alone?  That would be an improvement, but it might be nice to also add
>> a non-crappy API for this, too.
>
> Yes, keep current semantics but fix the race you reported.
>
> I currently don't have good proposals for a decent API to handle this
> besides adding some ancillary cmsg data to msg_control. This still would
> not solve the problem fundamentally, as a -EFAULT/-EINVAL return value
> could also mean that msg_control should not be touched, thus we end up
> again relying on errno checking. :/ Thus checking error queue after
> receiving an error indications is my best hunch so far.
>
> Your proposal with MSG_IGNORE_ERROR seems reasonable so far for ping or
> udp, but I haven't fully grasped the TCP semantics of sk->sk_err, yet.

I was looking at this a bit, and I was thinking about adding a new
socket option, but I'm a bit vague on how all this fits together.

One option would be a socket option that simply causes sock_error to
return 0 (and change SO_ERROR to peek at sk_err directly).  But there
seem to be sock_error callers all over the place, and maybe this
change would cause problems.

Another option would be to add a socket option that explicitly turns
off everything that queues soft errors to sk_err.

I think that, for IP datagrams at least, the ideal semantics would be
for soft errors not to affect sk_err and for POLLERR to be set if the
error queue is nonempty.

--Andy

Re: [PATCH net-next v2] vxlan: synchronously and race-free destruction of vxlan sockets

2016-04-08 Thread Hannes Frederic Sowa



On Sat, Apr 9, 2016, at 01:24, Cong Wang wrote:
> On Fri, Apr 8, 2016 at 1:55 PM, Hannes Frederic Sowa
>  wrote:
> > Due to the fact that the udp socket is destructed asynchronously in a
> > work queue, we have some nondeterministic behavior during shutdown of
> > vxlan tunnels and creating new ones. Fix this by keeping the destruction
> > process synchronous in regards to the user space process so IFF_UP can
> > be reliably set.
> >
> > udp_tunnel_sock_release destroys vs->sock->sk if reference counter
> > indicates so. We expect to have the same lifetime of vxlan_sock and
> > vxlan_sock->sock->sk even in fast paths with only rcu locks held. So
> > only destruct the whole socket after we can be sure it cannot be found
> > by searching vxlan_net->sock_list.
> >
> 
> I am wondering what is the reason why we used work queue from
> the beginning?

I actually don't know. It was like that from the beginning. I cc'ed
Stephen, maybe he remembers?

Bye,
Hannes

Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver

2016-04-08 Thread Bjorn Andersson

On Fri 08 Apr 16:01 PDT 2016, Timur Tabi wrote:

> Bjorn Andersson wrote:
> 
> >It sounds like you're trying to say that the pins used can be are
> >muxed as GPIO or MDIO, in the TLMM.
> 
> I'm not 100% sure, but I think that's correct.  If you don't want to have
> normal networking, you could connect those external pins to some GPIO device
> (like an LED or whatever), and then configure the pin muxing for GPIO
> purposes.  But if that's true, it's only true on the FSM9900. On the
> QDF2432, those lines are not connected to the TLMM.  They are instead
> hard-wired to the Emac.
> 

Then through proper use of the pinctrl framework you should configure
the FSM9900 to mux these pins appropriately and the two solutions are
equivalent.

> >In the downstream kernel this is often seen with the drivers calling
> >gpio_request() to "reserve" said pins, but all you should do is
> >described the desired configuration and muxing in the pinctrl node,
> >reference that from your driver and simply ignore the fact that those
> >pins could have been used as GPIO pins.
> 
> That makes sense, but I think the driver already does that.
> 
> https://patchwork.ozlabs.org/patch/561667/
> 
> Function emac_probe_resources() has a call to of_get_named_gpio().  And then
> emac_mac_up() calls gpio_request().  As far as I can tell, that's it.
> 
> I'm guessing that the of_get_named_gpio() call needs to be changed somehow,
> but I'm not sure how.
> 

Thanks for the link.

In short those call to the gpio framework should just be removed. They
should only be there if you're using the gpiolib to control the state of
those pins, and you're not as far as I can see.


The general outline of what you should have in your dts instead is:

soc {
tlmm {
compatible = "qcom,pinctrl-xyz";

mdio_pins_a: mdio {
state {
pins = "gpio0", "gpio1";
function = "mdio";
};
};
};

emac {
compatible = "qcom,somthing-emac";

pinctrl-names = "default";
pinctrl-0 = <_pins_a>;
};
};

Regards,
Bjorn

Re: [PATCH net-next v2] vxlan: synchronously and race-free destruction of vxlan sockets

2016-04-08 Thread Cong Wang

On Fri, Apr 8, 2016 at 1:55 PM, Hannes Frederic Sowa
 wrote:
> Due to the fact that the udp socket is destructed asynchronously in a
> work queue, we have some nondeterministic behavior during shutdown of
> vxlan tunnels and creating new ones. Fix this by keeping the destruction
> process synchronous in regards to the user space process so IFF_UP can
> be reliably set.
>
> udp_tunnel_sock_release destroys vs->sock->sk if reference counter
> indicates so. We expect to have the same lifetime of vxlan_sock and
> vxlan_sock->sock->sk even in fast paths with only rcu locks held. So
> only destruct the whole socket after we can be sure it cannot be found
> by searching vxlan_net->sock_list.
>

I am wondering what is the reason why we used work queue from
the beginning?

Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver

2016-04-08 Thread Timur Tabi


Bjorn Andersson wrote:


It sounds like you're trying to say that the pins used can be are
muxed as GPIO or MDIO, in the TLMM.


I'm not 100% sure, but I think that's correct.  If you don't want to 
have normal networking, you could connect those external pins to some 
GPIO device (like an LED or whatever), and then configure the pin muxing 
for GPIO purposes.  But if that's true, it's only true on the FSM9900. 
On the QDF2432, those lines are not connected to the TLMM.  They are 
instead hard-wired to the Emac.



In the downstream kernel this is often seen with the drivers calling
gpio_request() to "reserve" said pins, but all you should do is
described the desired configuration and muxing in the pinctrl node,
reference that from your driver and simply ignore the fact that those
pins could have been used as GPIO pins.


That makes sense, but I think the driver already does that.

https://patchwork.ozlabs.org/patch/561667/

Function emac_probe_resources() has a call to of_get_named_gpio().  And 
then emac_mac_up() calls gpio_request().  As far as I can tell, that's it.


I'm guessing that the of_get_named_gpio() call needs to be changed 
somehow, but I'm not sure how.


--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
Forum, a Linux Foundation collaborative project.

Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver

2016-04-08 Thread Bjorn Andersson

On Fri, Apr 8, 2016 at 12:06 PM, Timur Tabi  wrote:
> Andrew Lunn wrote:
>
>> There are two different things here. One is configuring the pin to be
>> a GPIO. The second is using the GPIO as a GPIO. In this case,
>> bit-banging the MDIO bus.
>>
>> The firmware could be doing the configuration, setting the pin as a
>> GPIO. However, the firmware cannot be doing the MDIO bit-banging to
>> make an MDIO bus available. Linux has to do that.
>>
>> Or it could be we have all completely misunderstood the hardware, and
>> we are not doing bit-banging GPIO MDIO. There is a real MDIO
>> controller there, we don't use these pins as GPIOs, etc
>
>
> Actually, I think there is a misunderstanding.
>
> On the FSM9900 SOC (which uses device-tree), the two pins that connect to
> the external PHY are gpio pins.  However, the driver needs to reprogram the
> pinmux so that those pins are wired to the Emac controller.  That's what the
> the gpio code in this driver is doing: it's just configuring the pins so
> that they connect directly between the Emac and the external PHY.  After
> that, they are no longer GPIO pins, and you cannot use the "GPIO controlled
> MDIO bus".  There is no MDIO controller on the SOC.  The external PHY is
> controlled directly from the Emac and also from the internal PHY.  It is
> screwy, I know, but that's what Gilad was trying to explain.
>

It sounds like you're trying to say that the pins used can be are
muxed as GPIO or MDIO, in the TLMM.

In the downstream kernel this is often seen with the drivers calling
gpio_request() to "reserve" said pins, but all you should do is
described the desired configuration and muxing in the pinctrl node,
reference that from your driver and simply ignore the fact that those
pins could have been used as GPIO pins.

Regards,
Bjorn

[PATCH iproute2 -master 1/3] tc, bpf: add new csum and tunnel signatures

2016-04-08 Thread Daniel Borkmann

Add new signatures for BPF_FUNC_csum_diff, BPF_FUNC_skb_get_tunnel_opt
and BPF_FUNC_skb_set_tunnel_opt.

Signed-off-by: Daniel Borkmann 
---
 include/bpf_api.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/bpf_api.h b/include/bpf_api.h
index 4b16d25..0f278f0 100644
--- a/include/bpf_api.h
+++ b/include/bpf_api.h
@@ -212,6 +212,8 @@ static int BPF_FUNC(l3_csum_replace, struct __sk_buff *skb, 
uint32_t off,
uint32_t from, uint32_t to, uint32_t flags);
 static int BPF_FUNC(l4_csum_replace, struct __sk_buff *skb, uint32_t off,
uint32_t from, uint32_t to, uint32_t flags);
+static int BPF_FUNC(csum_diff, const void *from, uint32_t from_size,
+   const void *to, uint32_t to_size, uint32_t seed);
 
 /* Packet vlan encap/decap */
 static int BPF_FUNC(skb_vlan_push, struct __sk_buff *skb, uint16_t proto,
@@ -225,6 +227,11 @@ static int BPF_FUNC(skb_set_tunnel_key, struct __sk_buff 
*skb,
const struct bpf_tunnel_key *from, uint32_t size,
uint32_t flags);
 
+static int BPF_FUNC(skb_get_tunnel_opt, struct __sk_buff *skb,
+   void *to, uint32_t size);
+static int BPF_FUNC(skb_set_tunnel_opt, struct __sk_buff *skb,
+   const void *from, uint32_t size);
+
 /** LLVM built-ins, mem*() routines work for constant size */
 
 #ifndef lock_xadd
-- 
1.9.3

[PATCH iproute2 -master 2/3] tc, bpf: further improve error reporting

2016-04-08 Thread Daniel Borkmann

Make it easier to spot issues when loading the object file fails. This
includes reporting in what pinned object specs differ, better indication
when we've reached instruction limits. Don't retry to load a non relo
program once we failed with bpf(2), and report out of bounds tail call key.

Also, add truncation of huge log outputs by default. Sometimes errors are
quite easy to spot by only looking at the tail of the verifier log, but
logs can get huge in size e.g. up to few MB (due to verifier checking all
possible program paths). Thus, by default limit output to the last 4096
bytes and indicate that it's truncated. For the full log, the verbose option
can be used.

Signed-off-by: Daniel Borkmann 
---
 tc/tc_bpf.c | 82 -
 tc/tc_bpf.h |  4 +++
 2 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c
index d94af82..0c59427 100644
--- a/tc/tc_bpf.c
+++ b/tc/tc_bpf.c
@@ -184,7 +184,7 @@ static int bpf_ops_parse(int argc, char **argv, struct 
sock_filter *bpf_ops,
}
 
if (i != bpf_len) {
-   fprintf(stderr, "Parsed program length is less than 
encodedlength parameter!\n");
+   fprintf(stderr, "Parsed program length is less than encoded 
length parameter!\n");
ret = -EINVAL;
goto out;
}
@@ -214,6 +214,27 @@ void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 
len)
ops[i].jf, ops[i].k);
 }
 
+static void bpf_map_pin_report(const struct bpf_elf_map *pin,
+  const struct bpf_elf_map *obj)
+{
+   fprintf(stderr, "Map specification differs from pinned file!\n");
+
+   if (obj->type != pin->type)
+   fprintf(stderr, " - Type: %u (obj) != %u (pin)\n",
+   obj->type, pin->type);
+   if (obj->size_key != pin->size_key)
+   fprintf(stderr, " - Size key: %u (obj) != %u (pin)\n",
+   obj->size_key, pin->size_key);
+   if (obj->size_value != pin->size_value)
+   fprintf(stderr, " - Size value:   %u (obj) != %u (pin)\n",
+   obj->size_value, pin->size_value);
+   if (obj->max_elem != pin->max_elem)
+   fprintf(stderr, " - Max elems:%u (obj) != %u (pin)\n",
+   obj->max_elem, pin->max_elem);
+
+   fprintf(stderr, "\n");
+}
+
 static int bpf_map_selfcheck_pinned(int fd, const struct bpf_elf_map *map,
int length)
 {
@@ -256,7 +277,7 @@ static int bpf_map_selfcheck_pinned(int fd, const struct 
bpf_elf_map *map,
if (!memcmp(, , length))
return 0;
 
-   fprintf(stderr, "Map specs from pinned file differ!\n");
+   bpf_map_pin_report(, map);
return -EINVAL;
}
 }
@@ -735,7 +756,19 @@ bpf_dump_error(struct bpf_elf_ctx *ctx, const char 
*format, ...)
va_end(vl);
 
if (ctx->log && ctx->log[0]) {
-   fprintf(stderr, "%s\n", ctx->log);
+   if (ctx->verbose) {
+   fprintf(stderr, "%s\n", ctx->log);
+   } else {
+   unsigned int off = 0, len = strlen(ctx->log);
+
+   if (len > BPF_MAX_LOG) {
+   off = len - BPF_MAX_LOG;
+   fprintf(stderr, "Skipped %u bytes, use \'verb\' 
option for the full verbose log.\n[...]\n",
+   off);
+   }
+   fprintf(stderr, "%s\n", ctx->log + off);
+   }
+
memset(ctx->log, 0, ctx->log_size);
}
 }
@@ -1055,14 +1088,16 @@ static void bpf_prog_report(int fd, const char *section,
const struct bpf_elf_prog *prog,
struct bpf_elf_ctx *ctx)
 {
-   fprintf(stderr, "Prog section \'%s\' %s%s (%d)!\n", section,
+   unsigned int insns = prog->size / sizeof(struct bpf_insn);
+
+   fprintf(stderr, "\nProg section \'%s\' %s%s (%d)!\n", section,
fd < 0 ? "rejected: " : "loaded",
fd < 0 ? strerror(errno) : "",
fd < 0 ? errno : fd);
 
fprintf(stderr, " - Type: %u\n", prog->type);
-   fprintf(stderr, " - Instructions: %zu\n",
-   prog->size / sizeof(struct bpf_insn));
+   fprintf(stderr, " - Instructions: %u (%u over limit)\n",
+   insns, insns > BPF_MAXINSNS ? insns - BPF_MAXINSNS : 0);
fprintf(stderr, " - License:  %s\n\n", prog->license);
 
bpf_dump_error(ctx, "Verifier analysis:\n\n");
@@ -1283,6 +1318,11 @@ static int bpf_fetch_strtab(struct bpf_elf_ctx *ctx, int 
section,
return 0;
 }
 
+static bool bpf_has_map_data(const struct bpf_elf_ctx *ctx)
+{
+   return ctx->sym_tab && ctx->str_tab && ctx->sec_maps;
+}
+
 static int

[PATCH iproute2 -master 0/3] Minor tc/bpf updates

2016-04-08 Thread Daniel Borkmann

Some minor updates to improve error reporting, add signatures
and recently introduced map flags attribute. Set is against
master branch.

Thanks!

Daniel Borkmann (3):
  tc, bpf: add new csum and tunnel signatures
  tc, bpf: further improve error reporting
  tc, bpf: add support for map pre/allocation

 examples/bpf/bpf_cyclic.c   |  9 -
 examples/bpf/bpf_graft.c|  8 +++-
 examples/bpf/bpf_prog.c |  2 +
 examples/bpf/bpf_shared.c   |  8 +++-
 examples/bpf/bpf_tailcall.c | 29 --
 include/bpf_api.h   | 52 
 include/bpf_elf.h   |  1 +
 tc/tc_bpf.c | 98 +++--
 tc/tc_bpf.h |  4 ++
 9 files changed, 138 insertions(+), 73 deletions(-)

-- 
1.9.3

[PATCH iproute2 -master 3/3] tc, bpf: add support for map pre/allocation

2016-04-08 Thread Daniel Borkmann

Follow-up to kernel commit 6c9059817432 ("bpf: pre-allocate hash map
elements"). Add flags support, so that we can pass in BPF_F_NO_PREALLOC
flag for disallowing preallocation. Update examples accordingly and also
remove the BPF_* map helper macros from them as they were not very useful.

Signed-off-by: Daniel Borkmann 
---
 examples/bpf/bpf_cyclic.c   |  9 -
 examples/bpf/bpf_graft.c|  8 +++-
 examples/bpf/bpf_prog.c |  2 ++
 examples/bpf/bpf_shared.c   |  8 +++-
 examples/bpf/bpf_tailcall.c | 29 +
 include/bpf_api.h   | 45 -
 include/bpf_elf.h   |  1 +
 tc/tc_bpf.c | 16 
 8 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/examples/bpf/bpf_cyclic.c b/examples/bpf/bpf_cyclic.c
index 36745a3..11d1c06 100644
--- a/examples/bpf/bpf_cyclic.c
+++ b/examples/bpf/bpf_cyclic.c
@@ -6,7 +6,14 @@
  */
 #define JMP_MAP_ID 0xabccba
 
-BPF_PROG_ARRAY(jmp_tc, JMP_MAP_ID, PIN_OBJECT_NS, 1);
+struct bpf_elf_map __section_maps jmp_tc = {
+   .type   = BPF_MAP_TYPE_PROG_ARRAY,
+   .id = JMP_MAP_ID,
+   .size_key   = sizeof(uint32_t),
+   .size_value = sizeof(uint32_t),
+   .pinning= PIN_OBJECT_NS,
+   .max_elem   = 1,
+};
 
 __section_tail(JMP_MAP_ID, 0)
 int cls_loop(struct __sk_buff *skb)
diff --git a/examples/bpf/bpf_graft.c b/examples/bpf/bpf_graft.c
index 20784ff..07113d4 100644
--- a/examples/bpf/bpf_graft.c
+++ b/examples/bpf/bpf_graft.c
@@ -33,7 +33,13 @@
  *   [...]
  */
 
-BPF_PROG_ARRAY(jmp_tc, 0, PIN_GLOBAL_NS, 1);
+struct bpf_elf_map __section_maps jmp_tc = {
+   .type   = BPF_MAP_TYPE_PROG_ARRAY,
+   .size_key   = sizeof(uint32_t),
+   .size_value = sizeof(uint32_t),
+   .pinning= PIN_GLOBAL_NS,
+   .max_elem   = 1,
+};
 
 __section("aaa")
 int cls_aaa(struct __sk_buff *skb)
diff --git a/examples/bpf/bpf_prog.c b/examples/bpf/bpf_prog.c
index f15e876..d6caf37 100644
--- a/examples/bpf/bpf_prog.c
+++ b/examples/bpf/bpf_prog.c
@@ -192,6 +192,7 @@ struct bpf_elf_map __section("maps") map_proto = {
.size_key   =   sizeof(uint8_t),
.size_value =   sizeof(struct count_tuple),
.max_elem   =   256,
+   .flags  =   BPF_F_NO_PREALLOC,
 };
 
 struct bpf_elf_map __section("maps") map_queue = {
@@ -200,6 +201,7 @@ struct bpf_elf_map __section("maps") map_queue = {
.size_key   =   sizeof(uint32_t),
.size_value =   sizeof(struct count_queue),
.max_elem   =   1024,
+   .flags  =   BPF_F_NO_PREALLOC,
 };
 
 struct bpf_elf_map __section("maps") map_drops = {
diff --git a/examples/bpf/bpf_shared.c b/examples/bpf/bpf_shared.c
index 7fe9ef3..21fe6f1 100644
--- a/examples/bpf/bpf_shared.c
+++ b/examples/bpf/bpf_shared.c
@@ -18,7 +18,13 @@
  * instance is being created.
  */
 
-BPF_ARRAY4(map_sh, 0, PIN_OBJECT_NS, 1); /* or PIN_GLOBAL_NS, or PIN_NONE */
+struct bpf_elf_map __section_maps map_sh = {
+   .type   = BPF_MAP_TYPE_ARRAY,
+   .size_key   = sizeof(uint32_t),
+   .size_value = sizeof(uint32_t),
+   .pinning= PIN_OBJECT_NS, /* or PIN_GLOBAL_NS, or PIN_NONE */
+   .max_elem   = 1,
+};
 
 __section("egress")
 int emain(struct __sk_buff *skb)
diff --git a/examples/bpf/bpf_tailcall.c b/examples/bpf/bpf_tailcall.c
index f545430..1a30426 100644
--- a/examples/bpf/bpf_tailcall.c
+++ b/examples/bpf/bpf_tailcall.c
@@ -26,10 +26,31 @@
  * classifier behaviour.
  */
 
-BPF_PROG_ARRAY(jmp_tc, FOO, PIN_OBJECT_NS, MAX_JMP_SIZE);
-BPF_PROG_ARRAY(jmp_ex, BAR, PIN_OBJECT_NS, 1);
-
-BPF_ARRAY4(map_sh, 0, PIN_OBJECT_NS, 1);
+struct bpf_elf_map __section_maps jmp_tc = {
+   .type   = BPF_MAP_TYPE_PROG_ARRAY,
+   .id = FOO,
+   .size_key   = sizeof(uint32_t),
+   .size_value = sizeof(uint32_t),
+   .pinning= PIN_OBJECT_NS,
+   .max_elem   = MAX_JMP_SIZE,
+};
+
+struct bpf_elf_map __section_maps jmp_ex = {
+   .type   = BPF_MAP_TYPE_PROG_ARRAY,
+   .id = BAR,
+   .size_key   = sizeof(uint32_t),
+   .size_value = sizeof(uint32_t),
+   .pinning= PIN_OBJECT_NS,
+   .max_elem   = 1,
+};
+
+struct bpf_elf_map __section_maps map_sh = {
+   .type   = BPF_MAP_TYPE_ARRAY,
+   .size_key   = sizeof(uint32_t),
+   .size_value = sizeof(uint32_t),
+   .pinning= PIN_OBJECT_NS,
+   .max_elem   = 1,
+};
 
 __section_tail(FOO, ENTRY_0)
 int cls_case1(struct __sk_buff *skb)
diff --git a/include/bpf_api.h b/include/bpf_api.h
index 0f278f0..1b250d2 100644
--- a/include/bpf_api.h
+++ b/include/bpf_api.h
@@ -99,51 +99,6 @@
char license[] __section_license = NAME
 #endif
 
-#ifndef __BPF_MAP
-# define __BPF_MAP(NAME, TYPE,

[net-next][PATCH 1/2] RDS: fix endianness for dp_ack_seq

2016-04-08 Thread Santosh Shilimkar

From: Qing Huang 

dp->dp_ack_seq is used in big endian format. We need to do the
big endianness conversion when we assign a value in host format
to it.

Signed-off-by: Qing Huang 
Signed-off-by: Santosh Shilimkar 
---
 net/rds/ib_cm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 8764970..310cabc 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -194,7 +194,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection 
*conn,
dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
dp->dp_protocol_minor_mask = 
cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
-   dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
+   dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic));
 
/* Advertise flow control */
if (ic->i_flowctl) {
-- 
1.9.1

[net-next][PATCH 2/2] RDS: Fix the atomicity for congestion map update

2016-04-08 Thread Santosh Shilimkar

Two different threads with different rds sockets may be in
rds_recv_rcvbuf_delta() via receive path. If their ports
both map to the same word in the congestion map, then
using non-atomic ops to update it could cause the map to
be incorrect. Lets use atomics to avoid such an issue.

Full credit to Wengang  for
finding the issue, analysing it and also pointing out
to offending code with spin lock based fix.

Reviewed-by: Leon Romanovsky 
Signed-off-by: Wengang Wang 
Signed-off-by: Santosh Shilimkar 
---
 net/rds/cong.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/rds/cong.c b/net/rds/cong.c
index e6144b8..6641bcf 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -299,7 +299,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
 
-   __set_bit_le(off, (void *)map->m_page_addrs[i]);
+   set_bit_le(off, (void *)map->m_page_addrs[i]);
 }
 
 void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
@@ -313,7 +313,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 
port)
i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
 
-   __clear_bit_le(off, (void *)map->m_page_addrs[i]);
+   clear_bit_le(off, (void *)map->m_page_addrs[i]);
 }
 
 static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
-- 
1.9.1

[net-next][PATCH 0/2] RDS: couple of fixes for 4.6

2016-04-08 Thread Santosh Shilimkar

Patches are also available at below git tree. 

git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux.git 
for_4.6/net-next/rds-fixes

Qing Huang (1):
  RDS: fix endianness for dp_ack_seq

Santosh Shilimkar (1):
  RDS: Fix the atomicity for congestion map update

 net/rds/cong.c  | 4 ++--
 net/rds/ib_cm.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

-- 
1.9.1

[PATCH RFT 2/2] macb: kill PHY reset code

2016-04-08 Thread Sergei Shtylyov

With  the 'phylib' now  being aware of  the "reset-gpios" PHY node property,
there should be no need to frob the PHY reset in this  driver anymore...

Signed-off-by: Sergei Shtylyov 

---
 drivers/net/ethernet/cadence/macb.c |   17 -
 drivers/net/ethernet/cadence/macb.h |1 -
 2 files changed, 18 deletions(-)

Index: net-next/drivers/net/ethernet/cadence/macb.c
===
--- net-next.orig/drivers/net/ethernet/cadence/macb.c
+++ net-next/drivers/net/ethernet/cadence/macb.c
@@ -2884,7 +2884,6 @@ static int macb_probe(struct platform_de
  = macb_clk_init;
int (*init)(struct platform_device *) = macb_init;
struct device_node *np = pdev->dev.of_node;
-   struct device_node *phy_node;
const struct macb_config *macb_config = NULL;
struct clk *pclk, *hclk = NULL, *tx_clk = NULL;
unsigned int queue_mask, num_queues;
@@ -2977,18 +2976,6 @@ static int macb_probe(struct platform_de
else
macb_get_hwaddr(bp);
 
-   /* Power up the PHY if there is a GPIO reset */
-   phy_node =  of_get_next_available_child(np, NULL);
-   if (phy_node) {
-   int gpio = of_get_named_gpio(phy_node, "reset-gpios", 0);
-
-   if (gpio_is_valid(gpio)) {
-   bp->reset_gpio = gpio_to_desc(gpio);
-   gpiod_direction_output(bp->reset_gpio, 1);
-   }
-   }
-   of_node_put(phy_node);
-
err = of_get_phy_mode(np);
if (err < 0) {
pdata = dev_get_platdata(>dev);
@@ -3054,10 +3041,6 @@ static int macb_remove(struct platform_d
mdiobus_unregister(bp->mii_bus);
mdiobus_free(bp->mii_bus);
 
-   /* Shutdown the PHY if there is a GPIO reset */
-   if (bp->reset_gpio)
-   gpiod_set_value(bp->reset_gpio, 0);
-
unregister_netdev(dev);
clk_disable_unprepare(bp->tx_clk);
clk_disable_unprepare(bp->hclk);
Index: net-next/drivers/net/ethernet/cadence/macb.h
===
--- net-next.orig/drivers/net/ethernet/cadence/macb.h
+++ net-next/drivers/net/ethernet/cadence/macb.h
@@ -832,7 +832,6 @@ struct macb {
unsigned intdma_burst_length;
 
phy_interface_t phy_interface;
-   struct gpio_desc*reset_gpio;
 
/* AT91RM9200 transmit */
struct sk_buff *skb;/* holds skb until xmit 
interrupt completes */

[PATCH RFT 1/2] phylib: add device reset GPIO support

2016-04-08 Thread Sergei Shtylyov

The PHY  devices sometimes do have their reset signal (maybe even power
supply?) tied to some GPIO and sometimes it also does happen that a boot
loader does not leave it deasserted. So far this issue has been attacked
from (as I believe) a wrong angle: by teaching the MAC driver to manipulate
the GPIO in question;  that solution, when  applied to the device trees,
led to adding the PHY reset GPIO properties to the MAC device node, with
one exception: Cadence MACB driver which could handle the "reset-gpios"
prop  in a PHY device  subnode.  I believe that the correct approach is to
teach the 'phylib' to get the MDIO device reset GPIO from the device tree
node corresponding to this device -- which this patch is doing...

Note that I had to modify the  AT803x PHY driver as it would stop working
otherwise as it made use of the reset GPIO for its own purposes...

Signed-off-by: Sergei Shtylyov 

---
 Documentation/devicetree/bindings/net/phy.txt |2 +
 drivers/net/phy/at803x.c  |   19 ++
 drivers/net/phy/mdio_bus.c|4 +++
 drivers/net/phy/mdio_device.c |   27 +++--
 drivers/net/phy/phy_device.c  |   33 --
 drivers/of/of_mdio.c  |   16 
 include/linux/mdio.h  |3 ++
 include/linux/phy.h   |5 +++
 8 files changed, 89 insertions(+), 20 deletions(-)

Index: net-next/Documentation/devicetree/bindings/net/phy.txt
===
--- net-next.orig/Documentation/devicetree/bindings/net/phy.txt
+++ net-next/Documentation/devicetree/bindings/net/phy.txt
@@ -35,6 +35,8 @@ Optional Properties:
 - broken-turn-around: If set, indicates the PHY device does not correctly
   release the turn around line low at the end of a MDIO transaction.
 
+- reset-gpios: The GPIO phandle and specifier for the PHY reset signal.
+
 Example:
 
 ethernet-phy@0 {
Index: net-next/drivers/net/phy/at803x.c
===
--- net-next.orig/drivers/net/phy/at803x.c
+++ net-next/drivers/net/phy/at803x.c
@@ -65,7 +65,6 @@ MODULE_LICENSE("GPL");
 
 struct at803x_priv {
bool phy_reset:1;
-   struct gpio_desc *gpiod_reset;
 };
 
 struct at803x_context {
@@ -271,22 +270,10 @@ static int at803x_probe(struct phy_devic
 {
struct device *dev = >mdio.dev;
struct at803x_priv *priv;
-   struct gpio_desc *gpiod_reset;
 
priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
if (!priv)
return -ENOMEM;
-
-   if (phydev->drv->phy_id != ATH8030_PHY_ID)
-   goto does_not_require_reset_workaround;
-
-   gpiod_reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
-   if (IS_ERR(gpiod_reset))
-   return PTR_ERR(gpiod_reset);
-
-   priv->gpiod_reset = gpiod_reset;
-
-does_not_require_reset_workaround:
phydev->priv = priv;
 
return 0;
@@ -361,14 +348,14 @@ static void at803x_link_change_notify(st
 */
if (phydev->drv->phy_id == ATH8030_PHY_ID) {
if (phydev->state == PHY_NOLINK) {
-   if (priv->gpiod_reset && !priv->phy_reset) {
+   if (phydev->mdio.reset && !priv->phy_reset) {
struct at803x_context context;
 
at803x_context_save(phydev, );
 
-   gpiod_set_value(priv->gpiod_reset, 1);
+   phy_device_reset(phydev, 1);
msleep(1);
-   gpiod_set_value(priv->gpiod_reset, 0);
+   phy_device_reset(phydev, 0);
msleep(1);
 
at803x_context_restore(phydev, );
Index: net-next/drivers/net/phy/mdio_bus.c
===
--- net-next.orig/drivers/net/phy/mdio_bus.c
+++ net-next/drivers/net/phy/mdio_bus.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -371,6 +372,9 @@ void mdiobus_unregister(struct mii_bus *
if (!mdiodev)
continue;
 
+   if (mdiodev->reset)
+   gpiod_put(mdiodev->reset);
+
mdiodev->device_remove(mdiodev);
mdiodev->device_free(mdiodev);
}
Index: net-next/drivers/net/phy/mdio_device.c
===
--- net-next.orig/drivers/net/phy/mdio_device.c
+++ net-next/drivers/net/phy/mdio_device.c
@@ -12,6 +12,8 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -103,6 +105,13 @@ void mdio_device_remove(struct mdio_devi
 }
 EXPORT_SYMBOL(mdio_device_remove);

[PATCH RFT 0/2] Teach phylib hard-resetting devices

2016-04-08 Thread Sergei Shtylyov

Hello.

   Here's the set of 2 patches against DaveM's 'net-next.git' repo. They add to
'phylib' support for resetting devices via GPIO and do some clean up after
doing that...

[1/2] phylib: add device reset GPIO support
[2/2] macb: kill PHY reset code

MBR, Sergei

Re: [PATCH net-next 1/8] perf: optimize perf_fetch_caller_regs

2016-04-08 Thread Steven Rostedt

On Tue, 5 Apr 2016 14:06:26 +0200
Peter Zijlstra  wrote:

> On Mon, Apr 04, 2016 at 09:52:47PM -0700, Alexei Starovoitov wrote:
> > avoid memset in perf_fetch_caller_regs, since it's the critical path of all 
> > tracepoints.
> > It's called from perf_sw_event_sched, perf_event_task_sched_in and all of 
> > perf_trace_##call
> > with this_cpu_ptr(&__perf_regs[..]) which are zero initialized by 
> > perpcu_alloc  
> 
> Its not actually allocated; but because its a static uninitialized
> variable we get .bss like behaviour and the initial value is copied to
> all CPUs when the per-cpu allocator thingy bootstraps SMP IIRC.
> 
> > and
> > subsequent call to perf_arch_fetch_caller_regs initializes the same fields 
> > on all archs,
> > so we can safely drop memset from all of the above cases and   
> 
> Indeed.
> 
> > move it into
> > perf_ftrace_function_call that calls it with stack allocated pt_regs.  
> 
> Hmm, is there a reason that's still on-stack instead of using the
> per-cpu thing, Steve?

Well, what do you do when you are tracing with regs in an interrupt
that already set the per cpu regs field? We could create our own
per-cpu one as well, but then that would require checking which level
we are in, as we can have one for normal context, one for softirq
context, one for irq context and one for nmi context.

-- Steve



> 
> > Signed-off-by: Alexei Starovoitov   
> 
> In any case,
> 
> Acked-by: Peter Zijlstra (Intel)

Re: [net-next PATCH 2/5] GSO: Add GSO type for fixed IPv4 ID

2016-04-08 Thread Alexander Duyck

On Fri, Apr 8, 2016 at 2:41 PM, Jesse Gross  wrote:
> On Fri, Apr 8, 2016 at 5:33 PM, Alexander Duyck  wrote:
>> This patch adds support for TSO using IPv4 headers with a fixed IP ID
>> field.  This is meant to allow us to do a lossless GRO in the case of TCP
>> flows that use a fixed IP ID such as those that convert IPv6 header to IPv4
>> headers.
>>
>> In addition I am adding a feature that for now I am referring to TSO with
>> IP ID mangling.  Basically when this flag is enabled the device has the
>> option to either output the flow with incrementing IP IDs or with a fixed
>> IP ID regardless of what the original IP ID ordering was.  This is useful
>> in cases where the DF bit is set and we do not care if the original IP ID
>> value is maintained.
>>
>> Signed-off-by: Alexander Duyck 
>
> I think SKB_GSO_TCP_FIXEDID would also need to be added to the list of
> enumerated OK GSO types for MPLS GSO.

I'll have that fixed for v2.

- Alex

Re: [PATCH v4] route: do not cache fib route info on local routes with oif

2016-04-08 Thread Julian Anastasov


Hello,

On Fri, 8 Apr 2016, Chris Friesen wrote:

> For local routes that require a particular output interface we do not want
> to cache the result.  Caching the result causes incorrect behaviour when
> there are multiple source addresses on the interface.  The end result
> being that if the intended recipient is waiting on that interface for the
> packet he won't receive it because it will be delivered on the loopback
> interface and the IP_PKTINFO ipi_ifindex will be set to the loopback
> interface as well.
> 
> This can be tested by running a program such as "dhcp_release" which
> attempts to inject a packet on a particular interface so that it is
> received by another program on the same board.  The receiving process
> should see an IP_PKTINFO ipi_ifndex value of the source interface
> (e.g., eth1) instead of the loopback interface (e.g., lo).  The packet
> will still appear on the loopback interface in tcpdump but the important
> aspect is that the CMSG info is correct.
> 
> Sample dhcp_release command line:
> 
>dhcp_release eth1 192.168.204.222 02:11:33:22:44:66
> 
> Signed-off-by: Allain Legacy 
> Signed off-by: Chris Friesen 

Looks good to me.

Reviewed-by: Julian Anastasov 

> ---
>  net/ipv4/route.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 02c6229..b050cf9 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -2045,6 +2045,18 @@ static struct rtable *__mkroute_output(const struct 
> fib_result *res,
>*/
>   if (fi && res->prefixlen < 4)
>   fi = NULL;
> + } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
> +(orig_oif != dev_out->ifindex)) {
> + /* For local routes that require a particular output interface
> +  * we do not want to cache the result.  Caching the result
> +  * causes incorrect behaviour when there are multiple source
> +  * addresses on the interface, the end result being that if the
> +  * intended recipient is waiting on that interface for the
> +  * packet he won't receive it because it will be delivered on
> +  * the loopback interface and the IP_PKTINFO ipi_ifindex will
> +  * be set to the loopback interface as well.
> +  */
> + fi = NULL;
>   }
>  
>   fnhe = NULL;

Regards

--
Julian Anastasov

Re: [RFC PATCH 07/11] GENEVE: Add option to mangle IP IDs on inner headers when using TSO

2016-04-08 Thread Alexander Duyck

On Fri, Apr 8, 2016 at 2:40 PM, Jesse Gross  wrote:
> On Thu, Apr 7, 2016 at 8:52 PM, Alexander Duyck
>  wrote:
>> Just a thought.  What if I replaced NETIF_F_TSO_FIXEDID with something
>> that meant we could mange the IP ID like a NETIF_F_TSO_IPID_MANGLE
>> (advice for better name welcome).  Instead of the feature flag meaning
>> we are going to transmit packets with a fixed ID it would mean we
>> don't care about the ID and are free to mangle it as we see fit.  The
>> GSO type can retain the same meaning as far as that requiring the same
>> ID for all, but the feature would mean we will take fixed and convert
>> it to incrementing, or incrementing and convert it to fixed.
>
> I saw the new version of the code that you posted with this idea and
> now that I understand it better, it seems like a reasonable choice to
> me - it's nice that it is consistent with GRO and not tunnel specific.
> It also makes behavior consistent across drivers in regards to
> incrementing IDs in the default case, which was one of my concerns
> from before.
>
> Maybe I missed it but I didn't see any checks for the DF bit being set
> when we transmit a packet with NETIF_F_TSO_MANGLEID. Even if I am
> comfortable mangling my IDs in the DF case, I don't think this would
> ever extend to non-DF packets. In the documentation you noted that it
> is the driver's responsibility to do this check but I couldn't find it
> in either ixgbe or igb. It would also be nice if the core stack could
> enforce it somehow as well rather than each driver.

Yeah I had glossed over that in the igb and ixgbe patches.  A check is
only really needed for the incrementing to non-incrementing case and I
wasn't sure how common it was to have TCP with an IP header that
didn't set the DF bit.  In the case of the outer headers igb and ixgbe
will increment the IP ID always so we don't have to worry about if DF
is set of not there.  For the inner headers I had fudged it a bit and
didn't add the validation.  If needed I can see about adding that
shortly.

- Alex

Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver

2016-04-08 Thread Timur Tabi


Vikram Sethi wrote:


On the FSM9900 SOC (which uses device-tree), the two pins that connect to the external 
PHY are gpio pins.  However, the driver needs to reprogram the pinmux so that those pins 
are wired to the Emac controller.  That's what the the gpio code in this driver is doing: 
it's just configuring the pins so that they connect directly between the Emac and the 
external PHY.  After that, they are no longer GPIO pins, and you cannot use the 
"GPIO controlled MDIO bus".  There is no MDIO controller on the SOC.  The 
external PHY is controlled directly from the Emac and also from the internal PHY.  It is 
screwy, I know, but that's what Gilad was trying to explain.

It is incorrect to say there's no MDIO controller on the SoC. The EMAC Core on 
the SoC itself has a MDIO controller which talks to the external PHY. The 
internal SGMII is not on MDIO however.
Please see the EMAC specification.


Sorry, I should have said that there is no *independent* MDIO controller 
(one that has its own driver).  As you said, you can only talk to the 
external PHY through the Emac.


--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
Forum, a Linux Foundation collaborative project.

Re: [net-next PATCH 2/5] GSO: Add GSO type for fixed IPv4 ID

2016-04-08 Thread Jesse Gross

On Fri, Apr 8, 2016 at 5:33 PM, Alexander Duyck  wrote:
> This patch adds support for TSO using IPv4 headers with a fixed IP ID
> field.  This is meant to allow us to do a lossless GRO in the case of TCP
> flows that use a fixed IP ID such as those that convert IPv6 header to IPv4
> headers.
>
> In addition I am adding a feature that for now I am referring to TSO with
> IP ID mangling.  Basically when this flag is enabled the device has the
> option to either output the flow with incrementing IP IDs or with a fixed
> IP ID regardless of what the original IP ID ordering was.  This is useful
> in cases where the DF bit is set and we do not care if the original IP ID
> value is maintained.
>
> Signed-off-by: Alexander Duyck 

I think SKB_GSO_TCP_FIXEDID would also need to be added to the list of
enumerated OK GSO types for MPLS GSO.

Re: [RFC PATCH 07/11] GENEVE: Add option to mangle IP IDs on inner headers when using TSO

2016-04-08 Thread Jesse Gross

On Thu, Apr 7, 2016 at 8:52 PM, Alexander Duyck
 wrote:
> Just a thought.  What if I replaced NETIF_F_TSO_FIXEDID with something
> that meant we could mange the IP ID like a NETIF_F_TSO_IPID_MANGLE
> (advice for better name welcome).  Instead of the feature flag meaning
> we are going to transmit packets with a fixed ID it would mean we
> don't care about the ID and are free to mangle it as we see fit.  The
> GSO type can retain the same meaning as far as that requiring the same
> ID for all, but the feature would mean we will take fixed and convert
> it to incrementing, or incrementing and convert it to fixed.

I saw the new version of the code that you posted with this idea and
now that I understand it better, it seems like a reasonable choice to
me - it's nice that it is consistent with GRO and not tunnel specific.
It also makes behavior consistent across drivers in regards to
incrementing IDs in the default case, which was one of my concerns
from before.

Maybe I missed it but I didn't see any checks for the DF bit being set
when we transmit a packet with NETIF_F_TSO_MANGLEID. Even if I am
comfortable mangling my IDs in the DF case, I don't think this would
ever extend to non-DF packets. In the documentation you noted that it
is the driver's responsibility to do this check but I couldn't find it
in either ixgbe or igb. It would also be nice if the core stack could
enforce it somehow as well rather than each driver.

Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter

2016-04-08 Thread Alexei Starovoitov

On Fri, Apr 08, 2016 at 10:08:08PM +0200, Jesper Dangaard Brouer wrote:
> On Fri, 8 Apr 2016 10:26:53 -0700
> Alexei Starovoitov  wrote:
> 
> > On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> > > 
> > > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer 
> > >  wrote:
> > >   
> > > > > +/* user return codes for PHYS_DEV prog type */
> > > > > +enum bpf_phys_dev_action {
> > > > > + BPF_PHYS_DEV_DROP,
> > > > > + BPF_PHYS_DEV_OK,
> > > > > +};
> > > > 
> > > > I can imagine these extra return codes:
> > > > 
> > > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> > > >  BPF_PHYS_DEV_STOLEN, /* E.g. forward use-case */
> > > >  BPF_PHYS_DEV_SHARED, /* Queue for async processing, e.g. tcpdump 
> > > > use-case */
> > > > 
> > > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > > > which we can look at when we get that far...  
> > > 
> > > I want to point out something which is quite FUNDAMENTAL, for
> > > understanding these return codes (and network stack).
> > > 
> > > 
> > > At driver RX time, the network stack basically have two ways of
> > > building an SKB, which is send up the stack.
> > > 
> > > Option-A (fastest): The packet page is writable. The SKB can be
> > > allocated and skb->data/head can point directly to the page.  And
> > > we place/write skb_shared_info in the end/tail-room. (This is done by
> > > calling build_skb()).
> > > 
> > > Option-B (slower): The packet page is read-only.  The SKB cannot point
> > > skb->data/head directly to the page, because skb_shared_info need to be
> > > written into skb->end (slightly hidden via skb_shinfo() casting).  To
> > > get around this, a separate piece of memory is allocated (speedup by
> > > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> > > be written. (This is done when calling netdev/napi_alloc_skb()).
> > >   Drivers then need to copy over packet headers, and assign + adjust
> > > skb_shinfo(skb)->frags[0] offset to skip copied headers.
> > > 
> > > 
> > > Unfortunately most drivers use option-B.  Due to cost of calling the
> > > page allocator.  It is only slightly most expensive to get a larger
> > > compound page from the page allocator, which then can be partitioned into
> > > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> > > cost is added later, when constructing the SKB.
> > >  Another reason for option-B, is that archs with expensive IOMMU
> > > requirements (like PowerPC), don't need to dma_unmap on every packet,
> > > but only on the compound page level.
> > > 
> > > Side-note: Most drivers have a "copy-break" optimization.  Especially
> > > for option-B, when copying header data anyhow. For small packet, one
> > > might as well free (or recycle) the RX page, if header size fits into
> > > the newly allocated memory (for skb_shared_info).  
> > 
> > I think you guys are going into overdesign territory, so
> > . nack on read-only pages
> 
> Unfortunately you cannot just ignore or nack read-only pages. They are
> a fact in the current drivers.
> 
> Most drivers today (at-least the ones we care about) only deliver
> read-only pages.  If you don't accept read-only pages day-1, then you
> first have to rewrite a lot of drivers... and that will stall the
> project!  How will you deal with this fact?
> 
> The early drop filter use-case in this patchset, can ignore read-only
> pages.  But ABI wise we need to deal with the future case where we do
> need/require writeable pages.  A simple need-writable pages in the API
> could help us move forward.

the program should never need to worry about whether dma buffer is
writeable or not. Complicating drivers, api, abi, usability
for the single use case of fast packet drop is not acceptable.
XDP is not going to be a fit for all drivers and all architectures.
That is cruicial 'performance vs generality' aspect of the design.
All kernel-bypasses are taking advantage of specific architecture.
We have to take advantage of it as well. If it doesn't fit
powerpc with iommu, so be it. XDP will return -enotsupp.
That is fundamental point. We have to cut such corners and avoid
all cases where unnecessary generality hurts performance.
Read-only pages is clearly such thing.

> > The whole thing must be dead simple to use. Above is not simple by any 
> > means.
> 
> Maybe you missed that the above was a description of how the current
> network stack handles this, which is not simple... which is root of the
> hole performance issue.

Disagree. The stack has copy-break, gro, gso and everything else because
it's serving _host_ use case. XDP is packet forwarder use case.
The requirements are completely different. Ex. the host needs gso
in the core and drivers. It needs to deliver data all the way
to user space and back. That is hard and that's where complexity
comes from. For packet forwarder none of it is needed. So

[PATCH v4] route: do not cache fib route info on local routes with oif

2016-04-08 Thread Chris Friesen

For local routes that require a particular output interface we do not want
to cache the result.  Caching the result causes incorrect behaviour when
there are multiple source addresses on the interface.  The end result
being that if the intended recipient is waiting on that interface for the
packet he won't receive it because it will be delivered on the loopback
interface and the IP_PKTINFO ipi_ifindex will be set to the loopback
interface as well.

This can be tested by running a program such as "dhcp_release" which
attempts to inject a packet on a particular interface so that it is
received by another program on the same board.  The receiving process
should see an IP_PKTINFO ipi_ifndex value of the source interface
(e.g., eth1) instead of the loopback interface (e.g., lo).  The packet
will still appear on the loopback interface in tcpdump but the important
aspect is that the CMSG info is correct.

Sample dhcp_release command line:

   dhcp_release eth1 192.168.204.222 02:11:33:22:44:66

Signed-off-by: Allain Legacy 
Signed off-by: Chris Friesen 
---
 net/ipv4/route.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 02c6229..b050cf9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2045,6 +2045,18 @@ static struct rtable *__mkroute_output(const struct 
fib_result *res,
 */
if (fi && res->prefixlen < 4)
fi = NULL;
+   } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
+  (orig_oif != dev_out->ifindex)) {
+   /* For local routes that require a particular output interface
+* we do not want to cache the result.  Caching the result
+* causes incorrect behaviour when there are multiple source
+* addresses on the interface, the end result being that if the
+* intended recipient is waiting on that interface for the
+* packet he won't receive it because it will be delivered on
+* the loopback interface and the IP_PKTINFO ipi_ifindex will
+* be set to the loopback interface as well.
+*/
+   fi = NULL;
}
 
fnhe = NULL;

Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver

2016-04-08 Thread Vikram Sethi

On 04/08/2016 02:06 PM, Timur Tabi wrote:
> Andrew Lunn wrote:
>
>> There are two different things here. One is configuring the pin to be
>> a GPIO. The second is using the GPIO as a GPIO. In this case,
>> bit-banging the MDIO bus.
>>
>> The firmware could be doing the configuration, setting the pin as a
>> GPIO. However, the firmware cannot be doing the MDIO bit-banging to
>> make an MDIO bus available. Linux has to do that.
>>
>> Or it could be we have all completely misunderstood the hardware, and
>> we are not doing bit-banging GPIO MDIO. There is a real MDIO
>> controller there, we don't use these pins as GPIOs, etc
>
> Actually, I think there is a misunderstanding.
>
> On the FSM9900 SOC (which uses device-tree), the two pins that connect to the 
> external PHY are gpio pins.  However, the driver needs to reprogram the 
> pinmux so that those pins are wired to the Emac controller.  That's what the 
> the gpio code in this driver is doing: it's just configuring the pins so that 
> they connect directly between the Emac and the external PHY.  After that, 
> they are no longer GPIO pins, and you cannot use the "GPIO controlled MDIO 
> bus".  There is no MDIO controller on the SOC.  The external PHY is 
> controlled directly from the Emac and also from the internal PHY.  It is 
> screwy, I know, but that's what Gilad was trying to explain.
It is incorrect to say there's no MDIO controller on the SoC. The EMAC Core on 
the SoC itself has a MDIO controller which talks to the external PHY. The 
internal SGMII is not on MDIO however.
Please see the EMAC specification.
>
> On the QDF2432 (which uses ACPI), those two wires are now dedicated. There 
> are not muxed GPIOs any more -- they are hard wired between Emac and the 
> external PHY.
>
> In both cases, you need to use Emac registers to communicate with the 
> external PHY.  Stuff like link detect and link speed are configured by 
> programming the Emac and/or the internal phy.
You need to use EMAC *MDIO* registers to communicate with external PHY.
>
> And the internal phy isn't really an internal phy.  It's an SGMII-like device 
> that's connected to the Emac and handles various phy-related tasks.  It has 
> its own register block, but you still have to program it in concert with the 
> Emac.  You can't really treat it separately.
>
> So I'm beginning to believe that Gilad's driver is actually correct as-is.  
> There are a few minor bug fixes, but in general it's correct.  I would like 
> to post a V4 soon that has those minor fixes.
>


-- 
Vikram Sethi
Qualcomm Technologies Inc, on behalf of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux 
Foundation Collaborative Project

[next-queue PATCH 1/3] i40e/i40evf: Add support for GSO partial with UDP_TUNNEL_CSUM and GRE_CSUM

2016-04-08 Thread Alexander Duyck

This patch makes it so that i40e and i40evf can use GSO_PARTIAL to support
segmentation for frames with checksums enabled in outer headers.  As a
result we can now send data over these types of tunnels at over 20Gb/s
versus the 12Gb/s that was previously possible on my system.

The advantage with the i40e parts is that this offload is mostly
transparent as the hardware still deals with the inner and/or outer IPv4
headers so the IP ID is still incrementing for both when this offload is
performed.

Signed-off-by: Alexander Duyck 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c |   10 --
 drivers/net/ethernet/intel/i40e/i40e_txrx.c |7 ++-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c   |7 ++-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c |   10 --
 4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 07a70c4ac49f..6342fab4d177 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -9119,20 +9119,25 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
   NETIF_F_TSO_ECN  |
   NETIF_F_TSO6 |
   NETIF_F_GSO_GRE  |
+  NETIF_F_GSO_GRE_CSUM |
   NETIF_F_GSO_IPIP |
   NETIF_F_GSO_SIT  |
   NETIF_F_GSO_UDP_TUNNEL   |
   NETIF_F_GSO_UDP_TUNNEL_CSUM  |
+  NETIF_F_GSO_PARTIAL  |
   NETIF_F_SCTP_CRC |
   NETIF_F_RXHASH   |
   NETIF_F_RXCSUM   |
   0;
 
if (!(pf->flags & I40E_FLAG_OUTER_UDP_CSUM_CAPABLE))
-   netdev->hw_enc_features ^= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+   netdev->gso_partial_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+
+   netdev->gso_partial_features |= NETIF_F_GSO_GRE_CSUM;
 
/* record features VLANs can make use of */
-   netdev->vlan_features |= netdev->hw_enc_features;
+   netdev->vlan_features |= netdev->hw_enc_features |
+NETIF_F_TSO_MANGLEID;
 
if (!(pf->flags & I40E_FLAG_MFP_ENABLED))
netdev->hw_features |= NETIF_F_NTUPLE;
@@ -9142,6 +9147,7 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
   NETIF_F_HW_VLAN_CTAG_RX;
 
netdev->features |= netdev->hw_features | NETIF_F_HW_VLAN_CTAG_FILTER;
+   netdev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
 
if (vsi->type == I40E_VSI_MAIN) {
SET_NETDEV_DEV(netdev, >pdev->dev);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 6e44cf118843..ede4183468b9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2300,11 +2300,15 @@ static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, 
u64 *cd_type_cmd_tso_mss)
}
 
if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
+SKB_GSO_GRE_CSUM |
 SKB_GSO_IPIP |
 SKB_GSO_SIT |
 SKB_GSO_UDP_TUNNEL |
 SKB_GSO_UDP_TUNNEL_CSUM)) {
-   if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) {
+   if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+   (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
+   l4.udp->len = 0;
+
/* determine offset of outer transport header */
l4_offset = l4.hdr - skb->data;
 
@@ -2481,6 +2485,7 @@ static int i40e_tx_enable_csum(struct sk_buff *skb, u32 
*tx_flags,
 
/* indicate if we need to offload outer UDP header */
if ((*tx_flags & I40E_TX_FLAGS_TSO) &&
+   !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
tunnel |= I40E_TXD_CTX_QW0_L4T_CS_MASK;
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index f101895ecf4a..6ce00547c13e 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1565,11 +1565,15 @@ static int i40e_tso(struct sk_buff *skb, u8 *hdr_len, 
u64 *cd_type_cmd_tso_mss)
}
 
if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
+

[next-queue PATCH 2/3] ixgbe/ixgbevf: Add support for GSO partial

2016-04-08 Thread Alexander Duyck

This patch adds support for partial GSO segmentation in the case of
tunnels.  Specifically with this change the driver an perform segmenation
as long as the frame either has IPv6 inner headers, or we are allowed to
mangle the IP IDs on the inner header.  This is needed because we will not
be modifying any fields from the start of the start of the outer transport
header to the start of the inner transport header as we are treating them
like they are just a block of IP options.

Signed-off-by: Alexander Duyck 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  105 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  123 -
 2 files changed, 172 insertions(+), 56 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 974aa7ca7a12..5134cb97f33c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7216,9 +7216,18 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 struct ixgbe_tx_buffer *first,
 u8 *hdr_len)
 {
+   u32 vlan_macip_lens, type_tucmd, mss_l4len_idx;
struct sk_buff *skb = first->skb;
-   u32 vlan_macip_lens, type_tucmd;
-   u32 mss_l4len_idx, l4len;
+   union {
+   struct iphdr *v4;
+   struct ipv6hdr *v6;
+   unsigned char *hdr;
+   } ip;
+   union {
+   struct tcphdr *tcp;
+   unsigned char *hdr;
+   } l4;
+   u32 paylen, l4_offset;
int err;
 
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -7231,46 +7240,52 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
if (err < 0)
return err;
 
+   ip.hdr = skb_network_header(skb);
+   l4.hdr = skb_checksum_start(skb);
+
/* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */
type_tucmd = IXGBE_ADVTXD_TUCMD_L4T_TCP;
 
-   if (first->protocol == htons(ETH_P_IP)) {
-   struct iphdr *iph = ip_hdr(skb);
-   iph->tot_len = 0;
-   iph->check = 0;
-   tcp_hdr(skb)->check = ~csum_tcpudp_magic(iph->saddr,
-iph->daddr, 0,
-IPPROTO_TCP,
-0);
+   /* initialize outer IP header fields */
+   if (ip.v4->version == 4) {
+   /* IP header will have to cancel out any data that
+* is not a part of the outer IP header
+*/
+   ip.v4->check = csum_fold(csum_add(lco_csum(skb),
+ csum_unfold(l4.tcp->check)));
type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4;
+
+   ip.v4->tot_len = 0;
first->tx_flags |= IXGBE_TX_FLAGS_TSO |
   IXGBE_TX_FLAGS_CSUM |
   IXGBE_TX_FLAGS_IPV4;
-   } else if (skb_is_gso_v6(skb)) {
-   ipv6_hdr(skb)->payload_len = 0;
-   tcp_hdr(skb)->check =
-   ~csum_ipv6_magic(_hdr(skb)->saddr,
-_hdr(skb)->daddr,
-0, IPPROTO_TCP, 0);
+   } else {
+   ip.v6->payload_len = 0;
first->tx_flags |= IXGBE_TX_FLAGS_TSO |
   IXGBE_TX_FLAGS_CSUM;
}
 
-   /* compute header lengths */
-   l4len = tcp_hdrlen(skb);
-   *hdr_len = skb_transport_offset(skb) + l4len;
+   /* determine offset of inner transport header */
+   l4_offset = l4.hdr - skb->data;
+
+   /* compute length of segmentation header */
+   *hdr_len = (l4.tcp->doff * 4) + l4_offset;
+
+   /* remove payload length from inner checksum */
+   paylen = skb->len - l4_offset;
+   csum_replace_by_diff(>check, htonl(paylen));
 
/* update gso size and bytecount with header size */
first->gso_segs = skb_shinfo(skb)->gso_segs;
first->bytecount += (first->gso_segs - 1) * *hdr_len;
 
/* mss_l4len_id: use 0 as index for TSO */
-   mss_l4len_idx = l4len << IXGBE_ADVTXD_L4LEN_SHIFT;
+   mss_l4len_idx = (*hdr_len - l4_offset) << IXGBE_ADVTXD_L4LEN_SHIFT;
mss_l4len_idx |= skb_shinfo(skb)->gso_size << IXGBE_ADVTXD_MSS_SHIFT;
 
/* vlan_macip_lens: HEADLEN, MACLEN, VLAN tag */
-   vlan_macip_lens = skb_network_header_len(skb);
-   vlan_macip_lens |= skb_network_offset(skb) << IXGBE_ADVTXD_MACLEN_SHIFT;
+   vlan_macip_lens = l4.hdr - ip.hdr;
+   vlan_macip_lens |= (ip.hdr - skb->data) << IXGBE_ADVTXD_MACLEN_SHIFT;
vlan_macip_lens |= first->tx_flags & IXGBE_TX_FLAGS_VLAN_MASK;
 
ixgbe_tx_ctxtdesc(tx_ring, vlan_macip_lens, 0, type_tucmd,
@@ -8614,6 +8629,14 @@ static int ixgbe_set_features(struct net_device *netdev,
if

[next-queue PATCH 3/3] igb/igbvf: Add support for GSO partial

2016-04-08 Thread Alexander Duyck

This patch adds support for partial GSO segmentation in the case of
tunnels.  Specifically with this change the driver an perform segmenation
as long as the frame either has IPv6 inner headers, or we are allowed to
mangle the IP IDs on the inner header.  This is needed because we will not
be modifying any fields from the start of the start of the outer transport
header to the start of the inner transport header as we are treating them
like they are just a block of IP options.

Signed-off-by: Alexander Duyck 
---
 drivers/net/ethernet/intel/igb/igb_main.c |  112 ++-
 drivers/net/ethernet/intel/igbvf/netdev.c |  173 ++---
 2 files changed, 191 insertions(+), 94 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 8e96c35307fb..998a24611246 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2055,6 +2055,14 @@ static int igb_set_features(struct net_device *netdev,
if (changed & NETIF_F_HW_VLAN_CTAG_RX)
igb_vlan_mode(netdev, features);
 
+   /* We can only support IPV4 TSO in tunnels if we can mangle the
+* inner IP ID field, so strip TSO if MANGLEID is not supported.
+*/
+   if (features & NETIF_F_TSO_MANGLEID)
+   netdev->hw_enc_features |= NETIF_F_TSO;
+   else
+   netdev->hw_enc_features &= ~NETIF_F_TSO;
+
if (!(changed & (NETIF_F_RXALL | NETIF_F_NTUPLE)))
return 0;
 
@@ -2087,6 +2095,29 @@ static int igb_ndo_fdb_add(struct ndmsg *ndm, struct 
nlattr *tb[],
return ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, flags);
 }
 
+#define IGB_MAX_MAC_HDR_LEN127
+#define IGB_MAX_NETWORK_HDR_LEN511
+
+static netdev_features_t
+igb_features_check(struct sk_buff *skb, struct net_device *dev,
+  netdev_features_t features)
+{
+   unsigned int network_hdr_len, mac_hdr_len;
+
+   /* Make certain the headers can be described by a context descriptor */
+   mac_hdr_len = skb_network_header(skb) - skb->data;
+   network_hdr_len = skb_checksum_start(skb) - skb_network_header(skb);
+   if (unlikely((mac_hdr_len > IGB_MAX_MAC_HDR_LEN) ||
+(network_hdr_len >  IGB_MAX_NETWORK_HDR_LEN)))
+   return features & ~(NETIF_F_HW_CSUM |
+   NETIF_F_SCTP_CRC |
+   NETIF_F_HW_VLAN_CTAG_TX |
+   NETIF_F_TSO |
+   NETIF_F_TSO6);
+
+   return features;
+}
+
 static const struct net_device_ops igb_netdev_ops = {
.ndo_open   = igb_open,
.ndo_stop   = igb_close,
@@ -2111,7 +2142,7 @@ static const struct net_device_ops igb_netdev_ops = {
.ndo_fix_features   = igb_fix_features,
.ndo_set_features   = igb_set_features,
.ndo_fdb_add= igb_ndo_fdb_add,
-   .ndo_features_check = passthru_features_check,
+   .ndo_features_check = igb_features_check,
 };
 
 /**
@@ -2384,6 +2415,16 @@ static int igb_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
if (hw->mac.type >= e1000_82576)
netdev->features |= NETIF_F_SCTP_CRC;
 
+#define IGB_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
+ NETIF_F_GSO_GRE_CSUM | \
+ NETIF_F_GSO_IPIP | \
+ NETIF_F_GSO_SIT | \
+ NETIF_F_GSO_UDP_TUNNEL | \
+ NETIF_F_GSO_UDP_TUNNEL_CSUM)
+
+   netdev->gso_partial_features = IGB_GSO_PARTIAL_FEATURES;
+   netdev->features |= NETIF_F_GSO_PARTIAL | IGB_GSO_PARTIAL_FEATURES;
+
/* copy netdev features into list of user selectable features */
netdev->hw_features |= netdev->features;
netdev->hw_features |= NETIF_F_RXALL;
@@ -2396,19 +2437,22 @@ static int igb_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
 
netdev->vlan_features |= NETIF_F_SG |
 NETIF_F_TSO |
+NETIF_F_TSO_MANGLEID |
 NETIF_F_TSO6 |
 NETIF_F_HW_CSUM |
 NETIF_F_SCTP_CRC;
 
netdev->mpls_features |= NETIF_F_HW_CSUM;
-   netdev->hw_enc_features |= NETIF_F_HW_CSUM;
+   netdev->hw_enc_features |= NETIF_F_HW_CSUM |
+  NETIF_F_TSO_MANGLEID |
+  NETIF_F_TSO6 |
+  NETIF_F_GSO_PARTIAL |
+  IGB_GSO_PARTIAL_FEATURES;
 
netdev->priv_flags |= IFF_SUPP_NOFCS;
 
-   if (pci_using_dac) {
+   if (pci_using_dac)
netdev->features |= NETIF_F_HIGHDMA;
-   netdev->vlan_features |=

[next-queue PATCH 0/3] Add support for GSO partial to Intel NIC drivers

2016-04-08 Thread Alexander Duyck

So these are the patches needed to enable tunnel segmentation offloads on
the igb, igbvf, ixgbe, and ixgbevf drivers.  In addition this patch extends
the i40e and i40evf drivers to include segmentation support for tunnels
with outer checksums.

The net performance gain for these patches are pretty significant.  In the
case of i40e a tunnel with outer checksums showed the following
improvement:
Throughput Throughput  Local Local   Result 
   Units   CPU   Service Tag
   Util  Demand 
   %
14066.29   10^6bits/s  3.49  0.651   "before" 
20618.16   10^6bits/s  3.09  0.393   "after"

For ixgbe similar results were seen:
Throughput Throughput  Local  Local   Result 
   Units   CPUService Tag
   Util   Demand 
   %   
12879.89   10^6bits/s  10.00  0.763   "before"
14286.77   10^6bits/s  5.74   0.395   "after" 

These patches all rely on the TSO_MANGLEID and GSO_PARTIAL patches so I
would not recommend applying them until those patches have first been
applied.

---

Alexander Duyck (3):
  i40e/i40evf: Add support for GSO partial with UDP_TUNNEL_CSUM and GRE_CSUM
  ixgbe/ixgbevf: Add support for GSO partial
  igb/igbvf: Add support for GSO partial


 drivers/net/ethernet/intel/i40e/i40e_main.c   |   10 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |7 +
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c |7 +
 drivers/net/ethernet/intel/i40evf/i40evf_main.c   |   10 +
 drivers/net/ethernet/intel/igb/igb_main.c |  112 ++
 drivers/net/ethernet/intel/igbvf/netdev.c |  173 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  105 +
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  123 ---
 8 files changed, 391 insertions(+), 156 deletions(-)

--

[PATCH net-next v2] vxlan: synchronously and race-free destruction of vxlan sockets

2016-04-08 Thread Hannes Frederic Sowa

Due to the fact that the udp socket is destructed asynchronously in a
work queue, we have some nondeterministic behavior during shutdown of
vxlan tunnels and creating new ones. Fix this by keeping the destruction
process synchronous in regards to the user space process so IFF_UP can
be reliably set.

udp_tunnel_sock_release destroys vs->sock->sk if reference counter
indicates so. We expect to have the same lifetime of vxlan_sock and
vxlan_sock->sock->sk even in fast paths with only rcu locks held. So
only destruct the whole socket after we can be sure it cannot be found
by searching vxlan_net->sock_list.

Cc: Eric Dumazet 
Cc: Jiri Benc 
Cc: Marcelo Ricardo Leitner 
Signed-off-by: Hannes Frederic Sowa 
---
v2) synchronize_rcu -> synchronize_net (proposed by Eric, thanks!)
also rebased on net-next to apply without conflicts
 drivers/net/vxlan.c | 20 +++-
 include/net/vxlan.h |  2 --
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 9f3634064c921f..77ba31a0e44f97 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -98,7 +98,6 @@ struct vxlan_fdb {
 
 /* salt for hash table */
 static u32 vxlan_salt __read_mostly;
-static struct workqueue_struct *vxlan_wq;
 
 static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
 {
@@ -1053,7 +1052,9 @@ static void __vxlan_sock_release(struct vxlan_sock *vs)
vxlan_notify_del_rx_port(vs);
spin_unlock(>sock_lock);
 
-   queue_work(vxlan_wq, >del_work);
+   synchronize_net();
+   udp_tunnel_sock_release(vs->sock);
+   kfree(vs);
 }
 
 static void vxlan_sock_release(struct vxlan_dev *vxlan)
@@ -2674,13 +2675,6 @@ static const struct ethtool_ops vxlan_ethtool_ops = {
.get_link   = ethtool_op_get_link,
 };
 
-static void vxlan_del_work(struct work_struct *work)
-{
-   struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
-   udp_tunnel_sock_release(vs->sock);
-   kfree_rcu(vs, rcu);
-}
-
 static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
__be16 port, u32 flags)
 {
@@ -2726,8 +2720,6 @@ static struct vxlan_sock *vxlan_socket_create(struct net 
*net, bool ipv6,
for (h = 0; h < VNI_HASH_SIZE; ++h)
INIT_HLIST_HEAD(>vni_list[h]);
 
-   INIT_WORK(>del_work, vxlan_del_work);
-
sock = vxlan_create_sock(net, ipv6, port, flags);
if (IS_ERR(sock)) {
pr_info("Cannot bind port %d, err=%ld\n", ntohs(port),
@@ -3346,10 +3338,6 @@ static int __init vxlan_init_module(void)
 {
int rc;
 
-   vxlan_wq = alloc_workqueue("vxlan", 0, 0);
-   if (!vxlan_wq)
-   return -ENOMEM;
-
get_random_bytes(_salt, sizeof(vxlan_salt));
 
rc = register_pernet_subsys(_net_ops);
@@ -3370,7 +3358,6 @@ out3:
 out2:
unregister_pernet_subsys(_net_ops);
 out1:
-   destroy_workqueue(vxlan_wq);
return rc;
 }
 late_initcall(vxlan_init_module);
@@ -3379,7 +3366,6 @@ static void __exit vxlan_cleanup_module(void)
 {
rtnl_link_unregister(_link_ops);
unregister_netdevice_notifier(_notifier_block);
-   destroy_workqueue(vxlan_wq);
unregister_pernet_subsys(_net_ops);
/* rcu_barrier() is called by netns */
 }
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 2f168f0ea32c39..d442eb3129cde4 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -184,9 +184,7 @@ struct vxlan_metadata {
 /* per UDP socket information */
 struct vxlan_sock {
struct hlist_node hlist;
-   struct work_struct del_work;
struct socket*sock;
-   struct rcu_head   rcu;
struct hlist_head vni_list[VNI_HASH_SIZE];
atomic_t  refcnt;
u32   flags;
-- 
2.5.5

Re: [PATCH v4 1/2] RDS: memory allocated must be align to 8

2016-04-08 Thread santosh shilimkar

On 4/8/2016 1:10 PM, David Miller wrote:

From: santosh shilimkar 
Date: Fri, 8 Apr 2016 12:44:39 -0700

On 4/7/2016 4:57 AM, Shamir Rabinovitch wrote:

Fix issue in 'rds_ib_cong_recv' when accessing unaligned memory
allocated by 'rds_page_remainder_alloc' using uint64_t pointer.

Sorry I still didn't follow this change still. What exactly is the
problem.

You can't stop the offset at non-8byte intervals, because the chunks
being used in these arenas can have 64-bit values in it, which must be
8-byte aligned.

I see. Thanks for explaining it Dave.
Its fine to apply then.
Acked-by: Santosh Shilimkar 

Regards,
Santosh

Re: [PATCH net-next v2] net: dsa: document missing functions

2016-04-08 Thread David Miller

From: Vivien Didelot 
Date: Wed,  6 Apr 2016 11:06:20 -0400

> Add description for the missing port_vlan_prepare, port_fdb_prepare,
> port_fdb_dump functions in the DSA documentation.
> 
> Signed-off-by: Vivien Didelot 

Applied.

Re: [PATCH net-next v2 0/3] net: dsa: voidify STP setter and FDB/VLAN add ops

2016-04-08 Thread David Miller

From: Vivien Didelot 
Date: Wed,  6 Apr 2016 11:55:02 -0400

> Neither the DSA layer nor the bridge code (see br_set_state) really care
> about eventual errors from STP state setters, so make it void.
> 
> The DSA layer separates the prepare and commit phases of switchdev in
> two different functions. Logical errors must not happen in commit
> routines, so make them void.
> 
> Changes v1 -> v2:
>   - rename port_stp_update to port_stp_state_set
>   - don't change code flow of bcm_sf2_sw_br_set_stp_state
>   - prefer netdev_err over netdev_warn

Series applied.

Re: [PATCH] net: thunderx: Fix broken of_node_put() code.

2016-04-08 Thread David Daney

On 04/08/2016 01:15 PM, David Miller wrote:

From: David Daney 
Date: Fri, 8 Apr 2016 09:41:35 -0700

Due to mail server malfunction, this patch was sent twice.  Please
ignore this duplicate.

This submission had another problem too.

Do not use the date of your commit as the date that gets put into
your email headers.

I don't.  This is standard git-send-email 1.7.11.7.

This makes all of your patch submissions look like they occurred in
the past, and this mixes up the ordering of patches in patchwork.

They did occur in the past.  Just like all e-mail you read, they were 
sent before you read them.

I ran git-send-email for this on  Thu, 31 Mar 2016 18:01:57 -0700.  I 
observed that the patch didn't seem to make it to the public lists, so I 
figured I screwed something up and I sent it again, with the same results.

Then I went on vacation, and came back today to sort everything out.  My 
MTA had died, so I restarted it, and ... the backlog of messages was 
sent and you read it.

So please resubmit this properly with a normal, current, date in your
email headers.

OK, I will resend the identical patch for the third time...

Re: pull-request: mac80211-next 2016-04-06

2016-04-08 Thread David Miller

From: Johannes Berg 
Date: Wed, 06 Apr 2016 15:26:50 +0200

> On Wed, 2016-04-06 at 15:25 +0200, Johannes Berg wrote:
>> Hi Dave,
>> 
>> For the 4.6 cycle, there's of course much more. The few things that
>> 
> 
> Err, -next, so that's 4.7.

Pulled, and I fixed the version number in the merge commit message.

Thanks.

Re: pull-request: mac80211 2016-04-06

2016-04-08 Thread David Miller

From: Johannes Berg 
Date: Wed,  6 Apr 2016 15:19:58 +0200

> First set of fixes for 4.6. Nothing really stands out.
> 
> Let me know if there's any problem.

Pulled, thanks Johannes.

Re: [PATCH net] vxlan: synchronously and race-free destruction of vxlan sockets

2016-04-08 Thread Eric Dumazet

On Fri, 2016-04-08 at 22:30 +0200, Hannes Frederic Sowa wrote:
> Hi Marcelo,
> ng rtnl?
> 
> I thought about that and try not to use synchronize_rcu, but I don't see 
> any other way. Anyway, ndo_stop isn't really fast path and is used to 
> shut the interface down. Also since we have lwtunnels we don't really 
> need a lot of interfaces created and torn down.
> 
> But I could switch to synchronize_rcu_expedited here.
> 
> Also we have another synchronize_rcu during device dismantling, maybe we 
> can split ndo_stop into two callbacks, one preparing for stopping and 
> the other one after the synchronize_rcu when we safely can free resources.
> 
> I will investigate this but for the mean time I think this patch is 
> already improving things as user space can bind the socket again when 
> the dellink command returned.

Of course, we have synchronize_net() which specifically put in a single
point the knowledge (rtnl being held or not)

[PATCH] net: thunderx: Fix broken of_node_put() code.

2016-04-08 Thread David Daney

From: David Daney 

commit b7d3e3d3d21a ("net: thunderx: Don't leak phy device references
on -EPROBE_DEFER condition.") incorrectly moved the call to
of_node_put() outside of the loop.  Under normal loop exit, the node
has already had of_node_put() called, so the extra call results in:

[8.228020] ERROR: Bad of_node_put() on 
/soc@0/pci@8480/mrml-bridge0@1,0/bgx0/xlaui00
[8.239433] CPU: 16 PID: 608 Comm: systemd-udevd Not tainted 4.6.0-rc1-numa+ 
#157
[8.247380] Hardware name: www.cavium.com EBB8800/EBB8800, BIOS 0.3 Mar  2 
2016
[8.273541] Call trace:
[8.273550] [] dump_backtrace+0x0/0x210
[8.273557] [] show_stack+0x24/0x2c
[8.273560] [] dump_stack+0x8c/0xb4
[8.273566] [] of_node_release+0xa8/0xac
[8.273570] [] kobject_cleanup+0x8c/0x194
[8.273573] [] kobject_put+0x44/0x6c
[8.273576] [] of_node_put+0x24/0x30
[8.273587] [] bgx_probe+0x17c/0xcd8 [thunder_bgx]
[8.273591] [] pci_device_probe+0xa0/0x114
[8.273596] [] driver_probe_device+0x178/0x418
[8.273599] [] __driver_attach+0x100/0x118
[8.273602] [] bus_for_each_dev+0x6c/0xac
[8.273605] [] driver_attach+0x30/0x38
[8.273608] [] bus_add_driver+0x1f8/0x29c
[8.273611] [] driver_register+0x70/0x110
[8.273617] [] __pci_register_driver+0x60/0x6c
[8.273623] [] bgx_init_module+0x40/0x48 [thunder_bgx]
[8.273626] [] do_one_initcall+0xcc/0x1c0
[8.273631] [] do_init_module+0x68/0x1c8
[8.273635] [] load_module+0xf44/0x11f4
[8.273638] [] SyS_finit_module+0xb8/0xe0
[8.273641] [] el0_svc_naked+0x24/0x28

Go back to the previous (correct) code that only did the extra
of_node_put() call on early exit from the loop.

Signed-off-by: David Daney 
---
 drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c 
b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
index 9679515..d20539a 100644
--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
+++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
@@ -1011,10 +1011,11 @@ static int bgx_init_of_phy(struct bgx *bgx)
}
 
lmac++;
-   if (lmac == MAX_LMAC_PER_BGX)
+   if (lmac == MAX_LMAC_PER_BGX) {
+   of_node_put(node);
break;
+   }
}
-   of_node_put(node);
return 0;
 
 defer:
-- 
1.8.3.1

Re: [PATCH net-next] net: bcmgenet: add BQL support

2016-04-08 Thread David Miller

From: Petri Gynther 
Date: Tue,  5 Apr 2016 17:50:01 -0700

> Add Byte Queue Limits (BQL) support to bcmgenet driver.
> 
> Signed-off-by: Petri Gynther 

As Eric Dumazet indicated, your ->ndo_init() code to reset the queues is
probably not necessary at all.

Re: [PATCH v3] route: do not cache fib route info on local routes with oif

2016-04-08 Thread Julian Anastasov


Hello,

On Fri, 8 Apr 2016, Chris Friesen wrote:

> For local routes that require a particular output interface we do not want to
> cache the result.  Caching the result causes incorrect behaviour when there 
> are
> multiple source addresses on the interface.  The end result being that if the
> intended recipient is waiting on that interface for the packet he won't 
> receive
> it because it will be delivered on the loopback interface and the IP_PKTINFO
> ipi_ifindex will be set to the loopback interface as well.
> 
> This can be tested by running a program such as "dhcp_release" which attempts
> to inject a packet on a particular interface so that it is received by another
> program on the same board.  The receiving process should see an IP_PKTINFO
> ipi_ifndex value of the source interface (e.g., eth1) instead of the loopback
> interface (e.g., lo).  The packet will still appear on the loopback interface
> in tcpdump but the important aspect is that the CMSG info is correct.
> 
> Sample dhcp_release command line:
> 
>dhcp_release eth1 192.168.204.222 02:11:33:22:44:66
> 
> Signed-off-by: Allain Legacy 
> Signed off-by: Chris Friesen 

Sorry, forgot to mention that patch has other errors:

scripts/checkpatch.pl --strict /tmp/file3.patch

Its recommendations:

- prefer a maximum 75 chars per line
- code indent should use tabs where possible

> ---
>  net/ipv4/route.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 02c6229..437a377 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -2045,6 +2045,18 @@ static struct rtable *__mkroute_output(const struct 
> fib_result *res,
>*/
>   if (fi && res->prefixlen < 4)
>   fi = NULL;
> + } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
> +(orig_oif != dev_out->ifindex)) {
> + /* For local routes that require a particular output interface
> + * we do not want to cache the result.  Caching the result
> + * causes incorrect behaviour when there are multiple source
> + * addresses on the interface, the end result being that if 
> the
> + * intended recipient is waiting on that interface for the
> + * packet he won't receive it because it will be delivered on
> + * the loopback interface and the IP_PKTINFO ipi_ifindex will
> + * be set to the loopback interface as well.
> +  */
> + fi = NULL;
>   }
>  
>   fnhe = NULL;

Regards

--
Julian Anastasov

[net-next PATCH 2/5] GSO: Add GSO type for fixed IPv4 ID

2016-04-08 Thread Alexander Duyck

This patch adds support for TSO using IPv4 headers with a fixed IP ID
field.  This is meant to allow us to do a lossless GRO in the case of TCP
flows that use a fixed IP ID such as those that convert IPv6 header to IPv4
headers.

In addition I am adding a feature that for now I am referring to TSO with
IP ID mangling.  Basically when this flag is enabled the device has the
option to either output the flow with incrementing IP IDs or with a fixed
IP ID regardless of what the original IP ID ordering was.  This is useful
in cases where the DF bit is set and we do not care if the original IP ID
value is maintained.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdev_features.h |3 +++
 include/linux/netdevice.h   |1 +
 include/linux/skbuff.h  |   20 +++-
 net/core/dev.c  |6 --
 net/core/ethtool.c  |1 +
 net/ipv4/af_inet.c  |   19 +++
 net/ipv4/gre_offload.c  |1 +
 net/ipv4/tcp_offload.c  |4 +++-
 net/ipv6/ip6_offload.c  |3 ++-
 9 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a734bf43d190..7cf272a4b5c8 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -39,6 +39,7 @@ enum {
NETIF_F_UFO_BIT,/* ... UDPv4 fragmentation */
NETIF_F_GSO_ROBUST_BIT, /* ... ->SKB_GSO_DODGY */
NETIF_F_TSO_ECN_BIT,/* ... TCP ECN support */
+   NETIF_F_TSO_MANGLEID_BIT,   /* ... IPV4 ID mangling allowed */
NETIF_F_TSO6_BIT,   /* ... TCPv6 segmentation */
NETIF_F_FSO_BIT,/* ... FCoE segmentation */
NETIF_F_GSO_GRE_BIT,/* ... GRE with TSO */
@@ -120,6 +121,7 @@ enum {
 #define NETIF_F_GSO_SIT__NETIF_F(GSO_SIT)
 #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
+#define NETIF_F_TSO_MANGLEID   __NETIF_F(TSO_MANGLEID)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX__NETIF_F(HW_VLAN_STAG_RX)
@@ -147,6 +149,7 @@ enum {
 
 /* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE   (NETIF_F_TSO | NETIF_F_TSO_ECN | \
+NETIF_F_TSO_MANGLEID | \
 NETIF_F_TSO6 | NETIF_F_UFO)
 
 /* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 166402ae3324..ffc12f565ed9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3994,6 +3994,7 @@ static inline bool net_gso_ok(netdev_features_t features, 
int gso_type)
BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
+   BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_FCOE!= (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 007381270ff8..5fba16658f9d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -465,23 +465,25 @@ enum {
/* This indicates the tcp segment has CWR set. */
SKB_GSO_TCP_ECN = 1 << 3,
 
-   SKB_GSO_TCPV6 = 1 << 4,
+   SKB_GSO_TCP_FIXEDID = 1 << 4,
 
-   SKB_GSO_FCOE = 1 << 5,
+   SKB_GSO_TCPV6 = 1 << 5,
 
-   SKB_GSO_GRE = 1 << 6,
+   SKB_GSO_FCOE = 1 << 6,
 
-   SKB_GSO_GRE_CSUM = 1 << 7,
+   SKB_GSO_GRE = 1 << 7,
 
-   SKB_GSO_IPIP = 1 << 8,
+   SKB_GSO_GRE_CSUM = 1 << 8,
 
-   SKB_GSO_SIT = 1 << 9,
+   SKB_GSO_IPIP = 1 << 9,
 
-   SKB_GSO_UDP_TUNNEL = 1 << 10,
+   SKB_GSO_SIT = 1 << 10,
 
-   SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
+   SKB_GSO_UDP_TUNNEL = 1 << 11,
 
-   SKB_GSO_TUNNEL_REMCSUM = 1 << 12,
+   SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
+
+   SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/dev.c b/net/core/dev.c
index d51343a821ed..16def40dfbe8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6976,9 +6976,11 @@ int register_netdevice(struct net_device *dev)
dev->features |= NETIF_F_SOFT_FEATURES;
dev->wanted_features = dev->features & dev->hw_features;
 
-   if (!(dev->flags & IFF_LOOPBACK)) {
+   if (!(dev->flags & IFF_LOOPBACK))
dev->hw_features |= NETIF_F_NOCACHE_COPY;
-   }
+
+   if (dev->hw_features &

[net-next PATCH 4/5] GSO: Support partial segmentation offload

2016-04-08 Thread Alexander Duyck

This patch adds support for something I am referring to as GSO partial.
The basic idea is that we can support a broader range of devices for
segmentation if we use fixed outer headers and have the hardware only
really deal with segmenting the inner header.  The idea behind the naming
is due to the fact that everything before csum_start will be fixed headers,
and everything after will be the region that is handled by hardware.

With the current implementation it allows us to add support for the
following GSO types with an inner TSO_MANGLEID or TSO6 offload:
NETIF_F_GSO_GRE
NETIF_F_GSO_GRE_CSUM
NETIF_F_GSO_IPIP
NETIF_F_GSO_SIT
NETIF_F_UDP_TUNNEL
NETIF_F_UDP_TUNNEL_CSUM

In the case of hardware that already supports tunneling we may be able to
extend this further to support TSO_TCPV4 without TSO_MANGLEID if the
hardware can support updating inner IPv4 headers.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdev_features.h |5 +
 include/linux/netdevice.h   |2 ++
 include/linux/skbuff.h  |9 +++--
 net/core/dev.c  |   31 ++-
 net/core/ethtool.c  |1 +
 net/core/skbuff.c   |   29 -
 net/ipv4/af_inet.c  |   20 
 net/ipv4/gre_offload.c  |   26 +-
 net/ipv4/tcp_offload.c  |   10 --
 net/ipv4/udp_offload.c  |   27 +--
 net/ipv6/ip6_offload.c  |   10 +-
 11 files changed, 148 insertions(+), 22 deletions(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 7cf272a4b5c8..9fc79df0e561 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -48,6 +48,10 @@ enum {
NETIF_F_GSO_SIT_BIT,/* ... SIT tunnel with TSO */
NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */
NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */
+   NETIF_F_GSO_PARTIAL_BIT,/* ... Only segment inner-most L4
+* in hardware and all other
+* headers in software.
+*/
NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
/**/NETIF_F_GSO_LAST =  /* last bit, see GSO_MASK */
NETIF_F_GSO_TUNNEL_REMCSUM_BIT,
@@ -122,6 +126,7 @@ enum {
 #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL)
 #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM)
 #define NETIF_F_TSO_MANGLEID   __NETIF_F(TSO_MANGLEID)
+#define NETIF_F_GSO_PARTIAL __NETIF_F(GSO_PARTIAL)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX__NETIF_F(HW_VLAN_STAG_RX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a3ac84ac8cb0..554efb93f0ed 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1656,6 +1656,7 @@ struct net_device {
netdev_features_t   vlan_features;
netdev_features_t   hw_enc_features;
netdev_features_t   mpls_features;
+   netdev_features_t   gso_partial_features;
 
int ifindex;
int group;
@@ -4006,6 +4007,7 @@ static inline bool net_gso_ok(netdev_features_t features, 
int gso_type)
BUILD_BUG_ON(SKB_GSO_SIT != (NETIF_F_GSO_SIT >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> 
NETIF_F_GSO_SHIFT));
+   BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> 
NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> 
NETIF_F_GSO_SHIFT));
 
return (features & feature) == feature;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5fba16658f9d..da0ace389fec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -483,7 +483,9 @@ enum {
 
SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
 
-   SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
+   SKB_GSO_PARTIAL = 1 << 13,
+
+   SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
 };
 
 #if BITS_PER_LONG > 32
@@ -3591,7 +3593,10 @@ static inline struct sec_path *skb_sec_path(struct 
sk_buff *skb)
  * Keeps track of level of encapsulation of network headers.
  */
 struct skb_gso_cb {
-   int mac_offset;
+   union {
+   int mac_offset;
+   int data_offset;
+   };
int encap_level;
__wsum  csum;
__u16   csum_start;
diff --git a/net/core/dev.c b/net/core/dev.c
index 235e0f3e34f0..d80010b3828f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2711,6 +2711,19 @@ struct sk_buff

[net-next PATCH 3/5] GRO: Add support for TCP with fixed IPv4 ID field, limit tunnel IP ID values

2016-04-08 Thread Alexander Duyck

This patch does two things.

First it allows TCP to aggregate TCP frames with a fixed IPv4 ID field.  As
a result we should now be able to aggregate flows that were converted from
IPv6 to IPv4.  In addition this allows us more flexibility for future
implementations of segmentation as we may be able to use a fixed IP ID when
segmenting the flow.

The second thing this does is that it places limitations on the outer IPv4
ID header in the case of tunneled frames.  Specifically it forces the IP ID
to be incrementing by 1 unless the DF bit is set in the outer IPv4 header.
This way we can avoid creating overlapping series of IP IDs that could
possibly be fragmented if the frame goes through GRO and is then
resegmented via GSO.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdevice.h |5 -
 net/core/dev.c|1 +
 net/ipv4/af_inet.c|   35 ---
 net/ipv4/tcp_offload.c|   16 +++-
 net/ipv6/ip6_offload.c|8 ++--
 5 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ffc12f565ed9..a3ac84ac8cb0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2123,7 +2123,10 @@ struct napi_gro_cb {
/* Used in GRE, set in fou/gue_gro_receive */
u8  is_fou:1;
 
-   /* 6 bit hole */
+   /* Used to determine if flush_id can be ignored */
+   u8  is_atomic:1;
+
+   /* 5 bit hole */
 
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum  csum;
diff --git a/net/core/dev.c b/net/core/dev.c
index 16def40dfbe8..235e0f3e34f0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4440,6 +4440,7 @@ static enum gro_result dev_gro_receive(struct napi_struct 
*napi, struct sk_buff
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
NAPI_GRO_CB(skb)->is_fou = 0;
+   NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
 
/* Setup for GRO checksum validation */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5bbea9a0ce96..8564cab96189 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1328,6 +1328,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff 
**head,
 
for (p = *head; p; p = p->next) {
struct iphdr *iph2;
+   u16 flush_id;
 
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -1351,16 +1352,36 @@ static struct sk_buff **inet_gro_receive(struct sk_buff 
**head,
(iph->tos ^ iph2->tos) |
((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
 
-   /* Save the IP ID check to be included later when we get to
-* the transport layer so only the inner most IP ID is checked.
-* This is because some GSO/TSO implementations do not
-* correctly increment the IP ID for the outer hdrs.
-*/
-   NAPI_GRO_CB(p)->flush_id =
-   ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ 
id);
NAPI_GRO_CB(p)->flush |= flush;
+
+   /* We need to store of the IP ID check to be included later
+* when we can verify that this packet does in fact belong
+* to a given flow.
+*/
+   flush_id = (u16)(id - ntohs(iph2->id));
+
+   /* This bit of code makes it much easier for us to identify
+* the cases where we are doing atomic vs non-atomic IP ID
+* checks.  Specifically an atomic check can return IP ID
+* values 0 - 0x, while a non-atomic check can only
+* return 0 or 0x.
+*/
+   if (!NAPI_GRO_CB(p)->is_atomic ||
+   !(iph->frag_off & htons(IP_DF))) {
+   flush_id ^= NAPI_GRO_CB(p)->count;
+   flush_id = flush_id ? 0x : 0;
+   }
+
+   /* If the previous IP ID value was based on an atomic
+* datagram we can overwrite the value and ignore it.
+*/
+   if (NAPI_GRO_CB(skb)->is_atomic)
+   NAPI_GRO_CB(p)->flush_id = flush_id;
+   else
+   NAPI_GRO_CB(p)->flush_id |= flush_id;
}
 
+   NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;
skb_set_network_header(skb, off);
/* The above will be needed by the transport layer if there is one
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 08dd25d835af..d1ffd55289bd 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -239,7 +239,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, 
struct sk_buff *skb)

[net-next PATCH 5/5] Documentation: Add documentation for TSO and GSO features

2016-04-08 Thread Alexander Duyck

This document is a starting point for defining the TSO and GSO features.
The whole thing is starting to get a bit messy so I wanted to make sure we
have notes somwhere to start describing what does and doesn't work.

Signed-off-by: Alexander Duyck 
---
 Documentation/networking/segmentation-offloads.txt |  130 
 1 file changed, 130 insertions(+)
 create mode 100644 Documentation/networking/segmentation-offloads.txt

diff --git a/Documentation/networking/segmentation-offloads.txt 
b/Documentation/networking/segmentation-offloads.txt
new file mode 100644
index ..f200467ade38
--- /dev/null
+++ b/Documentation/networking/segmentation-offloads.txt
@@ -0,0 +1,130 @@
+Segmentation Offloads in the Linux Networking Stack
+
+Introduction
+
+
+This document describes a set of techniques in the Linux networking stack
+to take advantage of segmentation offload capabilities of various NICs.
+
+The following technologies are described:
+ * TCP Segmentation Offload - TSO
+ * UDP Fragmentation Offload - UFO
+ * IPIP, SIT, GRE, and UDP Tunnel Offloads
+ * Generic Segmentation Offload - GSO
+ * Generic Receive Offload - GRO
+ * Partial Generic Segmentation Offload - GSO_PARTIAL
+
+TCP Segmentation Offload
+
+
+TCP segmentation allows a device to segment a single frame into multiple
+frames with a data payload size specified in skb_shinfo()->gso_size.
+When TCP segmentation requested the bit for either SKB_GSO_TCP or
+SKB_GSO_TCP6 should be set in skb_shinfo()->gso_type and
+skb_shinfo()->gso_size should be set to a non-zero value.
+
+TCP segmentation is dependent on support for the use of partial checksum
+offload.  For this reason TSO is normally disabled if the Tx checksum
+offload for a given device is disabled.
+
+In order to support TCP segmentation offload it is necessary to populate
+the network and transport header offsets of the skbuff so that the device
+drivers will be able determine the offsets of the IP or IPv6 header and the
+TCP header.  In addition as CHECKSUM_PARTIAL is required csum_start should
+also point to the TCP header of the packet.
+
+For IPv4 segmentation we support one of two types in terms of the IP ID.
+The default behavior is to increment the IP ID with every segment.  If the
+GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP
+ID and all segments will use the same IP ID.  If a device has
+NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO
+and we will either increment the IP ID for all frames, or leave it at a
+static value based on driver preference.
+
+UDP Fragmentation Offload
+=
+
+UDP fragmentation offload allows a device to fragment an oversized UDP
+datagram into multiple IPv4 fragments.  Many of the requirements for UDP
+fragmentation offload are the same as TSO.  However the IPv4 ID for
+fragments should not increment as a single IPv4 datagram is fragmented.
+
+IPIP, SIT, GRE, UDP Tunnel, and Remote Checksum Offloads
+
+
+In addition to the offloads described above it is possible for a frame to
+contain additional headers such as an outer tunnel.  In order to account
+for such instances an additional set of segmentation offload types were
+introduced including SKB_GSO_IPIP, SKB_GSO_SIT, SKB_GSO_GRE, and
+SKB_GSO_UDP_TUNNEL.  These extra segmentation types are used to identify
+cases where there are more than just 1 set of headers.  For example in the
+case of IPIP and SIT we should have the network and transport headers moved
+from the standard list of headers to "inner" header offsets.
+
+Currently only two levels of headers are supported.  The convention is to
+refer to the tunnel headers as the outer headers, while the encapsulated
+data is normally referred to as the inner headers.  Below is the list of
+calls to access the given headers:
+
+IPIP/SIT Tunnel:
+   Outer   Inner
+MACskb_mac_header
+Networkskb_network_header  skb_inner_network_header
+Transport  skb_transport_header
+
+UDP/GRE Tunnel:
+   Outer   Inner
+MACskb_mac_header  skb_inner_mac_header
+Networkskb_network_header  skb_inner_network_header
+Transport  skb_transport_headerskb_inner_transport_header
+
+In addition to the above tunnel types there are also SKB_GSO_GRE_CSUM and
+SKB_GSO_UDP_TUNNEL_CSUM.  These two additional tunnel types reflect the
+fact that the outer header also requests to have a non-zero checksum
+included in the outer header.
+
+Finally there is SKB_GSO_REMCSUM which indicates that a given tunnel header
+has requested a remote checksum offload.  In this case the inner headers
+will be left with a partial checksum and only the outer header checksum
+will be computed.
+
+Generic Segmentation Offload
+
+
+Generic

[net-next PATCH 1/5] ethtool: Add support for toggling any of the GSO offloads

2016-04-08 Thread Alexander Duyck

The strings were missing for several of the GSO offloads that are
available.  This patch provides the missing strings so that we can toggle
or query any of them via the ethtool command.

Signed-off-by: Alexander Duyck 
---
 net/core/ethtool.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f426c5ad6149..6a7f99661c2f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -82,9 +82,11 @@ static const char 
netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
[NETIF_F_FSO_BIT] =  "tx-fcoe-segmentation",
[NETIF_F_GSO_GRE_BIT] =  "tx-gre-segmentation",
+   [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
[NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation",
[NETIF_F_GSO_SIT_BIT] =  "tx-sit-segmentation",
[NETIF_F_GSO_UDP_TUNNEL_BIT] =   "tx-udp_tnl-segmentation",
+   [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
 
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
[NETIF_F_SCTP_CRC_BIT] ="tx-checksum-sctp",

Re: [PATCH net] bridge, netem: mark mailing lists as moderated

2016-04-08 Thread David Miller

From: Stephen Hemminger 
Date: Tue,  5 Apr 2016 13:43:53 -0700

> I moderate these (lightly loaded) lists to block spam.
> 
> Signed-off-by: Stephen Hemminger 

Applied, thanks.

[net-next PATCH 0/5] GRO Fixed IPv4 ID support and GSO partial support

2016-04-08 Thread Alexander Duyck

This patch series sets up a few different things.

First it adds support for GRO of frames with a fixed IP ID value.  This
will allow us to perform GRO for frames that go through things like an IPv6
to IPv4 header translation.

The second item we add is support for segmenting frames that are generated
this way.  Most devices only support an incrementing IP ID value, and in
the case of TCP the IP ID can be ignored in many cases since the DF bit
should be set.  So we can technically segment these frames using existing
TSO if we are willing to allow the IP ID to be mangled.  As such I have
added a matching feature for the new form of GRO/GSO called TCP IPv4 ID
mangling.  With this enabled we can assemble and disassemble a frame with
the sequence number fixed and the only ill effect will be that the IPv4 ID
will be altered which may or may not have any noticeable effect.  As such I
have defaulted the feature to disabled.

The third item this patch series adds is support for partial GSO
segmentation.  Partial GSO segmentation allows us to split a large frame
into two pieces.  The first piece will have an even multiple of MSS worth
of data and the headers before the one pointed to by csum_start will have
been updated so that they are correct for if the data payload had already
been segmented.  By doing this we can do things such as precompute the
outer header checksums for a frame to be segmented allowing us to perform
TSO on devices that don't support tunneling, or tunneling with outer header
checksums.

This patch series currently relies on a patch that is in the net tree.  As
such it may be best to defer applying it until the net tree is merged.  In
addition I have some patches for the Intel NIC drivers that I will submit
as an RFC for now and will submit to Jeff Kirsher once this patch series
has been applied.

---

Alexander Duyck (5):
  ethtool: Add support for toggling any of the GSO offloads
  GSO: Add GSO type for fixed IPv4 ID
  GRO: Add support for TCP with fixed IPv4 ID field, limit tunnel IP ID 
values
  GSO: Support partial segmentation offload
  Documentation: Add documentation for TSO and GSO features


 Documentation/networking/segmentation-offloads.txt |  130 
 include/linux/netdev_features.h|8 +
 include/linux/netdevice.h  |8 +
 include/linux/skbuff.h |   27 +++-
 net/core/dev.c |   38 +-
 net/core/ethtool.c |4 +
 net/core/skbuff.c  |   29 
 net/ipv4/af_inet.c |   70 ---
 net/ipv4/gre_offload.c |   27 +++-
 net/ipv4/tcp_offload.c |   30 -
 net/ipv4/udp_offload.c |   27 +++-
 net/ipv6/ip6_offload.c |   21 +++
 12 files changed, 368 insertions(+), 51 deletions(-)
 create mode 100644 Documentation/networking/segmentation-offloads.txt

--

Re: [PATCH net] vxlan: synchronously and race-free destruction of vxlan sockets

2016-04-08 Thread Hannes Frederic Sowa


Hi Marcelo,


On 08.04.2016 20:51, Marcelo Ricardo Leitner wrote:

On Thu, Apr 07, 2016 at 04:57:40PM +0200, Hannes Frederic Sowa wrote:

Due to the fact that the udp socket is destructed asynchronously in a
work queue, we have some nondeterministic behavior during shutdown of
vxlan tunnels and creating new ones. Fix this by keeping the destruction
process synchronous in regards to the user space process so IFF_UP can
be reliably set.

udp_tunnel_sock_release destroys vs->sock->sk if reference counter
indicates so. We expect to have the same lifetime of vxlan_sock and
vxlan_sock->sock->sk even in fast paths with only rcu locks held. So
only destruct the whole socket after we can be sure it cannot be found
by searching vxlan_net->sock_list.

Cc: Jiri Benc 
Signed-off-by: Hannes Frederic Sowa 
---
  drivers/net/vxlan.c | 20 +++-
  include/net/vxlan.h |  2 --
  2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 1c0fa364323e28..487e48b7a53090 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -98,7 +98,6 @@ struct vxlan_fdb {

  /* salt for hash table */
  static u32 vxlan_salt __read_mostly;
-static struct workqueue_struct *vxlan_wq;

  static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
  {
@@ -1065,7 +1064,9 @@ static void __vxlan_sock_release(struct vxlan_sock *vs)
vxlan_notify_del_rx_port(vs);
spin_unlock(>sock_lock);

-   queue_work(vxlan_wq, >del_work);
+   synchronize_rcu();


__vxlan_sock_release is called by vxlan_sock_release which is called by
vxlan_open/stop. Do we really want to have synchronize_rcu() while
holding rtnl?


I thought about that and try not to use synchronize_rcu, but I don't see 
any other way. Anyway, ndo_stop isn't really fast path and is used to 
shut the interface down. Also since we have lwtunnels we don't really 
need a lot of interfaces created and torn down.


But I could switch to synchronize_rcu_expedited here.

Also we have another synchronize_rcu during device dismantling, maybe we 
can split ndo_stop into two callbacks, one preparing for stopping and 
the other one after the synchronize_rcu when we safely can free resources.


I will investigate this but for the mean time I think this patch is 
already improving things as user space can bind the socket again when 
the dellink command returned.


Thanks,
Hannes

Re: [PATCH] net: thunderx: Fix broken of_node_put() code.

2016-04-08 Thread David Miller

From: David Daney 
Date: Fri, 8 Apr 2016 09:41:35 -0700

> Due to mail server malfunction, this patch was sent twice.  Please
> ignore this duplicate.

This submission had another problem too.

Do not use the date of your commit as the date that gets put into
your email headers.

This makes all of your patch submissions look like they occurred in
the past, and this mixes up the ordering of patches in patchwork.

So please resubmit this properly with a normal, current, date in your
email headers.

Thanks.

Re: [PATCH v4 1/2] RDS: memory allocated must be align to 8

2016-04-08 Thread David Miller

From: santosh shilimkar 
Date: Fri, 8 Apr 2016 12:44:39 -0700

> On 4/7/2016 4:57 AM, Shamir Rabinovitch wrote:
>> Fix issue in 'rds_ib_cong_recv' when accessing unaligned memory
>> allocated by 'rds_page_remainder_alloc' using uint64_t pointer.
>>
> Sorry I still didn't follow this change still. What exactly is the
> problem.

You can't stop the offset at non-8byte intervals, because the chunks
being used in these arenas can have 64-bit values in it, which must be
8-byte aligned.

It looks extremely obvious to me.

Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter

2016-04-08 Thread Jesper Dangaard Brouer

On Fri, 8 Apr 2016 10:26:53 -0700
Alexei Starovoitov  wrote:

> On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> > 
> > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer 
> >  wrote:
> >   
> > > > +/* user return codes for PHYS_DEV prog type */
> > > > +enum bpf_phys_dev_action {
> > > > +   BPF_PHYS_DEV_DROP,
> > > > +   BPF_PHYS_DEV_OK,
> > > > +};
> > > 
> > > I can imagine these extra return codes:
> > > 
> > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> > >  BPF_PHYS_DEV_STOLEN, /* E.g. forward use-case */
> > >  BPF_PHYS_DEV_SHARED, /* Queue for async processing, e.g. tcpdump 
> > > use-case */
> > > 
> > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > > which we can look at when we get that far...  
> > 
> > I want to point out something which is quite FUNDAMENTAL, for
> > understanding these return codes (and network stack).
> > 
> > 
> > At driver RX time, the network stack basically have two ways of
> > building an SKB, which is send up the stack.
> > 
> > Option-A (fastest): The packet page is writable. The SKB can be
> > allocated and skb->data/head can point directly to the page.  And
> > we place/write skb_shared_info in the end/tail-room. (This is done by
> > calling build_skb()).
> > 
> > Option-B (slower): The packet page is read-only.  The SKB cannot point
> > skb->data/head directly to the page, because skb_shared_info need to be
> > written into skb->end (slightly hidden via skb_shinfo() casting).  To
> > get around this, a separate piece of memory is allocated (speedup by
> > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> > be written. (This is done when calling netdev/napi_alloc_skb()).
> >   Drivers then need to copy over packet headers, and assign + adjust
> > skb_shinfo(skb)->frags[0] offset to skip copied headers.
> > 
> > 
> > Unfortunately most drivers use option-B.  Due to cost of calling the
> > page allocator.  It is only slightly most expensive to get a larger
> > compound page from the page allocator, which then can be partitioned into
> > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> > cost is added later, when constructing the SKB.
> >  Another reason for option-B, is that archs with expensive IOMMU
> > requirements (like PowerPC), don't need to dma_unmap on every packet,
> > but only on the compound page level.
> > 
> > Side-note: Most drivers have a "copy-break" optimization.  Especially
> > for option-B, when copying header data anyhow. For small packet, one
> > might as well free (or recycle) the RX page, if header size fits into
> > the newly allocated memory (for skb_shared_info).  
> 
> I think you guys are going into overdesign territory, so
> . nack on read-only pages

Unfortunately you cannot just ignore or nack read-only pages. They are
a fact in the current drivers.

Most drivers today (at-least the ones we care about) only deliver
read-only pages.  If you don't accept read-only pages day-1, then you
first have to rewrite a lot of drivers... and that will stall the
project!  How will you deal with this fact?

The early drop filter use-case in this patchset, can ignore read-only
pages.  But ABI wise we need to deal with the future case where we do
need/require writeable pages.  A simple need-writable pages in the API
could help us move forward.


> . nack on copy-break approach

Copy-break can be ignored.  It sort of happens at a higher-level in the
driver. (Eric likely want/care this happens for local socket delivery).


> . nack on per-ring programs

Hmmm... I don't see it as a lot more complicated to attach the program
to the ring.  But maybe we can extend the API later, and thus postpone that
discussion.

> . nack on modified/stolen/shared return codes
> 
> The whole thing must be dead simple to use. Above is not simple by any means.

Maybe you missed that the above was a description of how the current
network stack handles this, which is not simple... which is root of the
hole performance issue.


> The programs must see writeable pages only and return codes:
> drop, pass to stack, redirect to xmit.
> If program wishes to modify packets before passing it to stack, it
> shouldn't need to deal with different return values.

> No special things to deal with small or large packets. No header splits.
> Program must not be aware of any such things.

I agree on this.  This layer only deals with packets at the page level,
single packets stored in continuous memory.


> Drivers can use DMA_BIDIRECTIONAL to allow received page to be
> modified by the program and immediately sent to xmit.

We just have to verify that DMA_BIDIRECTIONAL does not add extra
overhead (which is explicitly stated that it likely does on the
DMA-API-HOWTO.txt, but I like to verify this with a micro benchmark)

> No dma map/unmap/sync per packet. If some odd architectures/dma setups
> cannot

Re: [PATCH v2] route: do not cache fib route info on local routes with oif

2016-04-08 Thread Chris Friesen


On 04/08/2016 01:14 PM, Julian Anastasov wrote:


Your patch is corrupted. I was in the same trap
some time ago but with different client:

 From Documentation/email-clients.txt:

Don't send patches with "format=flowed".  This can cause unexpected
and unwanted line breaks.

Anyways, the change looks good to me and I'll add my
Reviewed-by tag the next time.



Doh...forgot to turn off word wrapping.  New patch coming.

Chris

[PATCH v3] route: do not cache fib route info on local routes with oif

2016-04-08 Thread Chris Friesen

For local routes that require a particular output interface we do not want to
cache the result.  Caching the result causes incorrect behaviour when there are
multiple source addresses on the interface.  The end result being that if the
intended recipient is waiting on that interface for the packet he won't receive
it because it will be delivered on the loopback interface and the IP_PKTINFO
ipi_ifindex will be set to the loopback interface as well.

This can be tested by running a program such as "dhcp_release" which attempts
to inject a packet on a particular interface so that it is received by another
program on the same board.  The receiving process should see an IP_PKTINFO
ipi_ifndex value of the source interface (e.g., eth1) instead of the loopback
interface (e.g., lo).  The packet will still appear on the loopback interface
in tcpdump but the important aspect is that the CMSG info is correct.

Sample dhcp_release command line:

   dhcp_release eth1 192.168.204.222 02:11:33:22:44:66

Signed-off-by: Allain Legacy 
Signed off-by: Chris Friesen 
---
 net/ipv4/route.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 02c6229..437a377 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2045,6 +2045,18 @@ static struct rtable *__mkroute_output(const struct 
fib_result *res,
 */
if (fi && res->prefixlen < 4)
fi = NULL;
+   } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
+  (orig_oif != dev_out->ifindex)) {
+   /* For local routes that require a particular output interface
+ * we do not want to cache the result.  Caching the result
+ * causes incorrect behaviour when there are multiple source
+ * addresses on the interface, the end result being that if the
+ * intended recipient is waiting on that interface for the
+ * packet he won't receive it because it will be delivered on
+ * the loopback interface and the IP_PKTINFO ipi_ifindex will
+ * be set to the loopback interface as well.
+*/
+   fi = NULL;
}
 
fnhe = NULL;

Re: [PATCH net] tuntap: restore default qdisc

2016-04-08 Thread David Miller

From: Jason Wang 
Date: Fri,  8 Apr 2016 13:26:48 +0800

> After commit f84bb1eac027 ("net: fix IFF_NO_QUEUE for drivers using
> alloc_netdev"), default qdisc was changed to noqueue because
> tuntap does not set tx_queue_len during .setup(). This patch restores
> default qdisc by setting tx_queue_len in tun_setup().
> 
> Fixes: f84bb1eac027 ("net: fix IFF_NO_QUEUE for drivers using alloc_netdev")
> Cc: Phil Sutter 
> Signed-off-by: Jason Wang 

Applied and queued up for -stable, thanks Jason.

Re: [PATCH v4 1/2] RDS: memory allocated must be align to 8

2016-04-08 Thread santosh shilimkar


On 4/7/2016 4:57 AM, Shamir Rabinovitch wrote:

Fix issue in 'rds_ib_cong_recv' when accessing unaligned memory
allocated by 'rds_page_remainder_alloc' using uint64_t pointer.


Sorry I still didn't follow this change still. What exactly is the
problem.


Signed-off-by: Shamir Rabinovitch 
---
  net/rds/page.c |4 ++--
  1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/rds/page.c b/net/rds/page.c
index 616f21f..e2b5a58 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -135,8 +135,8 @@ int rds_page_remainder_alloc(struct scatterlist *scat, 
unsigned long bytes,
if (rem->r_offset != 0)
rds_stats_inc(s_page_remainder_hit);

-   rem->r_offset += bytes;
-   if (rem->r_offset == PAGE_SIZE) {
+   rem->r_offset += ALIGN(bytes, 8);
+   if (rem->r_offset >= PAGE_SIZE) {
__free_page(rem->r_page);
rem->r_page = NULL;
}

[PATCH v3 0/2] sctp: delay calls to sk_data_ready() as much as possible

2016-04-08 Thread Marcelo Ricardo Leitner

1st patch is a preparation for the 2nd. The idea is to not call
->sk_data_ready() for every data chunk processed while processing
packets but only once before releasing the socket.

v2: patchset re-checked, small changelog fixes
v3: on patch 2, make use of local vars to make it more readable

Marcelo Ricardo Leitner (2):
  sctp: compress bit-wide flags to a bitfield on sctp_sock
  sctp: delay calls to sk_data_ready() as much as possible

 include/net/sctp/structs.h | 13 +++--
 net/sctp/sm_sideeffect.c   |  7 +++
 net/sctp/ulpqueue.c|  4 ++--
 3 files changed, 16 insertions(+), 8 deletions(-)

-- 
2.5.0

[PATCH v3 1/2] sctp: compress bit-wide flags to a bitfield on sctp_sock

2016-04-08 Thread Marcelo Ricardo Leitner

It wastes space and gets worse as we add new flags, so convert bit-wide
flags to a bitfield.

Currently it already saves 4 bytes in sctp_sock, which are left as holes
in it for now. The whole struct needs packing, which should be done in
another patch.

Note that do_auto_asconf cannot be merged, as explained in the comment
before it.

Signed-off-by: Marcelo Ricardo Leitner 
---
 include/net/sctp/structs.h | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 
6df1ce7a411c548bda4163840a90578b6e1b4cfe..1a6a626904bba4223b7921bbb4be41c2550271a7
 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -210,14 +210,14 @@ struct sctp_sock {
int user_frag;
 
__u32 autoclose;
-   __u8 nodelay;
-   __u8 disable_fragments;
-   __u8 v4mapped;
-   __u8 frag_interleave;
__u32 adaptation_ind;
__u32 pd_point;
-   __u8 recvrcvinfo;
-   __u8 recvnxtinfo;
+   __u16   nodelay:1,
+   disable_fragments:1,
+   v4mapped:1,
+   frag_interleave:1,
+   recvrcvinfo:1,
+   recvnxtinfo:1;
 
atomic_t pd_mode;
/* Receive to here while partial delivery is in effect. */
-- 
2.5.0

[PATCH v3 2/2] sctp: delay calls to sk_data_ready() as much as possible

2016-04-08 Thread Marcelo Ricardo Leitner

Currently processing of multiple chunks in a single SCTP packet leads to
multiple calls to sk_data_ready, causing multiple wake up signals which
are costy and doesn't make it wake up any faster.

With this patch it will note that the wake up is pending and will do it
before leaving the state machine interpreter, latest place possible to
do it realiably and cleanly.

Note that sk_data_ready events are not dependent on asocs, unlike waking
up writers.

v2: series re-checked
v3: use local vars to cleanup the code, suggested by Jakub Sitnicki
Signed-off-by: Marcelo Ricardo Leitner 
---
 include/net/sctp/structs.h | 3 ++-
 net/sctp/sm_sideeffect.c   | 7 +++
 net/sctp/ulpqueue.c| 4 ++--
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 
1a6a626904bba4223b7921bbb4be41c2550271a7..21cb11107e378b4da1e7efde22fab4349496e35a
 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -217,7 +217,8 @@ struct sctp_sock {
v4mapped:1,
frag_interleave:1,
recvrcvinfo:1,
-   recvnxtinfo:1;
+   recvnxtinfo:1,
+   pending_data_ready:1;
 
atomic_t pd_mode;
/* Receive to here while partial delivery is in effect. */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 
7fe56d0acabf66cfd8fe29dfdb45f7620b470ac7..d06317de873090be359ce768fe291224ee50658f
 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1222,6 +1222,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
sctp_cmd_seq_t *commands,
gfp_t gfp)
 {
+   struct sock *sk = ep->base.sk;
+   struct sctp_sock *sp = sctp_sk(sk);
int error = 0;
int force;
sctp_cmd_t *cmd;
@@ -1742,6 +1744,11 @@ out:
error = sctp_outq_uncork(>outqueue, gfp);
} else if (local_cork)
error = sctp_outq_uncork(>outqueue, gfp);
+
+   if (sp->pending_data_ready) {
+   sk->sk_data_ready(sk);
+   sp->pending_data_ready = 0;
+   }
return error;
 nomem:
error = -ENOMEM;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 
ce469d648ffbe166f9ae1c5650f481256f31a7f8..72e5b3e41cddf9d79371de8ab01484e4601b97b6
 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -264,7 +264,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct 
sctp_ulpevent *event)
sctp_ulpq_clear_pd(ulpq);
 
if (queue == >sk_receive_queue)
-   sk->sk_data_ready(sk);
+   sctp_sk(sk)->pending_data_ready = 1;
return 1;
 
 out_free:
@@ -1140,5 +1140,5 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
 
/* If there is data waiting, send it up the socket now. */
if (sctp_ulpq_clear_pd(ulpq) || ev)
-   sk->sk_data_ready(sk);
+   sctp_sk(sk)->pending_data_ready = 1;
 }
-- 
2.5.0

Re: [patch net-next] devlink: share user_ptr pointer for both devlink and devlink_port

2016-04-08 Thread David Miller

From: Jiri Pirko 
Date: Fri,  8 Apr 2016 19:12:48 +0200

> From: Jiri Pirko 
> 
> Ptr to devlink structure can be easily obtained from
> devlink_port->devlink. So share user_ptr[0] pointer for both and leave
> user_ptr[1] free for other users.
> 
> Signed-off-by: Jiri Pirko 
> Reviewed-by: Ido Schimmel 

Applied, thanks again Jiri.

Re: [PATCH v4 2/2] RDS: fix congestion map corruption for PAGE_SIZE > 4k

2016-04-08 Thread santosh shilimkar


On 4/7/2016 4:57 AM, Shamir Rabinovitch wrote:

When PAGE_SIZE > 4k single page can contain 2 RDS fragments. If
'rds_ib_cong_recv' ignore the RDS fragment offset in to the page it
then read the data fragment as far congestion map update and lead to
corruption of the RDS connection far congestion map.

Signed-off-by: Shamir Rabinovitch 
---

Acked-by: Santosh Shilimkar

Re: [patch net-next 0/6] mlxsw: small driver update + one tiny devlink dependency

2016-04-08 Thread David Miller

From: Jiri Pirko 
Date: Fri,  8 Apr 2016 19:11:19 +0200

> Cosmetics, in preparation to sharedbuffer patchset.
> First patch is here to allow patch number two.

Series applied, thanks Jiri.

[PATCH] mISDN: Fixing missing validation in base_sock_bind()

2016-04-08 Thread Emrah Demir

From: Emrah Demir 

Add validation code into mISDN/socket.c

Signed-off-by: Emrah Demir 
---
 drivers/isdn/mISDN/socket.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 0d29b5a..99e5f97 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -715,6 +715,9 @@ base_sock_bind(struct socket *sock, struct sockaddr *addr, 
int addr_len)
if (!maddr || maddr->family != AF_ISDN)
return -EINVAL;
 
+   if (addr_len < sizeof(struct sockaddr_mISDN))
+   return -EINVAL;
+
lock_sock(sk);
 
if (_pms(sk)->dev) {
-- 
2.8.0.rc3

Re: [PATCH v5 net-next 00/15] MTU/buffer reconfig changes

2016-04-08 Thread David Miller

From: Jakub Kicinski 
Date: Thu,  7 Apr 2016 19:39:33 +0100

> I re-discussed MPLS/MTU internally, dropped it from the patch 1,
> re-tested everything, found out I forgot about debugfs pointers,
> fixed that as well.
> 
> v5:
>  - don't reserve space in RX buffers for MPLS label stack
>(patch 1);
>  - fix debugfs pointers to ring structures (patch 5).
> v4:
>  - cut down on unrelated patches;
>  - don't "close" the device on error path.
> 
> --- v4 cover letter
> 
> Previous series included some not entirely related patches,
> this one is cut down.  Main issue I'm trying to solve here
> is that .ndo_change_mtu() in nfpvf driver is doing full
> close/open to reallocate buffers - which if open fails
> can result in device being basically closed even though
> the interface is started.  As suggested by you I try to move
> towards a paradigm where the resources are allocated first
> and the MTU change is only done once I'm certain (almost)
> nothing can fail.  Almost because I need to communicate 
> with FW and that can always time out.
> 
> Patch 1 fixes small issue.  Next 10 patches reorganize things
> so that I can easily allocate new rings and sets of buffers
> while the device is running.  Patches 13 and 15 reshape the
> .ndo_change_mtu() and ethtool's ring-resize operation into
> desired form.

Looks good, series applied, thanks!

Re: [PATCH v2] route: do not cache fib route info on local routes with oif

2016-04-08 Thread Julian Anastasov


Hello,

On Fri, 8 Apr 2016, Chris Friesen wrote:

> For local routes that require a particular output interface we do not want to
> cache the result.  Caching the result causes incorrect behaviour when there
> are
> multiple source addresses on the interface.  The end result being that if the
> intended recipient is waiting on that interface for the packet he won't
> receive
> it because it will be delivered on the loopback interface and the IP_PKTINFO
> ipi_ifindex will be set to the loopback interface as well.
> 
> This can be tested by running a program such as "dhcp_release" which attempts
> to inject a packet on a particular interface so that it is received by another
> program on the same board.  The receiving process should see an IP_PKTINFO
> ipi_ifndex value of the source interface (e.g., eth1) instead of the loopback
> interface (e.g., lo).  The packet will still appear on the loopback interface
> in tcpdump but the important aspect is that the CMSG info is correct.
> 
> Sample dhcp_release command line:
> 
>dhcp_release eth1 192.168.204.222 02:11:33:22:44:66
> 
> Signed-off-by: Allain Legacy 
> Signed off-by: Chris Friesen 
> ---
>  net/ipv4/route.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 02c6229..437a377 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -2045,6 +2045,18 @@ static struct rtable *__mkroute_output(const struct
> fib_result *res,

Your patch is corrupted. I was in the same trap
some time ago but with different client:

>From Documentation/email-clients.txt:

Don't send patches with "format=flowed".  This can cause unexpected
and unwanted line breaks.

Anyways, the change looks good to me and I'll add my
Reviewed-by tag the next time.

>   */
>   if (fi && res->prefixlen < 4)
>   fi = NULL;
> + } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
> +(orig_oif != dev_out->ifindex)) {
> + /* For local routes that require a particular output interface
> +  * we do not want to cache the result.  Caching the result
> +  * causes incorrect behaviour when there are multiple source
> +  * addresses on the interface, the end result being that if
> the
> +  * intended recipient is waiting on that interface for the
> +  * packet he won't receive it because it will be delivered on
> +  * the loopback interface and the IP_PKTINFO ipi_ifindex will
> +  * be set to the loopback interface as well.
> +  */
> + fi = NULL;
>   }
> 
>   fnhe = NULL;

Regards

--
Julian Anastasov

FROM: MR. OLIVER SENO!!

2016-04-08 Thread AKINWUMI

Dear Sir.

I bring you greetings. My name is Mr.Oliver Seno Lim, I am a staff of Abbey 
National Plc. London and heading our regional office in West Africa. Our late 
customer named Engr.Ben W.westland, made a fixed deposit amount of 
US$7Million.He did not declare any next of kin in any of his paper work, I want 
you as a foreigner to stand as the beneficiary to transfer this funds out of my 
bank into your account, after the successful transfer, we shall share in the 
ratio of 30% for you, 70%for me. Should you be interested please send me your 
information:

1,Full names.
2,current residential address.
3,Tele/Fax numbers./your work.
 
   
All I need from you is your readiness, trustworthiness and edication. Please 
email me directly on my private email address: officeose...@yahoo.com) so we 
can begin arrangements and I would give you more information on how we would 
handle this venture and once i hear from you i will give you information of the 
bank for the transferring funds on your name.

Regards,
Mr.Oliver Seno Lim

Re: [PATCH V3] net: emac: emac gigabit ethernet controller driver

2016-04-08 Thread Timur Tabi


Andrew Lunn wrote:


There are two different things here. One is configuring the pin to be
a GPIO. The second is using the GPIO as a GPIO. In this case,
bit-banging the MDIO bus.

The firmware could be doing the configuration, setting the pin as a
GPIO. However, the firmware cannot be doing the MDIO bit-banging to
make an MDIO bus available. Linux has to do that.

Or it could be we have all completely misunderstood the hardware, and
we are not doing bit-banging GPIO MDIO. There is a real MDIO
controller there, we don't use these pins as GPIOs, etc


Actually, I think there is a misunderstanding.

On the FSM9900 SOC (which uses device-tree), the two pins that connect 
to the external PHY are gpio pins.  However, the driver needs to 
reprogram the pinmux so that those pins are wired to the Emac 
controller.  That's what the the gpio code in this driver is doing: it's 
just configuring the pins so that they connect directly between the Emac 
and the external PHY.  After that, they are no longer GPIO pins, and you 
cannot use the "GPIO controlled MDIO bus".  There is no MDIO controller 
on the SOC.  The external PHY is controlled directly from the Emac and 
also from the internal PHY.  It is screwy, I know, but that's what Gilad 
was trying to explain.


On the QDF2432 (which uses ACPI), those two wires are now dedicated. 
There are not muxed GPIOs any more -- they are hard wired between Emac 
and the external PHY.


In both cases, you need to use Emac registers to communicate with the 
external PHY.  Stuff like link detect and link speed are configured by 
programming the Emac and/or the internal phy.


And the internal phy isn't really an internal phy.  It's an SGMII-like 
device that's connected to the Emac and handles various phy-related 
tasks.  It has its own register block, but you still have to program it 
in concert with the Emac.  You can't really treat it separately.


So I'm beginning to believe that Gilad's driver is actually correct 
as-is.  There are a few minor bug fixes, but in general it's correct.  I 
would like to post a V4 soon that has those minor fixes.


--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
Forum, a Linux Foundation collaborative project.

Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter

2016-04-08 Thread Jesper Dangaard Brouer

On Fri, 8 Apr 2016 10:02:00 -0700
Brenden Blanco  wrote:

> On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> > 
> > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer 
> >  wrote:
> >   
> > > > +/* user return codes for PHYS_DEV prog type */
> > > > +enum bpf_phys_dev_action {
> > > > +   BPF_PHYS_DEV_DROP,
> > > > +   BPF_PHYS_DEV_OK,
> > > > +};
> > > 
> > > I can imagine these extra return codes:
> > > 
> > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> > >  BPF_PHYS_DEV_STOLEN, /* E.g. forward use-case */
> > >  BPF_PHYS_DEV_SHARED, /* Queue for async processing, e.g. tcpdump 
> > > use-case */
> > > 
> > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > > which we can look at when we get that far...  
> > 
> > I want to point out something which is quite FUNDAMENTAL, for
> > understanding these return codes (and network stack).
> > 
> > 
> > At driver RX time, the network stack basically have two ways of
> > building an SKB, which is send up the stack.
> > 
> > Option-A (fastest): The packet page is writable. The SKB can be
> > allocated and skb->data/head can point directly to the page.  And
> > we place/write skb_shared_info in the end/tail-room. (This is done by
> > calling build_skb()).
> > 
> > Option-B (slower): The packet page is read-only.  The SKB cannot point
> > skb->data/head directly to the page, because skb_shared_info need to be
> > written into skb->end (slightly hidden via skb_shinfo() casting).  To
> > get around this, a separate piece of memory is allocated (speedup by
> > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> > be written. (This is done when calling netdev/napi_alloc_skb()).
> >   Drivers then need to copy over packet headers, and assign + adjust
> > skb_shinfo(skb)->frags[0] offset to skip copied headers.
> > 
> > 
> > Unfortunately most drivers use option-B.  Due to cost of calling the
> > page allocator.  It is only slightly most expensive to get a larger
> > compound page from the page allocator, which then can be partitioned into
> > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> > cost is added later, when constructing the SKB.
> >  Another reason for option-B, is that archs with expensive IOMMU
> > requirements (like PowerPC), don't need to dma_unmap on every packet,
> > but only on the compound page level.
> > 
> > Side-note: Most drivers have a "copy-break" optimization.  Especially
> > for option-B, when copying header data anyhow. For small packet, one
> > might as well free (or recycle) the RX page, if header size fits into
> > the newly allocated memory (for skb_shared_info).
> > 
> > 
> > For the early filter drop (DDoS use-case), it does not matter that the
> > packet-page is read-only.
> > 
> > BUT for the future XDP (eXpress Data Path) use-case it does matter.  If
> > we ever want to see speeds comparable to DPDK, then drivers to
> > need to implement option-A, as this allow forwarding at the packet-page
> > level.
> > 
> > I hope, my future page-pool facility can remove/hide the cost calling
> > the page allocator.
> >   
> Can't wait! This will open up a lot of doors.
>

If you talk about the page-pool, then it is just once piece of the
puzzle, not the silver bullet ;-)

> > 
> > Back to the return codes, thus:
> > ---
> > BPF_PHYS_DEV_SHARED requires driver use option-B, when constructing
> > the SKB, and treat packet data as read-only.
> > 
> > BPF_PHYS_DEV_MODIFIED requires driver to provide a writable packet-page.  
>
> I understand the driver/hw requirement, but the codes themselves I think
> need some tweaking.

I'm very open to changing these return codes. I'm just trying to open
up the discussion.


> For instance, if the packet is both modified and forwarded, should
> the flags be ORed together? 

I didn't see these as bit-flags. I assumed that if you want to forward
the packet, then you need to steal it (BPF_PHYS_DEV_STOLEN) and cannot
return it to the stack.

I'm open to changing this to bit-flags, BUT we just have to take care
not to introduce too many things we need to check, due to performance
issues.


> Or is the need for this return code made obsolete if the driver knows
> ahead of time via struct bpf_prog flags that the prog intends to
> modify the packet, and can set up the page accordingly?

Yes, maybe we can drop the modified (BPF_PHYS_DEV_MODIFIED) return code.
I was just thinking this could be used to indicate if the checksum
would need to be recalculated.  If the usual checksum people don't
care, we should drop this indication.

Think about it performance wise... if we know the program _can_ modify
(but don't know if it did so), then we would have mark the SKB to the
stack as the checksum needed to be recalculated, always...

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat

[PATCH] net: ipv6: Do not keep linklocal and loopback addresses

2016-04-08 Thread David Ahern

f1705ec197e7 added the option to retain user configured addresses on an
admin down. A comment to one of the later revisions suggested using the
IFA_F_PERMANENT flag rather than adding a user_managed boolean to the
ifaddr struct. A side effect of this change is that link local and
loopback addresses are also retained which is not part of the objective
of f1705ec197e7. Add check to drop those addresses.

Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")

Signed-off-by: David Ahern 
---
 net/ipv6/addrconf.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 27aed1afcf81..2dd8c1ca3287 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3438,6 +3438,12 @@ static void addrconf_type_change(struct net_device *dev, 
unsigned long event)
ipv6_mc_unmap(idev);
 }
 
+static bool addr_is_local(const struct in6_addr *addr)
+{
+   return ipv6_addr_type(addr) &
+   (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
+}
+
 static int addrconf_ifdown(struct net_device *dev, int how)
 {
struct net *net = dev_net(dev);
@@ -3495,7 +3501,8 @@ static int addrconf_ifdown(struct net_device *dev, int 
how)
 * address is retained on a down event
 */
if (!keep_addr ||
-   !(ifa->flags & IFA_F_PERMANENT)) {
+   !(ifa->flags & IFA_F_PERMANENT) ||
+   addr_is_local(>addr)) {
hlist_del_init_rcu(>addr_lst);
goto restart;
}
@@ -3544,7 +3551,8 @@ static int addrconf_ifdown(struct net_device *dev, int 
how)
write_unlock_bh(>lock);
spin_lock_bh(>lock);
 
-   if (keep_addr && (ifa->flags & IFA_F_PERMANENT)) {
+   if (keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
+   !addr_is_local(>addr)) {
/* set state to skip the notifier below */
state = INET6_IFADDR_STATE_DEAD;
ifa->state = 0;
-- 
2.1.4

Re: [PATCH v2 1/5] net: w5100: move mmiowb into register access callbacks

2016-04-08 Thread Akinobu Mita

2016-04-08 1:29 GMT+09:00 David Miller :
>
> Where is your "[PATCH v2 0/5] ..." header posting explaing what this series
> is doing, at a high level, how it is doing that, and why it is doing it
> that way?
>
> This is mandator for patch series submissions.

I see.  I'll surely include the explanations at the v3 submission.

Re: [PATCH net] vxlan: synchronously and race-free destruction of vxlan sockets

2016-04-08 Thread Marcelo Ricardo Leitner

Hi Hannes,

On Thu, Apr 07, 2016 at 04:57:40PM +0200, Hannes Frederic Sowa wrote:
> Due to the fact that the udp socket is destructed asynchronously in a
> work queue, we have some nondeterministic behavior during shutdown of
> vxlan tunnels and creating new ones. Fix this by keeping the destruction
> process synchronous in regards to the user space process so IFF_UP can
> be reliably set.
> 
> udp_tunnel_sock_release destroys vs->sock->sk if reference counter
> indicates so. We expect to have the same lifetime of vxlan_sock and
> vxlan_sock->sock->sk even in fast paths with only rcu locks held. So
> only destruct the whole socket after we can be sure it cannot be found
> by searching vxlan_net->sock_list.
> 
> Cc: Jiri Benc 
> Signed-off-by: Hannes Frederic Sowa 
> ---
>  drivers/net/vxlan.c | 20 +++-
>  include/net/vxlan.h |  2 --
>  2 files changed, 3 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 1c0fa364323e28..487e48b7a53090 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -98,7 +98,6 @@ struct vxlan_fdb {
>  
>  /* salt for hash table */
>  static u32 vxlan_salt __read_mostly;
> -static struct workqueue_struct *vxlan_wq;
>  
>  static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
>  {
> @@ -1065,7 +1064,9 @@ static void __vxlan_sock_release(struct vxlan_sock *vs)
>   vxlan_notify_del_rx_port(vs);
>   spin_unlock(>sock_lock);
>  
> - queue_work(vxlan_wq, >del_work);
> + synchronize_rcu();

__vxlan_sock_release is called by vxlan_sock_release which is called by
vxlan_open/stop. Do we really want to have synchronize_rcu() while
holding rtnl?

> + udp_tunnel_sock_release(vs->sock);
> + kfree(vs);
>  }
>  
>  static void vxlan_sock_release(struct vxlan_dev *vxlan)
> @@ -2574,13 +2575,6 @@ static const struct ethtool_ops vxlan_ethtool_ops = {
>   .get_link   = ethtool_op_get_link,
>  };
>  
> -static void vxlan_del_work(struct work_struct *work)
> -{
> - struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
> - udp_tunnel_sock_release(vs->sock);
> - kfree_rcu(vs, rcu);
> -}
> -
>  static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
>   __be16 port, u32 flags)
>  {
> @@ -2626,8 +2620,6 @@ static struct vxlan_sock *vxlan_socket_create(struct 
> net *net, bool ipv6,
>   for (h = 0; h < VNI_HASH_SIZE; ++h)
>   INIT_HLIST_HEAD(>vni_list[h]);
>  
> - INIT_WORK(>del_work, vxlan_del_work);
> -
>   sock = vxlan_create_sock(net, ipv6, port, flags);
>   if (IS_ERR(sock)) {
>   pr_info("Cannot bind port %d, err=%ld\n", ntohs(port),
> @@ -3218,10 +3210,6 @@ static int __init vxlan_init_module(void)
>  {
>   int rc;
>  
> - vxlan_wq = alloc_workqueue("vxlan", 0, 0);
> - if (!vxlan_wq)
> - return -ENOMEM;
> -
>   get_random_bytes(_salt, sizeof(vxlan_salt));
>  
>   rc = register_pernet_subsys(_net_ops);
> @@ -3242,7 +3230,6 @@ out3:
>  out2:
>   unregister_pernet_subsys(_net_ops);
>  out1:
> - destroy_workqueue(vxlan_wq);
>   return rc;
>  }
>  late_initcall(vxlan_init_module);
> @@ -3251,7 +3238,6 @@ static void __exit vxlan_cleanup_module(void)
>  {
>   rtnl_link_unregister(_link_ops);
>   unregister_netdevice_notifier(_notifier_block);
> - destroy_workqueue(vxlan_wq);
>   unregister_pernet_subsys(_net_ops);
>   /* rcu_barrier() is called by netns */
>  }
> diff --git a/include/net/vxlan.h b/include/net/vxlan.h
> index 73ed2e951c020d..2113f808e905a4 100644
> --- a/include/net/vxlan.h
> +++ b/include/net/vxlan.h
> @@ -126,9 +126,7 @@ struct vxlan_metadata {
>  /* per UDP socket information */
>  struct vxlan_sock {
>   struct hlist_node hlist;
> - struct work_struct del_work;
>   struct socket*sock;
> - struct rcu_head   rcu;
>   struct hlist_head vni_list[VNI_HASH_SIZE];
>   atomic_t  refcnt;
>   struct udp_offload udp_offloads;
> -- 
> 2.5.5
>

[PATCH] drivers/net/ethernet/jme.c: Deinline jme_reset_mac_processor, save 2816 bytes

2016-04-08 Thread Denys Vlasenko

This function compiles to 895 bytes of machine code.

Clearly, this isn't a time-critical function.
For one, it has a number of udelay(1) calls.

Signed-off-by: Denys Vlasenko 
CC: David S. Miller 
CC: linux-ker...@vger.kernel.org
CC: netdev@vger.kernel.org
---
 drivers/net/ethernet/jme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index 3ddf657..711cb19 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -222,7 +222,7 @@ jme_clear_ghc_reset(struct jme_adapter *jme)
jwrite32f(jme, JME_GHC, jme->reg_ghc);
 }
 
-static inline void
+static void
 jme_reset_mac_processor(struct jme_adapter *jme)
 {
static const u32 mask[WAKEUP_FRAME_MASK_DWNR] = {0, 0, 0, 0};
-- 
2.1.0

Re: [PATCH net-next] net: bcmgenet: add BQL support

2016-04-08 Thread Eric Dumazet

On Fri, 2016-04-08 at 09:54 -0700, Petri Gynther wrote:
> On Wed, Apr 6, 2016 at 1:25 PM, Florian Fainelli  wrote:
> >
> > 2016-04-05 17:50 GMT-07:00 Petri Gynther :
> > > Add Byte Queue Limits (BQL) support to bcmgenet driver.
> > >
> > > Signed-off-by: Petri Gynther 
> >
> > Signed-off-by: Florian Fainelli 
> >
> > Thanks!
> > --
> > Florian
> 
> Any further comments?
> 
> Notable difference from some other drivers --
> netdev_tx_reset_queue(txq) is called for all queues in
> bcmgenet_netif_start(), just before netif_tx_start_all_queues(dev).
> This is to ensure that BQL is reset before the interface becomes
> operational.
> 
> I think that is the right place for these calls.
> 
> Some other drivers call it from the "interface down" path.

BQL is ready to go at device setup :

__QUEUE_STATE_STACK_XOFF is not set

dql_reset() was called from dql_init(), called from
netdev_init_one_queue()

Re: [PATCH] ieee802154/adf7242: fix memory leak of firmware

2016-04-08 Thread Marcel Holtmann

Hi Sudip,

> If the firmware upload or the firmware verification fails then we
> printed the error message and exited but we missed releasing the
> firmware.
> 
> Signed-off-by: Sudip Mukherjee 
> ---
> drivers/net/ieee802154/adf7242.c | 2 ++
> 1 file changed, 2 insertions(+)

patch has been applied to bluetooth-next tree.

Regards

Marcel

Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter

2016-04-08 Thread Alexei Starovoitov

On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> 
> On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer  
> wrote:
> 
> > > +/* user return codes for PHYS_DEV prog type */
> > > +enum bpf_phys_dev_action {
> > > + BPF_PHYS_DEV_DROP,
> > > + BPF_PHYS_DEV_OK,
> > > +};  
> > 
> > I can imagine these extra return codes:
> > 
> >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> >  BPF_PHYS_DEV_STOLEN, /* E.g. forward use-case */
> >  BPF_PHYS_DEV_SHARED, /* Queue for async processing, e.g. tcpdump 
> > use-case */
> > 
> > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > which we can look at when we get that far...
> 
> I want to point out something which is quite FUNDAMENTAL, for
> understanding these return codes (and network stack).
> 
> 
> At driver RX time, the network stack basically have two ways of
> building an SKB, which is send up the stack.
> 
> Option-A (fastest): The packet page is writable. The SKB can be
> allocated and skb->data/head can point directly to the page.  And
> we place/write skb_shared_info in the end/tail-room. (This is done by
> calling build_skb()).
> 
> Option-B (slower): The packet page is read-only.  The SKB cannot point
> skb->data/head directly to the page, because skb_shared_info need to be
> written into skb->end (slightly hidden via skb_shinfo() casting).  To
> get around this, a separate piece of memory is allocated (speedup by
> __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> be written. (This is done when calling netdev/napi_alloc_skb()).
>   Drivers then need to copy over packet headers, and assign + adjust
> skb_shinfo(skb)->frags[0] offset to skip copied headers.
> 
> 
> Unfortunately most drivers use option-B.  Due to cost of calling the
> page allocator.  It is only slightly most expensive to get a larger
> compound page from the page allocator, which then can be partitioned into
> page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> cost is added later, when constructing the SKB.
>  Another reason for option-B, is that archs with expensive IOMMU
> requirements (like PowerPC), don't need to dma_unmap on every packet,
> but only on the compound page level.
> 
> Side-note: Most drivers have a "copy-break" optimization.  Especially
> for option-B, when copying header data anyhow. For small packet, one
> might as well free (or recycle) the RX page, if header size fits into
> the newly allocated memory (for skb_shared_info).

I think you guys are going into overdesign territory, so
. nack on read-only pages
. nack on copy-break approach
. nack on per-ring programs
. nack on modified/stolen/shared return codes

The whole thing must be dead simple to use. Above is not simple by any means.
The programs must see writeable pages only and return codes:
drop, pass to stack, redirect to xmit.
If program wishes to modify packets before passing it to stack, it
shouldn't need to deal with different return values.
No special things to deal with small or large packets. No header splits.
Program must not be aware of any such things.
Drivers can use DMA_BIDIRECTIONAL to allow received page to be
modified by the program and immediately sent to xmit. 
No dma map/unmap/sync per packet. If some odd architectures/dma setups
cannot do it, then XDP will not be applicable there.
We are not going to sacrifice performance for generality.

Re: [PATCH] net: thunderx: Fix broken of_node_put() code.

2016-04-08 Thread David Daney

Due to mail server malfunction, this patch was sent twice.  Please 
ignore this duplicate.


Thanks,
David Daney


On 03/31/2016 06:01 PM, David Daney wrote:

From: David Daney 

commit b7d3e3d3d21a ("net: thunderx: Don't leak phy device references
on -EPROBE_DEFER condition.") incorrectly moved the call to
of_node_put() outside of the loop.  Under normal loop exit, the node
has already had of_node_put() called, so the extra call results in:

[8.228020] ERROR: Bad of_node_put() on 
/soc@0/pci@8480/mrml-bridge0@1,0/bgx0/xlaui00
[8.239433] CPU: 16 PID: 608 Comm: systemd-udevd Not tainted 4.6.0-rc1-numa+ 
#157
[8.247380] Hardware name: www.cavium.com EBB8800/EBB8800, BIOS 0.3 Mar  2 
2016
[8.273541] Call trace:
[8.273550] [] dump_backtrace+0x0/0x210
[8.273557] [] show_stack+0x24/0x2c
[8.273560] [] dump_stack+0x8c/0xb4
[8.273566] [] of_node_release+0xa8/0xac
[8.273570] [] kobject_cleanup+0x8c/0x194
[8.273573] [] kobject_put+0x44/0x6c
[8.273576] [] of_node_put+0x24/0x30
[8.273587] [] bgx_probe+0x17c/0xcd8 [thunder_bgx]
[8.273591] [] pci_device_probe+0xa0/0x114
[8.273596] [] driver_probe_device+0x178/0x418
[8.273599] [] __driver_attach+0x100/0x118
[8.273602] [] bus_for_each_dev+0x6c/0xac
[8.273605] [] driver_attach+0x30/0x38
[8.273608] [] bus_add_driver+0x1f8/0x29c
[8.273611] [] driver_register+0x70/0x110
[8.273617] [] __pci_register_driver+0x60/0x6c
[8.273623] [] bgx_init_module+0x40/0x48 [thunder_bgx]
[8.273626] [] do_one_initcall+0xcc/0x1c0
[8.273631] [] do_init_module+0x68/0x1c8
[8.273635] [] load_module+0xf44/0x11f4
[8.273638] [] SyS_finit_module+0xb8/0xe0
[8.273641] [] el0_svc_naked+0x24/0x28

Go back to the previous (correct) code that only did the extra
of_node_put() call on early exit from the loop.

Signed-off-by: David Daney 
---
  drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c 
b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
index 9679515..d20539a 100644
--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
+++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
@@ -1011,10 +1011,11 @@ static int bgx_init_of_phy(struct bgx *bgx)
}

lmac++;
-   if (lmac == MAX_LMAC_PER_BGX)
+   if (lmac == MAX_LMAC_PER_BGX) {
+   of_node_put(node);
break;
+   }
}
-   of_node_put(node);
return 0;

  defer:

Re: [patch net-next 0/5] mlxsw: small driver update

2016-04-08 Thread David Miller

From: Jiri Pirko 
Date: Fri, 8 Apr 2016 17:51:55 +0200

> Fri, Apr 08, 2016 at 05:45:20PM CEST, j...@resnulli.us wrote:
>>From: Jiri Pirko 
>>
>>Cosmetics, in preparation to sharedbuffer patchset.
> 
> Dave, I just realized there is dependency on:
> "devlink: remove implicit type set in port register" which I sent couple
> of minutes after this patchset. I can either resend in bulk, or if you
> could apply in order, that would be great.

The devlink series also lacked a header posting.  Can you just sort this
all out properly and respin everything?

Thanks.

> Thanks and sorry, owe you another beer :)

:-)

Re: [PATCH net-next] ipv6, token: allow for clearing the current device token

2016-04-08 Thread Daniel Borkmann


On 04/08/2016 05:36 PM, Hannes Frederic Sowa wrote:

On 08.04.2016 17:25, Bjørn Mork wrote:

Hannes Frederic Sowa  writes:

On Fri, Apr 8, 2016, at 16:18, Bjørn Mork wrote:

Daniel Borkmann  writes:


  if (!token)
  return -EINVAL;
-if (ipv6_addr_any(token))
-return -EINVAL;
  if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
  return -EINVAL;


Not directly related to the patch in question.  It just made me aware of
this restriction...

I realize that I'm a few years late here, but what's with the IFF_NOARP?
Is that just because we can't do DAD for the token based addresses?  How
is that different from manually configuring the whole address?


IFF_NOARP is kind of the equivalent to no neighbor discovery. If you set
a token and never get in a router advertisement you never create a
tokenized ip address, thus the feature is useless.


You can get router advertisements with IFF_NOARP. You cannot lookup L2
addresses, but the L3 prefix info is still as useful as with any other
interface.


Of course router advertisements can be send and received with IFF_NOARP and 
probably we act on them as usual, as you showed. Looking in the source we don't 
really specify what those flags mean/do for IPv6. So I think you can assume 
that it is in there because of history.

I would absolutely not mind if you remove the limitation for IFF_ARP.


Agreed me neither, the code should be able to handle it as far as I see.

Thanks,
Daniel

1 2 >

1 - 100 of 183 matches

Mail list logo