[PATCH net-next] selftests: net: Clean up an unused variable

2018-10-05 Thread Jakub Sitnicki
Address compiler warning:

ip_defrag.c: In function 'send_udp_frags':
ip_defrag.c:206:16: warning: unused variable 'udphdr' [-Wunused-variable]
  struct udphdr udphdr;
^~

Signed-off-by: Jakub Sitnicki 
---
 tools/testing/selftests/net/ip_defrag.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/testing/selftests/net/ip_defrag.c 
b/tools/testing/selftests/net/ip_defrag.c
index 2366dc6bce71..61ae2782388e 100644
--- a/tools/testing/selftests/net/ip_defrag.c
+++ b/tools/testing/selftests/net/ip_defrag.c
@@ -203,7 +203,6 @@ static void send_udp_frags(int fd_raw, struct sockaddr 
*addr,
 {
struct ip *iphdr = (struct ip *)ip_frame;
struct ip6_hdr *ip6hdr = (struct ip6_hdr *)ip_frame;
-   struct udphdr udphdr;
int res;
int offset;
int frag_len;
-- 
2.17.1



[PATCH net-next] socket: Tighten no-error check in bind()

2018-10-04 Thread Jakub Sitnicki
move_addr_to_kernel() returns only negative values on error, or zero on
success. Rewrite the error check to an idiomatic form to avoid confusing
the reader.

Signed-off-by: Jakub Sitnicki 
---
 net/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/socket.c b/net/socket.c
index 01f3f8f32d6f..713dc4833d40 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1475,7 +1475,7 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, 
int addrlen)
sock = sockfd_lookup_light(fd, , _needed);
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, );
-   if (err >= 0) {
+   if (!err) {
err = security_socket_bind(sock,
   (struct sockaddr *),
   addrlen);
-- 
2.17.1



Re: [RFC net-next] ipv4: Don't promote secondaries when flushing addresses

2018-06-07 Thread Jakub Sitnicki
On Thu, 7 Jun 2018 13:00:29 +0200
Michal Kubecek  wrote:

> On Thu, Jun 07, 2018 at 12:13:01PM +0200, Jakub Sitnicki wrote:
> > Promoting secondary addresses on address removal makes flushing all
> > addresses from a device with 1000's of them slow. This is because we
> > cannot take down the secondary addresses when we are removing the
> > primary one, which would make it faster.
> > 
> > However, the userspace, when performing a flush, will in the end remove
> > all the addresses regardless of secondary address promotion taking
> > place. Unfortunately the kernel currently cannot distinguish between a
> > single address removal and a flush of all addresses.
> > 
> > To help with this case introduce a IFA_F_FLUSH flag that can be used by
> > userspace to signal that a removal operation is being done because of a
> > flush. When the flag is set, don't bother with secondary address
> > promotion as we expect that secondary addresses will be removed soon as
> > well.  
> 
> Unless you intend to use the flag to allow deleting a specific address
> with its secondaries (overriding promote_secondaries), maybe it would
> be more practical to go even further and delete all addresses on the
> interface if IFA_F_FLUSH is set so that userspace could delete all
> addresses with one request.

Thanks for input, Michal. The intend as I understand it is to make
flushing all the addresses fast(er). Let me see if I can rework it
according to your suggestion. It does make more sense to do it like
that to me too.

Thanks,
Jakub



[RFC net-next] ipv4: Don't promote secondaries when flushing addresses

2018-06-07 Thread Jakub Sitnicki
Promoting secondary addresses on address removal makes flushing all
addresses from a device with 1000's of them slow. This is because we
cannot take down the secondary addresses when we are removing the
primary one, which would make it faster.

However, the userspace, when performing a flush, will in the end remove
all the addresses regardless of secondary address promotion taking
place. Unfortunately the kernel currently cannot distinguish between a
single address removal and a flush of all addresses.

To help with this case introduce a IFA_F_FLUSH flag that can be used by
userspace to signal that a removal operation is being done because of a
flush. When the flag is set, don't bother with secondary address
promotion as we expect that secondary addresses will be removed soon as
well.

Signed-off-by: Jakub Sitnicki 

---

A benchmark involving a flush of 40,000 addresses from a dummy device
shows a x4 speed-up of the 'flush' operation. 'ip' had to be modified to
set the IFA_F_FLUSH flag for RTM_DELADDR requests issued for the
'flush':

  # time $IP -stats addr flush dev dum0

Before:

  real0m30.596s
  user0m0.000s
  sys 0m30.567s

After:

  real0m7.601s
  user0m0.000s
  sys 0m7.569s

It's also worth noting that promote_secondaries sysctl param is enabled by
default since systemd 216 thus making it the new "normal" on some distros.


 include/uapi/linux/if_addr.h |  1 +
 net/ipv4/devinet.c   | 14 ++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index ebaf5701c9db..19aab9a9cec5 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -54,6 +54,7 @@ enum {
 #define IFA_F_NOPREFIXROUTE0x200
 #define IFA_F_MCAUTOJOIN   0x400
 #define IFA_F_STABLE_PRIVACY   0x800
+#define IFA_F_FLUSH0x1000

 struct ifa_cacheinfo {
__u32   ifa_prefered;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab1a77a..1f436e1e5222 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -331,13 +331,14 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, 
__be32 b)
 }

 static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
-int destroy, struct nlmsghdr *nlh, u32 portid)
+  int destroy, struct nlmsghdr *nlh, u32 portid,
+  bool flush)
 {
struct in_ifaddr *promote = NULL;
struct in_ifaddr *ifa, *ifa1 = *ifap;
struct in_ifaddr *last_prim = in_dev->ifa_list;
struct in_ifaddr *prev_prom = NULL;
-   int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
+   int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev) && !flush;

ASSERT_RTNL();

@@ -437,7 +438,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct 
in_ifaddr **ifap,
 static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 int destroy)
 {
-   __inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
+   __inet_del_ifa(in_dev, ifap, destroy, NULL, 0, false);
 }

 static void check_lifetime(struct work_struct *work);
@@ -607,6 +608,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct 
nlmsghdr *nlh,
struct in_device *in_dev;
struct ifaddrmsg *ifm;
struct in_ifaddr *ifa, **ifap;
+   bool flush = false;
int err = -EINVAL;

ASSERT_RTNL();
@@ -623,6 +625,9 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct 
nlmsghdr *nlh,
goto errout;
}

+   if (tb[IFA_FLAGS])
+   flush = !!(nla_get_u32(tb[IFA_FLAGS]) & IFA_F_FLUSH);
+
for (ifap = _dev->ifa_list; (ifa = *ifap) != NULL;
 ifap = >ifa_next) {
if (tb[IFA_LOCAL] &&
@@ -639,7 +644,8 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct 
nlmsghdr *nlh,

if (ipv4_is_multicast(ifa->ifa_address))
ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa);
-   __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
+   __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid,
+  flush);
return 0;
}

--
2.14.4


Re: [PATCH net] ipv6: fix uninit-value in ip6_multipath_l3_keys()

2018-04-30 Thread Jakub Sitnicki
On Sun, 29 Apr 2018 09:54:59 -0700
Eric Dumazet <eduma...@google.com> wrote:

> syzbot/KMSAN reported an uninit-value in ip6_multipath_l3_keys(),
> root caused to a bad assumption of ICMP header being already
> pulled in skb->head
> 
> ip_multipath_l3_keys() does the correct thing, so it is an IPv6 only bug.
> 
> BUG: KMSAN: uninit-value in ip6_multipath_l3_keys net/ipv6/route.c:1830 
> [inline]
> BUG: KMSAN: uninit-value in rt6_multipath_hash+0x5c4/0x640 
> net/ipv6/route.c:1858
> CPU: 0 PID: 4507 Comm: syz-executor661 Not tainted 4.16.0+ #87
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x185/0x1d0 lib/dump_stack.c:53
>  kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
>  __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
>  ip6_multipath_l3_keys net/ipv6/route.c:1830 [inline]
>  rt6_multipath_hash+0x5c4/0x640 net/ipv6/route.c:1858
>  ip6_route_input+0x65a/0x920 net/ipv6/route.c:1884
>  ip6_rcv_finish+0x413/0x6e0 net/ipv6/ip6_input.c:69
>  NF_HOOK include/linux/netfilter.h:288 [inline]
>  ipv6_rcv+0x1e16/0x2340 net/ipv6/ip6_input.c:208
>  __netif_receive_skb_core+0x47df/0x4a90 net/core/dev.c:4562
>  __netif_receive_skb net/core/dev.c:4627 [inline]
>  netif_receive_skb_internal+0x49d/0x630 net/core/dev.c:4701
>  netif_receive_skb+0x230/0x240 net/core/dev.c:4725
>  tun_rx_batched drivers/net/tun.c:1555 [inline]
>  tun_get_user+0x740f/0x7c60 drivers/net/tun.c:1962
>  tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990
>  call_write_iter include/linux/fs.h:1782 [inline]
>  new_sync_write fs/read_write.c:469 [inline]
>  __vfs_write+0x7fb/0x9f0 fs/read_write.c:482
>  vfs_write+0x463/0x8d0 fs/read_write.c:544
>  SYSC_write+0x172/0x360 fs/read_write.c:589
>  SyS_write+0x55/0x80 fs/read_write.c:581
>  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
>  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> 
> Fixes: 23aebdacb05d ("ipv6: Compute multipath hash for ICMP errors from 
> offending packet")
> Signed-off-by: Eric Dumazet <eduma...@google.com>
> Reported-by: syzbot <syzkal...@googlegroups.com>
> Cc: Jakub Sitnicki <j...@redhat.com>
> ---
>  net/ipv6/route.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 
> cde7d8251377c1a115e02c46843d361d3c0b4313..f4d61736c41abe8cd7f439c4a37100e90c1eacca
>  100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -1835,11 +1835,16 @@ static void ip6_multipath_l3_keys(const struct 
> sk_buff *skb,
>   const struct ipv6hdr *inner_iph;
>   const struct icmp6hdr *icmph;
>   struct ipv6hdr _inner_iph;
> + struct icmp6hdr _icmph;
>  
>   if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
>   goto out;
>  
> - icmph = icmp6_hdr(skb);
> + icmph = skb_header_pointer(skb, skb_transport_offset(skb),
> +sizeof(_icmph), &_icmph);
> + if (!icmph)
> +     goto out;
> +
>   if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
>   icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
>   icmph->icmp6_type != ICMPV6_TIME_EXCEED &&

Thank you for the fix. I was not being careful there.

Acked-by: Jakub Sitnicki <j...@redhat.com>


[iproute PATCH] iproute: Abort if nexthop cannot be parsed

2018-04-11 Thread Jakub Sitnicki
Attempt to add a multipath route where a nexthop definition refers to a
non-existent device causes 'ip' to crash and burn due to stack buffer
overflow:

  # ip -6 route add fd00::1/64 nexthop dev fake1
  Cannot find device "fake1"
  Cannot find device "fake1"
  Cannot find device "fake1"
  ...
  Segmentation fault (core dumped)

Don't ignore errors from the helper routine that parses the nexthop
definition, and abort immediately if parsing fails.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 ip/iproute.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 1d8fd815..44351bc5 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -1038,7 +1038,10 @@ static int parse_nexthops(struct nlmsghdr *n, struct 
rtmsg *r,
memset(rtnh, 0, sizeof(*rtnh));
rtnh->rtnh_len = sizeof(*rtnh);
rta->rta_len += rtnh->rtnh_len;
-   parse_one_nh(n, r, rta, rtnh, , );
+   if (parse_one_nh(n, r, rta, rtnh, , )) {
+   fprintf(stderr, "Error: cannot parse nexthop\n");
+   exit(-1);
+   }
rtnh = RTNH_NEXT(rtnh);
}
 
-- 
2.14.3



Re: [PATCH net] geneve: Fix function matching VNI and tunnel ID on big-endian

2017-10-19 Thread Jakub Sitnicki
On Thu, 19 Oct 2017 13:31:28 +0200
Stefano Brivio <sbri...@redhat.com> wrote:

> On big-endian machines, functions converting between tunnel ID
> and VNI use the three LSBs of tunnel ID storage to map VNI.
> 
> The comparison function eq_tun_id_and_vni(), on the other hand,
> attempted to map the VNI from the three MSBs. Fix it by using
> the same check implemented on LE, which maps VNI from the three
> LSBs of tunnel ID.
> 
> Fixes: 2e0b26e10352 ("geneve: Optimize geneve device lookup.")
> Signed-off-by: Stefano Brivio <sbri...@redhat.com>

Thanks, Stefano.

Reviewed-by: Jakub Sitnicki <j...@redhat.com>


[PATCH net-next v3 4/4] ipv6: Use multipath hash from flow info if available

2017-08-23 Thread Jakub Sitnicki
Allow our callers to influence the choice of ECMP link by honoring the
hash passed together with the flow info. This allows for special
treatment of ICMP errors which we would like to route over the same path
as the IPv6 datagram that triggered the error.

Also go through rt6_multipath_hash(), in the usual case when we aren't
dealing with an ICMP error, so that there is one central place where
multipath hash is computed.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 net/ipv6/route.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 246e7d7..4d02734 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -452,7 +452,13 @@ static struct rt6_info *rt6_multipath_select(struct 
rt6_info *match,
struct rt6_info *sibling, *next_sibling;
int route_choosen;
 
-   route_choosen = get_hash_from_flowi6(fl6) % (match->rt6i_nsiblings + 1);
+   /* We might have already computed the hash for ICMPv6 errors. In such
+* case it will always be non-zero. Otherwise now is the time to do it.
+*/
+   if (!fl6->mp_hash)
+   fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
+
+   route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
/* Don't change the route, if route_choosen == 0
 * (siblings does not include ourself)
 */
-- 
2.9.4



[PATCH net-next v3 0/4] Route ICMPv6 errors with the flow when ECMP in use

2017-08-23 Thread Jakub Sitnicki
This patch set is another take at making Path MTU Discovery work when
server nodes are behind a router employing multipath routing in a
load-balance or anycast setup (that is, when not every end-node can be
reached by every path). The problem has been well described in RFC 7690
[1], but in short - in such setups ICMPv6 PTB errors are not guaranteed
to be routed back to the server node that sent a reply that exceeds path
MTU.

The proposed solution is two-fold:

 (1) on the server side - reflect the Flow Label [2]. This can be done
 without modifying the application using a new per-netns sysctl knob
 that has been proposed independently of this patchset in the patch
 entitled "ipv6: Add sysctl for per namespace flow label
 reflection" [3].

 (2) on the ECMP router - make the ipv6 routing subsystem look into the
 ICMPv6 error packets and compute the flow-hash from its payload,
 i.e. the offending packet that triggered the error. This is the
 same behavior as ipv4 stack has already.

With both parts in place Path MTU Discovery can work past the ECMP
router when using IPv6.

[1] https://tools.ietf.org/html/rfc7690
[2] https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01
[3] http://patchwork.ozlabs.org/patch/804870/

v1 -> v2:
 - don't use "extern" in external function declaration in header file
 - style change, put as many arguments as possible on the first line of
   a function call, and align consecutive lines to the first argument
 - expand the cover letter based on the feedback

v2 -> v3:
 - switch to computing flow-hash using flow dissector to align with
   recent changes to multipath routing in ipv4 stack
 - add a sysctl knob for enabling flow label reflection per netns

---

Testing has covered multipath routing of ICMPv6 PTB errors in forward
and local output path in a simple use-case of an HTTP server sending a
reply which is over the path MTU size [3]. I have also checked if the
flows get evenly spread over multiple paths (i.e. if there are no
regressions) [4].

[3] https://github.com/jsitnicki/tools/tree/master/net/tests/ecmp/pmtud
[4] https://github.com/jsitnicki/tools/tree/master/net/tests/ecmp/load-balance


Jakub Sitnicki (4):
  net: Extend struct flowi6 with multipath hash
  ipv6: Compute multipath hash for ICMP errors from offending packet
  ipv6: Fold rt6_info_hash_nhsfn() into its only caller
  ipv6: Use multipath hash from flow info if available

 include/net/flow.h  |  1 +
 include/net/ip6_route.h |  1 +
 net/ipv6/icmp.c |  1 +
 net/ipv6/route.c| 68 +
 4 files changed, 60 insertions(+), 11 deletions(-)

-- 
2.9.4



[PATCH net-next v3 3/4] ipv6: Fold rt6_info_hash_nhsfn() into its only caller

2017-08-23 Thread Jakub Sitnicki
Commit 644d0e656958 ("ipv6 Use get_hash_from_flowi6 for rt6 hash") has
turned rt6_info_hash_nhsfn() into a one-liner, so it no longer makes
sense to keep it around. Also remove the accompanying comment that has
become outdated.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 net/ipv6/route.c | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6c4dd57..246e7d7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -445,16 +445,6 @@ static bool rt6_check_expired(const struct rt6_info *rt)
return false;
 }
 
-/* Multipath route selection:
- *   Hash based function using packet header and flowlabel.
- * Adapted from fib_info_hashfn()
- */
-static int rt6_info_hash_nhsfn(unsigned int candidate_count,
-  const struct flowi6 *fl6)
-{
-   return get_hash_from_flowi6(fl6) % candidate_count;
-}
-
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 struct flowi6 *fl6, int oif,
 int strict)
@@ -462,7 +452,7 @@ static struct rt6_info *rt6_multipath_select(struct 
rt6_info *match,
struct rt6_info *sibling, *next_sibling;
int route_choosen;
 
-   route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
+   route_choosen = get_hash_from_flowi6(fl6) % (match->rt6i_nsiblings + 1);
/* Don't change the route, if route_choosen == 0
 * (siblings does not include ourself)
 */
-- 
2.9.4



[PATCH net-next v3 1/4] net: Extend struct flowi6 with multipath hash

2017-08-23 Thread Jakub Sitnicki
Allow for functions that fill out the IPv6 flow info to also pass a hash
computed over the skb contents. The hash value will drive the multipath
routing decisions.

This is intended for special treatment of ICMPv6 errors, where we would
like to make a routing decision based on the flow identifying the
offending IPv6 datagram that triggered the error, rather than the flow
of the ICMP error itself.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 include/net/flow.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/net/flow.h b/include/net/flow.h
index f3dc61b..eb60cee3 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -149,6 +149,7 @@ struct flowi6 {
 #define fl6_ipsec_spi  uli.spi
 #define fl6_mh_typeuli.mht.type
 #define fl6_gre_keyuli.gre_key
+   __u32   mp_hash;
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 struct flowidn {
-- 
2.9.4



[PATCH net-next v3 2/4] ipv6: Compute multipath hash for ICMP errors from offending packet

2017-08-23 Thread Jakub Sitnicki
When forwarding or sending out an ICMPv6 error, look at the embedded
packet that triggered the error and compute a flow hash over its
headers.

This let's us route the ICMP error together with the flow it belongs to
when multipath (ECMP) routing is in use, which in turn makes Path MTU
Discovery work in ECMP load-balanced or anycast setups (RFC 7690).

Granted, end-hosts behind the ECMP router (aka servers) need to reflect
the IPv6 Flow Label for PMTUD to work.

The code is organized to be in parallel with ipv4 stack:

  ip_multipath_l3_keys -> ip6_multipath_l3_keys
  fib_multipath_hash   -> rt6_multipath_hash

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 include/net/ip6_route.h |  1 +
 net/ipv6/icmp.c |  1 +
 net/ipv6/route.c| 50 +
 3 files changed, 52 insertions(+)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 907d39a..882bc3c 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -115,6 +115,7 @@ static inline int ip6_route_get_saddr(struct net *net, 
struct rt6_info *rt,
 
 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
const struct in6_addr *saddr, int oif, int flags);
+u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb);
 
 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 4f82830..dd7608c 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -519,6 +519,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 
code, __u32 info,
fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code;
fl6.flowi6_uid = sock_net_uid(net, NULL);
+   fl6.mp_hash = rt6_multipath_hash(, skb);
security_skb_classify_flow(skb, flowi6_to_flowi());
 
sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9b02064..6c4dd57 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1214,6 +1214,54 @@ struct dst_entry *ip6_route_input_lookup(struct net *net,
 }
 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
+static void ip6_multipath_l3_keys(const struct sk_buff *skb,
+ struct flow_keys *keys)
+{
+   const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
+   const struct ipv6hdr *key_iph = outer_iph;
+   const struct ipv6hdr *inner_iph;
+   const struct icmp6hdr *icmph;
+   struct ipv6hdr _inner_iph;
+
+   if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
+   goto out;
+
+   icmph = icmp6_hdr(skb);
+   if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
+   icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
+   icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
+   icmph->icmp6_type != ICMPV6_PARAMPROB)
+   goto out;
+
+   inner_iph = skb_header_pointer(skb,
+  skb_transport_offset(skb) + 
sizeof(*icmph),
+  sizeof(_inner_iph), &_inner_iph);
+   if (!inner_iph)
+   goto out;
+
+   key_iph = inner_iph;
+out:
+   memset(keys, 0, sizeof(*keys));
+   keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+   keys->addrs.v6addrs.src = key_iph->saddr;
+   keys->addrs.v6addrs.dst = key_iph->daddr;
+   keys->tags.flow_label = ip6_flowinfo(key_iph);
+   keys->basic.ip_proto = key_iph->nexthdr;
+}
+
+/* if skb is set it will be used and fl6 can be NULL */
+u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
+{
+   struct flow_keys hash_keys;
+
+   if (skb) {
+   ip6_multipath_l3_keys(skb, _keys);
+   return flow_hash_from_keys(_keys);
+   }
+
+   return get_hash_from_flowi6(fl6);
+}
+
 void ip6_route_input(struct sk_buff *skb)
 {
const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -1232,6 +1280,8 @@ void ip6_route_input(struct sk_buff *skb)
tun_info = skb_tunnel_info(skb);
if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+   if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
+   fl6.mp_hash = rt6_multipath_hash(, skb);
skb_dst_drop(skb);
skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, , flags));
 }
-- 
2.9.4



[PATCH net-next] ipv6: Add sysctl for per namespace flow label reflection

2017-08-23 Thread Jakub Sitnicki
Reflecting IPv6 Flow Label at server nodes is useful in environments
that employ multipath routing to load balance the requests. As "IPv6
Flow Label Reflection" standard draft [1] points out - ICMPv6 PTB error
messages generated in response to a downstream packets from the server
can be routed by a load balancer back to the original server without
looking at transport headers, if the server applies the flow label
reflection. This enables the Path MTU Discovery past the ECMP router in
load-balance or anycast environments where each server node is reachable
by only one path.

Introduce a sysctl to enable flow label reflection per net namespace for
all newly created sockets. Same could be earlier achieved only per
socket by setting the IPV6_FL_F_REFLECT flag for the IPV6_FLOWLABEL_MGR
socket option.

[1] https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 Documentation/networking/ip-sysctl.txt | 9 +
 include/net/netns/ipv6.h   | 1 +
 net/ipv6/af_inet6.c| 1 +
 net/ipv6/sysctl_net_ipv6.c | 8 
 4 files changed, 19 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 84c9b8c..6b0bc0f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1350,6 +1350,15 @@ flowlabel_state_ranges - BOOLEAN
FALSE: disabled
Default: true
 
+flowlabel_reflect - BOOLEAN
+   Automatically reflect the flow label. Needed for Path MTU
+   Discovery to work with Equal Cost Multipath Routing in anycast
+   environments. See RFC 7690 and:
+   https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01
+   TRUE: enabled
+   FALSE: disabled
+   Default: FALSE
+
 anycast_src_echo_reply - BOOLEAN
Controls the use of anycast addresses as source addresses for ICMPv6
echo reply
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 0e50bf3..2544f97 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 {
int idgen_retries;
int idgen_delay;
int flowlabel_state_ranges;
+   int flowlabel_reflect;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3b58ee7..fe5262f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -211,6 +211,7 @@ static int inet6_create(struct net *net, struct socket 
*sock, int protocol,
np->mc_loop = 1;
np->pmtudisc= IPV6_PMTUDISC_WANT;
np->autoflowlabel = ip6_default_np_autolabel(net);
+   np->repflow = net->ipv6.sysctl.flowlabel_reflect;
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
 
/* Init the ipv4 part of the socket since we can have sockets
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 69c50e7..6fbf8ae 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -90,6 +90,13 @@ static struct ctl_table ipv6_table_template[] = {
.mode   = 0644,
.proc_handler   = proc_dointvec
},
+   {
+   .procname   = "flowlabel_reflect",
+   .data   = _net.ipv6.sysctl.flowlabel_reflect,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec,
+   },
{ }
 };
 
@@ -149,6 +156,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
ipv6_table[6].data = >ipv6.sysctl.idgen_delay;
ipv6_table[7].data = >ipv6.sysctl.flowlabel_state_ranges;
ipv6_table[8].data = >ipv6.sysctl.ip_nonlocal_bind;
+   ipv6_table[9].data = >ipv6.sysctl.flowlabel_reflect;
 
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
-- 
2.9.4



Re: [PATCH net] xfrm: Clear sk_dst_cache when applying per-socket policy.

2017-08-16 Thread Jakub Sitnicki
On Wed, 16 Aug 2017 03:43:54 -0700
Eric Dumazet <eric.duma...@gmail.com> wrote:

> On Wed, 2017-08-16 at 11:03 +0200, Jakub Sitnicki wrote:
> > On Tue, 15 Aug 2017 15:25:10 -0700
> > Jonathan Basseri <misterik...@google.com> wrote:
> >   
> > > If an IPv6 socket has a valid dst cache, then xfrm_lookup_route will get
> > > skipped. However, the cache is not invalidated when applying policy to a
> > > socket (i.e. IPV6_XFRM_POLICY). The result is that new policies are
> > > sometimes ignored on those sockets.
> > > 
> > > This can be demonstrated like so,
> > > 1. Create UDPv6 socket.
> > > 2. connect() the socket.
> > > 3. Apply an outbound XFRM policy to the socket.
> > > 4. send() data on the socket.
> > > 
> > > Packets will continue to be sent in the clear instead of matching an
> > > xfrm or returning a no-match error (EAGAIN). This affects calls to
> > > send() and not sendto().
> > > 
> > > Note: Creating normal XFRM policies should have a similar effect on
> > > sk_dst_cache entries that match the policy, but that is not fixed in
> > > this patch.
> > > 
> > > Fixes: 00bc0ef5880d ("ipv6: Skip XFRM lookup if dst_entry in socket cache 
> > > is valid")
> > > Tested: https://android-review.googlesource.com/418659
> > > Signed-off-by: Jonathan Basseri <misterik...@google.com>
> > > ---  
> > 
> > Thank you for the fix.
> > 
> > Acked-by: Jakub Sitnicki <j...@redhat.com>  
> 
> I do not believe this fix is correct.
> 
> What happens if the socket is TCP ?
> 
> sk_dst_reset(sk) is not safe for them.
> 
> This might add use-after-free, and eventually crash.

You are right. I see that RCU-variant __sk_dst_reset() is used
throughout TCP stack. Thank you for pointing it out.

Please disregard my earlier ACK.

-Jakub


Re: [PATCH net] xfrm: Clear sk_dst_cache when applying per-socket policy.

2017-08-16 Thread Jakub Sitnicki
On Tue, 15 Aug 2017 15:25:10 -0700
Jonathan Basseri <misterik...@google.com> wrote:

> If an IPv6 socket has a valid dst cache, then xfrm_lookup_route will get
> skipped. However, the cache is not invalidated when applying policy to a
> socket (i.e. IPV6_XFRM_POLICY). The result is that new policies are
> sometimes ignored on those sockets.
> 
> This can be demonstrated like so,
> 1. Create UDPv6 socket.
> 2. connect() the socket.
> 3. Apply an outbound XFRM policy to the socket.
> 4. send() data on the socket.
> 
> Packets will continue to be sent in the clear instead of matching an
> xfrm or returning a no-match error (EAGAIN). This affects calls to
> send() and not sendto().
> 
> Note: Creating normal XFRM policies should have a similar effect on
> sk_dst_cache entries that match the policy, but that is not fixed in
> this patch.
> 
> Fixes: 00bc0ef5880d ("ipv6: Skip XFRM lookup if dst_entry in socket cache is 
> valid")
> Tested: https://android-review.googlesource.com/418659
> Signed-off-by: Jonathan Basseri <misterik...@google.com>
> ---

Thank you for the fix.

Acked-by: Jakub Sitnicki <j...@redhat.com>


[PATCH net-next] rtnelink: Move link dump consistency check out of the loop

2017-08-09 Thread Jakub Sitnicki
Calls to rtnl_dump_ifinfo() are protected by RTNL lock. So are the
{list,unlist}_netdevice() calls where we bump the net->dev_base_seq
number.

For this reason net->dev_base_seq can't change under out feet while
we're looping over links in rtnl_dump_ifinfo(). So move the check for
net->dev_base_seq change (since the last time we were called) out of the
loop.

This way we avoid giving a wrong impression that there are concurrent
updates to the link list going on while we're iterating over them.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 net/core/rtnetlink.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9201e36..6b7888e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1644,8 +1644,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct 
netlink_callback *cb)
s_h = cb->args[0];
s_idx = cb->args[1];
 
-   cb->seq = net->dev_base_seq;
-
/* A hack to preserve kernel<->userspace interface.
 * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
 * However, before Linux v3.9 the code here assumed rtgenmsg and that's
@@ -1691,8 +1689,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct 
netlink_callback *cb)
 
goto out_err;
}
-
-   nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 cont:
idx++;
}
@@ -1702,6 +1698,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct 
netlink_callback *cb)
 out_err:
cb->args[1] = idx;
cb->args[0] = h;
+   cb->seq = net->dev_base_seq;
+   nl_dump_check_consistent(cb, nlmsg_hdr(skb));
 
return err;
 }
-- 
2.9.4



[PATCH net-next] ipv6: Avoid going through ->sk_net to access the netns

2017-07-31 Thread Jakub Sitnicki
There is no need to go through sk->sk_net to access the net namespace
and its sysctl variables because we allocate the sock and initialize
sk_net just a few lines earlier in the same routine.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 net/ipv6/af_inet6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index a88b5b5..0a7c740 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -210,7 +210,7 @@ static int inet6_create(struct net *net, struct socket 
*sock, int protocol,
np->mcast_hops  = IPV6_DEFAULT_MCASTHOPS;
np->mc_loop = 1;
np->pmtudisc= IPV6_PMTUDISC_WANT;
-   np->autoflowlabel = ip6_default_np_autolabel(sock_net(sk));
+   np->autoflowlabel = ip6_default_np_autolabel(net);
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
 
/* Init the ipv4 part of the socket since we can have sockets
-- 
2.9.4



[iproute PATCH] iproute: Remove useless check for nexthop keyword when setting RTA_OIF

2017-06-07 Thread Jakub Sitnicki
When modifying a route we set the RTA_OIF attribute only if a device was
specified with "dev" or "oif" keyword. But for some unknown reason we
earlier alternatively check also for the presence of "nexthop" keyword,
even though it has no effect. So remove the pointless check.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 ip/iproute.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index b4ca291..4fd36a1 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -1241,16 +1241,14 @@ static int iproute_modify(int cmd, unsigned int flags, 
int argc, char **argv)
if (!dst_ok)
usage();
 
-   if (d || nhs_ok)  {
+   if (d) {
int idx;
 
-   if (d) {
-   if ((idx = ll_name_to_index(d)) == 0) {
-   fprintf(stderr, "Cannot find device \"%s\"\n", 
d);
-   return -1;
-   }
-   addattr32(, sizeof(req), RTA_OIF, idx);
+   if ((idx = ll_name_to_index(d)) == 0) {
+   fprintf(stderr, "Cannot find device \"%s\"\n", d);
+   return -1;
}
+   addattr32(, sizeof(req), RTA_OIF, idx);
}
 
if (mxrta->rta_len > RTA_LENGTH(0)) {
-- 
2.9.4



Re: [PATCH] [PATCH net] net: Do not hold the reference for the same sk_rx_dst.

2017-03-16 Thread Jakub Sitnicki
On Thu, Mar 16, 2017 at 10:12 AM GMT, Kevin Xu wrote:
> Do you mean the message looping endlessly?

No, the message is emitted just once. Around 100 seconds after
destroying a few namespaces. Occurs not so often, maybe once per ten
runs.

-Jakub

> If so, then I suppose it's a different bug.
>
> Kevin
>
>> On Mar 16, 2017, at 3:01 AM, Jakub Sitnicki <j...@redhat.com> wrote:
>>
>>> On Thu, Mar 16, 2017 at 08:08 AM GMT, Kevin Xu wrote:
>>> In some rare cases, inet_sk_rx_dst_set() may be called multiple times
>>> on the same dst, causing reference count leakage. Eventually, it
>>> prevents net_device to be destroyed. The bug then manifested as
>>>
>>> unregister_netdevice: waiting for lo to become free. Usage count = 1
>>>
>>> in the kernel log, preventing new network namespace creation.
>>>
>>> The patch works around the issue by checking whether the socket already
>>> has the same dst set.
>>>
>>> Signed-off-by: Kevin Xu <kaiwen...@hulu.com>
>>> ---
>>
>> FWIW, with this patch applied I'm still sometimes seeing:
>>
>> [  125.928095] unregister_netdevice: waiting for lo to become free. Usage 
>> count = 1
>>
>> -Jakub


Re: [PATCH] [PATCH net] net: Do not hold the reference for the same sk_rx_dst.

2017-03-16 Thread Jakub Sitnicki
On Thu, Mar 16, 2017 at 08:08 AM GMT, Kevin Xu wrote:
> In some rare cases, inet_sk_rx_dst_set() may be called multiple times
> on the same dst, causing reference count leakage. Eventually, it
> prevents net_device to be destroyed. The bug then manifested as
>
> unregister_netdevice: waiting for lo to become free. Usage count = 1
>
> in the kernel log, preventing new network namespace creation.
>
> The patch works around the issue by checking whether the socket already
> has the same dst set.
>
> Signed-off-by: Kevin Xu 
> ---

FWIW, with this patch applied I'm still sometimes seeing:

[  125.928095] unregister_netdevice: waiting for lo to become free. Usage count 
= 1

-Jakub


Re: [PATCH net-next v3] net: ipv4: add support for ECMP hash policy choice

2017-03-15 Thread Jakub Sitnicki
On Tue, Mar 14, 2017 at 03:36 PM GMT, Nikolay Aleksandrov wrote:
> This patch adds support for ECMP hash policy choice via a new sysctl
> called fib_multipath_hash_policy and also adds support for L4 hashes.
> The current values for fib_multipath_hash_policy are:
>  0 - layer 3 (default)
>  1 - layer 4
> If there's an skb hash already set and it matches the chosen policy then it
> will be used instead of being calculated (currently only for L4).
> In L3 mode we always calculate the hash due to the ICMP error special
> case, the flow dissector's field consistentification should handle the
> address order thus we can remove the address reversals.
>
> Signed-off-by: Nikolay Aleksandrov 
> ---
> v3:
>  - keep the ICMP error special handling and always calc L3 hash
>Jakub, could you please run your tests with this version ?
>
> v2:
>  - removed the output_key_hash as it's not needed anymore
>  - reverted to my original/internal patch with L3 as default hash
>
>  Documentation/networking/ip-sysctl.txt |  8 +++
>  include/net/ip_fib.h   | 14 ++
>  include/net/netns/ipv4.h   |  1 +
>  include/net/route.h|  9 +---
>  net/ipv4/fib_semantics.c   | 11 ++--
>  net/ipv4/icmp.c| 19 +--
>  net/ipv4/route.c   | 92 
> ++
>  net/ipv4/sysctl_net_ipv4.c |  9 
>  8 files changed, 98 insertions(+), 65 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.txt 
> b/Documentation/networking/ip-sysctl.txt
> index fc73eeb7b3b8..558e19653106 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -73,6 +73,14 @@ fib_multipath_use_neigh - BOOLEAN
>   0 - disabled
>   1 - enabled
>  
> +fib_multipath_hash_policy - INTEGER
> + Controls which hash policy to use for multipath routes. Only valid
> + for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled.
> + Default: 0 (Layer 3)
> + Possible values:
> + 0 - Layer 3
> + 1 - Layer 4
> +
>  route/max_size - INTEGER
>   Maximum number of routes allowed in the kernel.  Increase
>   this when using large numbers of interfaces and/or routes.
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index d9cee9659978..8c608a17bd9a 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -383,17 +383,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned 
> long event, bool force);
>  int fib_sync_down_addr(struct net_device *dev, __be32 local);
>  int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
>  
> -extern u32 fib_multipath_secret __read_mostly;
> -
> -static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
> -{
> - return jhash_2words((__force u32)saddr, (__force u32)daddr,
> - fib_multipath_secret) >> 1;
> -}
> -
> +#ifdef CONFIG_IP_ROUTE_MULTIPATH
> +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
> +const struct sk_buff *skb);
> +#endif
>  void fib_select_multipath(struct fib_result *res, int hash);
>  void fib_select_path(struct net *net, struct fib_result *res,
> -  struct flowi4 *fl4, int mp_hash);
> +  struct flowi4 *fl4);
>  
>  /* Exported by fib_trie.c */
>  void fib_trie_init(void);
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 622d2da27135..70a1d4251790 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -152,6 +152,7 @@ struct netns_ipv4 {
>  #endif
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   int sysctl_fib_multipath_use_neigh;
> + int sysctl_fib_multipath_hash_policy;
>  #endif
>  
>   unsigned intfib_seq;/* protected by rtnl_mutex */
> diff --git a/include/net/route.h b/include/net/route.h
> index c0874c87c173..32412c91c222 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -113,14 +113,7 @@ struct in_device;
>  int ip_rt_init(void);
>  void rt_cache_flush(struct net *net);
>  void rt_flush_dev(struct net_device *dev);
> -struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
> -   int mp_hash);
> -
> -static inline struct rtable *__ip_route_output_key(struct net *net,
> -struct flowi4 *flp)
> -{
> - return __ip_route_output_key_hash(net, flp, -1);
> -}
> +struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp);
>  
>  struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
>   const struct sock *sk);
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 317026a39cfa..6601bd9744c9 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -57,7 +57,6 @@ static unsigned int fib_info_cnt;
>  static struct hlist_head 

Re: [PATCH net-next v2] net: ipv4: add support for ECMP hash policy choice

2017-03-08 Thread Jakub Sitnicki
On Wed, Mar 08, 2017 at 12:43 PM GMT, Nikolay Aleksandrov wrote:
> On 08/03/17 14:05, Jakub Sitnicki wrote:
>> On Tue, Mar 07, 2017 at 11:01 AM GMT, Nikolay Aleksandrov wrote:
>>> This patch adds support for ECMP hash policy choice via a new sysctl
>>> called fib_multipath_hash_policy and also adds support for L4 hashes.
>>> The current values for fib_multipath_hash_policy are:
>>>  0 - layer 3 (default)
>>>  1 - layer 4
>>> If there's an skb hash already set and it matches the chosen policy then it
>>> will be used instead of being calculated. The ICMP inner IP addresses use
>>> is removed.
>>>
>>> Signed-off-by: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
>>> ---
>>> v2:
>>>  - removed the output_key_hash as it's not needed anymore
>>>  - reverted to my original/internal patch with L3 as default hash
>> 
>> What about ICMP PTB (Fragmentation Needed) forwarding that makes PMTUD
>> work with ECMP in setups like described in RFC7690 [1]?
>> 
>>   ptb -> router ecmp -> next hop L4/L7 load balancer -> destination
>> 
>>router --> load balancer 1 --->
>> \\--> load balancer 2 ---> load-balanced service
>>  \--> load balancer N --->
>> 
>> Removing special treatment of ICMP errors will break it, won't it?
>> 
>
> Yes, I am aware and this decision was made with that in mind.
> We'd like to use the HW hash when available and IIRC that doesn't play well 
> with
> special-casing ICMP errors for anycast as it may not match also. Another 
> thing,
> again if I remember correctly, was that this behaviour is closer to how 
> hardware
> handles ECMP.

OK, I wanted to make sure that is not an oversight that ECMP routing in
ipv4 stack is to be dumbed down to match the hardware behavior. I
thought that it was an advantage that we want to have over hardware
routers. (To be fair, I should mention that we don't have it in ipv6
stack ATM.)

>
> One thing we can do is leave the current L3 behaviour with ICMP error handling
> and add a new L3 mode that tries to use the skb hash when available and 
> doesn't
> care about the packet type.
>
> What do you think ?

Sounds good to me. Would be good to hear other opinions also.

Thanks,
Jakub


Re: [PATCH net-next v2] net: ipv4: add support for ECMP hash policy choice

2017-03-08 Thread Jakub Sitnicki
Hi Nikolay,

On Tue, Mar 07, 2017 at 11:01 AM GMT, Nikolay Aleksandrov wrote:
> This patch adds support for ECMP hash policy choice via a new sysctl
> called fib_multipath_hash_policy and also adds support for L4 hashes.
> The current values for fib_multipath_hash_policy are:
>  0 - layer 3 (default)
>  1 - layer 4
> If there's an skb hash already set and it matches the chosen policy then it
> will be used instead of being calculated. The ICMP inner IP addresses use
> is removed.
>
> Signed-off-by: Nikolay Aleksandrov 
> ---
> v2:
>  - removed the output_key_hash as it's not needed anymore
>  - reverted to my original/internal patch with L3 as default hash

What about ICMP PTB (Fragmentation Needed) forwarding that makes PMTUD
work with ECMP in setups like described in RFC7690 [1]?

  ptb -> router ecmp -> next hop L4/L7 load balancer -> destination

   router --> load balancer 1 --->
\\--> load balancer 2 ---> load-balanced service
 \--> load balancer N --->

Removing special treatment of ICMP errors will break it, won't it?

FWIW, I gave a run to your patch (default settings, L3 hash) with a test
script [2] that simulates such a setup and it confirmed my worries - PTB
errors don't travel back to the source host any more.

-Jakub

[1] https://tools.ietf.org/html/rfc7690#section-2
[2] 
https://github.com/jsitnicki/tools/commit/ccb85e68421df4ffd8b7abf00f6f5fe1c6a5af76


[PATCH net-next] gre6: Clean up unused struct ipv6_tel_txoption definition

2017-01-20 Thread Jakub Sitnicki
Commit b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE
functions") removed the ip6gre specific transmit function, but left the
struct ipv6_tel_txoption definition. Clean it up.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 net/ipv6/ip6_gre.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 75b6108..65bdfd1 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -484,11 +484,6 @@ static int gre_rcv(struct sk_buff *skb)
return 0;
 }
 
-struct ipv6_tel_txoption {
-   struct ipv6_txoptions ops;
-   __u8 dst_opt[8];
-};
-
 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
 {
return iptunnel_handle_offloads(skb,
-- 
2.7.4



[PATCH net] ip6_tunnel: Account for tunnel header in tunnel MTU

2017-01-13 Thread Jakub Sitnicki
With ip6gre we have a tunnel header which also makes the tunnel MTU
smaller. We need to reserve room for it. Previously we were using up
space reserved for the Tunnel Encapsulation Limit option
header (RFC 2473).

Also, after commit b05229f44228 ("gre6: Cleanup GREv6 transmit path,
call common GRE functions") our contract with the caller has
changed. Now we check if the packet length exceeds the tunnel MTU after
the tunnel header has been pushed, unlike before.

This is reflected in the check where we look at the packet length minus
the size of the tunnel header, which is already accounted for in tunnel
MTU.

Fixes: b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE 
functions")
Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 net/ipv6/ip6_tunnel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 36d2921..753d6d0 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1108,7 +1108,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device 
*dev, __u8 dsfield,
 t->parms.name);
goto tx_err_dst_release;
}
-   mtu = dst_mtu(dst) - psh_hlen;
+   mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen;
if (encap_limit >= 0) {
max_headroom += 8;
mtu -= 8;
@@ -1117,7 +1117,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device 
*dev, __u8 dsfield,
mtu = IPV6_MIN_MTU;
if (skb_dst(skb) && !t->parms.collect_md)
skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-   if (skb->len > mtu && !skb_is_gso(skb)) {
+   if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) {
*pmtu = mtu;
err = -EMSGSIZE;
goto tx_err_dst_release;
-- 
2.7.4



Re: [PATCH net-next 5/5] ipv6: Compute multipath hash for forwarded ICMP errors from offending packet

2016-11-01 Thread Jakub Sitnicki
On Tue, Nov 01, 2016 at 03:35 PM GMT, David Miller wrote:
> From: Jakub Sitnicki <j...@redhat.com>
> Date: Tue, 01 Nov 2016 16:13:51 +0100
>
>> On Mon, Oct 31, 2016 at 07:15 PM GMT, David Miller wrote:
>>> From: Jakub Sitnicki <j...@redhat.com>
>>> Date: Sun, 30 Oct 2016 14:03:11 +0100
>>>
>>>> 2) ensure the flow labels used in both directions are the same (either
>>>>reflected by one side, or fixed, e.g. not used and set to 0), so that
>>>>the 4-tuple we hash over when forwarding, >>>label, next hdr>, is the same both ways, modulo the order of
>>>>addresses.
>>>
>>> Even Linux, by default, does not do reflection.
>>>
>>> See the flowlabel_consistency sysctl, which we set by default to '1'.
>> 
>> Yes, unfortunately, if Linux-based hosts are used as sending/receiving
>> IPv6, ICMP error forwarding will not work out of the box. Users will be
>> burdened with adjusting the runtime network stack config, as you point
>> out, or otherwise instructing the apps to set the flow label,
>> e.g. traceroute6 -I  ...
>
> I'm pretty sure that sysctl default was choosen intentionally, and we
> actively are _encouraging_ the world to not depend upon reflection in
> any way, shape, or form.
>
> And it's kind of pointless to suggest a facility that can't work with
> Linux endpoints out of the box.
>
> This was the point I'm trying to make.
>
> If the intentions of that sysctl default does pan out, the idea is for
> the world to move towards arbitrary flow label settings, even perhaps
> changing over time.  The intention is to make this more, not less,
> common.  And the idea is to give maximum flexibility for endpoints to
> set these flow labels, in order to increase entropy wherever possible.
>
> I have a really hard time accepting a "fix" that depends upon behavior
> that the Linux ipv6 stack doesn't even have.

Fair enough. I'm not questioning the defaults or the benefits of
widespread use of flow labels.

I was trying to do this without changing as to how we hash the packets
and balance traffic over multiple paths, but that does yield a solution
that does not work out of the box with Linux endpoints. Hard to sell, I
agree.

As a potential way out, I can rework it so that we exclude the flow
label from the multipath hash. That way we lose some entropy (not worse
than IPv4), but do not depend any more on how flow labels are set
(flexible). This could be made runtime configurable, as it changes
existing behavior.

Thanks,
Jakub


Re: [PATCH net-next 5/5] ipv6: Compute multipath hash for forwarded ICMP errors from offending packet

2016-11-01 Thread Jakub Sitnicki
On Mon, Oct 31, 2016 at 07:25 PM GMT, Tom Herbert wrote:
> On Sun, Oct 30, 2016 at 6:03 AM, Jakub Sitnicki <j...@redhat.com> wrote:
>> On Fri, Oct 28, 2016 at 02:25 PM GMT, Tom Herbert wrote:
>>>
>>> If this patch is being done to be compatible with IPv4 I guess that's
>>> okay, but it would be false advertisement to say this makes ICMP
>>> follow the same path as the flow being targeted in an error.
>>> Fortunately, I doubt anyone can have a dependency on this for ICMP.
>>
>> I wouldn't want to propose anything that would be useless. If you think
>> that this is the case here, I would very much like to understand what
>> and why cannot work in practice.
>>
> The normal hash for TCP or UDP using ECMP is over <protocol, srcIP,
> dstIP, srcPort, dstPort>. For an ICMP packet ECMP would most likely be
> done over <protocol, srcIP, dstIP>. There really is no way to ensure
> that an ICMP packet will follow the same path as TCP or any other
> protocol. Fortunately, this is really isn't so terrible. The Internet
> has worked this way ever since routers started using ports as input to
> ECMP and that hasn't caused any major meltdown.

Ahh, I see the problem now. Thank you for clearing it up for me.

You are right, for locally generated TCP/UDP traffic we are computing an
L4 hash (over the 5-tuple that you mentioned) that drives the multipath
routing. While when sending locally generated ICMP errors we are only
computing an L3 hash (over the mentioned 3-tuple).

I believe that is a problem affects both IPv6 and IPv4, and manifests
itself only when the offending packet that triggers the error is
destined for the ECMP router.

When an ECMP router is: (i) sending an ICMP error in reaction to a
packet that was to be forwarded, or (ii) forwarding an ICMP error,
everything works as expected. That is because when forwarding traffic we
limit ourselves to computing an L3 hash so that the IP fragments are
routed together. Right?

So, my understanding is that, with these changes, things are not perfect
but we are not worse than IPv4 right now.

Would you be okay with this series if I update the patch 4/5 description
to highlight the existing problem that you point out? A fix for this
IPv4/6 common issue could follow afterwards.

Thanks,
Jakub


Re: [PATCH net-next 5/5] ipv6: Compute multipath hash for forwarded ICMP errors from offending packet

2016-11-01 Thread Jakub Sitnicki
On Mon, Oct 31, 2016 at 07:15 PM GMT, David Miller wrote:
> From: Jakub Sitnicki <j...@redhat.com>
> Date: Sun, 30 Oct 2016 14:03:11 +0100
>
>> 2) ensure the flow labels used in both directions are the same (either
>>reflected by one side, or fixed, e.g. not used and set to 0), so that
>>the 4-tuple we hash over when forwarding, >label, next hdr>, is the same both ways, modulo the order of
>>addresses.
>
> Even Linux, by default, does not do reflection.
>
> See the flowlabel_consistency sysctl, which we set by default to '1'.

Yes, unfortunately, if Linux-based hosts are used as sending/receiving
IPv6, ICMP error forwarding will not work out of the box. Users will be
burdened with adjusting the runtime network stack config, as you point
out, or otherwise instructing the apps to set the flow label,
e.g. traceroute6 -I  ...

> I think we need to think a lot more about how systems actually set and
> use flowlabels.

The only alternative I can think of, only for ECMP routing, is having a
toggle option that would exclude the flow label from the input to the
multipath hash.

We would be sacrificing the entropy that potentially comes from hashing
over the flow label, leading to better flow balancing. But we wouldn't
be making IPv6 multipath routing any worse than IPv4 is in that regard.
And user-space apps wouldn't need to resort to reflecting/setting the
label, just like with IPv4.

Is that something that you would consider a viable option?

> Also, one issue I also had with this series was adding a new member
> to the flow label.  Is it possible to implement this like the ipv4
> side did, by simply passing a new parameter around to the necessary
> functions?

This was my initial approach, i.e. to mimic the IPv4 and pass the
multipath hash down the call chain via a parameter. However, I gave up
on it, thinking it will cause too much disturbance in the involved
functions' interfaces, when I realized that one of the call paths the
multipath hash would have to also be passed through is:

  ip6_route_input_lookup
fib6_rule_lookup
  fib_rules_lookup
fib6_rule_action
  ip6_pol_route_input

To be honest, I was thinking that if extending flowi6 structure would
find acceptance, then maybe the new member could be at some point moved
to flowi_common and also used by IPv4 to avoid parameter passing there
as well.

Thanks,
Jakub



Re: [PATCH net-next 5/5] ipv6: Compute multipath hash for forwarded ICMP errors from offending packet

2016-10-30 Thread Jakub Sitnicki
On Fri, Oct 28, 2016 at 02:25 PM GMT, Tom Herbert wrote:
> On Fri, Oct 28, 2016 at 1:32 AM, Jakub Sitnicki <j...@redhat.com> wrote:
>> On Thu, Oct 27, 2016 at 10:35 PM GMT, Tom Herbert wrote:
>>> On Mon, Oct 24, 2016 at 2:28 AM, Jakub Sitnicki <j...@redhat.com> wrote:
>>>> Same as for the transmit path, let's do our best to ensure that received
>>>> ICMP errors that may be subject to forwarding will be routed the same
>>>> path as flow that triggered the error, if it was going in the opposite
>>>> direction.
>>>>
>>> Unfortunately our ability to do this is generally quite limited. This
>>> patch will select the route for multipath, but I don't believe sets
>>> the same link in LAG and definitely can't help switches doing ECMP to
>>> route the ICMP packet in the same way as the flow would be. Did you
>>> see a problem that warrants solving this case?
>>
>> The motivation here is to bring IPv6 ECMP routing on par with IPv4 to
>> enable its wider use, targeting anycast services. Forwarding ICMP errors
>> back to the source host, at the L3 layer, is what we thought would be a
>> step forward.
>>
>> Similar to change in IPv4 routing introduced in commit 79a131592dbb
>> ("ipv4: ICMP packet inspection for multipath", [1]) we do our best at
>> L3, leaving any potential problems with LAG at lower layer (L2)
>> unaddressed.
>>
> ICMP will almost certainly take a different path in the network than
> TCP or UDP due to ECMP. If we ever get proper flow label support for
> ECMP then that could solve the problem if all the devices do a hash
> just on <srcIP, destIP, FlowLabel>.

Sorry for my late reply, I have been traveling.

I think that either I am missing something here, or the proposed changes
address just the problem that you have described.

Yes, if we compute the hash that drives the route choice over the IP
header of the ICMP error, then there is no guarantee it will travel back
to the sender of the offending packet that triggered the error.

That is why, we look at the offending packet carried by an ICMP error
and hash over its fields, instead. We need, however, to take care of two
things:

1) swap the source with the destination address, because we are
   forwarding the ICMP error in the opposite direction than the
   offending packet was going (see icmpv6_multipath_hash() introduced in
   patch 4/5); and

2) ensure the flow labels used in both directions are the same (either
   reflected by one side, or fixed, e.g. not used and set to 0), so that
   the 4-tuple we hash over when forwarding, , is the same both ways, modulo the order of
   addresses.

> If this patch is being done to be compatible with IPv4 I guess that's
> okay, but it would be false advertisement to say this makes ICMP
> follow the same path as the flow being targeted in an error.
> Fortunately, I doubt anyone can have a dependency on this for ICMP.

I wouldn't want to propose anything that would be useless. If you think
that this is the case here, I would very much like to understand what
and why cannot work in practice.

Thanks for reviewing this series,
Jakub


[PATCH net-next v2 5/5] ipv6: Compute multipath hash for forwarded ICMP errors from offending packet

2016-10-28 Thread Jakub Sitnicki
Same as for the transmit path, let's do our best to ensure that received
ICMP errors that may be subject to forwarding will be routed the same
path as flow that triggered the error, if it was going in the opposite
direction.

v1 -> v2:
 - style change, put as many arguments as possible on the first line of
   a function call, and align consecutive lines to the first argument,
   pointed out by David Miller

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>
---
 net/ipv6/route.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1184c2b..269e30d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1150,6 +1150,30 @@ struct dst_entry *ip6_route_input_lookup(struct net *net,
 }
 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
+static u32 ip6_multipath_icmp_hash(const struct sk_buff *skb)
+{
+   const struct icmp6hdr *icmph = icmp6_hdr(skb);
+   const struct ipv6hdr *inner_iph;
+   struct ipv6hdr _inner_iph;
+
+   if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
+   icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
+   icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
+   icmph->icmp6_type != ICMPV6_PARAMPROB)
+   goto standard_hash;
+
+   inner_iph = skb_header_pointer(skb,
+  skb_transport_offset(skb) + 
sizeof(*icmph),
+  sizeof(_inner_iph), &_inner_iph);
+   if (!inner_iph)
+   goto standard_hash;
+
+   return icmpv6_multipath_hash(inner_iph);
+
+standard_hash:
+   return 0; /* compute it later, if needed */
+}
+
 void ip6_route_input(struct sk_buff *skb)
 {
const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -1168,6 +1192,8 @@ void ip6_route_input(struct sk_buff *skb)
tun_info = skb_tunnel_info(skb);
if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+   if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
+   fl6.mp_hash = ip6_multipath_icmp_hash(skb);
skb_dst_drop(skb);
skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, , flags));
 }
-- 
2.7.4



[PATCH net-next v2 4/5] ipv6: Compute multipath hash for sent ICMP errors from offending packet

2016-10-28 Thread Jakub Sitnicki
Improve debuggability with tools like traceroute and make PMTUD work in
setups that make use of ECMP routing by sending ICMP errors down the
same path as the offending packet would travel, if it was going in the
opposite direction.

There is a caveat, flows in both directions need use the same
label. Otherwise packets from flow in the opposite direction and ICMP
errors will not be routed over the same ECMP link.

Export the function for calculating the multipath hash so that we can
use it also on receive side, when forwarding ICMP errors.

v1 -> v2:
 - don't use "extern" in external function declaration in header file,
   pointed out by David Miller

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>
---
 include/linux/icmpv6.h |  2 ++
 net/ipv6/icmp.c| 21 +
 2 files changed, 23 insertions(+)

diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 57086e9..803c241 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -45,4 +45,6 @@ extern void   icmpv6_flow_init(struct 
sock *sk,
 const struct in6_addr 
*saddr,
 const struct in6_addr 
*daddr,
 int oif);
+struct ipv6hdr;
+u32icmpv6_multipath_hash(const struct 
ipv6hdr *iph);
 #endif
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index bd59c34..ab376b3d1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -385,6 +385,26 @@ static struct dst_entry *icmpv6_route_lookup(struct net 
*net,
return ERR_PTR(err);
 }
 
+u32 icmpv6_multipath_hash(const struct ipv6hdr *iph)
+{
+   struct flowi6 fl6;
+
+   /* Calculate the multipath hash from the offending IP datagram that
+* triggered the ICMP error. The source and destination addresses are
+* swapped as we do our best to route the ICMP message together with the
+* flow it belongs to. However, flows in both directions have to have
+* the same label (e.g. by using flow label reflection) for it to
+* happen.
+*/
+   memset(, 0, sizeof(fl6));
+   fl6.daddr = iph->saddr;
+   fl6.saddr = iph->daddr;
+   fl6.flowlabel = ip6_flowinfo(iph);
+   fl6.flowi6_proto = iph->nexthdr;
+
+   return get_hash_from_flowi6();
+}
+
 /*
  * Send an ICMP message in response to a packet in error
  */
@@ -484,6 +504,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 
code, __u32 info,
fl6.flowi6_oif = iif;
fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code;
+   fl6.mp_hash = icmpv6_multipath_hash(hdr);
security_skb_classify_flow(skb, flowi6_to_flowi());
 
sk = icmpv6_xmit_lock(net);
-- 
2.7.4



[PATCH net-next v2 0/5] Route ICMPv6 errors with the flow when ECMP in use

2016-10-28 Thread Jakub Sitnicki
The motivation for this series is to route ICMPv6 error messages
together with the flow they belong to when multipath routing is in
use. It intends to bring the ECMP routing in IPv6 stack on par with
IPv4.

This enables the use of tools that rely on ICMP error messages such as
traceroute and makes PMTU discovery work both ways. However, for it to
work IPv6 flow labels have to be same in both directions
(i.e. reflected) or need to be chosen in a manner that ensures that
the flow going in the opposite direction would actually be routed to a
given path.

Even though we generally don't expect this, as receiving and sending
IPv6 are free to choose flow labels at will, we make an assumption
here that the enitity in charge of configuring ECMP routing will also
be in control of the server hosts, and can set up flow label
reflection. However, if this is not the case, the patchset doesn't
make the situation worse.

One potential user of the changes here would be an anycast service
hosted behind an ECMP router(s).

Changes have been tested in a virtual setup with a topology as below:

  Re1 --- Hs1
 /
 Hc --- Ri --- Rc
 \
  Re1 --- Hs2

 Hc  - client host
 HsX - server host
 Rc  - core router
 ReX - edge router
 Ri  - intermediate router

To test the changes, traceroute in UDP mode to the client host, with
flow label set, has been run from one of the server hosts. Full test
is available at [1].

-Jakub

[1] 
https://github.com/jsitnicki/tools/blob/master/net/tests/ecmp/test-ecmp-icmpv6-error-routing.sh

v1 -> v2:
 - don't use "extern" in external function declaration in header file,
   pointed out by David Miller;
 - style change, put as many arguments as possible on the first line of
   a function call, and align consecutive lines to the first argument,
   pointed out by David Miller;
 - expand the cover letter based on the feedback from David Miller and
   Hannes Sowa;

Jakub Sitnicki (5):
  ipv6: Fold rt6_info_hash_nhsfn() into its only caller
  net: Extend struct flowi6 with multipath hash
  ipv6: Use multipath hash from flow info if available
  ipv6: Compute multipath hash for sent ICMP errors from offending
packet
  ipv6: Compute multipath hash for forwarded ICMP errors from offending
packet

 include/linux/icmpv6.h |  2 ++
 include/net/flow.h |  1 +
 net/ipv6/icmp.c| 21 +
 net/ipv6/route.c   | 40 +---
 4 files changed, 53 insertions(+), 11 deletions(-)

-- 
2.7.4



[PATCH net-next v2 2/5] net: Extend struct flowi6 with multipath hash

2016-10-28 Thread Jakub Sitnicki
Allow for functions that fill out the IPv6 flow info to also pass a hash
computed over the skb contents. The hash value will drive the multipath
routing decisions.

This is intended for special treatment of ICMPv6 errors, where we would
like to make a routing decision based on the flow identifying the
offending IPv6 datagram that triggered the error, rather than the flow
of the ICMP error itself.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>
---
 include/net/flow.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/net/flow.h b/include/net/flow.h
index 035aa77..73ee3aa 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -143,6 +143,7 @@ struct flowi6 {
 #define fl6_ipsec_spi  uli.spi
 #define fl6_mh_typeuli.mht.type
 #define fl6_gre_keyuli.gre_key
+   __u32   mp_hash;
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 struct flowidn {
-- 
2.7.4



[PATCH net-next v2 3/5] ipv6: Use multipath hash from flow info if available

2016-10-28 Thread Jakub Sitnicki
Allow our callers to influence the choice of ECMP link by honoring the
hash passed together with the flow info. This will allow for special
treatment of ICMP errors which we would like to route over the same link
as the IP datagram that triggered the error.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>
---
 net/ipv6/route.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0514b35..1184c2b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -430,9 +430,11 @@ static struct rt6_info *rt6_multipath_select(struct 
rt6_info *match,
 int strict)
 {
struct rt6_info *sibling, *next_sibling;
+   unsigned int hash;
int route_choosen;
 
-   route_choosen = get_hash_from_flowi6(fl6) % (match->rt6i_nsiblings + 1);
+   hash = fl6->mp_hash ? : get_hash_from_flowi6(fl6);
+   route_choosen = hash % (match->rt6i_nsiblings + 1);
/* Don't change the route, if route_choosen == 0
 * (siblings does not include ourself)
 */
-- 
2.7.4



[PATCH net-next v2 1/5] ipv6: Fold rt6_info_hash_nhsfn() into its only caller

2016-10-28 Thread Jakub Sitnicki
Commit 644d0e656958 ("ipv6 Use get_hash_from_flowi6 for rt6 hash") has
turned rt6_info_hash_nhsfn() into a one-liner, so it no longer makes
sense to keep it around.

Also the accompanying documentation comment has become outdated, so just
remove it altogether.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>
---
 net/ipv6/route.c | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bdbc38e..0514b35 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -425,16 +425,6 @@ static bool rt6_check_expired(const struct rt6_info *rt)
return false;
 }
 
-/* Multipath route selection:
- *   Hash based function using packet header and flowlabel.
- * Adapted from fib_info_hashfn()
- */
-static int rt6_info_hash_nhsfn(unsigned int candidate_count,
-  const struct flowi6 *fl6)
-{
-   return get_hash_from_flowi6(fl6) % candidate_count;
-}
-
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 struct flowi6 *fl6, int oif,
 int strict)
@@ -442,7 +432,7 @@ static struct rt6_info *rt6_multipath_select(struct 
rt6_info *match,
struct rt6_info *sibling, *next_sibling;
int route_choosen;
 
-   route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
+   route_choosen = get_hash_from_flowi6(fl6) % (match->rt6i_nsiblings + 1);
/* Don't change the route, if route_choosen == 0
 * (siblings does not include ourself)
 */
-- 
2.7.4



Re: [PATCH net-next 5/5] ipv6: Compute multipath hash for forwarded ICMP errors from offending packet

2016-10-28 Thread Jakub Sitnicki
On Thu, Oct 27, 2016 at 10:35 PM GMT, Tom Herbert wrote:
> On Mon, Oct 24, 2016 at 2:28 AM, Jakub Sitnicki <j...@redhat.com> wrote:
>> Same as for the transmit path, let's do our best to ensure that received
>> ICMP errors that may be subject to forwarding will be routed the same
>> path as flow that triggered the error, if it was going in the opposite
>> direction.
>>
> Unfortunately our ability to do this is generally quite limited. This
> patch will select the route for multipath, but I don't believe sets
> the same link in LAG and definitely can't help switches doing ECMP to
> route the ICMP packet in the same way as the flow would be. Did you
> see a problem that warrants solving this case?

The motivation here is to bring IPv6 ECMP routing on par with IPv4 to
enable its wider use, targeting anycast services. Forwarding ICMP errors
back to the source host, at the L3 layer, is what we thought would be a
step forward.

Similar to change in IPv4 routing introduced in commit 79a131592dbb
("ipv4: ICMP packet inspection for multipath", [1]) we do our best at
L3, leaving any potential problems with LAG at lower layer (L2)
unaddressed.

>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index 1184c2b..c0f38ea 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c

[...]

>> @@ -1168,6 +1192,8 @@ void ip6_route_input(struct sk_buff *skb)
>> tun_info = skb_tunnel_info(skb);
>> if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
>> fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
>> +   if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
>> +   fl6.mp_hash = ip6_multipath_icmp_hash(skb);
>
> I will point out that this is only

Sorry, looks like part of your reply got cut short. Could you repost?

-Jakub

[1] https://git.kernel.org/torvalds/c/79a131592dbb81a2dba208622a2ffbfc53f28bc0


Re: [PATCH net-next 5/5] ipv6: Compute multipath hash for forwarded ICMP errors from offending packet

2016-10-27 Thread Jakub Sitnicki
On Thu, Oct 27, 2016 at 03:25 PM GMT, David Miller wrote:
> From: Jakub Sitnicki <j...@redhat.com>
> Date: Mon, 24 Oct 2016 11:28:52 +0200
>
>> +inner_iph = skb_header_pointer(
>> +skb, skb_transport_offset(skb) + sizeof(*icmph),
>> +sizeof(_inner_iph), &_inner_iph);
>
> Please do not style this call like this, put as many arguments as
> you can on the first line.
>
>   inner_iph = skb_header_pointer(skb,
>  skb_transport_offset(skb) + 
> sizeof(*icmph),
>  sizeof(_inner_iph), &_inner_iph);
>
> And on the second and subsequent lines, indent to the first column after
> the openning parenthesis of the first line.

FWIW, I had it styled like that and then changed it. Will change back.

In my defense - checkpatch.pl made me do it, Your Honor! (line too long)


Re: [PATCH net-next 4/5] ipv6: Compute multipath hash for sent ICMP errors from offending packet

2016-10-27 Thread Jakub Sitnicki
On Thu, Oct 27, 2016 at 03:24 PM GMT, David Miller wrote:
> From: Jakub Sitnicki <j...@redhat.com>
> Date: Mon, 24 Oct 2016 11:28:51 +0200
>
>> diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
>> index 57086e9..6282e03 100644
>> --- a/include/linux/icmpv6.h
>> +++ b/include/linux/icmpv6.h
>> @@ -45,4 +45,6 @@ extern void
>> icmpv6_flow_init(struct sock *sk,
>>   const struct in6_addr 
>> *saddr,
>>   const struct in6_addr 
>> *daddr,
>>   int oif);
>> +struct ipv6hdr;
>> +extern u32  icmpv6_multipath_hash(const struct 
>> ipv6hdr *iph);
>>  #endif
>
> We do not use "extern" in external function declarations in header file any 
> more.

My mistake, will remote it.


Re: [PATCH] rtl8xxxu: mark symbol static where possible

2016-10-26 Thread Jakub Sitnicki
On Wed, Oct 26, 2016 at 09:32 AM GMT, Baoyou Xie wrote:
> We get 1 warning when building kernel with W=1:
> drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c:1557:6: warning: no 
> previous prototype for 'rtl8192eu_power_off' [-Wmissing-prototypes]
>
> In fact, this function is only used in the file in which it is
> declared and don't need a declaration, but can be made static.
> So this patch marks this function with 'static'.
>
> Signed-off-by: Baoyou Xie 
> ---
>  drivers/net/dsa/mv88e6xxx/chip.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/dsa/mv88e6xxx/chip.c 
> b/drivers/net/dsa/mv88e6xxx/chip.c
> index 883fd98..4d975f0 100644
> --- a/drivers/net/dsa/mv88e6xxx/chip.c
> +++ b/drivers/net/dsa/mv88e6xxx/chip.c
> @@ -2701,7 +2701,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
> *chip, int port)
>   return mv88e6xxx_port_write(chip, port, PORT_DEFAULT_VLAN, 0x);
>  }
>  
> -int mv88e6xxx_g1_set_switch_mac(struct mv88e6xxx_chip *chip, u8 *addr)
> +static int mv88e6xxx_g1_set_switch_mac(struct mv88e6xxx_chip *chip, u8 *addr)
>  {
>   int err;

Probably a mix-up - your patch doesn't match the description.

Thanks,
Jakub


[PATCH net] ipv6: Don't use ufo handling on later transformed packets

2016-10-26 Thread Jakub Sitnicki
Similar to commit c146066ab802 ("ipv4: Don't use ufo handling on later
transformed packets"), don't perform UFO on packets that will be IPsec
transformed. To detect it we rely on the fact that headerlen in
dst_entry is non-zero only for transformation bundles (xfrm_dst
objects).

Unwanted segmentation can be observed with a NETIF_F_UFO capable device,
such as a dummy device:

  DEV=dum0 LEN=1493

  ip li add $DEV type dummy
  ip addr add fc00::1/64 dev $DEV nodad
  ip link set $DEV up
  ip xfrm policy add dir out src fc00::1 dst fc00::2 \
 tmpl src fc00::1 dst fc00::2 proto esp spi 1
  ip xfrm state add src fc00::1 dst fc00::2 \
 proto esp spi 1 enc 'aes' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b

  tcpdump -n -nn -i $DEV -t &
  socat /dev/zero,readbytes=$LEN udp6:[fc00::2]:$LEN

tcpdump output before:

  IP6 fc00::1 > fc00::2: frag (0|1448) ESP(spi=0x0001,seq=0x1), length 1448
  IP6 fc00::1 > fc00::2: frag (1448|48)
  IP6 fc00::1 > fc00::2: ESP(spi=0x0001,seq=0x2), length 88

... and after:

  IP6 fc00::1 > fc00::2: frag (0|1448) ESP(spi=0x0001,seq=0x1), length 1448
  IP6 fc00::1 > fc00::2: frag (1448|80)

Fixes: e89e9cf539a2 ("[IPv4/IPv6]: UFO Scatter-gather approach")

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>
---
 net/ipv6/ip6_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6001e78..59eb4ed 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1366,7 +1366,7 @@ static int __ip6_append_data(struct sock *sk,
if (((length > mtu) ||
 (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
-   (rt->dst.dev->features & NETIF_F_UFO) &&
+   (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
(sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
  hh_len, fragheaderlen, exthdrlen,
-- 
2.7.4



[PATCH net-next 0/5] Route ICMPv6 errors with the flow when ECMP in use

2016-10-24 Thread Jakub Sitnicki
The motivation for this series is to route ICMPv6 error messages
together with the flow they belong to when multipath routing is in
use. It intends to bring the ECMP routing in IPv6 stack on par with
IPv4.

This enables the use of tools that rely on ICMP error messages such as
traceroute and makes PMTU discovery work both ways. However, for it to
work IPv6 flow labels have to be same in both directions
(i.e. reflected) or need to be chosen in a manner that ensures that
the flow going in the opposite direction would actually be routed to a
given path.

Changes have been tested in a virtual setup with a topology as below:

  Re1 --- Hs1
 /
 Hc --- Ri --- Rc
 \
  Re1 --- Hs2

 Hc  - client host
 HsX - server host
 Rc  - core router
 ReX - edge router
 Ri  - intermediate router

To test the changes, traceroute in UDP mode to the client host, with
flow label set, has been run from one of the server hosts. Full test
is available at [1].

-Jakub

[1] 
https://github.com/jsitnicki/tools/blob/master/net/tests/ecmp/test-ecmp-icmpv6-error-routing.sh


Jakub Sitnicki (5):
  ipv6: Fold rt6_info_hash_nhsfn() into its only caller
  net: Extend struct flowi6 with multipath hash
  ipv6: Use multipath hash from flow info if available
  ipv6: Compute multipath hash for sent ICMP errors from offending
packet
  ipv6: Compute multipath hash for forwarded ICMP errors from offending
packet

 include/linux/icmpv6.h |  2 ++
 include/net/flow.h |  1 +
 net/ipv6/icmp.c| 21 +
 net/ipv6/route.c   | 40 +---
 4 files changed, 53 insertions(+), 11 deletions(-)

-- 
2.7.4



[PATCH net-next 1/5] ipv6: Fold rt6_info_hash_nhsfn() into its only caller

2016-10-24 Thread Jakub Sitnicki
Commit 644d0e656958 ("ipv6 Use get_hash_from_flowi6 for rt6 hash") has
turned rt6_info_hash_nhsfn() into a one-liner, so it no longer makes
sense to keep it around.

Also the accompanying documentation comment has become outdated, so just
remove it altogether.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>
---
 net/ipv6/route.c | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bdbc38e..0514b35 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -425,16 +425,6 @@ static bool rt6_check_expired(const struct rt6_info *rt)
return false;
 }
 
-/* Multipath route selection:
- *   Hash based function using packet header and flowlabel.
- * Adapted from fib_info_hashfn()
- */
-static int rt6_info_hash_nhsfn(unsigned int candidate_count,
-  const struct flowi6 *fl6)
-{
-   return get_hash_from_flowi6(fl6) % candidate_count;
-}
-
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 struct flowi6 *fl6, int oif,
 int strict)
@@ -442,7 +432,7 @@ static struct rt6_info *rt6_multipath_select(struct 
rt6_info *match,
struct rt6_info *sibling, *next_sibling;
int route_choosen;
 
-   route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
+   route_choosen = get_hash_from_flowi6(fl6) % (match->rt6i_nsiblings + 1);
/* Don't change the route, if route_choosen == 0
 * (siblings does not include ourself)
 */
-- 
2.7.4



[PATCH net-next 2/5] net: Extend struct flowi6 with multipath hash

2016-10-24 Thread Jakub Sitnicki
Allow for functions that fill out the IPv6 flow info to also pass a hash
computed over the skb contents. The hash value will drive the multipath
routing decisions.

This is intended for special treatment of ICMPv6 errors, where we would
like to make a routing decision based on the flow identifying the
offending IPv6 datagram that triggered the error, rather than the flow
of the ICMP error itself.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 include/net/flow.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/net/flow.h b/include/net/flow.h
index 035aa77..73ee3aa 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -143,6 +143,7 @@ struct flowi6 {
 #define fl6_ipsec_spi  uli.spi
 #define fl6_mh_typeuli.mht.type
 #define fl6_gre_keyuli.gre_key
+   __u32   mp_hash;
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 struct flowidn {
-- 
2.7.4



[PATCH net-next 3/5] ipv6: Use multipath hash from flow info if available

2016-10-24 Thread Jakub Sitnicki
Allow our callers to influence the choice of ECMP link by honoring the
hash passed together with the flow info. This will allow for special
treatment of ICMP errors which we would like to route over the same link
as the IP datagram that triggered the error.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 net/ipv6/route.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0514b35..1184c2b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -430,9 +430,11 @@ static struct rt6_info *rt6_multipath_select(struct 
rt6_info *match,
 int strict)
 {
struct rt6_info *sibling, *next_sibling;
+   unsigned int hash;
int route_choosen;
 
-   route_choosen = get_hash_from_flowi6(fl6) % (match->rt6i_nsiblings + 1);
+   hash = fl6->mp_hash ? : get_hash_from_flowi6(fl6);
+   route_choosen = hash % (match->rt6i_nsiblings + 1);
/* Don't change the route, if route_choosen == 0
 * (siblings does not include ourself)
 */
-- 
2.7.4



[PATCH net-next 4/5] ipv6: Compute multipath hash for sent ICMP errors from offending packet

2016-10-24 Thread Jakub Sitnicki
Improve debuggability with tools like traceroute and make PMTUD work in
setups that make use of ECMP routing by sending ICMP errors down the
same path as the offending packet would travel, if it was going in the
opposite direction.

There is a caveat, flows in both directions need use the same
label. Otherwise packets from flow in the opposite direction and ICMP
errors will not be routed over the same ECMP link.

Export the function for calculating the multipath hash so that we can
use it also on receive side, when forwarding ICMP errors.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 include/linux/icmpv6.h |  2 ++
 net/ipv6/icmp.c| 21 +
 2 files changed, 23 insertions(+)

diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 57086e9..6282e03 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -45,4 +45,6 @@ extern void   icmpv6_flow_init(struct 
sock *sk,
 const struct in6_addr 
*saddr,
 const struct in6_addr 
*daddr,
 int oif);
+struct ipv6hdr;
+extern u32 icmpv6_multipath_hash(const struct 
ipv6hdr *iph);
 #endif
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index bd59c34..ab376b3d1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -385,6 +385,26 @@ static struct dst_entry *icmpv6_route_lookup(struct net 
*net,
return ERR_PTR(err);
 }
 
+u32 icmpv6_multipath_hash(const struct ipv6hdr *iph)
+{
+   struct flowi6 fl6;
+
+   /* Calculate the multipath hash from the offending IP datagram that
+* triggered the ICMP error. The source and destination addresses are
+* swapped as we do our best to route the ICMP message together with the
+* flow it belongs to. However, flows in both directions have to have
+* the same label (e.g. by using flow label reflection) for it to
+* happen.
+*/
+   memset(, 0, sizeof(fl6));
+   fl6.daddr = iph->saddr;
+   fl6.saddr = iph->daddr;
+   fl6.flowlabel = ip6_flowinfo(iph);
+   fl6.flowi6_proto = iph->nexthdr;
+
+   return get_hash_from_flowi6();
+}
+
 /*
  * Send an ICMP message in response to a packet in error
  */
@@ -484,6 +504,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 
code, __u32 info,
fl6.flowi6_oif = iif;
fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code;
+   fl6.mp_hash = icmpv6_multipath_hash(hdr);
security_skb_classify_flow(skb, flowi6_to_flowi());
 
sk = icmpv6_xmit_lock(net);
-- 
2.7.4



[PATCH net-next 5/5] ipv6: Compute multipath hash for forwarded ICMP errors from offending packet

2016-10-24 Thread Jakub Sitnicki
Same as for the transmit path, let's do our best to ensure that received
ICMP errors that may be subject to forwarding will be routed the same
path as flow that triggered the error, if it was going in the opposite
direction.

Signed-off-by: Jakub Sitnicki <j...@redhat.com>
---
 net/ipv6/route.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1184c2b..c0f38ea 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1150,6 +1150,30 @@ struct dst_entry *ip6_route_input_lookup(struct net *net,
 }
 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
+static u32 ip6_multipath_icmp_hash(const struct sk_buff *skb)
+{
+   const struct icmp6hdr *icmph = icmp6_hdr(skb);
+   const struct ipv6hdr *inner_iph;
+   struct ipv6hdr _inner_iph;
+
+   if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
+   icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
+   icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
+   icmph->icmp6_type != ICMPV6_PARAMPROB)
+   goto standard_hash;
+
+   inner_iph = skb_header_pointer(
+   skb, skb_transport_offset(skb) + sizeof(*icmph),
+   sizeof(_inner_iph), &_inner_iph);
+   if (!inner_iph)
+   goto standard_hash;
+
+   return icmpv6_multipath_hash(inner_iph);
+
+standard_hash:
+   return 0; /* compute it later, if needed */
+}
+
 void ip6_route_input(struct sk_buff *skb)
 {
const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -1168,6 +1192,8 @@ void ip6_route_input(struct sk_buff *skb)
tun_info = skb_tunnel_info(skb);
if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+   if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
+   fl6.mp_hash = ip6_multipath_icmp_hash(skb);
skb_dst_drop(skb);
skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, , flags));
 }
-- 
2.7.4



Re: [PATCH net v2] flow_dissector: Check skb for VLAN only if skb specified.

2016-10-18 Thread Jakub Sitnicki
On Mon, Oct 17, 2016 at 08:30 PM GMT, Eric Garver wrote:
> Fixes a panic when calling eth_get_headlen(). Noticed on i40e driver.
>
> Fixes: d5709f7ab776 ("flow_dissector: For stripped vlan, get vlan info from 
> skb->vlan_tci")
> Signed-off-by: Eric Garver <e...@erig.me>
> ---
>  net/core/flow_dissector.c | 6 ++
>  1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 1a7b80f73376..44e6ba9d3a6b 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -247,12 +247,10 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>   case htons(ETH_P_8021Q): {
>   const struct vlan_hdr *vlan;
>  
> - if (skb_vlan_tag_present(skb))
> + if (skb && skb_vlan_tag_present(skb))
>   proto = skb->protocol;
>  

I was a bit confused that we check skb for VLAN tag again later in the
same block but this time without a NULL check. However, this only
happens when we get called from skb_flow_dissect() or
skb_flow_dissect_flow_keys() variants, which take an skb and pass it to
us.

Feel free to add:

Reviewed-by: Jakub Sitnicki <j...@redhat.com>


Re: [PATCH v2 net-next 1/2] net: centralize net_device min/max MTU checking

2016-09-30 Thread Jakub Sitnicki
On Wed, Sep 28, 2016 at 10:20 PM GMT, Jarod Wilson wrote:
> While looking into an MTU issue with sfc, I started noticing that almost
> every NIC driver with an ndo_change_mtu function implemented almost
> exactly the same range checks, and in many cases, that was the only
> practical thing their ndo_change_mtu function was doing. Quite a few
> drivers have either 68, 64, 60 or 46 as their minimum MTU value checked,
> and then various sizes from 1500 to 65535 for their maximum MTU value. We
> can remove a whole lot of redundant code here if we simple store min_mtu
> and max_mtu in net_device, and check against those in net/core/dev.c's
> dev_set_mtu().
>
> In theory, there should be zero functional change with this patch, it just
> puts the infrastructure in place. Subsequent patches will attempt to start
> using said infrastructure, with theoretically zero change in
> functionality.
>
> CC: "David S. Miller" 
> CC: netdev@vger.kernel.org
> Signed-off-by: Jarod Wilson 
> ---

[...]

> diff --git a/net/core/dev.c b/net/core/dev.c
> index c0c291f..5343799 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6493,9 +6493,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
>   if (new_mtu == dev->mtu)
>   return 0;
>  
> - /*  MTU must be positive.*/
> - if (new_mtu < 0)
> + if (new_mtu < dev->min_mtu) {

Ouch, integral promotions. Looks like you need to keep the < 0 check.
Otherwise new_mtu gets promoted to unsigned int and negative values will
pass the check.

Thanks,
Jakub


Re: [PATCH iproute2] Enable use of extra debugging information

2016-06-22 Thread Jakub Sitnicki
Hi David,

On Wed, Jun 22, 2016 at 01:27 AM CEST, David Ahern  
wrote:
> Add -g flag to builds if DEBUG parameter is set. Improves
> debugging with gdb.
>
> Signed-off-by: David Ahern 
> ---
>  Makefile | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/Makefile b/Makefile
> index 15c81ecfdca3..8e006759079d 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -39,7 +39,11 @@ HOSTCC = gcc
>  DEFINES += -D_GNU_SOURCE
>  # Turn on transparent support for LFS
>  DEFINES += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
> +ifdef DEBUG
> +CCOPTS = -g
> +else
>  CCOPTS = -O2
> +endif
>  WFLAGS := -Wall -Wstrict-prototypes  -Wmissing-prototypes
>  WFLAGS += -Wmissing-declarations -Wold-style-definition -Wformat=2

This implies a change of optimization level to -O0 when building with
DEBUG set, doesn't it? Was it intentional?

Perhaps it would be less surprising to explicitly set -O0, if that was
your intention. Just a thought.

Thanks,
Jakub


Re: [iproute PATCH v2 7/7] ip/tcp_metrics: Simplify process_msg a bit

2016-06-22 Thread Jakub Sitnicki
On Tue, Jun 21, 2016 at 06:18 PM CEST, Phil Sutter  wrote:
> By combining the attribute extraction and check for existence, the
> additional indentation level in the 'else' clause can be avoided.
>
> In addition to that, common actions for 'daddr' are combined since the
> function returns if neither of the branches are taken.
>
> Signed-off-by: Phil Sutter 
> ---
>  ip/tcp_metrics.c | 45 ++---
>  1 file changed, 18 insertions(+), 27 deletions(-)
>
> diff --git a/ip/tcp_metrics.c b/ip/tcp_metrics.c
> index f82604f458ada..899830c127bcb 100644
> --- a/ip/tcp_metrics.c
> +++ b/ip/tcp_metrics.c
> @@ -112,47 +112,38 @@ static int process_msg(const struct sockaddr_nl *who, 
> struct nlmsghdr *n,
>   parse_rtattr(attrs, TCP_METRICS_ATTR_MAX, (void *) ghdr + GENL_HDRLEN,
>len);
>  
> - a = attrs[TCP_METRICS_ATTR_ADDR_IPV4];
> - if (a) {
> + if ((a = attrs[TCP_METRICS_ATTR_ADDR_IPV4])) {

Copy the pointer inside the branch?

Same gain on indentation while keeping checkpatch happy.

I only compile-tested the patch below.

Thanks,
Jakub

diff --git a/ip/tcp_metrics.c b/ip/tcp_metrics.c
index f82604f..ac2613a 100644
--- a/ip/tcp_metrics.c
+++ b/ip/tcp_metrics.c
@@ -112,47 +112,44 @@ static int process_msg(const struct sockaddr_nl *who, 
struct nlmsghdr *n,
parse_rtattr(attrs, TCP_METRICS_ATTR_MAX, (void *) ghdr + GENL_HDRLEN,
 len);
 
-   a = attrs[TCP_METRICS_ATTR_ADDR_IPV4];
-   if (a) {
+   if (attrs[TCP_METRICS_ATTR_ADDR_IPV4]) {
if (f.daddr.family && f.daddr.family != AF_INET)
return 0;
+   a = attrs[TCP_METRICS_ATTR_ADDR_IPV4];
memcpy(, RTA_DATA(a), 4);
daddr.bytelen = 4;
family = AF_INET;
atype = TCP_METRICS_ATTR_ADDR_IPV4;
dlen = RTA_PAYLOAD(a);
-   } else {
-   a = attrs[TCP_METRICS_ATTR_ADDR_IPV6];
-   if (a) {
-   if (f.daddr.family && f.daddr.family != AF_INET6)
-   return 0;
-   memcpy(, RTA_DATA(a), 16);
-   daddr.bytelen = 16;
-   family = AF_INET6;
-   atype = TCP_METRICS_ATTR_ADDR_IPV6;
-   dlen = RTA_PAYLOAD(a);
-   } else
+   } else if (attrs[TCP_METRICS_ATTR_ADDR_IPV6]) {
+   if (f.daddr.family && f.daddr.family != AF_INET6)
return 0;
+   a = attrs[TCP_METRICS_ATTR_ADDR_IPV6];
+   memcpy(, RTA_DATA(a), 16);
+   daddr.bytelen = 16;
+   family = AF_INET6;
+   atype = TCP_METRICS_ATTR_ADDR_IPV6;
+   dlen = RTA_PAYLOAD(a);
+   } else {
+   return 0;
}
 
-   a = attrs[TCP_METRICS_ATTR_SADDR_IPV4];
-   if (a) {
+   if (attrs[TCP_METRICS_ATTR_SADDR_IPV4]) {
if (f.saddr.family && f.saddr.family != AF_INET)
return 0;
+   a = attrs[TCP_METRICS_ATTR_SADDR_IPV4];
memcpy(, RTA_DATA(a), 4);
saddr.bytelen = 4;
stype = TCP_METRICS_ATTR_SADDR_IPV4;
slen = RTA_PAYLOAD(a);
-   } else {
+   } else if (attrs[TCP_METRICS_ATTR_SADDR_IPV6]) {
+   if (f.saddr.family && f.saddr.family != AF_INET6)
+   return 0;
a = attrs[TCP_METRICS_ATTR_SADDR_IPV6];
-   if (a) {
-   if (f.saddr.family && f.saddr.family != AF_INET6)
-   return 0;
-   memcpy(, RTA_DATA(a), 16);
-   saddr.bytelen = 16;
-   stype = TCP_METRICS_ATTR_SADDR_IPV6;
-   slen = RTA_PAYLOAD(a);
-   }
+   memcpy(, RTA_DATA(a), 16);
+   saddr.bytelen = 16;
+   stype = TCP_METRICS_ATTR_SADDR_IPV6;
+   slen = RTA_PAYLOAD(a);
}
 
if (f.daddr.family && f.daddr.bitlen >= 0 &&


Re: [iproute PATCH v2 2/7] Use C99 style initializers everywhere

2016-06-22 Thread Jakub Sitnicki
On Wed, Jun 22, 2016 at 11:29 AM CEST, Phil Sutter <p...@nwl.cc> wrote:
> On Wed, Jun 22, 2016 at 11:12:14AM +0200, Jakub Sitnicki wrote:
>> On Tue, Jun 21, 2016 at 06:18 PM CEST, Phil Sutter <p...@nwl.cc> wrote:
>> > This big patch was compiled by vimgrepping for memset calls and changing
>> > to C99 initializer if applicable. One notable exception is the
>> > initialization of union bpf_attr in tc/tc_bpf.c: changing it would break
>> > for older gcc versions (at least <=3.4.6).
>> >
>> > Calls to memset for struct rtattr pointer fields for parse_rtattr*()
>> > were just dropped since they are not needed.
>> >
>> > The changes here allowed the compiler to discover some unused variables,
>> > so get rid of them, too.
>> >
>> > Signed-off-by: Phil Sutter <p...@nwl.cc>
>> > ---
>> 
>> [...]
>> 
>> > diff --git a/bridge/fdb.c b/bridge/fdb.c
>> > index be849f980a802..a59d6a9c13018 100644
>> > --- a/bridge/fdb.c
>> > +++ b/bridge/fdb.c
>> > @@ -177,16 +177,15 @@ static int fdb_show(int argc, char **argv)
>> >struct nlmsghdr n;
>> >struct ifinfomsgifm;
>> >charbuf[256];
>> > -  } req;
>> > +  } req = {
>> > +  .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
>> > +  .ifm.ifi_family = PF_BRIDGE
>> > +  };
>> >  
>> 
>> A comma is allowed after a list of initializers (IOW, after the last
>> initializer). Having it would make for smaller/cleaner diffs in the
>> future if new structure members get added and need to be initialized.
>
> Good point! I knew about that already, but that's good reasoning why one
> would actually want to do that intentionally.

It also helps us all with the compulsive need for closure ;-)

Thanks,
Jakub


Re: [iproute PATCH v2 2/7] Use C99 style initializers everywhere

2016-06-22 Thread Jakub Sitnicki
Hi Phil,

On Tue, Jun 21, 2016 at 06:18 PM CEST, Phil Sutter  wrote:
> This big patch was compiled by vimgrepping for memset calls and changing
> to C99 initializer if applicable. One notable exception is the
> initialization of union bpf_attr in tc/tc_bpf.c: changing it would break
> for older gcc versions (at least <=3.4.6).
>
> Calls to memset for struct rtattr pointer fields for parse_rtattr*()
> were just dropped since they are not needed.
>
> The changes here allowed the compiler to discover some unused variables,
> so get rid of them, too.
>
> Signed-off-by: Phil Sutter 
> ---

[...]

> diff --git a/bridge/fdb.c b/bridge/fdb.c
> index be849f980a802..a59d6a9c13018 100644
> --- a/bridge/fdb.c
> +++ b/bridge/fdb.c
> @@ -177,16 +177,15 @@ static int fdb_show(int argc, char **argv)
>   struct nlmsghdr n;
>   struct ifinfomsgifm;
>   charbuf[256];
> - } req;
> + } req = {
> + .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
> + .ifm.ifi_family = PF_BRIDGE
> + };
>  

A comma is allowed after a list of initializers (IOW, after the last
initializer). Having it would make for smaller/cleaner diffs in the
future if new structure members get added and need to be initialized.

Thanks,
Jakub


[PATCH net] ipv6: Skip XFRM lookup if dst_entry in socket cache is valid

2016-06-08 Thread Jakub Sitnicki
At present we perform an xfrm_lookup() for each UDPv6 message we
send. The lookup involves querying the flow cache (flow_cache_lookup)
and, in case of a cache miss, creating an XFRM bundle.

If we miss the flow cache, we can end up creating a new bundle and
deriving the path MTU (xfrm_init_pmtu) from on an already transformed
dst_entry, which we pass from the socket cache (sk->sk_dst_cache) down
to xfrm_lookup(). This can happen only if we're caching the dst_entry
in the socket, that is when we're using a connected UDP socket.

To put it another way, the path MTU shrinks each time we miss the flow
cache, which later on leads to incorrectly fragmented payload. It can
be observed with ESPv6 in transport mode:

  1) Set up a transformation and lower the MTU to trigger fragmentation
# ip xfrm policy add dir out src ::1 dst ::1 \
  tmpl src ::1 dst ::1 proto esp spi 1
# ip xfrm state add src ::1 dst ::1 \
  proto esp spi 1 enc 'aes' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b
# ip link set dev lo mtu 1500

  2) Monitor the packet flow and set up an UDP sink
# tcpdump -ni lo -ttt &
# socat udp6-listen:12345,fork /dev/null &

  3) Send a datagram that needs fragmentation with a connected socket
# perl -e 'print "@" x 1470 | socat - udp6:[::1]:12345
2016/06/07 18:52:52 socat[724] E read(3, 0x555bb3d5ba00, 8192): Protocol 
error
00:00:00.00 IP6 ::1 > ::1: frag (0|1448) ESP(spi=0x0001,seq=0x2), 
length 1448
00:00:00.14 IP6 ::1 > ::1: frag (1448|32)
00:00:00.50 IP6 ::1 > ::1: ESP(spi=0x0001,seq=0x3), length 1272
(^ ICMPv6 Parameter Problem)
00:00:00.22 IP6 ::1 > ::1: ESP(spi=0x0001,seq=0x5), length 136

  4) Compare it to a non-connected socket
# perl -e 'print "@" x 1500' | socat - udp6-sendto:[::1]:12345
00:00:40.535488 IP6 ::1 > ::1: frag (0|1448) ESP(spi=0x0001,seq=0x6), 
length 1448
00:00:00.10 IP6 ::1 > ::1: frag (1448|64)

What happens in step (3) is:

  1) when connecting the socket in __ip6_datagram_connect(), we
 perform an XFRM lookup, miss the flow cache, create an XFRM
 bundle, and cache the destination,

  2) afterwards, when sending the datagram, we perform an XFRM lookup,
 again, miss the flow cache (due to mismatch of flowi6_iif and
 flowi6_oif, which is an issue of its own), and recreate an XFRM
 bundle based on the cached (and already transformed) destination.

To prevent the recreation of an XFRM bundle, avoid an XFRM lookup
altogether whenever we already have a destination entry cached in the
socket. This prevents the path MTU shrinkage and brings us on par with
UDPv4.

The fix also benefits connected PINGv6 sockets, another user of
ip6_sk_dst_lookup_flow(), who also suffer messages being transformed
twice.

Joint work with Hannes Frederic Sowa.

Reported-by: Jan Tluka <jtl...@redhat.com>
Signed-off-by: Jakub Sitnicki <j...@redhat.com>
Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>
---
 net/ipv6/ip6_output.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6b2f60a..fd32175 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1071,17 +1071,12 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock 
*sk, struct flowi6 *fl6,
 const struct in6_addr *final_dst)
 {
struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
-   int err;
 
dst = ip6_sk_dst_check(sk, dst, fl6);
+   if (!dst)
+   dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
 
-   err = ip6_dst_lookup_tail(sock_net(sk), sk, , fl6);
-   if (err)
-   return ERR_PTR(err);
-   if (final_dst)
-   fl6->daddr = *final_dst;
-
-   return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 
0);
+   return dst;
 }
 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
 
-- 
2.5.5



Re: [PATCHv3 net-next] net: use jiffies_to_msecs to replace EXPIRES_IN_MS in inet/sctp_diag

2016-04-19 Thread Jakub Sitnicki
On Tue, Apr 19, 2016 at 09:10 AM CEST, Xin Long <lucien@gmail.com> wrote:
> EXPIRES_IN_MS macro comes from net/ipv4/inet_diag.c and dates
> back to before jiffies_to_msecs() has been introduced.
>
> Now we can remove it and use jiffies_to_msecs().
>
> Signed-off-by: Xin Long <lucien@gmail.com>
> ---

Acked-by: Jakub Sitnicki <j...@redhat.com>


Re: [PATCHv3 net-next 4/6] sctp: add the sctp_diag.c file

2016-04-14 Thread Jakub Sitnicki
Hi Xin,

On Thu, Apr 14, 2016 at 09:35 AM CEST, Xin Long  wrote:
> This one will implement all the interface of inet_diag, inet_diag_handler.
> which includes sctp_diag_dump, sctp_diag_dump_one and sctp_diag_get_info.
>
> It will work as a module, and register inet_diag_handler when loading.
>
> v2->v3:
> - fix the mistake in inet_assoc_attr_size().
>
> - change inet_diag_msg_laddrs_fill() name to inet_diag_msg_sctpladdrs_fill.
>
> - change inet_diag_msg_paddrs_fill() name to inet_diag_msg_sctpaddrs_fill.
>
> - add inet_diag_msg_sctpinfo_fill() to make asoc/ep fill code clearer.
>
> - add inet_diag_msg_sctpasoc_fill() to make asoc fill code clearer.
>
> - merge inet_asoc_diag_fill() and inet_ep_diag_fill() to
>   inet_sctp_diag_fill().
>
> - call sctp_diag_get_info() directly, instead by handler, cause the caller
>   is in the same file with it.
>
> - call lock_sock in sctp_tsp_dump_one() to make sure we call get sctp info
>   safely.
>
> - after lock_sock(sk), we should check sk != assoc->base.sk.
>
> - change mem[SK_MEMINFO_WMEM_ALLOC] to asoc->sndbuf_used for asoc dump when
>   asoc->ep->sndbuf_policy is set. don't use INET_DIAG_MEMINFO attr any more.
>
> Signed-off-by: Xin Long 
> ---

[...]

> diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
> new file mode 100644
> index 000..98ecd16
> --- /dev/null
> +++ b/net/sctp/sctp_diag.c
> @@ -0,0 +1,497 @@

[...]

> +#define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
> + r->idiag_expires =
> + EXPIRES_IN_MS(asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX]);
> +#undef EXPIRES_IN_MS

How about using jiffies_to_msecs() here? It's already being used
elsewhere in net/sctp.

AFAICT EXPIRES_IN_MS macro comes from net/ipv4/inet_diag.c and dates
back to before jiffies_to_msecs() has been introduced. Perhaps that's
why it's still used in inet_diag.c. But this is new code.

Thanks,
Jakub


Re: [PATCH v2 2/2] sctp: delay calls to sk_data_ready() as much as possible

2016-04-07 Thread Jakub Sitnicki
On Wed, Apr 06, 2016 at 07:53 PM CEST, Marcelo Ricardo Leitner 
 wrote:
> Currently, the processing of multiple chunks in a single SCTP packet
> leads to multiple calls to sk_data_ready, causing multiple wake up
> signals which are costly and doesn't make it wake up any faster.
>
> With this patch it will notice that the wake up is pending and will do it
> before leaving the state machine interpreter, latest place possible to
> do it realiably and cleanly.
>
> Note that sk_data_ready events are not dependent on asocs, unlike waking
> up writers.
>
> Signed-off-by: Marcelo Ricardo Leitner 
> ---

[...]

> diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
> index 
> 7fe56d0acabf66cfd8fe29dfdb45f7620b470ac7..e7042f9ce63b0cfca50cae252f51b60b68cb5731
>  100644
> --- a/net/sctp/sm_sideeffect.c
> +++ b/net/sctp/sm_sideeffect.c
> @@ -1742,6 +1742,11 @@ out:
>   error = sctp_outq_uncork(>outqueue, gfp);
>   } else if (local_cork)
>   error = sctp_outq_uncork(>outqueue, gfp);
> +
> + if (sctp_sk(ep->base.sk)->pending_data_ready) {
> + ep->base.sk->sk_data_ready(ep->base.sk);
> + sctp_sk(ep->base.sk)->pending_data_ready = 0;
> + }
>   return error;
>  nomem:
>   error = -ENOMEM;

Would it make sense to introduce a local variable for ep->base.sk (and
make this function 535+1 lines long ;-)

  struct sock *sk = ep->base.sk;

... like sctp_ulpq_tail_event() does?

Thanks,
Jakub


[PATCH net] ipv6: Count in extension headers in skb->network_header

2016-04-05 Thread Jakub Sitnicki
When sending a UDPv6 message longer than MTU, account for the length
of fragmentable IPv6 extension headers in skb->network_header offset.
Same as we do in alloc_new_skb path in __ip6_append_data().

This ensures that later on __ip6_make_skb() will make space in
headroom for fragmentable extension headers:

/* move skb->data to ip header from ext header */
if (skb->data < skb_network_header(skb))
__skb_pull(skb, skb_network_offset(skb));

Prevents a splat due to skb_under_panic:

skbuff: skb_under_panic: text:8143397b len:2126 put:14 \
head:880005bacf50 data:880005bacf4a tail:0x48 end:0xc0 dev:lo
[ cut here ]
kernel BUG at net/core/skbuff.c:104!
invalid opcode:  [#1] KASAN
CPU: 0 PID: 160 Comm: reproducer Not tainted 4.6.0-rc2 #65
[...]
Call Trace:
 [] skb_push+0x79/0x80
 [] eth_header+0x2b/0x100
 [] neigh_resolve_output+0x210/0x310
 [] ip6_finish_output2+0x4a7/0x7c0
 [] ip6_output+0x16a/0x280
 [] ip6_local_out+0xb1/0xf0
 [] ip6_send_skb+0x45/0xd0
 [] udp_v6_send_skb+0x246/0x5d0
 [] udpv6_sendmsg+0xa6e/0x1090
[...]

Reported-by: Ji Jianwen <j...@redhat.com>
Signed-off-by: Jakub Sitnicki <j...@redhat.com>
Acked-by: Hannes Frederic Sowa <han...@stressinduktion.org>
---

Can be reproduced by sending a UDPv6 message longer than MTU when
Destination Options are present, as shown below.

Original reproducer has been developed by Ji Jianwen.  Cut down
version included.

# ip link set dev lo mtu 1500
# ./reproducer 0 1024   # works
# ./reproducer 8 1024   # works
# ./reproducer 64 1024  # works
# ./reproducer 0 2048   # works
# ./reproducer 8 2048   # crash
# ./reproducer 64 2048  # crash


/* reproducer.c */

#include 
#include 
#include 
#include 
#include 
#include 
#include 

static void set_dstopts(int s, int len)
{
char *dstopts;
int r;

assert(len % 8 == 0);

dstopts = calloc(len, 1);
dstopts[1] = len / 8 - 1; /* Hdr Ext Len */
dstopts[2] = 1;   /* PadN Option */
dstopts[3] = len - 4; /* Opt Data Len */

r = setsockopt(s, IPPROTO_IPV6, IPV6_DSTOPTS, dstopts, len);
if (r < 0) {
perror("setsockopt");
exit(1);
}

free(dstopts);
}

static void do_send(int s, const struct addrinfo *ai, int len)
{
struct msghdr msg;
struct iovec iov[1];
char *data;
int r;

data = malloc(len);
memset(data, 'A', len);
memset(, 0, sizeof(msg));
iov[0].iov_base = data;
iov[0].iov_len = len;

msg.msg_name = ai->ai_addr;
msg.msg_namelen = ai->ai_addrlen;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_control = 0;
msg.msg_controllen = 0;

r = sendmsg(s, , 0);
if (r < 0) {
perror("sendmsg");
exit(1);
}

free(data);
}

int main(int argc, char *argv[])
{
struct addrinfo *ai = NULL;
int dstopts_len, data_len;
int r, s;

if (argc != 3) {
fprintf(stderr, "Usage: %s  \n", 
argv[0]);
return 1;
}

dstopts_len = atoi(argv[1]);
data_len = atoi(argv[2]);

r = getaddrinfo("::1", "12345", NULL, );
assert(r == 0);

s = socket(ai->ai_family, SOCK_DGRAM, IPPROTO_UDP);
assert(s != -1);

if (dstopts_len > 0)
set_dstopts(s, dstopts_len);
do_send(s, ai, data_len);

freeaddrinfo(ai);

return 0;
}


 net/ipv6/ip6_output.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 9428345..bc972e7 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1090,8 +1090,8 @@ static inline int ip6_ufo_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
-   int transhdrlen, int mtu, unsigned int flags,
-   const struct flowi6 *fl6)
+   int exthdrlen, int transhdrlen, int mtu,
+   unsigned int flags, const struct flowi6 *fl6)
 
 {
struct sk_buff *skb;
@@ -1116,7 +1116,7 @@ static inline int ip6_ufo_append_data(struct sock *sk,
skb_put(skb, fragheaderlen + transhdrlen);
 
/* initialize network header pointer */
-   skb_reset_network_header(skb);
+   skb_set_network_header(skb, exthdrlen);
 
/* initialize protocol header pointer */
skb->transport_header = skb->network_header + fragheaderlen;
@@ -1358,7 +1358,7