Hi!

This is not a direct follow-up on the thread "BIRD trying to reinsert
existing kernel routes, netlink issue?" (
https://www.mail-archive.com/bird-users@network.cz/msg05429.html ) but
I think that the cause of my issue can be the same.

My scenario: I have a server in LAB that has a public IP on a GRE
tunnel interface and the tunnel goes to a router in the Internet. BIRD
receives full BGP feed over the GRE tunnel. The tunnel is terminated
in a VRF and BIRD exports the BGP feed to the kernel RT associated
with the VRF. I observed pretty much the same set of issues as Sasha
and others described in the linked thread two years ago: BIRD starts
normally, BGP session comes up but after 3 or 5 minutes it max-out one
CPU, mostly in sys (doing the netlink I/O). Then BIRD stops responding
normally over birdc - each command takes several minutes to complete
and I also saw the log messages like:

Jan  2 00:00:35 lab bird: Netlink: File exists
Jan  2 00:00:35 lab bird: Netlink: File exists
Jan  2 00:00:35 lab bird: ...
Jan  2 00:00:35 lab bird: I/O loop cycle took 73127 ms for 11 events

I realized that BIRD actually transfers huge amounts of data from
kernel over netlink (several gigabytes of messages per minute or so,
which translates to hundreds of millions of FIB records) in the
beginning of each sync and it ultimately stalls BIRD I/O long enough
to miss BGP keepalive deadlines and the sessions start flapping, which
makes the situation hard to understand using the conventional
profiling tools, tcpdump for netlink dump etc. FYI I am attaching my
quick and dirty patch to BIRD that I used to collect stats from
netlink interactions to understand the problem and finally to add
experimental PoC fix - see  nl_request_rt_dump() in the patch.

The large table that BIRD pulled from the kernel was a FNHE table
where Linux collects PMTU records for *all* destination IPs that are
routed to the tunnel (which does not seem to be right and I will
discuss it in LKML shortly). These records have (default) 600s
expiration time and in my scenario I happen to receive some
backscatter traffic that in most cases gets ICMP or TCP reset
responses that could ultimately create millions of these records in a
few minutes.

The reason why this problem occured only in Linux ~5.2+ lies in the
patch  
https://patchwork.ozlabs.org/project/netdev/patch/8d3b68cd37fb5fddc470904cdd6793fcf480c6c1.1561131177.git.sbri...@redhat.com/
that changed the semantics of netlink dump requests. Now the kernel
dumps the FIB Next Hop Exceptions table (previously known as route
cache) alongside the RT unless the requester sets sockopt
NETLINK_GET_STRICT_CHK and clear the flag RTM_F_CLONED in the dump
request. BIRD does not apply the filters so the kernel dumps
everything. But iproute2 and other programs that use netlink utilize
the filters, so no similar performance issue occurs unless I
explicitly dump the FNHE table (ip route show cache).

I believe that many different types of Linux tunnels create the PMTU
records for all packets transmitted over the tunnel as well. And it
works like that for a long time - the code that creates the route
cache (at that time, now it is FNHE table) records has been introduced
in Linux 3.10 
(https://elixir.bootlin.com/linux/v3.10/source/net/ipv4/ip_tunnel.c#L591).

Regardless of what may or may not happen on the kernel side I think
that implementing the netlink filter in BIRD to avoid the described
situation makes sense. I am almost certain that my experimental fix
breaks other things (most likely OSPF) but I would be glad to help
make it right.

What do you think?

Best regards,
Tomas
From a9868dff404e7c6bec151a2942a699b0ad8f7222 Mon Sep 17 00:00:00 2001
From: Tomas Hlavacek <tmshlvck@gmail.com>
Date: Wed, 5 Jan 2022 02:22:31 +0100
Subject: [PATCH] EXPERIMENT: Add stats print and strict dump over NL

This patch is EXPERIMENTAL, it breaks multiple things, among others OSPF
seems to be broken by the strict netlink filtering.

This servers as a PoC for analysis of problems described in
https://www.mail-archive.com/bird-users@network.cz/msg05577.html
---
 sysdep/linux/netlink.c | 176 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 163 insertions(+), 13 deletions(-)

diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
index f85bcf35..ed674ecf 100644
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@@ -32,6 +32,7 @@
 #include <linux/if.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
+#include <sys/time.h>
 
 #ifdef HAVE_MPLS_KERNEL
 #include <linux/lwtunnel.h>
@@ -74,6 +75,8 @@
 
 const int rt_default_ecmp = 16;
 
+int rcvd=0;
+
 /*
  * Structure nl_parse_state keeps state of received route processing. Ideally,
  * we could just independently parse received Netlink messages and immediately
@@ -128,7 +131,8 @@ struct nl_sock
   uint last_size;
 };
 
-#define NL_RX_SIZE 8192
+//#define NL_RX_SIZE 8192
+#define NL_RX_SIZE 32768
 
 #define NL_OP_DELETE	0
 #define NL_OP_ADD	(NLM_F_CREATE|NLM_F_EXCL)
@@ -143,11 +147,22 @@ static struct nl_sock nl_req  = {.fd = -1};	/* Netlink socket for requests */
 static void
 nl_open_sock(struct nl_sock *nl)
 {
+  int sndbuf = 32768;
+  int rcvbuf = 1024*1024;
+  int one = 1;
+
   if (nl->fd < 0)
     {
-      nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+      nl->fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
       if (nl->fd < 0)
 	die("Unable to open rtnetlink socket: %m");
+
+
+      setsockopt(nl->fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf));
+      setsockopt(nl->fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf));
+      setsockopt(nl->fd, SOL_NETLINK, NETLINK_EXT_ACK, &one, sizeof(one));
+      setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &one, sizeof(one));
+
       nl->seq = (u32) (current_time() TO_S); /* Or perhaps random_u32() ? */
       nl->rx_buffer = xmalloc(NL_RX_SIZE);
       nl->last_hdr = NULL;
@@ -192,6 +207,36 @@ nl_request_dump(int af, int cmd)
   nl_send(&nl_scan, &req.nh);
 }
 
+static void
+nl_request_rt_dump(int af)
+{
+  struct {
+    struct nlmsghdr nh;
+    struct rtmsg rtm;
+    char buf[128];
+  } req = {
+    .nh.nlmsg_type = RTM_GETROUTE,
+    .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
+    .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+    .nh.nlmsg_seq = ++(nl_scan.seq),
+    .nh.nlmsg_pid = 0,
+  };
+  struct rtmsg *rtm = NLMSG_DATA(&req.nh);
+  rtm->rtm_protocol = 0;
+//  rtm->rtm_family = af;
+  rtm->rtm_family = 0;
+//  rtm->rtm_flags = 0;
+
+  printf("reqdump: rtm_flags=%d nlmsg_len=%d\n", req.rtm.rtm_flags, req.nh.nlmsg_len);
+//  nl_send(&nl_scan, &req.nh);
+  for(int i = 0; i<sizeof(req); i+=4)
+     printf("%08X ", ((char *)&req)[i]); 
+  printf("\n");
+  send(nl_scan.fd, &req, sizeof(req), 0);
+  nl_scan.last_hdr = NULL;
+}
+
+
 static struct nlmsghdr *
 nl_get_reply(struct nl_sock *nl)
 {
@@ -208,6 +253,7 @@ nl_get_reply(struct nl_sock *nl)
 	    .msg_iovlen = 1,
 	  };
 	  int x = recvmsg(nl->fd, &m, 0);
+          rcvd++;
 	  if (x < 0)
 	    die("nl_get_reply: %m");
 	  if (sa.nl_pid)		/* It isn't from the kernel */
@@ -258,6 +304,14 @@ nl_error(struct nlmsghdr *h, int ignore_esrch)
   return ec;
 }
 
+static long long current_timestamp(void) {
+    struct timeval te; 
+    gettimeofday(&te, NULL); // get current time
+    long long milliseconds = te.tv_sec*1000LL + te.tv_usec/1000; // calculate milliseconds
+    // printf("milliseconds: %lld\n", milliseconds);
+    return milliseconds;
+}
+
 static struct nlmsghdr *
 nl_get_scan(void)
 {
@@ -1509,6 +1563,28 @@ nl_parse_end(struct nl_parse_state *s)
 
 #define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0)
 
+int ncheckin=0;
+int n4parse=0;
+int mytab=0;
+int rtnuni=0;
+int rtnblack=0;
+int rtnunreach=0;
+int rtnprohi=0;
+int rtndef=0;
+
+int utab=0;
+int spfnadr=0;
+int iff=0;
+int tos=0;
+int delrt=0;
+ 
+int cls=0;
+int punspec=0;
+int predir=0;
+int pkern=0;
+int pbird=0;
+int pboot=0;
+
 static void
 nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
 {
@@ -1524,14 +1600,18 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
   u32 def_scope = RT_SCOPE_UNIVERSE;
   int krt_src;
 
-  if (!(i = nl_checkin(h, sizeof(*i))))
+  if (!(i = nl_checkin(h, sizeof(*i)))){
+    ncheckin++;
     return;
+  }
 
   switch (i->rtm_family)
     {
     case AF_INET:
-      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want4, a, sizeof(a)))
+      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want4, a, sizeof(a))){
+        n4parse++;
 	return;
+      }
 
       if (a[RTA_DST])
 	net_fill_ip4(&dst, rta_get_ip4(a[RTA_DST]), i->rtm_dst_len);
@@ -1581,44 +1661,63 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
   else
     table_id = i->rtm_table;
 
+  if (table_id == 101)
+    mytab++;
+
   /* Do we know this table? */
   p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id);
-  if (!p)
+  if (!p){
+    utab++;
     SKIP("unknown table %u\n", table_id);
+  }
 
-  if (a[RTA_SRC] && (p->p.net_type != NET_IP6_SADR))
+  if (a[RTA_SRC] && (p->p.net_type != NET_IP6_SADR)){
+    spfnadr++;
     SKIP("src prefix for non-SADR channel\n");
+  }
 
-  if (a[RTA_IIF])
+  if (a[RTA_IIF]){
+    iff++;
     SKIP("IIF set\n");
+  }
 
-  if (i->rtm_tos != 0)			/* We don't support TOS */
+  if (i->rtm_tos != 0){			/* We don't support TOS */
+    tos++;
     SKIP("TOS %02x\n", i->rtm_tos);
+  }
 
-  if (s->scan && !new)
+  if (s->scan && !new){
+    delrt++;
     SKIP("RTM_DELROUTE in scan\n");
+  }
 
   if (a[RTA_PRIORITY])
     priority = rta_get_u32(a[RTA_PRIORITY]);
 
   int c = net_classify(&dst);
-  if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
+  if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK)){
+    cls++;
     SKIP("strange class/scope\n");
+  }
 
   switch (i->rtm_protocol)
     {
     case RTPROT_UNSPEC:
+      punspec++;
       SKIP("proto unspec\n");
 
     case RTPROT_REDIRECT:
+      predir++;
       krt_src = KRT_SRC_REDIRECT;
       break;
 
     case RTPROT_KERNEL:
+      pkern++;
       krt_src = KRT_SRC_KERNEL;
       return;
 
     case RTPROT_BIRD:
+      pbird++;
       if (!s->scan)
 	SKIP("echo\n");
       krt_src = KRT_SRC_BIRD;
@@ -1626,6 +1725,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
 
     case RTPROT_BOOT:
     default:
+      pboot++;
       krt_src = KRT_SRC_ALIEN;
     }
 
@@ -1650,6 +1750,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
   switch (i->rtm_type)
     {
     case RTN_UNICAST:
+      rtnuni++;
       ra->dest = RTD_UNICAST;
 
       if (a[RTA_MULTIPATH])
@@ -1703,16 +1804,20 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
 
       break;
     case RTN_BLACKHOLE:
+      rtnblack++;
       ra->dest = RTD_BLACKHOLE;
       break;
     case RTN_UNREACHABLE:
+      rtnunreach++;
       ra->dest = RTD_UNREACHABLE;
       break;
     case RTN_PROHIBIT:
+      rtnprohi++;
       ra->dest = RTD_PROHIBIT;
       break;
     /* FIXME: What about RTN_THROW? */
     default:
+      rtndef++;
       SKIP("type %d\n", i->rtm_type);
       return;
     }
@@ -1864,12 +1969,57 @@ krt_do_scan(struct krt_proto *p UNUSED)	/* CONFIG_ALL_TABLES_AT_ONCE => p is NUL
   struct nl_parse_state s;
 
   nl_parse_begin(&s, 1);
-  nl_request_dump(AF_UNSPEC, RTM_GETROUTE);
-  while (h = nl_get_scan())
-    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
+  //nl_request_dump(AF_UNSPEC, RTM_GETROUTE);
+  printf("req_dump: %lld\n", current_timestamp());
+  nl_request_rt_dump(AF_INET);
+  int m=0;
+  int nr=0;
+  int dr=0;
+  while (1){
+    h = nl_get_scan();
+    m++;
+    if (!h)
+      break;
+    if (h->nlmsg_type == RTM_NEWROUTE){
+      nl_parse_route(&s, h);
+      nr++;
+    } else
+    if (h->nlmsg_type == RTM_DELROUTE){
       nl_parse_route(&s, h);
+      dr++;
+    }
     else
       log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
+  }
+  printf("scan end: %lld rcvd=%d m=%d nr=%d dr=%d ncheckin=%d n4parse=%d mytab=%d rtnuni=%d rtnblack=%d rtnunreach=%d rtnprohi=%d rtndef=%d \n", current_timestamp(), rcvd, m, nr, dr, ncheckin, n4parse, mytab, rtnuni, rtnblack, rtnunreach, rtnprohi, rtndef);
+
+  printf("scan end2: utab=%d spfnadr=%d iff=%d tos=%d delrt=%d\n", utab, spfnadr, iff, tos, delrt);
+  printf("scan end3: cls=%d punspec=%d predir=%d pkern=%d pbird=%d pboot=%d\n", cls, punspec, predir, pkern, pbird, pboot);
+
+  rcvd=0;
+  utab=0;
+  spfnadr=0;
+  iff=0;
+  tos=0;
+  delrt=0;
+ 
+  cls=0;
+  punspec=0;
+  predir=0;
+  pkern=0;
+  pbird=0;
+  pboot=0;
+
+
+  ncheckin=0;
+  n4parse=0;
+  mytab=0;
+  rtnuni=0;
+  rtnblack=0;
+  rtnunreach=0;
+  rtnprohi=0;
+  rtndef=0;
+ 
   nl_parse_end(&s);
 }
 
-- 
2.30.2

Reply via email to