from:"Kuniyuki Iwashima"

This patch adds a test for BPF_SK_REUSEPORT_SELECT_OR_MIGRATE and
removes 'static' from settimeo() in network_helpers.c.

Signed-off-by: Kuniyuki Iwashima 
---
 tools/testing/selftests/bpf/network_helpers.c |   2 +-
 tools/testing/selftests/bpf/network_helpers.h |   1 +
 .../bpf/prog_tests/migrate_reuseport.c| 483 ++
 .../bpf/progs/test_migrate_reuseport.c|  51 ++
 4 files changed, 536 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_migrate_reuseport.c

diff --git a/tools/testing/selftests/bpf/network_helpers.c 
b/tools/testing/selftests/bpf/network_helpers.c
index 12ee40284da0..2060bc122c53 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -40,7 +40,7 @@ struct ipv6_packet pkt_v6 = {
.tcp.doff = 5,
 };
 
-static int settimeo(int fd, int timeout_ms)
+int settimeo(int fd, int timeout_ms)
 {
struct timeval timeout = { .tv_sec = 3 };
 
diff --git a/tools/testing/selftests/bpf/network_helpers.h 
b/tools/testing/selftests/bpf/network_helpers.h
index 7205f8afdba1..5e0d51c07b63 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -33,6 +33,7 @@ struct ipv6_packet {
 } __packed;
 extern struct ipv6_packet pkt_v6;
 
+int settimeo(int fd, int timeout_ms);
 int start_server(int family, int type, const char *addr, __u16 port,
 int timeout_ms);
 int connect_to_fd(int server_fd, int timeout_ms);
diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c 
b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
new file mode 100644
index ..726f6380390a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
@@ -0,0 +1,483 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check if we can migrate child sockets.
+ *
+ *   1. call listen() for 5 server sockets.
+ *   2. update a map to migrate all child sockets
+ *to the last server socket (migrate_map[cookie] = 4)
+ *   3. call connect() for 25 client sockets.
+ *   4. call shutdown() for first 4 server sockets
+ *and migrate the requests in the accept queue
+ *to the last server socket.
+ *   5. call listen() for the second server socket.
+ *   6. call shutdown() for the last server
+ *and migrate the requests in the accept queue
+ *to the second server socket.
+ *   7. call listen() for the last server.
+ *   8. call shutdown() for the second server
+ *and migrate the requests in the accept queue
+ *to the last server socket.
+ *   9. call accept() for the last server socket.
+ *
+ * Author: Kuniyuki Iwashima 
+ */
+
+#include 
+#include 
+
+#include "test_progs.h"
+#include "test_migrate_reuseport.skel.h"
+#include "network_helpers.h"
+
+#define NR_SERVERS 5
+#define NR_CLIENTS (NR_SERVERS * 5)
+#define MIGRATED_TO (NR_SERVERS - 1)
+
+/* fastopenq->max_qlen and sk->sk_max_ack_backlog */
+#define QLEN (NR_CLIENTS * 5)
+
+#define MSG "Hello World"
+#define MSGLEN 12
+
+struct migrate_reuseport_test_case {
+   const char *name;
+   __s64 servers[NR_SERVERS];
+   __s64 clients[NR_CLIENTS];
+   struct sockaddr_storage addr;
+   socklen_t addrlen;
+   int family;
+   bool drop_ack;
+   bool expire_synack_timer;
+   bool fastopen;
+} test_cases[] = {
+   {
+   .name = "IPv4 - TCP_ESTABLISHED - inet_csk_listen_stop",
+   .family = AF_INET,
+   .drop_ack = false,
+   .expire_synack_timer = false,
+   .fastopen = false,
+   },
+   {
+   .name = "IPv4 - TCP_SYN_RECV - inet_csk_listen_stop",
+   .family = AF_INET,
+   .drop_ack = true,
+   .expire_synack_timer = false,
+   .fastopen = true,
+   },
+   {
+   .name = "IPv4 - TCP_NEW_SYN_RECV - inet_csk_complete_hashdance",
+   .family = AF_INET,
+   .drop_ack = true,
+   .expire_synack_timer = false,
+   .fastopen = false,
+   },
+   {
+   .name = "IPv4 - TCP_NEW_SYN_RECV - reqsk_timer_handler",
+   .family = AF_INET,
+   .drop_ack = true,
+   .expire_synack_timer = true,
+   .fastopen = false,
+   },
+   {
+   .name = "IPv6 - TCP_ESTABLISHED - inet_csk_listen_stop",
+   .family = AF_INET6,
+   .drop_ack = false,
+   .expire_synack_timer = false,
+   .fastopen = false,
+   },
+   {
+   .name = "IPv6 - TCP_SYN_RECV - inet_csk_listen_stop",
+   .family = AF_INET6,
+   .drop_ack = true,
+   .expire_

[PATCH v3 bpf-next 10/11] libbpf: Set expected_attach_type for BPF_PROG_TYPE_SK_REUSEPORT.

This commit introduces a new section (sk_reuseport/migrate) and sets
expected_attach_type to two each section in BPF_PROG_TYPE_SK_REUSEPORT
program.

Signed-off-by: Kuniyuki Iwashima 
---
 tools/lib/bpf/libbpf.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7aad78dbb4b4..6ba84111e143 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8765,7 +8765,10 @@ static struct bpf_link *attach_iter(const struct 
bpf_sec_def *sec,
 
 static const struct bpf_sec_def section_defs[] = {
BPF_PROG_SEC("socket",  BPF_PROG_TYPE_SOCKET_FILTER),
-   BPF_PROG_SEC("sk_reuseport",BPF_PROG_TYPE_SK_REUSEPORT),
+   BPF_EAPROG_SEC("sk_reuseport/migrate",  BPF_PROG_TYPE_SK_REUSEPORT,
+   
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE),
+   BPF_EAPROG_SEC("sk_reuseport",  BPF_PROG_TYPE_SK_REUSEPORT,
+   BPF_SK_REUSEPORT_SELECT),
SEC_DEF("kprobe/", KPROBE,
.attach_fn = attach_kprobe),
BPF_PROG_SEC("uprobe/", BPF_PROG_TYPE_KPROBE),
-- 
2.30.2

[PATCH v3 bpf-next 09/11] bpf: Support socket migration by eBPF.

This patch introduces a new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT
to check if the attached eBPF program is capable of migrating sockets. When
the eBPF program is attached, we run it for socket migration if the
expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE or
net.ipv4.tcp_migrate_req is enabled.

Ccurrently, the expected_attach_type is not enforced for the
BPF_PROG_TYPE_SK_REUSEPORT type of program. Thus, this commit follows the
earlier idea in the commit aac3fc320d94 ("bpf: Post-hooks for sys_bind") to
fix up the zero expected_attach_type in bpf_prog_load_fixup_attach_type().

Moreover, this patch adds a new field (migrating_sk) to sk_reuseport_md to
select a new listener based on the child socket. migrating_sk varies
depending on if it is migrating a request in the accept queue or during
3WHS.

  - accept_queue : sock (ESTABLISHED/SYN_RECV)
  - 3WHS : request_sock (NEW_SYN_RECV)

In the eBPF program, we can select a new listener by
BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning
SK_DROP. This feature is useful when listeners have different settings at
the socket API level or when we want to free resources as soon as possible.

  - SK_PASS with selected_sk, select it as a new listener
  - SK_PASS with selected_sk NULL, fallbacks to the random selection
  - SK_DROP, cancel the migration.

There is a noteworthy point. We select a listening socket in three places,
but we do not have struct skb at closing a listener or retransmitting a
SYN+ACK. On the other hand, some helper functions do not expect skb is NULL
(e.g. skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer()
in BPF_FUNC_skb_load_bytes_relative()). So we allocate an empty skb
temporarily before running the eBPF program.

Link: 
https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6t...@kafai-mbp.dhcp.thefacebook.com/
Link: 
https://lore.kernel.org/netdev/20201203042402.6cskdlit5f3mw...@kafai-mbp.dhcp.thefacebook.com/
Link: 
https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmo...@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau 
Signed-off-by: Kuniyuki Iwashima 
---
 include/linux/bpf.h|  1 +
 include/linux/filter.h |  2 ++
 include/uapi/linux/bpf.h   | 15 +++
 kernel/bpf/syscall.c   | 13 +
 net/core/filter.c  | 13 -
 net/core/sock_reuseport.c  | 34 ++
 tools/include/uapi/linux/bpf.h | 15 +++
 7 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c9b7a876b0c8..40b9f074a929 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2014,6 +2014,7 @@ struct sk_reuseport_kern {
struct sk_buff *skb;
struct sock *sk;
struct sock *selected_sk;
+   struct sock *migrating_sk;
void *data_end;
u32 hash;
u32 reuseport_id;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9a09547bc7ba..226f76c0b974 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -995,11 +995,13 @@ void bpf_warn_invalid_xdp_action(u32 act);
 #ifdef CONFIG_INET
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock 
*sk,
  struct bpf_prog *prog, struct sk_buff *skb,
+ struct sock *migrating_sk,
  u32 hash);
 #else
 static inline struct sock *
 bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 struct bpf_prog *prog, struct sk_buff *skb,
+struct sock *migrating_sk,
 u32 hash)
 {
return NULL;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ab8584701334..3888641c0d39 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -958,6 +958,8 @@ enum bpf_attach_type {
BPF_SK_LOOKUP,
BPF_XDP,
BPF_SK_SKB_VERDICT,
+   BPF_SK_REUSEPORT_SELECT,
+   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
__MAX_BPF_ATTACH_TYPE
 };
 
@@ -5299,7 +5301,20 @@ struct sk_reuseport_md {
__u32 ip_protocol;  /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
__u32 bind_inany;   /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
+   /* When reuse->migrating_sk is NULL, it is selecting a sk for the
+* new incoming connection request (e.g. selecting a listen sk for
+* the received SYN in the TCP case).  reuse->sk is one of the sk
+* in the reuseport group. The bpf prog can use reuse->sk to learn
+* the local listening ip/port without looking into the skb.
+*
+* When reuse->migrating_sk is not NULL, reuse->sk is closed and
+* reuse->migrating_sk is the socket that needs to be migrated
+* to another listening socket.  migr

[PATCH v3 bpf-next 01/11] net: Introduce net.ipv4.tcp_migrate_req.

This commit adds a new sysctl option: net.ipv4.tcp_migrate_req. If this
option is enabled or eBPF program is attached, we will be able to migrate
child sockets from a listener to another in the same reuseport group after
close() or shutdown() syscalls.

Signed-off-by: Kuniyuki Iwashima 
Reviewed-by: Benjamin Herrenschmidt 
---
 Documentation/networking/ip-sysctl.rst | 20 
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/sysctl_net_ipv4.c |  9 +
 3 files changed, 30 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.rst 
b/Documentation/networking/ip-sysctl.rst
index c2ecc9894fd0..8e92f9b28aad 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -732,6 +732,26 @@ tcp_syncookies - INTEGER
network connections you can set this knob to 2 to enable
unconditionally generation of syncookies.
 
+tcp_migrate_req - INTEGER
+   The incoming connection is tied to a specific listening socket when
+   the initial SYN packet is received during the three-way handshake.
+   When a listener is closed, in-flight request sockets during the
+   handshake and established sockets in the accept queue are aborted.
+
+   If the listener has SO_REUSEPORT enabled, other listeners on the
+   same port should have been able to accept such connections. This
+   option makes it possible to migrate such child sockets to another
+   listener after close() or shutdown().
+
+   Default: 0
+
+   Note that the source and destination listeners MUST have the same
+   settings at the socket API level. If different applications listen
+   on the same port, disable this option or attach the
+   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE type of eBPF program to select
+   the correct socket by bpf_sk_select_reuseport() or to cancel
+   migration by returning SK_DROP.
+
 tcp_fastopen - INTEGER
Enable TCP Fast Open (RFC7413) to send and accept data in the opening
SYN packet.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 87e1612497ea..6402d489419d 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -136,6 +136,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_syn_retries;
u8 sysctl_tcp_synack_retries;
u8 sysctl_tcp_syncookies;
+   u8 sysctl_tcp_migrate_req;
int sysctl_tcp_reordering;
u8 sysctl_tcp_retries1;
u8 sysctl_tcp_retries2;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a62934b9f15a..7bb013fcbf5f 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -940,6 +940,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler   = proc_dou8vec_minmax,
},
 #endif
+   {
+   .procname   = "tcp_migrate_req",
+   .data   = _net.ipv4.sysctl_tcp_migrate_req,
+   .maxlen = sizeof(u8),
+   .mode   = 0644,
+   .proc_handler   = proc_dou8vec_minmax,
+   .extra1 = SYSCTL_ZERO,
+   .extra2 = SYSCTL_ONE
+   },
{
.procname   = "tcp_reordering",
.data   = _net.ipv4.sysctl_tcp_reordering,
-- 
2.30.2

[PATCH v3 bpf-next 08/11] bpf: Support BPF_FUNC_get_socket_cookie() for BPF_PROG_TYPE_SK_REUSEPORT.

We will call sock_reuseport.prog for socket migration in the next commit,
so the eBPF program has to know which listener is closing to select a new
listener.

We can currently get a unique ID of each listener in the userspace by
calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map.

This patch makes the pointer of sk available in sk_reuseport_md so that we
can get the ID by BPF_FUNC_get_socket_cookie() in the eBPF program.

Link: 
https://lore.kernel.org/netdev/20201119001154.kapwihc2plp4f...@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau 
Signed-off-by: Kuniyuki Iwashima 
---
 include/uapi/linux/bpf.h   |  1 +
 net/core/filter.c  | 10 ++
 tools/include/uapi/linux/bpf.h |  1 +
 3 files changed, 12 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 49371eba98ba..ab8584701334 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5299,6 +5299,7 @@ struct sk_reuseport_md {
__u32 ip_protocol;  /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
__u32 bind_inany;   /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
+   __bpf_md_ptr(struct bpf_sock *, sk);
 };
 
 #define BPF_TAG_SIZE   8
diff --git a/net/core/filter.c b/net/core/filter.c
index cae56d08a670..3d0f989f5d38 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10135,6 +10135,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
return _reuseport_load_bytes_proto;
case BPF_FUNC_skb_load_bytes_relative:
return _reuseport_load_bytes_relative_proto;
+   case BPF_FUNC_get_socket_cookie:
+   return _get_socket_ptr_cookie_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -10164,6 +10166,10 @@ sk_reuseport_is_valid_access(int off, int size,
case offsetof(struct sk_reuseport_md, hash):
return size == size_default;
 
+   case offsetof(struct sk_reuseport_md, sk):
+   info->reg_type = ARG_PTR_TO_SOCKET;
+   return size == sizeof(__u64);
+
/* Fields that allow narrowing */
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
if (size < sizeof_field(struct sk_buff, protocol))
@@ -10236,6 +10242,10 @@ static u32 sk_reuseport_convert_ctx_access(enum 
bpf_access_type type,
case offsetof(struct sk_reuseport_md, bind_inany):
SK_REUSEPORT_LOAD_FIELD(bind_inany);
break;
+
+   case offsetof(struct sk_reuseport_md, sk):
+   SK_REUSEPORT_LOAD_FIELD(sk);
+   break;
}
 
return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 69902603012c..5fd94d632792 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5293,6 +5293,7 @@ struct sk_reuseport_md {
__u32 ip_protocol;  /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
__u32 bind_inany;   /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
+   __bpf_md_ptr(struct bpf_sock *, sk);
 };
 
 #define BPF_TAG_SIZE   8
-- 
2.30.2

[PATCH v3 bpf-next 07/11] tcp: Migrate TCP_NEW_SYN_RECV requests at receiving the final ACK.

This patch also changes the code to call reuseport_migrate_sock() and
reqsk_clone(), but unlike the other cases, we do not call reqsk_clone()
right after reuseport_migrate_sock().

Currently, in the receive path for TCP_NEW_SYN_RECV sockets, its listener
has three kinds of refcnt:

  (A) for listener itself
  (B) carried by reuqest_sock
  (C) sock_hold() in tcp_v[46]_rcv()

While processing the req, (A) may disappear by close(listener). Also, (B)
can disappear by accept(listener) once we put the req into the accept
queue. So, we have to hold another refcnt (C) for the listener to prevent
use-after-free.

For socket migration, we call reuseport_migrate_sock() to select a listener
with (A) and to increment the new listener's refcnt in tcp_v[46]_rcv().
This refcnt corresponds to (C) and is cleaned up later in tcp_v[46]_rcv().
Thus we have to take another refcnt (B) for the newly cloned request_sock.

In inet_csk_complete_hashdance(), we hold the count (B), clone the req, and
try to put the new req into the accept queue. By migrating req after
winning the "own_req" race, we can avoid such a worst situation:

  CPU 1 looks up req1
  CPU 2 looks up req1, unhashes it, then CPU 1 loses the race
  CPU 3 looks up req2, unhashes it, then CPU 2 loses the race
  ...

Signed-off-by: Kuniyuki Iwashima 
---
 net/ipv4/inet_connection_sock.c | 30 +-
 net/ipv4/tcp_ipv4.c | 20 ++--
 net/ipv6/tcp_ipv6.c | 14 +++---
 3 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index dc984d1f352e..2f1e5897137b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1072,10 +1072,38 @@ struct sock *inet_csk_complete_hashdance(struct sock 
*sk, struct sock *child,
if (own_req) {
inet_csk_reqsk_queue_drop(sk, req);
reqsk_queue_removed(_csk(sk)->icsk_accept_queue, req);
-   if (inet_csk_reqsk_queue_add(sk, req, child))
+
+   if (sk != req->rsk_listener) {
+   /* another listening sk has been selected,
+* migrate the req to it.
+*/
+   struct request_sock *nreq;
+
+   /* hold a refcnt for the nreq->rsk_listener
+* which is assigned in reqsk_clone()
+*/
+   sock_hold(sk);
+   nreq = reqsk_clone(req, sk);
+   if (!nreq) {
+   inet_child_forget(sk, req, child);
+   goto child_put;
+   }
+
+   refcount_set(>rsk_refcnt, 1);
+   if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
+   reqsk_migrate_reset(req);
+   reqsk_put(req);
+   return child;
+   }
+
+   reqsk_migrate_reset(nreq);
+   __reqsk_free(nreq);
+   } else if (inet_csk_reqsk_queue_add(sk, req, child)) {
return child;
+   }
}
/* Too bad, another child took ownership of the request, undo. */
+child_put:
bh_unlock_sock(child);
sock_put(child);
return NULL;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 312184cead57..214495d02143 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2000,13 +2000,21 @@ int tcp_v4_rcv(struct sk_buff *skb)
goto csum_error;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
-   inet_csk_reqsk_queue_drop_and_put(sk, req);
-   goto lookup;
+   nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+   if (!nsk) {
+   inet_csk_reqsk_queue_drop_and_put(sk, req);
+   goto lookup;
+   }
+   sk = nsk;
+   /* reuseport_migrate_sock() has already held one 
sk_refcnt
+* before returning.
+*/
+   } else {
+   /* We own a reference on the listener, increase it again
+* as we might lose it too soon.
+*/
+   sock_hold(sk);
}
-   /* We own a reference on the listener, increase it again
-* as we might lose it too soon.
-*/
-   sock_hold(sk);
refcounted = true;
nsk = NULL;
if (!tcp_filter(sk, skb)) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5f47c0b6e3de..aea8e75d3fed 100644
--- a/net/ipv6/tcp_ipv6.c

[PATCH v3 bpf-next 06/11] tcp: Migrate TCP_NEW_SYN_RECV requests at retransmitting SYN+ACKs.

As with the preceding patch, this patch changes reqsk_timer_handler() to
call reuseport_migrate_sock() and reqsk_clone() to migrate in-flight
requests at retransmitting SYN+ACKs. If we can select a new listener and
clone the request, we resume setting the SYN+ACK timer for the new req. If
we can set the timer, we call inet_ehash_insert() to unhash the old req and
put the new req into ehash.

The noteworthy point here is that by unhashing the old req, another CPU
processing it may lose the "own_req" race in tcp_v[46]_syn_recv_sock() and
drop the final ACK packet. However, the new timer will recover this
situation.

Signed-off-by: Kuniyuki Iwashima 
---
 net/core/request_sock.c |  1 +
 net/ipv4/inet_connection_sock.c | 76 +
 2 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 82cf9fbe2668..08c37ecd923b 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -151,6 +151,7 @@ struct request_sock *reqsk_clone(struct request_sock *req, 
struct sock *sk)
memcpy(_sk->sk_dontcopy_end, _sk->sk_dontcopy_end,
   req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
 
+   sk_node_init(_sk->sk_node);
nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
 #ifdef CONFIG_XPS
nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 851992405826..dc984d1f352e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -695,10 +695,20 @@ int inet_rtx_syn_ack(const struct sock *parent, struct 
request_sock *req)
 }
 EXPORT_SYMBOL(inet_rtx_syn_ack);
 
+static void reqsk_queue_migrated(struct request_sock_queue *queue,
+const struct request_sock *req)
+{
+   if (req->num_timeout == 0)
+   atomic_inc(>young);
+   atomic_inc(>qlen);
+}
+
 static void reqsk_migrate_reset(struct request_sock *req)
 {
+   req->saved_syn = NULL;
+   inet_rsk(req)->ireq_opt = NULL;
 #if IS_ENABLED(CONFIG_IPV6)
-   inet_rsk(req)->ipv6_opt = NULL;
+   inet_rsk(req)->pktopts = NULL;
 #endif
 }
 
@@ -741,16 +751,37 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
 
 static void reqsk_timer_handler(struct timer_list *t)
 {
-   struct request_sock *req = from_timer(req, t, rsk_timer);
-   struct sock *sk_listener = req->rsk_listener;
-   struct net *net = sock_net(sk_listener);
-   struct inet_connection_sock *icsk = inet_csk(sk_listener);
-   struct request_sock_queue *queue = >icsk_accept_queue;
+   struct request_sock *req = from_timer(req, t, rsk_timer), *nreq = NULL, 
*oreq = req;
+   struct sock *sk_listener = req->rsk_listener, *nsk = NULL;
+   struct inet_connection_sock *icsk;
+   struct request_sock_queue *queue;
+   struct net *net;
int max_syn_ack_retries, qlen, expire = 0, resend = 0;
 
-   if (inet_sk_state_load(sk_listener) != TCP_LISTEN)
-   goto drop;
+   if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
+   nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL);
+   if (!nsk)
+   goto drop;
+
+   nreq = reqsk_clone(req, nsk);
+   if (!nreq)
+   goto drop;
+
+   /* The new timer for the cloned req can decrease the 2
+* by calling inet_csk_reqsk_queue_drop_and_put(), so
+* hold another count to prevent use-after-free and
+* call reqsk_put() just before return.
+*/
+   refcount_set(>rsk_refcnt, 2 + 1);
+   timer_setup(>rsk_timer, reqsk_timer_handler, 
TIMER_PINNED);
+   reqsk_queue_migrated(_csk(nsk)->icsk_accept_queue, req);
+
+   req = nreq;
+   sk_listener = nsk;
+   }
 
+   icsk = inet_csk(sk_listener);
+   net = sock_net(sk_listener);
max_syn_ack_retries = icsk->icsk_syn_retries ? : 
net->ipv4.sysctl_tcp_synack_retries;
/* Normally all the openreqs are young and become mature
 * (i.e. converted to established socket) for first timeout.
@@ -769,6 +800,7 @@ static void reqsk_timer_handler(struct timer_list *t)
 * embrions; and abort old ones without pity, if old
 * ones are about to clog our table.
 */
+   queue = >icsk_accept_queue;
qlen = reqsk_queue_len(queue);
if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
int young = reqsk_queue_len_young(queue) << 1;
@@ -793,10 +825,36 @@ static void reqsk_timer_handler(struct timer_list *t)
atomic_dec(>young);
timeo = min(TCP_TIMEOUT_INIT << req

[PATCH v3 bpf-next 05/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

When we call close() or shutdown() for listening sockets, each child socket
in the accept queue are freed at inet_csk_listen_stop(). If we can get a
new listener by reuseport_migrate_sock() and clone the request by
reqsk_clone(), we try to add it into the new listener's accept queue by
inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free() to
call sock_put() for its listener and free the cloned request.

After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets
NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be
non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid
double free.

Note that we do not update req->rsk_listener and instead clone the req to
migrate because another path may reference the original request. If we
protected it by RCU, we would need to add rcu_read_lock() in many places.

Link: 
https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmo...@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau 
Signed-off-by: Kuniyuki Iwashima 
---
 include/net/request_sock.h  |  2 ++
 net/core/request_sock.c | 37 +
 net/ipv4/inet_connection_sock.c | 31 ++-
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 29e41ff3ec93..c6d6cfd3c93b 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -190,6 +190,8 @@ void reqsk_queue_alloc(struct request_sock_queue *queue);
 void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
   bool reset);
 
+struct request_sock *reqsk_clone(struct request_sock *req, struct sock *sk);
+
 static inline bool reqsk_queue_empty(const struct request_sock_queue *queue)
 {
return READ_ONCE(queue->rskq_accept_head) == NULL;
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index f35c2e998406..82cf9fbe2668 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -130,3 +130,40 @@ void reqsk_fastopen_remove(struct sock *sk, struct 
request_sock *req,
 out:
spin_unlock_bh(>lock);
 }
+
+struct request_sock *reqsk_clone(struct request_sock *req, struct sock *sk)
+{
+   struct sock *req_sk, *nreq_sk;
+   struct request_sock *nreq;
+
+   nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+   if (!nreq) {
+   /* paired with refcount_inc_not_zero() in 
reuseport_migrate_sock() */
+   sock_put(sk);
+   return NULL;
+   }
+
+   req_sk = req_to_sk(req);
+   nreq_sk = req_to_sk(nreq);
+
+   memcpy(nreq_sk, req_sk,
+  offsetof(struct sock, sk_dontcopy_begin));
+   memcpy(_sk->sk_dontcopy_end, _sk->sk_dontcopy_end,
+  req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+   nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_XPS
+   nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+   nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+
+   nreq->rsk_listener = sk;
+
+   /* We need not acquire fastopenq->lock
+* because the child socket is locked in inet_csk_listen_stop().
+*/
+   if (tcp_rsk(nreq)->tfo_listener)
+   rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+   return nreq;
+}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fa806e9167ec..851992405826 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -695,6 +695,13 @@ int inet_rtx_syn_ack(const struct sock *parent, struct 
request_sock *req)
 }
 EXPORT_SYMBOL(inet_rtx_syn_ack);
 
+static void reqsk_migrate_reset(struct request_sock *req)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+   inet_rsk(req)->ipv6_opt = NULL;
+#endif
+}
+
 /* return true if req was found in the ehash table */
 static bool reqsk_queue_unlink(struct request_sock *req)
 {
@@ -1036,14 +1043,36 @@ void inet_csk_listen_stop(struct sock *sk)
 * of the variants now. --ANK
 */
while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
-   struct sock *child = req->sk;
+   struct sock *child = req->sk, *nsk;
+   struct request_sock *nreq;
 
local_bh_disable();
bh_lock_sock(child);
WARN_ON(sock_owned_by_user(child));
sock_hold(child);
 
+   nsk = reuseport_migrate_sock(sk, child, NULL);
+   if (nsk) {
+   nreq = reqsk_clone(req, nsk);
+   if (nreq) {
+   refcount_set(>rsk_refcnt, 1);
+
+   if (inet_csk_reqsk_queue_add(nsk, nreq, child)) 
{
+

[PATCH v3 bpf-next 04/11] tcp: Add reuseport_migrate_sock() to select a new listener.

reuseport_migrate_sock() does the same check done in
reuseport_listen_stop_sock(). If the reuseport group is capable of
migration, reuseport_migrate_sock() selects a new listener by the child
socket hash and increments the listener's sk_refcnt beforehand. Thus, if we
fail in the migration, we have to decrement it later.

We will support migration by eBPF in the later commits.

Signed-off-by: Kuniyuki Iwashima 
Signed-off-by: Martin KaFai Lau 
---
 include/net/sock_reuseport.h |  3 ++
 net/core/sock_reuseport.c| 78 +---
 2 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 1333d0cddfbc..473b0b0fa4ab 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -37,6 +37,9 @@ extern struct sock *reuseport_select_sock(struct sock *sk,
  u32 hash,
  struct sk_buff *skb,
  int hdr_len);
+struct sock *reuseport_migrate_sock(struct sock *sk,
+   struct sock *migrating_sk,
+   struct sk_buff *skb);
 extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
 extern int reuseport_detach_prog(struct sock *sk);
 
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index d5fb0ad12e87..a2bca39ec0e3 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -44,7 +44,7 @@ static void __reuseport_add_sock(struct sock *sk,
 struct sock_reuseport *reuse)
 {
reuse->socks[reuse->num_socks] = sk;
-   /* paired with smp_rmb() in reuseport_select_sock() */
+   /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
smp_wmb();
reuse->num_socks++;
 }
@@ -435,6 +435,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport 
*reuse, u16 socks,
return reuse->socks[index];
 }
 
+static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
+ u32 hash, u16 num_socks)
+{
+   int i, j;
+
+   i = j = reciprocal_scale(hash, num_socks);
+   while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+   i++;
+   if (i >= num_socks)
+   i = 0;
+   if (i == j)
+   return NULL;
+   }
+
+   return reuse->socks[i];
+}
+
 /**
  *  reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
  *  @sk: First socket in the group.
@@ -478,19 +495,8 @@ struct sock *reuseport_select_sock(struct sock *sk,
 
 select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
-   if (!sk2) {
-   int i, j;
-
-   i = j = reciprocal_scale(hash, socks);
-   while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
-   i++;
-   if (i >= socks)
-   i = 0;
-   if (i == j)
-   goto out;
-   }
-   sk2 = reuse->socks[i];
-   }
+   if (!sk2)
+   sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
}
 
 out:
@@ -499,6 +505,50 @@ struct sock *reuseport_select_sock(struct sock *sk,
 }
 EXPORT_SYMBOL(reuseport_select_sock);
 
+/**
+ *  reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
+ *  @sk: close()ed or shutdown()ed socket in the group.
+ *  @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
+ *NEW_SYN_RECV request socket during 3WHS.
+ *  @skb: skb to run through BPF filter.
+ *  Returns a socket (with sk_refcnt +1) that should accept the child socket
+ *  (or NULL on error).
+ */
+struct sock *reuseport_migrate_sock(struct sock *sk,
+   struct sock *migrating_sk,
+   struct sk_buff *skb)
+{
+   struct sock_reuseport *reuse;
+   struct sock *nsk = NULL;
+   u16 socks;
+   u32 hash;
+
+   rcu_read_lock();
+
+   reuse = rcu_dereference(sk->sk_reuseport_cb);
+   if (!reuse)
+   goto out;
+
+   socks = READ_ONCE(reuse->num_socks);
+   if (unlikely(!socks))
+   goto out;
+
+   /* paired with smp_wmb() in __reuseport_add_sock() */
+   smp_rmb();
+
+   hash = migrating_sk->sk_hash;
+   if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
+   nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
+
+   if (nsk && unlikely(!refcount_inc_not_zero(>sk_refcnt)))
+   nsk = NULL;
+
+out:
+   rcu_read_unlock();
+   return nsk;
+}
+EXPORT_SYMBOL(reuseport_migrate_

[PATCH v3 bpf-next 03/11] tcp: Keep TCP_CLOSE sockets in the reuseport group.

When we close a listening socket, to migrate its connections to another
listener in the same reuseport group, we have to handle two kinds of child
sockets. One is that a listening socket has a reference to, and the other
is not.

The former is the TCP_ESTABLISHED/TCP_SYN_RECV sockets, and they are in the
accept queue of their listening socket. So we can pop them out and push
them into another listener's queue at close() or shutdown() syscalls. On
the other hand, the latter, the TCP_NEW_SYN_RECV socket is during the
three-way handshake and not in the accept queue. Thus, we cannot access
such sockets at close() or shutdown() syscalls. Accordingly, we have to
migrate immature sockets after their listening socket has been closed.

Currently, if their listening socket has been closed, TCP_NEW_SYN_RECV
sockets are freed at receiving the final ACK or retransmitting SYN+ACKs. At
that time, if we could select a new listener from the same reuseport group,
no connection would be aborted. However, we cannot do that because
reuseport_detach_sock() sets NULL to sk_reuseport_cb and forbids access to
the reuseport group from closed sockets.

This patch allows TCP_CLOSE sockets to remain in the reuseport group and
access it while any child socket references them. The point is that
reuseport_detach_sock() was called twice from inet_unhash() and
sk_destruct(). This patch replaces the first reuseport_detach_sock() with
reuseport_stop_listen_sock(), which checks if the reuseport group is
capable of migration. If capable, it decrements num_socks, moves the socket
backwards in socks[] and increments num_closed_socks. When all connections
are migrated, sk_destruct() calls reuseport_detach_sock() to remove the
socket from socks[], decrement num_closed_socks, and set NULL to
sk_reuseport_cb.

By this change, closed or shutdowned sockets can keep sk_reuseport_cb.
Consequently, calling listen() after shutdown() can cause EADDRINUSE or
EBUSY in inet_csk_bind_conflict() or reuseport_add_sock() which expects
such sockets not to have the reuseport group. Therefore, this patch also
loosens such validation rules so that a socket can listen again if it has a
reuseport group with num_closed_socks more than 0.

When such sockets listen again, we handle them in reuseport_resurrect(). If
there is an existing reuseport group (reuseport_add_sock() path), we move
the socket from the old group to the new one and free the old one if
necessary. If there is no existing group (reuseport_alloc() path), we
allocate a new reuseport group, detach sk from the old one, and free it if
necessary, not to break the current shutdown behaviour:

  - we cannot carry over the eBPF prog of shutdowned sockets
  - we cannot attach an eBPF prog to listening sockets via shutdowned
sockets

Note that when the number of sockets gets over U16_MAX, we try to detach a
closed socket randomly to make room for the new listening socket in
reuseport_grow().

Signed-off-by: Kuniyuki Iwashima 
Signed-off-by: Martin KaFai Lau 
---
 include/net/sock_reuseport.h|   1 +
 net/core/sock_reuseport.c   | 159 +++-
 net/ipv4/inet_connection_sock.c |  12 ++-
 net/ipv4/inet_hashtables.c  |   2 +-
 4 files changed, 168 insertions(+), 6 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 0e558ca7afbf..1333d0cddfbc 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -32,6 +32,7 @@ extern int reuseport_alloc(struct sock *sk, bool bind_inany);
 extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
  bool bind_inany);
 extern void reuseport_detach_sock(struct sock *sk);
+void reuseport_stop_listen_sock(struct sock *sk);
 extern struct sock *reuseport_select_sock(struct sock *sk,
  u32 hash,
  struct sk_buff *skb,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 079bd1aca0e7..d5fb0ad12e87 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -17,6 +17,8 @@
 DEFINE_SPINLOCK(reuseport_lock);
 
 static DEFINE_IDA(reuseport_ida);
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport 
*old_reuse,
+  struct sock_reuseport *reuse, bool bind_inany);
 
 static int reuseport_sock_index(struct sock *sk,
struct sock_reuseport *reuse,
@@ -61,6 +63,29 @@ static bool __reuseport_detach_sock(struct sock *sk,
return true;
 }
 
+static void __reuseport_add_closed_sock(struct sock *sk,
+   struct sock_reuseport *reuse)
+{
+   reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
+   /* paired with READ_ONCE() in inet_csk_bind_conflict() */
+   WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
+}
+
+static bool __reuseport_detach_closed_sock(

[PATCH v3 bpf-next 02/11] tcp: Add num_closed_socks to struct sock_reuseport.

As noted in the following commit, a closed listener has to hold the
reference to the reuseport group for socket migration. This patch adds a
field (num_closed_socks) to struct sock_reuseport to manage closed sockets
within the same reuseport group. Moreover, this and the following commits
introduce some helper functions to split socks[] into two sections and keep
TCP_LISTEN and TCP_CLOSE sockets in each section. Like a double-ended
queue, we will place TCP_LISTEN sockets from the front and TCP_CLOSE
sockets from the end.

  TCP_LISTEN-->   <---TCP_CLOSE
  +---+---+  ---  +---+  ---  +---+  ---  +---+
  | 0 | 1 |  ...  | i |  ...  | j |  ...  | k |
  +---+---+  ---  +---+  ---  +---+  ---  +---+

  i = num_socks - 1
  j = max_socks - num_closed_socks
  k = max_socks - 1

This patch also extends reuseport_add_sock() and reuseport_grow() to
support num_closed_socks.

Signed-off-by: Kuniyuki Iwashima 
---
 include/net/sock_reuseport.h |  5 ++-
 net/core/sock_reuseport.c| 76 +++-
 2 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 505f1e18e9bf..0e558ca7afbf 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock;
 struct sock_reuseport {
struct rcu_head rcu;
 
-   u16 max_socks;  /* length of socks */
-   u16 num_socks;  /* elements in socks */
+   u16 max_socks;  /* length of socks */
+   u16 num_socks;  /* elements in socks */
+   u16 num_closed_socks;   /* closed elements in 
socks */
/* The last synq overflow event timestamp of this
 * reuse->socks[] group.
 */
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index b065f0a103ed..079bd1aca0e7 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -18,6 +18,49 @@ DEFINE_SPINLOCK(reuseport_lock);
 
 static DEFINE_IDA(reuseport_ida);
 
+static int reuseport_sock_index(struct sock *sk,
+   struct sock_reuseport *reuse,
+   bool closed)
+{
+   int left, right;
+
+   if (!closed) {
+   left = 0;
+   right = reuse->num_socks;
+   } else {
+   left = reuse->max_socks - reuse->num_closed_socks;
+   right = reuse->max_socks;
+   }
+
+   for (; left < right; left++)
+   if (reuse->socks[left] == sk)
+   return left;
+   return -1;
+}
+
+static void __reuseport_add_sock(struct sock *sk,
+struct sock_reuseport *reuse)
+{
+   reuse->socks[reuse->num_socks] = sk;
+   /* paired with smp_rmb() in reuseport_select_sock() */
+   smp_wmb();
+   reuse->num_socks++;
+}
+
+static bool __reuseport_detach_sock(struct sock *sk,
+   struct sock_reuseport *reuse)
+{
+   int i = reuseport_sock_index(sk, reuse, false);
+
+   if (i == -1)
+   return false;
+
+   reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+   reuse->num_socks--;
+
+   return true;
+}
+
 static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
 {
unsigned int size = sizeof(struct sock_reuseport) +
@@ -72,9 +115,8 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
}
 
reuse->reuseport_id = id;
-   reuse->socks[0] = sk;
-   reuse->num_socks = 1;
reuse->bind_inany = bind_inany;
+   __reuseport_add_sock(sk, reuse);
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
 out:
@@ -98,6 +140,7 @@ static struct sock_reuseport *reuseport_grow(struct 
sock_reuseport *reuse)
return NULL;
 
more_reuse->num_socks = reuse->num_socks;
+   more_reuse->num_closed_socks = reuse->num_closed_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
@@ -105,9 +148,13 @@ static struct sock_reuseport *reuseport_grow(struct 
sock_reuseport *reuse)
 
memcpy(more_reuse->socks, reuse->socks,
   reuse->num_socks * sizeof(struct sock *));
+   memcpy(more_reuse->socks +
+  (more_reuse->max_socks - more_reuse->num_closed_socks),
+  reuse->socks + reuse->num_socks,
+  reuse->num_closed_socks * sizeof(struct sock *));
more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
 
-   for (i = 0; i < reuse->num_socks; ++i)
+   for (i = 0; i < reuse->max_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,

[PATCH v3 bpf-next 00/11] Socket migration for SO_REUSEPORT.

The SO_REUSEPORT option allows sockets to listen on the same port and to
accept connections evenly. However, there is a defect in the current
implementation [1]. When a SYN packet is received, the connection is tied
to a listening socket. Accordingly, when the listener is closed, in-flight
requests during the three-way handshake and child sockets in the accept
queue are dropped even if other listeners on the same port could accept
such connections.

This situation can happen when various server management tools restart
server (such as nginx) processes. For instance, when we change nginx
configurations and restart it, it spins up new workers that respect the new
configuration and closes all listeners on the old workers, resulting in the
in-flight ACK of 3WHS is responded by RST.

The SO_REUSEPORT option is excellent to improve scalability. On the other
hand, as a trade-off, users have to know deeply how the kernel handles SYN
packets and implement connection draining by eBPF [2]:

1. Stop routing SYN packets to the listener by eBPF.
2. Wait for all timers to expire to complete requests
3. Accept connections until EAGAIN, then close the listener.

1. Start counting SYN packets and accept syscalls using the eBPF map.
2. Stop routing SYN packets.
3. Accept connections up to the count, then close the listener.

In either way, we cannot close a listener immediately. However, ideally,
the application need not drain the not yet accepted sockets because 3WHS
and tying a connection to a listener are just the kernel behaviour. The
root cause is within the kernel, so the issue should be addressed in kernel
space and should not be visible to user space. This patchset fixes it so
that users need not take care of kernel implementation and connection
draining. With this patchset, the kernel redistributes requests and
connections from a listener to the others in the same reuseport group
at/after close or shutdown syscalls.

Although some software does connection draining, there are still merits in
migration. For some security reasons, such as replacing TLS certificates,
we may want to apply new settings as soon as possible and/or we may not be
able to wait for connection draining. The sockets in the accept queue have
not started application sessions yet. So, if we do not drain such sockets,
they can be handled by the newer listeners and could have a longer
lifetime. It is difficult to drain all connections in every case, but we
can decrease such aborted connections by migration. In that sense,
migration is always better than draining.

Moreover, auto-migration simplifies user space logic and also works well in
a case where we cannot modify and build a server program to implement the
workaround.

Note that the source and destination listeners MUST have the same settings
at the socket API level; otherwise, applications may face inconsistency and
cause errors. In such a case, we have to use the eBPF program to select a
specific listener or to cancel migration.

Special thanks to Martin KaFai Lau for bouncing ideas and exchanging code
snippets along the way.

Link:
[1] The SO_REUSEPORT socket option
https://lwn.net/Articles/542629/

[2] Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain
mode

https://lore.kernel.org/netdev/1458828813.10868.65.ca...@edumazet-glaptop3.roam.corp.google.com/

Changelog:
v3:
* Add sysctl back for reuseport_grow()
* Add helper functions to manage socks[]
* Separate migration related logic into functions: reuseport_resurrect(),
reuseport_stop_listen_sock(), reuseport_migrate_sock()
* Clone request_sock to be migrated
* Migrate request one by one
* Pass child socket to eBPF prog

v2:
https://lore.kernel.org/netdev/20201207132456.65472-1-kun...@amazon.co.jp/
* Do not save closed sockets in socks[]
* Revert 607904c357c61adf20b8fd18af765e501d61a385
* Extract inet_csk_reqsk_queue_migrate() into a single patch
* Change the spin_lock order to avoid lockdep warning
* Add static to __reuseport_select_sock
* Use refcount_inc_not_zero() in reuseport_select_migrated_sock()
* Set the default attach type in bpf_prog_load_check_attach()
* Define new proto of BPF_FUNC_get_socket_cookie
* Fix test to be compiled successfully
* Update commit messages

v1:
https://lore.kernel.org/netdev/20201201144418.35045-1-kun...@amazon.co.jp/
* Remove the sysctl option
* Enable migration if eBPF progam is not attached
* Add expected_attach_type to check if eBPF program can migrate sockets
* Add a field to tell migration type to eBPF program
* Support BPF_FUNC_get_socket_cookie to get the cookie of sk
* Allocate an empty skb if skb is NULL
* Pass req_to_sk(req)->sk_hash because listener's hash is zero
* Update commit messages and coverletter

RFC:
https://lore.kernel.org/netdev/20201117094023.3685-1-kun...@amazon.co.jp/

Kuniyuki Iwashima (11):
net: Introduce net.ipv4.tcp_migrate_req.
tcp: Add num_closed_socks to str

[PATCH v5 net-next] net: Remove redundant calls of sk_tx_queue_clear().

The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and adds
it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). On the
other hand, the original commit had already put sk_tx_queue_clear() in
sk_prot_alloc(): the callee of sk_alloc() and sk_clone_lock(). Thus
sk_tx_queue_clear() is called twice in each path.

If we remove sk_tx_queue_clear() in sk_alloc() and sk_clone_lock(), it
currently works well because (i) sk_tx_queue_mapping is defined between
sk_dontcopy_begin and sk_dontcopy_end, and (ii) sock_copy() called after
sk_prot_alloc() in sk_clone_lock() does not overwrite sk_tx_queue_mapping.
However, if we move sk_tx_queue_mapping out of the no copy area, it
introduces a bug unintentionally.

Therefore, this patch adds a compile-time check to take care of the order
of sock_copy() and sk_tx_queue_clear() and removes sk_tx_queue_clear() from
sk_prot_alloc() so that it does the only allocation and its callers
initialize fields.

CC: Tariq Toukan 
CC: Boris Pismenny 
Signed-off-by: Kuniyuki Iwashima 
Acked-by: Tariq Toukan 
---
v5:
* Move the changelog after the --- separator

v4: https://lore.kernel.org/netdev/20210128124229.78315-1-kun...@amazon.co.jp/
* Fix typo in the changelog (runtime -> compile-time)

v3: https://lore.kernel.org/netdev/20210128021905.57471-1-kun...@amazon.co.jp/
* Remove Fixes: tag
* Add BUILD_BUG_ON
* Remove sk_tx_queue_clear() from sk_prot_alloc()
  instead of sk_alloc() and sk_clone_lock()

v2: https://lore.kernel.org/netdev/20210127132215.10842-1-kun...@amazon.co.jp/
* Remove Reviewed-by: tag

v1: https://lore.kernel.org/netdev/20210127125018.7059-1-kun...@amazon.co.jp/

 net/core/sock.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index bbcd4b97eddd..cfbd62a5e079 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1657,6 +1657,16 @@ static void sock_copy(struct sock *nsk, const struct 
sock *osk)
 #ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
 #endif
+
+   /* If we move sk_tx_queue_mapping out of the private section,
+* we must check if sk_tx_queue_clear() is called after
+* sock_copy() in sk_clone_lock().
+*/
+   BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
+offsetof(struct sock, sk_dontcopy_begin) ||
+offsetof(struct sock, sk_tx_queue_mapping) >=
+offsetof(struct sock, sk_dontcopy_end));
+
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
 
memcpy(>sk_dontcopy_end, >sk_dontcopy_end,
@@ -1690,7 +1700,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, 
gfp_t priority,
 
if (!try_module_get(prot->owner))
goto out_free_sec;
-   sk_tx_queue_clear(sk);
}
 
return sk;
-- 
2.17.2 (Apple Git-113)

Re: [PATCH v4 net-next] net: Remove redundant calls of sk_tx_queue_clear().

From:   Tariq Toukan 
Date:   Thu, 28 Jan 2021 15:09:51 +0200
> On 1/28/2021 2:42 PM, Kuniyuki Iwashima wrote:
> > The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> > sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and adds
> > it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
> > the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). On the
> > other hand, the original commit had already put sk_tx_queue_clear() in
> > sk_prot_alloc(): the callee of sk_alloc() and sk_clone_lock(). Thus
> > sk_tx_queue_clear() is called twice in each path.
> > 
> > If we remove sk_tx_queue_clear() in sk_alloc() and sk_clone_lock(), it
> > currently works well because (i) sk_tx_queue_mapping is defined between
> > sk_dontcopy_begin and sk_dontcopy_end, and (ii) sock_copy() called after
> > sk_prot_alloc() in sk_clone_lock() does not overwrite sk_tx_queue_mapping.
> > However, if we move sk_tx_queue_mapping out of the no copy area, it
> > introduces a bug unintentionally.
> > 
> > Therefore, this patch adds a compile-time check to take care of the order
> > of sock_copy() and sk_tx_queue_clear() and removes sk_tx_queue_clear() from
> > sk_prot_alloc() so that it does the only allocation and its callers
> > initialize fields.
> > 
> > v4:
> > * Fix typo in the changelog (runtime -> compile-time)
> > 
> > v3: 
> > https://lore.kernel.org/netdev/20210128021905.57471-1-kun...@amazon.co.jp/
> > * Remove Fixes: tag
> > * Add BUILD_BUG_ON
> > * Remove sk_tx_queue_clear() from sk_prot_alloc()
> >instead of sk_alloc() and sk_clone_lock()
> > 
> > v2: 
> > https://lore.kernel.org/netdev/20210127132215.10842-1-kun...@amazon.co.jp/
> > * Remove Reviewed-by: tag
> > 
> > v1: 
> > https://lore.kernel.org/netdev/20210127125018.7059-1-kun...@amazon.co.jp/
> > 
> 
> Sorry for not pointing this out earlier, but shouldn't the changelog 
> come after the --- separator? Unless you want it to appear as part of 
> the commit message.
> 
> Other than that, I think now I'm fine with the patch.
> 
> Acked-by: Tariq Toukan 
> 
> Thanks,
> Tariq

Oh, I didn't know that useful behaviour, thank you!

I will respin with your Acked-by tag.


> > CC: Tariq Toukan 
> > CC: Boris Pismenny 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >   net/core/sock.c | 11 ++-
> >   1 file changed, 10 insertions(+), 1 deletion(-)
> > 
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index bbcd4b97eddd..cfbd62a5e079 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1657,6 +1657,16 @@ static void sock_copy(struct sock *nsk, const struct 
> > sock *osk)
> >   #ifdef CONFIG_SECURITY_NETWORK
> > void *sptr = nsk->sk_security;
> >   #endif
> > +
> > +   /* If we move sk_tx_queue_mapping out of the private section,
> > +* we must check if sk_tx_queue_clear() is called after
> > +* sock_copy() in sk_clone_lock().
> > +*/
> > +   BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
> > +offsetof(struct sock, sk_dontcopy_begin) ||
> > +offsetof(struct sock, sk_tx_queue_mapping) >=
> > +offsetof(struct sock, sk_dontcopy_end));
> > +
> > memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
> >   
> > memcpy(>sk_dontcopy_end, >sk_dontcopy_end,
> > @@ -1690,7 +1700,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, 
> > gfp_t priority,
> >   
> > if (!try_module_get(prot->owner))
> > goto out_free_sec;
> > -   sk_tx_queue_clear(sk);
> > }
> >   
> > return sk;
> >

[PATCH v4 net-next] net: Remove redundant calls of sk_tx_queue_clear().

The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and adds
it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). On the
other hand, the original commit had already put sk_tx_queue_clear() in
sk_prot_alloc(): the callee of sk_alloc() and sk_clone_lock(). Thus
sk_tx_queue_clear() is called twice in each path.

If we remove sk_tx_queue_clear() in sk_alloc() and sk_clone_lock(), it
currently works well because (i) sk_tx_queue_mapping is defined between
sk_dontcopy_begin and sk_dontcopy_end, and (ii) sock_copy() called after
sk_prot_alloc() in sk_clone_lock() does not overwrite sk_tx_queue_mapping.
However, if we move sk_tx_queue_mapping out of the no copy area, it
introduces a bug unintentionally.

Therefore, this patch adds a compile-time check to take care of the order
of sock_copy() and sk_tx_queue_clear() and removes sk_tx_queue_clear() from
sk_prot_alloc() so that it does the only allocation and its callers
initialize fields.

v4:
* Fix typo in the changelog (runtime -> compile-time)

v3: https://lore.kernel.org/netdev/20210128021905.57471-1-kun...@amazon.co.jp/
* Remove Fixes: tag
* Add BUILD_BUG_ON
* Remove sk_tx_queue_clear() from sk_prot_alloc()
  instead of sk_alloc() and sk_clone_lock()

v2: https://lore.kernel.org/netdev/20210127132215.10842-1-kun...@amazon.co.jp/
* Remove Reviewed-by: tag

v1: https://lore.kernel.org/netdev/20210127125018.7059-1-kun...@amazon.co.jp/

CC: Tariq Toukan 
CC: Boris Pismenny 
Signed-off-by: Kuniyuki Iwashima 
---
 net/core/sock.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index bbcd4b97eddd..cfbd62a5e079 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1657,6 +1657,16 @@ static void sock_copy(struct sock *nsk, const struct 
sock *osk)
 #ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
 #endif
+
+   /* If we move sk_tx_queue_mapping out of the private section,
+* we must check if sk_tx_queue_clear() is called after
+* sock_copy() in sk_clone_lock().
+*/
+   BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
+offsetof(struct sock, sk_dontcopy_begin) ||
+offsetof(struct sock, sk_tx_queue_mapping) >=
+offsetof(struct sock, sk_dontcopy_end));
+
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
 
memcpy(>sk_dontcopy_end, >sk_dontcopy_end,
@@ -1690,7 +1700,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, 
gfp_t priority,
 
if (!try_module_get(prot->owner))
goto out_free_sec;
-   sk_tx_queue_clear(sk);
}
 
return sk;
-- 
2.17.2 (Apple Git-113)

Re: [PATCH v3 net-next] net: Remove redundant calls of sk_tx_queue_clear().

From:   Tariq Toukan 
Date:   Thu, 28 Jan 2021 13:07:26 +0200
> On 1/28/2021 4:19 AM, Kuniyuki Iwashima wrote:
> > The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> > sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and adds
> > it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
> > the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). On the
> > other hand, the original commit had already put sk_tx_queue_clear() in
> > sk_prot_alloc(): the callee of sk_alloc() and sk_clone_lock(). Thus
> > sk_tx_queue_clear() is called twice in each path.
> > 
> > If we remove sk_tx_queue_clear() in sk_alloc() and sk_clone_lock(), it
> > currently works well because (i) sk_tx_queue_mapping is defined between
> > sk_dontcopy_begin and sk_dontcopy_end, and (ii) sock_copy() called after
> > sk_prot_alloc() in sk_clone_lock() does not overwrite sk_tx_queue_mapping.
> > However, if we move sk_tx_queue_mapping out of the no copy area, it
> > introduces a bug unintentionally.
> > 
> > Therefore, this patch adds a runtime 
> 
> compile-time

Oh, shame on me...
I'll fix it in the next spin.

Thank you,
Kuniyuki


> > check to take care of the order of
> > sock_copy() and sk_tx_queue_clear() and removes sk_tx_queue_clear() from
> > sk_prot_alloc() so that it does the only allocation and its callers
> > initialize fields.
> > 
> > v3:
> > * Remove Fixes: tag
> > * Add BUILD_BUG_ON
> > * Remove sk_tx_queue_clear() from sk_prot_alloc()
> >instead of sk_alloc() and sk_clone_lock()
> > 
> > v2: 
> > https://lore.kernel.org/netdev/20210127132215.10842-1-kun...@amazon.co.jp/
> > * Remove Reviewed-by: tag
> > 
> > v1: 
> > https://lore.kernel.org/netdev/20210127125018.7059-1-kun...@amazon.co.jp/
> > 
> > CC: Tariq Toukan 
> > CC: Boris Pismenny 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >   net/core/sock.c | 11 ++-
> >   1 file changed, 10 insertions(+), 1 deletion(-)
> > 
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index bbcd4b97eddd..cfbd62a5e079 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1657,6 +1657,16 @@ static void sock_copy(struct sock *nsk, const struct 
> > sock *osk)
> >   #ifdef CONFIG_SECURITY_NETWORK
> > void *sptr = nsk->sk_security;
> >   #endif
> > +
> > +   /* If we move sk_tx_queue_mapping out of the private section,
> > +* we must check if sk_tx_queue_clear() is called after
> > +* sock_copy() in sk_clone_lock().
> > +*/
> > +   BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
> > +offsetof(struct sock, sk_dontcopy_begin) ||
> > +offsetof(struct sock, sk_tx_queue_mapping) >=
> > +offsetof(struct sock, sk_dontcopy_end));
> > +
> > memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
> >   
> > memcpy(>sk_dontcopy_end, >sk_dontcopy_end,
> > @@ -1690,7 +1700,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, 
> > gfp_t priority,
> >   
> > if (!try_module_get(prot->owner))
> > goto out_free_sec;
> > -   sk_tx_queue_clear(sk);
> > }
> >   
> > return sk;
> >

[PATCH v3 net-next] net: Remove redundant calls of sk_tx_queue_clear().

The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and adds
it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). On the
other hand, the original commit had already put sk_tx_queue_clear() in
sk_prot_alloc(): the callee of sk_alloc() and sk_clone_lock(). Thus
sk_tx_queue_clear() is called twice in each path.

If we remove sk_tx_queue_clear() in sk_alloc() and sk_clone_lock(), it
currently works well because (i) sk_tx_queue_mapping is defined between
sk_dontcopy_begin and sk_dontcopy_end, and (ii) sock_copy() called after
sk_prot_alloc() in sk_clone_lock() does not overwrite sk_tx_queue_mapping.
However, if we move sk_tx_queue_mapping out of the no copy area, it
introduces a bug unintentionally.

Therefore, this patch adds a runtime check to take care of the order of
sock_copy() and sk_tx_queue_clear() and removes sk_tx_queue_clear() from
sk_prot_alloc() so that it does the only allocation and its callers
initialize fields.

v3:
* Remove Fixes: tag
* Add BUILD_BUG_ON
* Remove sk_tx_queue_clear() from sk_prot_alloc()
  instead of sk_alloc() and sk_clone_lock()

v2: https://lore.kernel.org/netdev/20210127132215.10842-1-kun...@amazon.co.jp/
* Remove Reviewed-by: tag

v1: https://lore.kernel.org/netdev/20210127125018.7059-1-kun...@amazon.co.jp/

CC: Tariq Toukan 
CC: Boris Pismenny 
Signed-off-by: Kuniyuki Iwashima 
---
 net/core/sock.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index bbcd4b97eddd..cfbd62a5e079 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1657,6 +1657,16 @@ static void sock_copy(struct sock *nsk, const struct 
sock *osk)
 #ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
 #endif
+
+   /* If we move sk_tx_queue_mapping out of the private section,
+* we must check if sk_tx_queue_clear() is called after
+* sock_copy() in sk_clone_lock().
+*/
+   BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
+offsetof(struct sock, sk_dontcopy_begin) ||
+offsetof(struct sock, sk_tx_queue_mapping) >=
+offsetof(struct sock, sk_dontcopy_end));
+
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
 
memcpy(>sk_dontcopy_end, >sk_dontcopy_end,
@@ -1690,7 +1700,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, 
gfp_t priority,
 
if (!try_module_get(prot->owner))
goto out_free_sec;
-   sk_tx_queue_clear(sk);
}
 
return sk;
-- 
2.17.2 (Apple Git-113)

Re: [PATCH net] net: Remove redundant calls of sk_tx_queue_clear().

From:   Eric Dumazet 
Date:   Wed, 27 Jan 2021 19:07:51 +0100
> On Wed, Jan 27, 2021 at 6:56 PM Kuniyuki Iwashima  wrote:
> >
> > From:   Eric Dumazet 
> > Date:   Wed, 27 Jan 2021 18:34:35 +0100
> > > On Wed, Jan 27, 2021 at 6:32 PM Kuniyuki Iwashima  
> > > wrote:
> > > >
> > > > From:   Eric Dumazet 
> > > > Date:   Wed, 27 Jan 2021 18:05:24 +0100
> > > > > On Wed, Jan 27, 2021 at 5:52 PM Kuniyuki Iwashima 
> > > > >  wrote:
> > > > > >
> > > > > > From:   Eric Dumazet 
> > > > > > Date:   Wed, 27 Jan 2021 15:54:32 +0100
> > > > > > > On Wed, Jan 27, 2021 at 1:50 PM Kuniyuki Iwashima 
> > > > > > >  wrote:
> > > > > > > >
> > > > > > > > The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> > > > > > > > sk_set_socket()") removes sk_tx_queue_clear() from 
> > > > > > > > sk_set_socket() and adds
> > > > > > > > it instead in sk_alloc() and sk_clone_lock() to fix an issue 
> > > > > > > > introduced in
> > > > > > > > the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). 
> > > > > > > > However,
> > > > > > > > the original commit had already put sk_tx_queue_clear() in 
> > > > > > > > sk_prot_alloc():
> > > > > > > > the callee of sk_alloc() and sk_clone_lock(). Thus 
> > > > > > > > sk_tx_queue_clear() is
> > > > > > > > called twice in each path currently.
> > > > > > >
> > > > > > > Are you sure ?
> > > > > > >
> > > > > > > I do not clearly see the sk_tx_queue_clear() call from the 
> > > > > > > cloning part.
> > > > > > >
> > > > > > > Please elaborate.
> > > > > >
> > > > > > If sk is not NULL in sk_prot_alloc(), sk_tx_queue_clear() is called 
> > > > > > [1].
> > > > > > Also the callers of sk_prot_alloc() are only sk_alloc() and 
> > > > > > sk_clone_lock().
> > > > > > If they finally return not NULL pointer, sk_tx_queue_clear() is 
> > > > > > called in
> > > > > > each function [2][3].
> > > > > >
> > > > > > In the cloning part, sock_copy() is called after sk_prot_alloc(), 
> > > > > > but
> > > > > > skc_tx_queue_mapping is defined between skc_dontcopy_begin and
> > > > > > skc_dontcopy_end in struct sock_common [4]. So, sock_copy() does not
> > > > > > overwrite skc_tx_queue_mapping, and thus we can initialize it in
> > > > > > sk_prot_alloc().
> > > > >
> > > > > That is a lot of assumptions.
> > > > >
> > > > > What guarantees do we have that skc_tx_queue_mapping will never be
> > > > > moved out of this section ?
> > > > > AFAIK it was there by accident, for cache locality reasons, that might
> > > > > change in the future as we add more stuff in socket.
> > > > >
> > > > > I feel this optimization is risky for future changes, for a code path
> > > > > that is spending thousands of cycles anyway.
> > > >
> > > > If someone try to move skc_tx_queue_mapping out of the section, should
> > > > they take care about where it is used ?
> >
> > I'm sorry if it might be misleading, I would like to mean someone/they is
> > the author of a patch to move skc_tx_queue_mapping.
> >
> >
> > > Certainly not. You hide some knowledge, without a comment or some runtime 
> > > check.
> >
> > It was my bad, I should have written about sock_copy() in the changelog.
> 
> I think you also want to add some compile time check.
> 
> BUILD_BUG_ON( skc_tx_queue_mapping is in the no copy area)
> 
> Because maintainers do not remember changelogs in their mind.

I understand.

The proper place to add BUILD_BUG_ON() is sock_copy() or sk_clone_lock() ?


> >
> >
> > > You can not ask us (maintainers) to remember thousands of tricks.
> >
> > I'll keep this in mind.
> >
> >
> > > >
> > > > But I agree that we should not write error-prone code.
> > > >
> > > > Currently, sk_tx_queue_clear() is the only initi

Re: [PATCH net] net: Remove redundant calls of sk_tx_queue_clear().

From:   Eric Dumazet 
Date:   Wed, 27 Jan 2021 18:34:35 +0100
> On Wed, Jan 27, 2021 at 6:32 PM Kuniyuki Iwashima  wrote:
> >
> > From:   Eric Dumazet 
> > Date:   Wed, 27 Jan 2021 18:05:24 +0100
> > > On Wed, Jan 27, 2021 at 5:52 PM Kuniyuki Iwashima  
> > > wrote:
> > > >
> > > > From:   Eric Dumazet 
> > > > Date:   Wed, 27 Jan 2021 15:54:32 +0100
> > > > > On Wed, Jan 27, 2021 at 1:50 PM Kuniyuki Iwashima 
> > > > >  wrote:
> > > > > >
> > > > > > The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> > > > > > sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() 
> > > > > > and adds
> > > > > > it instead in sk_alloc() and sk_clone_lock() to fix an issue 
> > > > > > introduced in
> > > > > > the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). 
> > > > > > However,
> > > > > > the original commit had already put sk_tx_queue_clear() in 
> > > > > > sk_prot_alloc():
> > > > > > the callee of sk_alloc() and sk_clone_lock(). Thus 
> > > > > > sk_tx_queue_clear() is
> > > > > > called twice in each path currently.
> > > > >
> > > > > Are you sure ?
> > > > >
> > > > > I do not clearly see the sk_tx_queue_clear() call from the cloning 
> > > > > part.
> > > > >
> > > > > Please elaborate.
> > > >
> > > > If sk is not NULL in sk_prot_alloc(), sk_tx_queue_clear() is called [1].
> > > > Also the callers of sk_prot_alloc() are only sk_alloc() and 
> > > > sk_clone_lock().
> > > > If they finally return not NULL pointer, sk_tx_queue_clear() is called 
> > > > in
> > > > each function [2][3].
> > > >
> > > > In the cloning part, sock_copy() is called after sk_prot_alloc(), but
> > > > skc_tx_queue_mapping is defined between skc_dontcopy_begin and
> > > > skc_dontcopy_end in struct sock_common [4]. So, sock_copy() does not
> > > > overwrite skc_tx_queue_mapping, and thus we can initialize it in
> > > > sk_prot_alloc().
> > >
> > > That is a lot of assumptions.
> > >
> > > What guarantees do we have that skc_tx_queue_mapping will never be
> > > moved out of this section ?
> > > AFAIK it was there by accident, for cache locality reasons, that might
> > > change in the future as we add more stuff in socket.
> > >
> > > I feel this optimization is risky for future changes, for a code path
> > > that is spending thousands of cycles anyway.
> >
> > If someone try to move skc_tx_queue_mapping out of the section, should
> > they take care about where it is used ?

I'm sorry if it might be misleading, I would like to mean someone/they is
the author of a patch to move skc_tx_queue_mapping.


> Certainly not. You hide some knowledge, without a comment or some runtime 
> check.

It was my bad, I should have written about sock_copy() in the changelog.


> You can not ask us (maintainers) to remember thousands of tricks.

I'll keep this in mind.


> >
> > But I agree that we should not write error-prone code.
> >
> > Currently, sk_tx_queue_clear() is the only initialization code in
> > sk_prot_alloc(). So, does it make sense to remove sk_tx_queue_clear() in
> > sk_prot_alloc() so that it does only allocation and other fields are
> > initialized in each caller ?

Can I ask what you think about this ?


> > > >
> > > > [1] sk_prot_alloc
> > > > https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1693
> > > >
> > > > [2] sk_alloc
> > > > https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1762
> > > >
> > > > [3] sk_clone_lock
> > > > https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1986
> > > >
> > > > [4] struct sock_common
> > > > https://github.com/torvalds/linux/blob/master/include/net/sock.h#L218-L240
> > > >
> > > >
> > > > > In any case, this seems to be a candidate for net-next, this is not
> > > > > fixing a bug,
> > > > > this would be an optimization at most, and potentially adding a bug.
> > > > >
> > > > > So if you resend this patch, you can mention the old commit in the 
> > > > > changelog,
> > > > &

Re: [PATCH net] net: Remove redundant calls of sk_tx_queue_clear().

From:   Eric Dumazet 
Date:   Wed, 27 Jan 2021 18:05:24 +0100
> On Wed, Jan 27, 2021 at 5:52 PM Kuniyuki Iwashima  wrote:
> >
> > From:   Eric Dumazet 
> > Date:   Wed, 27 Jan 2021 15:54:32 +0100
> > > On Wed, Jan 27, 2021 at 1:50 PM Kuniyuki Iwashima  
> > > wrote:
> > > >
> > > > The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> > > > sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and 
> > > > adds
> > > > it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced 
> > > > in
> > > > the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). However,
> > > > the original commit had already put sk_tx_queue_clear() in 
> > > > sk_prot_alloc():
> > > > the callee of sk_alloc() and sk_clone_lock(). Thus sk_tx_queue_clear() 
> > > > is
> > > > called twice in each path currently.
> > >
> > > Are you sure ?
> > >
> > > I do not clearly see the sk_tx_queue_clear() call from the cloning part.
> > >
> > > Please elaborate.
> >
> > If sk is not NULL in sk_prot_alloc(), sk_tx_queue_clear() is called [1].
> > Also the callers of sk_prot_alloc() are only sk_alloc() and sk_clone_lock().
> > If they finally return not NULL pointer, sk_tx_queue_clear() is called in
> > each function [2][3].
> >
> > In the cloning part, sock_copy() is called after sk_prot_alloc(), but
> > skc_tx_queue_mapping is defined between skc_dontcopy_begin and
> > skc_dontcopy_end in struct sock_common [4]. So, sock_copy() does not
> > overwrite skc_tx_queue_mapping, and thus we can initialize it in
> > sk_prot_alloc().
> 
> That is a lot of assumptions.
> 
> What guarantees do we have that skc_tx_queue_mapping will never be
> moved out of this section ?
> AFAIK it was there by accident, for cache locality reasons, that might
> change in the future as we add more stuff in socket.
> 
> I feel this optimization is risky for future changes, for a code path
> that is spending thousands of cycles anyway.

If someone try to move skc_tx_queue_mapping out of the section, should
they take care about where it is used ?

But I agree that we should not write error-prone code.

Currently, sk_tx_queue_clear() is the only initialization code in
sk_prot_alloc(). So, does it make sense to remove sk_tx_queue_clear() in
sk_prot_alloc() so that it does only allocation and other fields are
initialized in each caller ?


> >
> > [1] sk_prot_alloc
> > https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1693
> >
> > [2] sk_alloc
> > https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1762
> >
> > [3] sk_clone_lock
> > https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1986
> >
> > [4] struct sock_common
> > https://github.com/torvalds/linux/blob/master/include/net/sock.h#L218-L240
> >
> >
> > > In any case, this seems to be a candidate for net-next, this is not
> > > fixing a bug,
> > > this would be an optimization at most, and potentially adding a bug.
> > >
> > > So if you resend this patch, you can mention the old commit in the 
> > > changelog,
> > > but do not add a dubious Fixes: tag
> >
> > I see.
> >
> > I will remove the tag and resend this as a net-next candidate.
> >
> > Thank you,
> > Kuniyuki
> >
> >
> > > >
> > > > This patch removes the redundant calls of sk_tx_queue_clear() in 
> > > > sk_alloc()
> > > > and sk_clone_lock().
> > > >
> > > > Fixes: 41b14fb8724d ("net: Do not clear the sock TX queue in 
> > > > sk_set_socket()")
> > > > CC: Tariq Toukan 
> > > > CC: Boris Pismenny 
> > > > Signed-off-by: Kuniyuki Iwashima 
> > > > Reviewed-by: Amit Shah 
> > > > ---
> > > >  net/core/sock.c | 2 --
> > > >  1 file changed, 2 deletions(-)
> > > >
> > > > diff --git a/net/core/sock.c b/net/core/sock.c
> > > > index bbcd4b97eddd..5c665ee14159 100644
> > > > --- a/net/core/sock.c
> > > > +++ b/net/core/sock.c
> > > > @@ -1759,7 +1759,6 @@ struct sock *sk_alloc(struct net *net, int 
> > > > family, gfp_t priority,
> > > > cgroup_sk_alloc(>sk_cgrp_data);
> > > > sock_update_classid(>sk_cgrp_data);
> > > > sock_update_netprioidx(>sk_cgrp_data);
> > > > -   sk_tx_queue_clear(sk);
> > > > }
> > > >
> > > > return sk;
> > > > @@ -1983,7 +1982,6 @@ struct sock *sk_clone_lock(const struct sock *sk, 
> > > > const gfp_t priority)
> > > >  */
> > > > sk_refcnt_debug_inc(newsk);
> > > > sk_set_socket(newsk, NULL);
> > > > -   sk_tx_queue_clear(newsk);
> > > > RCU_INIT_POINTER(newsk->sk_wq, NULL);
> > > >
> > > > if (newsk->sk_prot->sockets_allocated)
> > > > --
> > > > 2.17.2 (Apple Git-113)
> > > >

Re: [PATCH net] net: Remove redundant calls of sk_tx_queue_clear().

From:   Eric Dumazet 
Date:   Wed, 27 Jan 2021 15:54:32 +0100
> On Wed, Jan 27, 2021 at 1:50 PM Kuniyuki Iwashima  wrote:
> >
> > The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> > sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and adds
> > it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
> > the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). However,
> > the original commit had already put sk_tx_queue_clear() in sk_prot_alloc():
> > the callee of sk_alloc() and sk_clone_lock(). Thus sk_tx_queue_clear() is
> > called twice in each path currently.
> 
> Are you sure ?
> 
> I do not clearly see the sk_tx_queue_clear() call from the cloning part.
> 
> Please elaborate.

If sk is not NULL in sk_prot_alloc(), sk_tx_queue_clear() is called [1].
Also the callers of sk_prot_alloc() are only sk_alloc() and sk_clone_lock().
If they finally return not NULL pointer, sk_tx_queue_clear() is called in
each function [2][3].

In the cloning part, sock_copy() is called after sk_prot_alloc(), but
skc_tx_queue_mapping is defined between skc_dontcopy_begin and
skc_dontcopy_end in struct sock_common [4]. So, sock_copy() does not
overwrite skc_tx_queue_mapping, and thus we can initialize it in
sk_prot_alloc().

[1] sk_prot_alloc
https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1693

[2] sk_alloc
https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1762

[3] sk_clone_lock
https://github.com/torvalds/linux/blob/master/net/core/sock.c#L1986

[4] struct sock_common
https://github.com/torvalds/linux/blob/master/include/net/sock.h#L218-L240


> In any case, this seems to be a candidate for net-next, this is not
> fixing a bug,
> this would be an optimization at most, and potentially adding a bug.
> 
> So if you resend this patch, you can mention the old commit in the changelog,
> but do not add a dubious Fixes: tag

I see.

I will remove the tag and resend this as a net-next candidate.

Thank you,
Kuniyuki


> >
> > This patch removes the redundant calls of sk_tx_queue_clear() in sk_alloc()
> > and sk_clone_lock().
> >
> > Fixes: 41b14fb8724d ("net: Do not clear the sock TX queue in 
> > sk_set_socket()")
> > CC: Tariq Toukan 
> > CC: Boris Pismenny 
> > Signed-off-by: Kuniyuki Iwashima 
> > Reviewed-by: Amit Shah 
> > ---
> >  net/core/sock.c | 2 --
> >  1 file changed, 2 deletions(-)
> >
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index bbcd4b97eddd..5c665ee14159 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1759,7 +1759,6 @@ struct sock *sk_alloc(struct net *net, int family, 
> > gfp_t priority,
> > cgroup_sk_alloc(>sk_cgrp_data);
> > sock_update_classid(>sk_cgrp_data);
> > sock_update_netprioidx(>sk_cgrp_data);
> > -   sk_tx_queue_clear(sk);
> > }
> >
> > return sk;
> > @@ -1983,7 +1982,6 @@ struct sock *sk_clone_lock(const struct sock *sk, 
> > const gfp_t priority)
> >  */
> > sk_refcnt_debug_inc(newsk);
> > sk_set_socket(newsk, NULL);
> > -   sk_tx_queue_clear(newsk);
> > RCU_INIT_POINTER(newsk->sk_wq, NULL);
> >
> > if (newsk->sk_prot->sockets_allocated)
> > --
> > 2.17.2 (Apple Git-113)
> >

RE: [PATCH net] net: Remove redundant calls of sk_tx_queue_clear().

From:   Kuniyuki Iwashima 
Date:   Wed, 27 Jan 2021 21:50:18 +0900
> The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
> sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and adds
> it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
> the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). However,
> the original commit had already put sk_tx_queue_clear() in sk_prot_alloc():
> the callee of sk_alloc() and sk_clone_lock(). Thus sk_tx_queue_clear() is
> called twice in each path currently.
> 
> This patch removes the redundant calls of sk_tx_queue_clear() in sk_alloc()
> and sk_clone_lock().
> 
> Fixes: 41b14fb8724d ("net: Do not clear the sock TX queue in sk_set_socket()")
> CC: Tariq Toukan 
> CC: Boris Pismenny 
> Signed-off-by: Kuniyuki Iwashima 
> Reviewed-by: Amit Shah 

I'm sorry, I have respun the v2 patch.
So, please ignore v1.

v2: https://lore.kernel.org/netdev/20210127132215.10842-1-kun...@amazon.co.jp/

Best regards,
Kuniyuki

[PATCH v2 net] net: Remove redundant calls of sk_tx_queue_clear().

The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and adds
it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). However,
the original commit had already put sk_tx_queue_clear() in sk_prot_alloc():
the callee of sk_alloc() and sk_clone_lock(). Thus sk_tx_queue_clear() is
called twice in each path currently.

This patch removes the redundant calls of sk_tx_queue_clear() in sk_alloc()
and sk_clone_lock().

Fixes: 41b14fb8724d ("net: Do not clear the sock TX queue in sk_set_socket()")
CC: Tariq Toukan 
CC: Boris Pismenny 
Signed-off-by: Kuniyuki Iwashima 
---
 net/core/sock.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index bbcd4b97eddd..5c665ee14159 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1759,7 +1759,6 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t 
priority,
cgroup_sk_alloc(>sk_cgrp_data);
sock_update_classid(>sk_cgrp_data);
sock_update_netprioidx(>sk_cgrp_data);
-   sk_tx_queue_clear(sk);
}
 
return sk;
@@ -1983,7 +1982,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
 */
sk_refcnt_debug_inc(newsk);
sk_set_socket(newsk, NULL);
-   sk_tx_queue_clear(newsk);
RCU_INIT_POINTER(newsk->sk_wq, NULL);
 
if (newsk->sk_prot->sockets_allocated)
-- 
2.17.2 (Apple Git-113)

[PATCH net] net: Remove redundant calls of sk_tx_queue_clear().

The commit 41b14fb8724d ("net: Do not clear the sock TX queue in
sk_set_socket()") removes sk_tx_queue_clear() from sk_set_socket() and adds
it instead in sk_alloc() and sk_clone_lock() to fix an issue introduced in
the commit e022f0b4a03f ("net: Introduce sk_tx_queue_mapping"). However,
the original commit had already put sk_tx_queue_clear() in sk_prot_alloc():
the callee of sk_alloc() and sk_clone_lock(). Thus sk_tx_queue_clear() is
called twice in each path currently.

This patch removes the redundant calls of sk_tx_queue_clear() in sk_alloc()
and sk_clone_lock().

Fixes: 41b14fb8724d ("net: Do not clear the sock TX queue in sk_set_socket()")
CC: Tariq Toukan 
CC: Boris Pismenny 
Signed-off-by: Kuniyuki Iwashima 
Reviewed-by: Amit Shah 
---
 net/core/sock.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index bbcd4b97eddd..5c665ee14159 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1759,7 +1759,6 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t 
priority,
cgroup_sk_alloc(>sk_cgrp_data);
sock_update_classid(>sk_cgrp_data);
sock_update_netprioidx(>sk_cgrp_data);
-   sk_tx_queue_clear(sk);
}
 
return sk;
@@ -1983,7 +1982,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
 */
sk_refcnt_debug_inc(newsk);
sk_set_socket(newsk, NULL);
-   sk_tx_queue_clear(newsk);
RCU_INIT_POINTER(newsk->sk_wq, NULL);
 
if (newsk->sk_prot->sockets_allocated)
-- 
2.17.2 (Apple Git-113)

[PATCH net] tcp: Fix potential use-after-free due to double kfree().

2021-01-17 Thread Kuniyuki Iwashima

Receiving ACK with a valid SYN cookie, cookie_v4_check() allocates struct
request_sock and then can allocate inet_rsk(req)->ireq_opt. After that,
tcp_v4_syn_recv_sock() allocates struct sock and copies ireq_opt to
inet_sk(sk)->inet_opt. Normally, tcp_v4_syn_recv_sock() inserts the full
socket into ehash and sets NULL to ireq_opt. Otherwise,
tcp_v4_syn_recv_sock() has to reset inet_opt by NULL and free the full
socket.

The commit 01770a1661657 ("tcp: fix race condition when creating child
sockets from syncookies") added a new path, in which more than one cores
create full sockets for the same SYN cookie. Currently, the core which
loses the race frees the full socket without resetting inet_opt, resulting
in that both sock_put() and reqsk_put() call kfree() for the same memory:

  sock_put
sk_free
  __sk_free
sk_destruct
  __sk_destruct
sk->sk_destruct/inet_sock_destruct
  kfree(rcu_dereference_protected(inet->inet_opt, 1));

  reqsk_put
reqsk_free
  __reqsk_free
req->rsk_ops->destructor/tcp_v4_reqsk_destructor
  kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));

Calling kmalloc() between the double kfree() can lead to use-after-free, so
this patch fixes it by setting NULL to inet_opt before sock_put().

As a side note, this kind of issue does not happen for IPv6. This is
because tcp_v6_syn_recv_sock() clones both ipv6_opt and pktopts which
correspond to ireq_opt in IPv4.

Fixes: 01770a166165 ("tcp: fix race condition when creating child sockets from 
syncookies")
CC: Ricardo Dias 
Signed-off-by: Kuniyuki Iwashima 
Reviewed-by: Benjamin Herrenschmidt 
---
 net/ipv4/tcp_ipv4.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 58207c7769d0..87eb614dab27 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1595,6 +1595,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, 
struct sk_buff *skb,
tcp_move_syn(newtp, req);
ireq->ireq_opt = NULL;
} else {
+   newinet->inet_opt = NULL;
+
if (!req_unhash && found_dup_sk) {
/* This code path should only be executed in the
 * syncookie case only
@@ -1602,8 +1604,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, 
struct sk_buff *skb,
bh_unlock_sock(newsk);
sock_put(newsk);
newsk = NULL;
-   } else {
-   newinet->inet_opt = NULL;
}
}
return newsk;
-- 
2.17.2 (Apple Git-113)

Re: [PATCH v1 bpf-next 05/11] tcp: Migrate TCP_NEW_SYN_RECV requests.

2020-12-16 Thread Kuniyuki Iwashima

From:   Martin KaFai Lau 
Date:   Mon, 14 Dec 2020 18:58:37 -0800
> On Tue, Dec 15, 2020 at 02:03:13AM +0900, Kuniyuki Iwashima wrote:
> > From:   Martin KaFai Lau 
> > Date:   Thu, 10 Dec 2020 10:49:15 -0800
> > > On Thu, Dec 10, 2020 at 02:15:38PM +0900, Kuniyuki Iwashima wrote:
> > > > From:   Martin KaFai Lau 
> > > > Date:   Wed, 9 Dec 2020 16:07:07 -0800
> > > > > On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:
> > > > > > This patch renames reuseport_select_sock() to 
> > > > > > __reuseport_select_sock() and
> > > > > > adds two wrapper function of it to pass the migration type defined 
> > > > > > in the
> > > > > > previous commit.
> > > > > > 
> > > > > >   reuseport_select_sock  : BPF_SK_REUSEPORT_MIGRATE_NO
> > > > > >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST
> > > > > > 
> > > > > > As mentioned before, we have to select a new listener for 
> > > > > > TCP_NEW_SYN_RECV
> > > > > > requests at receiving the final ACK or sending a SYN+ACK. 
> > > > > > Therefore, this
> > > > > > patch also changes the code to call 
> > > > > > reuseport_select_migrated_sock() even
> > > > > > if the listening socket is TCP_CLOSE. If we can pick out a 
> > > > > > listening socket
> > > > > > from the reuseport group, we rewrite request_sock.rsk_listener and 
> > > > > > resume
> > > > > > processing the request.
> > > > > > 
> > > > > > Reviewed-by: Benjamin Herrenschmidt 
> > > > > > Signed-off-by: Kuniyuki Iwashima 
> > > > > > ---
> > > > > >  include/net/inet_connection_sock.h | 12 +++
> > > > > >  include/net/request_sock.h | 13 
> > > > > >  include/net/sock_reuseport.h   |  8 +++
> > > > > >  net/core/sock_reuseport.c  | 34 
> > > > > > --
> > > > > >  net/ipv4/inet_connection_sock.c| 13 ++--
> > > > > >  net/ipv4/tcp_ipv4.c|  9 ++--
> > > > > >  net/ipv6/tcp_ipv6.c|  9 ++--
> > > > > >  7 files changed, 81 insertions(+), 17 deletions(-)
> > > > > > 
> > > > > > diff --git a/include/net/inet_connection_sock.h 
> > > > > > b/include/net/inet_connection_sock.h
> > > > > > index 2ea2d743f8fc..1e0958f5eb21 100644
> > > > > > --- a/include/net/inet_connection_sock.h
> > > > > > +++ b/include/net/inet_connection_sock.h
> > > > > > @@ -272,6 +272,18 @@ static inline void 
> > > > > > inet_csk_reqsk_queue_added(struct sock *sk)
> > > > > > reqsk_queue_added(_csk(sk)->icsk_accept_queue);
> > > > > >  }
> > > > > >  
> > > > > > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,
> > > > > > +struct sock *nsk,
> > > > > > +struct request_sock 
> > > > > > *req)
> > > > > > +{
> > > > > > +   reqsk_queue_migrated(_csk(sk)->icsk_accept_queue,
> > > > > > +_csk(nsk)->icsk_accept_queue,
> > > > > > +req);
> > > > > > +   sock_put(sk);
> > > > > not sure if it is safe to do here.
> > > > > IIUC, when the req->rsk_refcnt is held, it also holds a refcnt
> > > > > to req->rsk_listener such that sock_hold(req->rsk_listener) is
> > > > > safe because its sk_refcnt is not zero.
> > > > 
> > > > I think it is safe to call sock_put() for the old listener here.
> > > > 
> > > > Without this patchset, at receiving the final ACK or retransmitting
> > > > SYN+ACK, if sk_state == TCP_CLOSE, sock_put(req->rsk_listener) is done
> > > > by calling reqsk_put() twice in inet_csk_reqsk_queue_drop_and_put().
> > > Note that in your example (final ACK), sock_put(req->rsk_listener) is
> > > _only_ called when reqsk_put() can get 
> > > refcount_dec_and_test(>rsk_refcnt)
> > > to reach zero.
> > > 
> > >

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-14 Thread Kuniyuki Iwashima

From:   Martin KaFai Lau 
Date:   Thu, 10 Dec 2020 11:33:40 -0800
> On Thu, Dec 10, 2020 at 02:58:10PM +0900, Kuniyuki Iwashima wrote:
> 
> [ ... ]
> 
> > > > I've implemented one-by-one migration only for the accept queue for now.
> > > > In addition to the concern about TFO queue,
> > > You meant this queue:  queue->fastopenq.rskq_rst_head?
> > 
> > Yes.
> > 
> > 
> > > Can "req" be passed?
> > > I did not look up the lock/race in details for that though.
> > 
> > I think if we rewrite freeing TFO requests part like one of accept queue
> > using reqsk_queue_remove(), we can also migrate them.
> > 
> > In this patchset, selecting a listener for accept queue, the TFO queue of
> > the same listener is also migrated to another listener in order to prevent
> > TFO spoofing attack.
> > 
> > If the request in the accept queue is migrated one by one, I am wondering
> > which should the request in TFO queue be migrated to prevent attack or
> > freed.
> > 
> > I think user need not know about keeping such requests in kernel to prevent
> > attacks, so passing them to eBPF prog is confusing. But, redistributing
> > them randomly without user's intention can make some irrelevant listeners
> > unnecessarily drop new TFO requests, so this is also bad. Moreover, freeing
> > such requests seems not so good in the point of security.
> The current behavior (during process restart) is also not carrying this
> security queue.  Not carrying them in this patch will make it
> less secure than the current behavior during process restart?

No, I thought I could make it more secure.


> Do you need it now or it is something that can be considered for later
> without changing uapi bpf.h?

No, I do not need it for any other reason, so I will simply free the
requests in TFO queue.
Thank you.


> > > > ---8<---
> > > > diff --git a/net/ipv4/inet_connection_sock.c 
> > > > b/net/ipv4/inet_connection_sock.c
> > > > index a82fd4c912be..d0ddd3cb988b 100644
> > > > --- a/net/ipv4/inet_connection_sock.c
> > > > +++ b/net/ipv4/inet_connection_sock.c
> > > > @@ -1001,6 +1001,29 @@ struct sock *inet_csk_reqsk_queue_add(struct 
> > > > sock *sk,
> > > >  }
> > > >  EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
> > > >  
> > > > +static bool inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock 
> > > > *nsk, struct request_sock *req)
> > > > +{
> > > > +   struct request_sock_queue *queue = 
> > > > _csk(nsk)->icsk_accept_queue;
> > > > +   bool migrated = false;
> > > > +
> > > > +   spin_lock(>rskq_lock);
> > > > +   if (likely(nsk->sk_state == TCP_LISTEN)) {
> > > > +   migrated = true;
> > > > +
> > > > +   req->dl_next = NULL;
> > > > +   if (queue->rskq_accept_head == NULL)
> > > > +   WRITE_ONCE(queue->rskq_accept_head, req);
> > > > +   else
> > > > +   queue->rskq_accept_tail->dl_next = req;
> > > > +   queue->rskq_accept_tail = req;
> > > > +   sk_acceptq_added(nsk);
> > > > +   inet_csk_reqsk_queue_migrated(sk, nsk, req);
> > > need to first resolve the question raised in patch 5 regarding
> > > to the update on req->rsk_listener though.
> > 
> > In the unhash path, it is also safe to call sock_put() for the old listner.
> > 
> > In inet_csk_listen_stop(), the sk_refcnt of the listener >= 1. If the
> > listener does not have immature requests, sk_refcnt is 1 and freed in
> > __tcp_close().
> > 
> >   sock_hold(sk) in __tcp_close()
> >   sock_put(sk) in inet_csk_destroy_sock()
> >   sock_put(sk) in __tcp_clsoe()
> I don't see how it is different here than in patch 5.
> I could be missing something.
> 
> Lets contd the discussion on the other thread (patch 5) first.

The listening socket has two kinds of refcounts for itself(1) and
requests(n). I think the listener has its own refcount at least in
inet_csk_listen_stop(), so sock_put() here never free the listener.

Re: [PATCH v1 bpf-next 05/11] tcp: Migrate TCP_NEW_SYN_RECV requests.

2020-12-14 Thread Kuniyuki Iwashima

From:   Martin KaFai Lau 
Date:   Thu, 10 Dec 2020 10:49:15 -0800
> On Thu, Dec 10, 2020 at 02:15:38PM +0900, Kuniyuki Iwashima wrote:
> > From:   Martin KaFai Lau 
> > Date:   Wed, 9 Dec 2020 16:07:07 -0800
> > > On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:
> > > > This patch renames reuseport_select_sock() to __reuseport_select_sock() 
> > > > and
> > > > adds two wrapper function of it to pass the migration type defined in 
> > > > the
> > > > previous commit.
> > > > 
> > > >   reuseport_select_sock  : BPF_SK_REUSEPORT_MIGRATE_NO
> > > >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST
> > > > 
> > > > As mentioned before, we have to select a new listener for 
> > > > TCP_NEW_SYN_RECV
> > > > requests at receiving the final ACK or sending a SYN+ACK. Therefore, 
> > > > this
> > > > patch also changes the code to call reuseport_select_migrated_sock() 
> > > > even
> > > > if the listening socket is TCP_CLOSE. If we can pick out a listening 
> > > > socket
> > > > from the reuseport group, we rewrite request_sock.rsk_listener and 
> > > > resume
> > > > processing the request.
> > > > 
> > > > Reviewed-by: Benjamin Herrenschmidt 
> > > > Signed-off-by: Kuniyuki Iwashima 
> > > > ---
> > > >  include/net/inet_connection_sock.h | 12 +++
> > > >  include/net/request_sock.h | 13 
> > > >  include/net/sock_reuseport.h   |  8 +++
> > > >  net/core/sock_reuseport.c  | 34 --
> > > >  net/ipv4/inet_connection_sock.c| 13 ++--
> > > >  net/ipv4/tcp_ipv4.c|  9 ++--
> > > >  net/ipv6/tcp_ipv6.c|  9 ++--
> > > >  7 files changed, 81 insertions(+), 17 deletions(-)
> > > > 
> > > > diff --git a/include/net/inet_connection_sock.h 
> > > > b/include/net/inet_connection_sock.h
> > > > index 2ea2d743f8fc..1e0958f5eb21 100644
> > > > --- a/include/net/inet_connection_sock.h
> > > > +++ b/include/net/inet_connection_sock.h
> > > > @@ -272,6 +272,18 @@ static inline void 
> > > > inet_csk_reqsk_queue_added(struct sock *sk)
> > > > reqsk_queue_added(_csk(sk)->icsk_accept_queue);
> > > >  }
> > > >  
> > > > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,
> > > > +struct sock *nsk,
> > > > +struct request_sock 
> > > > *req)
> > > > +{
> > > > +   reqsk_queue_migrated(_csk(sk)->icsk_accept_queue,
> > > > +_csk(nsk)->icsk_accept_queue,
> > > > +req);
> > > > +   sock_put(sk);
> > > not sure if it is safe to do here.
> > > IIUC, when the req->rsk_refcnt is held, it also holds a refcnt
> > > to req->rsk_listener such that sock_hold(req->rsk_listener) is
> > > safe because its sk_refcnt is not zero.
> > 
> > I think it is safe to call sock_put() for the old listener here.
> > 
> > Without this patchset, at receiving the final ACK or retransmitting
> > SYN+ACK, if sk_state == TCP_CLOSE, sock_put(req->rsk_listener) is done
> > by calling reqsk_put() twice in inet_csk_reqsk_queue_drop_and_put().
> Note that in your example (final ACK), sock_put(req->rsk_listener) is
> _only_ called when reqsk_put() can get refcount_dec_and_test(>rsk_refcnt)
> to reach zero.
> 
> Here in this patch, it sock_put(req->rsk_listener) without req->rsk_refcnt
> reaching zero.
> 
> Let says there are two cores holding two refcnt to req (one cnt for each core)
> by looking up the req from ehash.  One of the core do this migrate and
> sock_put(req->rsk_listener).  Another core does sock_hold(req->rsk_listener).
> 
>   Core1   Core2
>   sock_put(req->rsk_listener)
> 
>   sock_hold(req->rsk_listener)

I'm sorry for the late reply.

I missed this situation that different Cores get into NEW_SYN_RECV path,
but this does exist.
https://lore.kernel.org/netdev/1517977874.3715.153.ca...@gmail.com/#t
https://lore.kernel.org/netdev/1518531252.3715.178.ca...@gmail.com/


If close() is called for the listener and the request has the last re

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

From:   Martin KaFai Lau 
Date:   Wed, 9 Dec 2020 17:53:19 -0800
> On Thu, Dec 10, 2020 at 01:57:19AM +0900, Kuniyuki Iwashima wrOAote:
> [ ... ]
> 
> > > > > I think it is a bit complex to pass the new listener from
> > > > > reuseport_detach_sock() to inet_csk_listen_stop().
> > > > > 
> > > > > __tcp_close/tcp_disconnect/tcp_abort
> > > > >  |-tcp_set_state
> > > > >  |  |-unhash
> > > > >  | |-reuseport_detach_sock (return nsk)
> > > > >  |-inet_csk_listen_stop
> > > > Picking the new listener does not have to be done in
> > > > reuseport_detach_sock().
> > > > 
> > > > IIUC, it is done there only because it prefers to pick
> > > > the last sk from socks[] when bpf prog is not attached.
> > > > This seems to get into the way of exploring other potential
> > > > implementation options.
> > > 
> > > Yes.
> > > This is just idea, but we can reserve the last index of socks[] to hold 
> > > the
> > > last 'moved' socket in reuseport_detach_sock() and use it in
> > > inet_csk_listen_stop().
> > > 
> > > 
> > > > Merging the discussion on the last socks[] pick from another thread:
> > > > >
> > > > > I think most applications start new listeners before closing 
> > > > > listeners, in
> > > > > this case, selecting the moved socket as the new listener works well.
> > > > >
> > > > >
> > > > > > That said, if it is still desired to do a random pick by kernel when
> > > > > > there is no bpf prog, it probably makes sense to guard it in a 
> > > > > > sysctl as
> > > > > > suggested in another reply.  To keep it simple, I would also keep 
> > > > > > this
> > > > > > kernel-pick consistent instead of request socket is doing something
> > > > > > different from the unhash path.
> > > > >
> > > > > Then, is this way better to keep kernel-pick consistent?
> > > > >
> > > > >   1. call reuseport_select_migrated_sock() without sk_hash from any 
> > > > > path
> > > > >   2. generate a random number in reuseport_select_migrated_sock()
> > > > >   3. pass it to __reuseport_select_sock() only for select-by-hash
> > > > >   (4. pass 0 as sk_hash to bpf_run_sk_reuseport not to use it)
> > > > >   5. do migration per queue in inet_csk_listen_stop() or per request 
> > > > > in
> > > > >  receive path.
> > > > >
> > > > > I understand it is beautiful to keep consistensy, but also think
> > > > > the kernel-pick with heuristic performs better than random-pick.
> > > > I think discussing the best kernel pick without explicit user input
> > > > is going to be a dead end. There is always a case that
> > > > makes this heuristic (or guess) fail.  e.g. what if multiple
> > > > sk(s) being closed are always the last one in the socks[]?
> > > > all their child sk(s) will then be piled up at one listen sk
> > > > because the last socks[] is always picked?
> > > 
> > > There can be such a case, but it means the newly listened sockets are
> > > closed earlier than old ones.
> > > 
> > > 
> > > > Lets assume the last socks[] is indeed the best for all cases.  Then why
> > > > the in-progress req don't pick it this way?  I feel the implementation
> > > > is doing what is convenient at that point.  And that is fine, I think
> > > 
> > > In this patchset, I originally assumed four things:
> > > 
> > >   migration should be done
> > > (i)   from old to new
> > > (ii)  to redistribute requests evenly as possible
> > > (iii) to keep the order of requests in the queue
> > >   (resulting in splicing queues)
> > > (iv)  in O(1) for scalability
> > >   (resulting in fix-up rsk_listener approach)
> > > 
> > > I selected the last socket in unhash path to satisfy above four because 
> > > the
> > > last socket changes at every close() syscall if application closes from
> > > older socket.
> > > 
> > > But in receiving ACK or retransmitting SYN+ACK, we cannot get the last
> > > 'moved' socket. Even if we reserve the last 'moved' socket in the last
> > > index by the

Re: [PATCH v1 bpf-next 05/11] tcp: Migrate TCP_NEW_SYN_RECV requests.

From:   Martin KaFai Lau 
Date:   Wed, 9 Dec 2020 16:07:07 -0800
> On Tue, Dec 01, 2020 at 11:44:12PM +0900, Kuniyuki Iwashima wrote:
> > This patch renames reuseport_select_sock() to __reuseport_select_sock() and
> > adds two wrapper function of it to pass the migration type defined in the
> > previous commit.
> > 
> >   reuseport_select_sock  : BPF_SK_REUSEPORT_MIGRATE_NO
> >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST
> > 
> > As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV
> > requests at receiving the final ACK or sending a SYN+ACK. Therefore, this
> > patch also changes the code to call reuseport_select_migrated_sock() even
> > if the listening socket is TCP_CLOSE. If we can pick out a listening socket
> > from the reuseport group, we rewrite request_sock.rsk_listener and resume
> > processing the request.
> > 
> > Reviewed-by: Benjamin Herrenschmidt 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >  include/net/inet_connection_sock.h | 12 +++
> >  include/net/request_sock.h | 13 
> >  include/net/sock_reuseport.h   |  8 +++
> >  net/core/sock_reuseport.c  | 34 --
> >  net/ipv4/inet_connection_sock.c| 13 ++--
> >  net/ipv4/tcp_ipv4.c|  9 ++--
> >  net/ipv6/tcp_ipv6.c|  9 ++--
> >  7 files changed, 81 insertions(+), 17 deletions(-)
> > 
> > diff --git a/include/net/inet_connection_sock.h 
> > b/include/net/inet_connection_sock.h
> > index 2ea2d743f8fc..1e0958f5eb21 100644
> > --- a/include/net/inet_connection_sock.h
> > +++ b/include/net/inet_connection_sock.h
> > @@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct 
> > sock *sk)
> > reqsk_queue_added(_csk(sk)->icsk_accept_queue);
> >  }
> >  
> > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,
> > +struct sock *nsk,
> > +struct request_sock *req)
> > +{
> > +   reqsk_queue_migrated(_csk(sk)->icsk_accept_queue,
> > +_csk(nsk)->icsk_accept_queue,
> > +req);
> > +   sock_put(sk);
> not sure if it is safe to do here.
> IIUC, when the req->rsk_refcnt is held, it also holds a refcnt
> to req->rsk_listener such that sock_hold(req->rsk_listener) is
> safe because its sk_refcnt is not zero.

I think it is safe to call sock_put() for the old listener here.

Without this patchset, at receiving the final ACK or retransmitting
SYN+ACK, if sk_state == TCP_CLOSE, sock_put(req->rsk_listener) is done
by calling reqsk_put() twice in inet_csk_reqsk_queue_drop_and_put(). And
then, we do `goto lookup;` and overwrite the sk.

In the v2 patchset, refcount_inc_not_zero() is done for the new listener in
reuseport_select_migrated_sock(), so we have to call sock_put() for the old
listener instead to free it properly.

---8<---
+struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,
+   struct sk_buff *skb)
+{
+   struct sock *nsk;
+
+   nsk = __reuseport_select_sock(sk, hash, skb, 0, 
BPF_SK_REUSEPORT_MIGRATE_REQUEST);
+   if (nsk && likely(refcount_inc_not_zero(>sk_refcnt)))
+   return nsk;
+
+   return NULL;
+}
+EXPORT_SYMBOL(reuseport_select_migrated_sock);
---8<---
https://lore.kernel.org/netdev/20201207132456.65472-8-kun...@amazon.co.jp/


> > +   sock_hold(nsk);
> > +   req->rsk_listener = nsk;
> > +}
> > +
> 
> [ ... ]
> 
> > diff --git a/net/ipv4/inet_connection_sock.c 
> > b/net/ipv4/inet_connection_sock.c
> > index 361efe55b1ad..e71653c6eae2 100644
> > --- a/net/ipv4/inet_connection_sock.c
> > +++ b/net/ipv4/inet_connection_sock.c
> > @@ -743,8 +743,17 @@ static void reqsk_timer_handler(struct timer_list *t)
> > struct request_sock_queue *queue = >icsk_accept_queue;
> > int max_syn_ack_retries, qlen, expire = 0, resend = 0;
> >  
> > -   if (inet_sk_state_load(sk_listener) != TCP_LISTEN)
> > -   goto drop;
> > +   if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
> > +   sk_listener = reuseport_select_migrated_sock(sk_listener,
> > +
> > req_to_sk(req)->sk_hash, NULL);
> > +   if (!sk_listener) {
> > +   sk_listener = req->rsk_listener;
> > +   goto drop;
> > +   }
> > +   inet_csk_reqsk_queue_migrated(req->

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

From:   Kuniyuki Iwashima 
Date:   Wed, 9 Dec 2020 17:05:09 +0900
> From:   Martin KaFai Lau 
> Date:   Tue, 8 Dec 2020 19:09:03 -0800
> > On Tue, Dec 08, 2020 at 05:17:48PM +0900, Kuniyuki Iwashima wrote:
> > > From:   Martin KaFai Lau 
> > > Date:   Mon, 7 Dec 2020 23:34:41 -0800
> > > > On Tue, Dec 08, 2020 at 03:31:34PM +0900, Kuniyuki Iwashima wrote:
> > > > > From:   Martin KaFai Lau 
> > > > > Date:   Mon, 7 Dec 2020 12:33:15 -0800
> > > > > > On Thu, Dec 03, 2020 at 11:14:24PM +0900, Kuniyuki Iwashima wrote:
> > > > > > > From:   Eric Dumazet 
> > > > > > > Date:   Tue, 1 Dec 2020 16:25:51 +0100
> > > > > > > > On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> > > > > > > > > This patch lets reuseport_detach_sock() return a pointer of 
> > > > > > > > > struct sock,
> > > > > > > > > which is used only by inet_unhash(). If it is not NULL,
> > > > > > > > > inet_csk_reqsk_queue_migrate() migrates 
> > > > > > > > > TCP_ESTABLISHED/TCP_SYN_RECV
> > > > > > > > > sockets from the closing listener to the selected one.
> > > > > > > > > 
> > > > > > > > > Listening sockets hold incoming connections as a linked list 
> > > > > > > > > of struct
> > > > > > > > > request_sock in the accept queue, and each request has 
> > > > > > > > > reference to a full
> > > > > > > > > socket and its listener. In inet_csk_reqsk_queue_migrate(), 
> > > > > > > > > we only unlink
> > > > > > > > > the requests from the closing listener's queue and relink 
> > > > > > > > > them to the head
> > > > > > > > > of the new listener's queue. We do not process each request 
> > > > > > > > > and its
> > > > > > > > > reference to the listener, so the migration completes in O(1) 
> > > > > > > > > time
> > > > > > > > > complexity. However, in the case of TCP_SYN_RECV sockets, we 
> > > > > > > > > take special
> > > > > > > > > care in the next commit.
> > > > > > > > > 
> > > > > > > > > By default, the kernel selects a new listener randomly. In 
> > > > > > > > > order to pick
> > > > > > > > > out a different socket every time, we select the last element 
> > > > > > > > > of socks[] as
> > > > > > > > > the new listener. This behaviour is based on how the kernel 
> > > > > > > > > moves sockets
> > > > > > > > > in socks[]. (See also [1])
> > > > > > > > > 
> > > > > > > > > Basically, in order to redistribute sockets evenly, we have 
> > > > > > > > > to use an eBPF
> > > > > > > > > program called in the later commit, but as the side effect of 
> > > > > > > > > such default
> > > > > > > > > selection, the kernel can redistribute old requests evenly to 
> > > > > > > > > new listeners
> > > > > > > > > for a specific case where the application replaces listeners 
> > > > > > > > > by
> > > > > > > > > generations.
> > > > > > > > > 
> > > > > > > > > For example, we call listen() for four sockets (A, B, C, D), 
> > > > > > > > > and close the
> > > > > > > > > first two by turns. The sockets move in socks[] like below.
> > > > > > > > > 
> > > > > > > > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > > > > > > > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > > > > > > > >   socks[2] : C   |  socks[2] : C --'
> > > > > > > > >   socks[3] : D --'
> > > > > > > > > 
> > > > > > > > > Then, if C and D have newer settings than A and B, and each 
> > > > > > > > > socket has a
> > > > > > > > > request (a, b, c, d) i

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

From:   Martin KaFai Lau 
Date:   Tue, 8 Dec 2020 19:09:03 -0800
> On Tue, Dec 08, 2020 at 05:17:48PM +0900, Kuniyuki Iwashima wrote:
> > From:   Martin KaFai Lau 
> > Date:   Mon, 7 Dec 2020 23:34:41 -0800
> > > On Tue, Dec 08, 2020 at 03:31:34PM +0900, Kuniyuki Iwashima wrote:
> > > > From:   Martin KaFai Lau 
> > > > Date:   Mon, 7 Dec 2020 12:33:15 -0800
> > > > > On Thu, Dec 03, 2020 at 11:14:24PM +0900, Kuniyuki Iwashima wrote:
> > > > > > From:   Eric Dumazet 
> > > > > > Date:   Tue, 1 Dec 2020 16:25:51 +0100
> > > > > > > On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> > > > > > > > This patch lets reuseport_detach_sock() return a pointer of 
> > > > > > > > struct sock,
> > > > > > > > which is used only by inet_unhash(). If it is not NULL,
> > > > > > > > inet_csk_reqsk_queue_migrate() migrates 
> > > > > > > > TCP_ESTABLISHED/TCP_SYN_RECV
> > > > > > > > sockets from the closing listener to the selected one.
> > > > > > > > 
> > > > > > > > Listening sockets hold incoming connections as a linked list of 
> > > > > > > > struct
> > > > > > > > request_sock in the accept queue, and each request has 
> > > > > > > > reference to a full
> > > > > > > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we 
> > > > > > > > only unlink
> > > > > > > > the requests from the closing listener's queue and relink them 
> > > > > > > > to the head
> > > > > > > > of the new listener's queue. We do not process each request and 
> > > > > > > > its
> > > > > > > > reference to the listener, so the migration completes in O(1) 
> > > > > > > > time
> > > > > > > > complexity. However, in the case of TCP_SYN_RECV sockets, we 
> > > > > > > > take special
> > > > > > > > care in the next commit.
> > > > > > > > 
> > > > > > > > By default, the kernel selects a new listener randomly. In 
> > > > > > > > order to pick
> > > > > > > > out a different socket every time, we select the last element 
> > > > > > > > of socks[] as
> > > > > > > > the new listener. This behaviour is based on how the kernel 
> > > > > > > > moves sockets
> > > > > > > > in socks[]. (See also [1])
> > > > > > > > 
> > > > > > > > Basically, in order to redistribute sockets evenly, we have to 
> > > > > > > > use an eBPF
> > > > > > > > program called in the later commit, but as the side effect of 
> > > > > > > > such default
> > > > > > > > selection, the kernel can redistribute old requests evenly to 
> > > > > > > > new listeners
> > > > > > > > for a specific case where the application replaces listeners by
> > > > > > > > generations.
> > > > > > > > 
> > > > > > > > For example, we call listen() for four sockets (A, B, C, D), 
> > > > > > > > and close the
> > > > > > > > first two by turns. The sockets move in socks[] like below.
> > > > > > > > 
> > > > > > > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > > > > > > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > > > > > > >   socks[2] : C   |  socks[2] : C --'
> > > > > > > >   socks[3] : D --'
> > > > > > > > 
> > > > > > > > Then, if C and D have newer settings than A and B, and each 
> > > > > > > > socket has a
> > > > > > > > request (a, b, c, d) in their accept queue, we can redistribute 
> > > > > > > > old
> > > > > > > > requests evenly to new listeners.
> > > > > > > > 
> > > > > > > >   socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] 
> > > > > > > > : D (a + d)
> > > > > > > >   socks[1] : B (b

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-08 Thread Kuniyuki Iwashima

From:   Martin KaFai Lau 
Date:   Tue, 8 Dec 2020 00:13:28 -0800
> On Tue, Dec 08, 2020 at 03:27:14PM +0900, Kuniyuki Iwashima wrote:
> > From:   Martin KaFai Lau 
> > Date:   Mon, 7 Dec 2020 12:14:38 -0800
> > > On Sun, Dec 06, 2020 at 01:03:07AM +0900, Kuniyuki Iwashima wrote:
> > > > From:   Martin KaFai Lau 
> > > > Date:   Fri, 4 Dec 2020 17:42:41 -0800
> > > > > On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote:
> > > > > [ ... ]
> > > > > > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> > > > > > index fd133516ac0e..60d7c1f28809 100644
> > > > > > --- a/net/core/sock_reuseport.c
> > > > > > +++ b/net/core/sock_reuseport.c
> > > > > > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct 
> > > > > > sock *sk2, bool bind_inany)
> > > > > >  }
> > > > > >  EXPORT_SYMBOL(reuseport_add_sock);
> > > > > >  
> > > > > > -void reuseport_detach_sock(struct sock *sk)
> > > > > > +struct sock *reuseport_detach_sock(struct sock *sk)
> > > > > >  {
> > > > > > struct sock_reuseport *reuse;
> > > > > > +   struct bpf_prog *prog;
> > > > > > +   struct sock *nsk = NULL;
> > > > > > int i;
> > > > > >  
> > > > > > spin_lock_bh(_lock);
> > > > > > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk)
> > > > > >  
> > > > > > reuse->num_socks--;
> > > > > > reuse->socks[i] = reuse->socks[reuse->num_socks];
> > > > > > +   prog = rcu_dereference(reuse->prog);
> > > > > Is it under rcu_read_lock() here?
> > > > 
> > > > reuseport_lock is locked in this function, and we do not modify the 
> > > > prog,
> > > > but is rcu_dereference_protected() preferable?
> > > > 
> > > > ---8<---
> > > > prog = rcu_dereference_protected(reuse->prog,
> > > >  lockdep_is_held(_lock));
> > > > ---8<---
> > > It is not only reuse->prog.  Other things also require rcu_read_lock(),
> > > e.g. please take a look at __htab_map_lookup_elem().
> > > 
> > > The TCP_LISTEN sk (selected by bpf to be the target of the migration)
> > > is also protected by rcu.
> > 
> > Thank you, I will use rcu_read_lock() and rcu_dereference() in v3 patchset.
> > 
> > 
> > > I am surprised there is no WARNING in the test.
> > > Do you have the needed DEBUG_LOCK* config enabled?
> > 
> > Yes, DEBUG_LOCK* was 'y', but rcu_dereference() without rcu_read_lock()
> > does not show warnings...
> I would at least expect the "WARN_ON_ONCE(!rcu_read_lock_held() ...)"
> from __htab_map_lookup_elem() should fire in your test
> example in the last patch.
> 
> It is better to check the config before sending v3.

It seems ok, but I will check it again.

---8<---
[ec2-user@ip-10-0-0-124 bpf-next]$ cat .config | grep DEBUG_LOCK
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_DEBUG_LOCKDEP=y
CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
---8<---


> > > > > > diff --git a/net/ipv4/inet_connection_sock.c 
> > > > > > b/net/ipv4/inet_connection_sock.c
> > > > > > index 1451aa9712b0..b27241ea96bd 100644
> > > > > > --- a/net/ipv4/inet_connection_sock.c
> > > > > > +++ b/net/ipv4/inet_connection_sock.c
> > > > > > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct 
> > > > > > sock *sk,
> > > > > >  }
> > > > > >  EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
> > > > > >  
> > > > > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock 
> > > > > > *nsk)
> > > > > > +{
> > > > > > +   struct request_sock_queue *old_accept_queue, *new_accept_queue;
> > > > > > +
> > > > > > +   old_accept_queue = _csk(sk)->icsk_accept_queue;
> > > > > > +   new_accept_queue = _csk(nsk)->icsk_accept_queue;
> > > > > > +
> > > > > > +   spin_lock(_accept_queue->rskq_lock);
> > > > > > +   spin_lock(_accept_queue->rskq_lock);
> > > > > I am also not very thrilled on this double spin_lock.
> > > > > Can this

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-12-08 Thread Kuniyuki Iwashima

From:   Martin KaFai Lau 
Date:   Mon, 7 Dec 2020 23:34:41 -0800
> On Tue, Dec 08, 2020 at 03:31:34PM +0900, Kuniyuki Iwashima wrote:
> > From:   Martin KaFai Lau 
> > Date:   Mon, 7 Dec 2020 12:33:15 -0800
> > > On Thu, Dec 03, 2020 at 11:14:24PM +0900, Kuniyuki Iwashima wrote:
> > > > From:   Eric Dumazet 
> > > > Date:   Tue, 1 Dec 2020 16:25:51 +0100
> > > > > On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> > > > > > This patch lets reuseport_detach_sock() return a pointer of struct 
> > > > > > sock,
> > > > > > which is used only by inet_unhash(). If it is not NULL,
> > > > > > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > > > > > sockets from the closing listener to the selected one.
> > > > > > 
> > > > > > Listening sockets hold incoming connections as a linked list of 
> > > > > > struct
> > > > > > request_sock in the accept queue, and each request has reference to 
> > > > > > a full
> > > > > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we only 
> > > > > > unlink
> > > > > > the requests from the closing listener's queue and relink them to 
> > > > > > the head
> > > > > > of the new listener's queue. We do not process each request and its
> > > > > > reference to the listener, so the migration completes in O(1) time
> > > > > > complexity. However, in the case of TCP_SYN_RECV sockets, we take 
> > > > > > special
> > > > > > care in the next commit.
> > > > > > 
> > > > > > By default, the kernel selects a new listener randomly. In order to 
> > > > > > pick
> > > > > > out a different socket every time, we select the last element of 
> > > > > > socks[] as
> > > > > > the new listener. This behaviour is based on how the kernel moves 
> > > > > > sockets
> > > > > > in socks[]. (See also [1])
> > > > > > 
> > > > > > Basically, in order to redistribute sockets evenly, we have to use 
> > > > > > an eBPF
> > > > > > program called in the later commit, but as the side effect of such 
> > > > > > default
> > > > > > selection, the kernel can redistribute old requests evenly to new 
> > > > > > listeners
> > > > > > for a specific case where the application replaces listeners by
> > > > > > generations.
> > > > > > 
> > > > > > For example, we call listen() for four sockets (A, B, C, D), and 
> > > > > > close the
> > > > > > first two by turns. The sockets move in socks[] like below.
> > > > > > 
> > > > > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > > > > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > > > > >   socks[2] : C   |  socks[2] : C --'
> > > > > >   socks[3] : D --'
> > > > > > 
> > > > > > Then, if C and D have newer settings than A and B, and each socket 
> > > > > > has a
> > > > > > request (a, b, c, d) in their accept queue, we can redistribute old
> > > > > > requests evenly to new listeners.
> > > > > > 
> > > > > >   socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : D 
> > > > > > (a + d)
> > > > > >   socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C 
> > > > > > (b + c)
> > > > > >   socks[2] : C (c)   |  socks[2] : C (c) --'
> > > > > >   socks[3] : D (d) --'
> > > > > > 
> > > > > > Here, (A, D) or (B, C) can have different application settings, but 
> > > > > > they
> > > > > > MUST have the same settings at the socket API level; otherwise, 
> > > > > > unexpected
> > > > > > error may happen. For instance, if only the new listeners have
> > > > > > TCP_SAVE_SYN, old requests do not have SYN data, so the application 
> > > > > > will
> > > > > > face inconsistency and cause an error.
> > > > > > 
> > > > > > Therefore, if there are different kinds of sockets, we must at

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

From:   Martin KaFai Lau 
Date:   Mon, 7 Dec 2020 22:54:18 -0800
> On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote:
> 
> > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk)
> >  
> > reuse->num_socks--;
> > reuse->socks[i] = reuse->socks[reuse->num_socks];
> > +   prog = rcu_dereference(reuse->prog);
> >  
> > if (sk->sk_protocol == IPPROTO_TCP) {
> > +   if (reuse->num_socks && !prog)
> > +   nsk = i == reuse->num_socks ? reuse->socks[i - 
> > 1] : reuse->socks[i];
> I asked in the earlier thread if the primary use case is to only
> use the bpf prog to pick.  That thread did not come to
> a solid answer but did conclude that the sysctl should not
> control the behavior of the BPF_SK_REUSEPORT_SELECT_OR_MIGRATE prog.
> 
> From this change here, it seems it is still desired to only depend
> on the kernel to random pick even when no bpf prog is attached.

I wrote this way only to split patches into tcp and bpf parts.
So, in the 10th patch, eBPF prog is run if the type is
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.
https://lore.kernel.org/netdev/20201201144418.35045-11-kun...@amazon.co.jp/

But, it makes a breakage, so I will move
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE validation into 10th patch so that the
type is only available after 10th patch.

---8<---
case BPF_PROG_TYPE_SK_REUSEPORT:
switch (expected_attach_type) {
case BPF_SK_REUSEPORT_SELECT:
case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: <- move to 10th.
return 0;
default:
return -EINVAL;
}
---8<---


> If that is the case, a sysctl to guard here for not changing
> the current behavior makes sense.
> It should still only control the non-bpf-pick behavior:
> when the sysctl is on, the kernel will still do a random pick
> when there is no bpf prog attached to the reuseport group.
> Thoughts?

If different applications listen on the same port without eBPF prog, I
think sysctl is necessary. But honestly, I am not sure there is really such
a case and sysctl is necessary.

If patcheset with sysctl is more acceptable, I will add it back in the next
spin.


> > +
> > reuse->num_closed_socks++;
> > reuse->socks[reuse->max_socks - 
> > reuse->num_closed_socks] = sk;
> > } else {
> > @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk)
> > call_rcu(>rcu, reuseport_free_rcu);
> >  out:
> > spin_unlock_bh(_lock);
> > +
> > +   return nsk;
> >  }
> >  EXPORT_SYMBOL(reuseport_detach_sock);

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

From:   Martin KaFai Lau 
Date:   Mon, 7 Dec 2020 12:33:15 -0800
> On Thu, Dec 03, 2020 at 11:14:24PM +0900, Kuniyuki Iwashima wrote:
> > From:   Eric Dumazet 
> > Date:   Tue, 1 Dec 2020 16:25:51 +0100
> > > On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> > > > This patch lets reuseport_detach_sock() return a pointer of struct sock,
> > > > which is used only by inet_unhash(). If it is not NULL,
> > > > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > > > sockets from the closing listener to the selected one.
> > > > 
> > > > Listening sockets hold incoming connections as a linked list of struct
> > > > request_sock in the accept queue, and each request has reference to a 
> > > > full
> > > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we only 
> > > > unlink
> > > > the requests from the closing listener's queue and relink them to the 
> > > > head
> > > > of the new listener's queue. We do not process each request and its
> > > > reference to the listener, so the migration completes in O(1) time
> > > > complexity. However, in the case of TCP_SYN_RECV sockets, we take 
> > > > special
> > > > care in the next commit.
> > > > 
> > > > By default, the kernel selects a new listener randomly. In order to pick
> > > > out a different socket every time, we select the last element of 
> > > > socks[] as
> > > > the new listener. This behaviour is based on how the kernel moves 
> > > > sockets
> > > > in socks[]. (See also [1])
> > > > 
> > > > Basically, in order to redistribute sockets evenly, we have to use an 
> > > > eBPF
> > > > program called in the later commit, but as the side effect of such 
> > > > default
> > > > selection, the kernel can redistribute old requests evenly to new 
> > > > listeners
> > > > for a specific case where the application replaces listeners by
> > > > generations.
> > > > 
> > > > For example, we call listen() for four sockets (A, B, C, D), and close 
> > > > the
> > > > first two by turns. The sockets move in socks[] like below.
> > > > 
> > > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > > >   socks[2] : C   |  socks[2] : C --'
> > > >   socks[3] : D --'
> > > > 
> > > > Then, if C and D have newer settings than A and B, and each socket has a
> > > > request (a, b, c, d) in their accept queue, we can redistribute old
> > > > requests evenly to new listeners.
> > > > 
> > > >   socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : D (a + 
> > > > d)
> > > >   socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C (b + 
> > > > c)
> > > >   socks[2] : C (c)   |  socks[2] : C (c) --'
> > > >   socks[3] : D (d) --'
> > > > 
> > > > Here, (A, D) or (B, C) can have different application settings, but they
> > > > MUST have the same settings at the socket API level; otherwise, 
> > > > unexpected
> > > > error may happen. For instance, if only the new listeners have
> > > > TCP_SAVE_SYN, old requests do not have SYN data, so the application will
> > > > face inconsistency and cause an error.
> > > > 
> > > > Therefore, if there are different kinds of sockets, we must attach an 
> > > > eBPF
> > > > program described in later commits.
> > > > 
> > > > Link: 
> > > > https://lore.kernel.org/netdev/CAEfhGiyG8Y_amDZ2C8dQoQqjZJMHjTY76b=KBkTKcBtA=dh...@mail.gmail.com/
> > > > Reviewed-by: Benjamin Herrenschmidt 
> > > > Signed-off-by: Kuniyuki Iwashima 
> > > > ---
> > > >  include/net/inet_connection_sock.h |  1 +
> > > >  include/net/sock_reuseport.h   |  2 +-
> > > >  net/core/sock_reuseport.c  | 10 +-
> > > >  net/ipv4/inet_connection_sock.c| 30 ++
> > > >  net/ipv4/inet_hashtables.c |  9 +++--
> > > >  5 files changed, 48 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/include/net/inet_connection_sock.h 
> > > > b/include/net/inet_connection_sock.h
> > > > index

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

From:   Martin KaFai Lau 
Date:   Mon, 7 Dec 2020 12:14:38 -0800
> On Sun, Dec 06, 2020 at 01:03:07AM +0900, Kuniyuki Iwashima wrote:
> > From:   Martin KaFai Lau 
> > Date:   Fri, 4 Dec 2020 17:42:41 -0800
> > > On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote:
> > > [ ... ]
> > > > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> > > > index fd133516ac0e..60d7c1f28809 100644
> > > > --- a/net/core/sock_reuseport.c
> > > > +++ b/net/core/sock_reuseport.c
> > > > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct 
> > > > sock *sk2, bool bind_inany)
> > > >  }
> > > >  EXPORT_SYMBOL(reuseport_add_sock);
> > > >  
> > > > -void reuseport_detach_sock(struct sock *sk)
> > > > +struct sock *reuseport_detach_sock(struct sock *sk)
> > > >  {
> > > > struct sock_reuseport *reuse;
> > > > +   struct bpf_prog *prog;
> > > > +   struct sock *nsk = NULL;
> > > > int i;
> > > >  
> > > > spin_lock_bh(_lock);
> > > > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk)
> > > >  
> > > > reuse->num_socks--;
> > > > reuse->socks[i] = reuse->socks[reuse->num_socks];
> > > > +   prog = rcu_dereference(reuse->prog);
> > > Is it under rcu_read_lock() here?
> > 
> > reuseport_lock is locked in this function, and we do not modify the prog,
> > but is rcu_dereference_protected() preferable?
> > 
> > ---8<---
> > prog = rcu_dereference_protected(reuse->prog,
> >  lockdep_is_held(_lock));
> > ---8<---
> It is not only reuse->prog.  Other things also require rcu_read_lock(),
> e.g. please take a look at __htab_map_lookup_elem().
> 
> The TCP_LISTEN sk (selected by bpf to be the target of the migration)
> is also protected by rcu.

Thank you, I will use rcu_read_lock() and rcu_dereference() in v3 patchset.


> I am surprised there is no WARNING in the test.
> Do you have the needed DEBUG_LOCK* config enabled?

Yes, DEBUG_LOCK* was 'y', but rcu_dereference() without rcu_read_lock()
does not show warnings...


> > > > if (sk->sk_protocol == IPPROTO_TCP) {
> > > > +   if (reuse->num_socks && !prog)
> > > > +   nsk = i == reuse->num_socks ? 
> > > > reuse->socks[i - 1] : reuse->socks[i];
> > > > +
> > > > reuse->num_closed_socks++;
> > > > reuse->socks[reuse->max_socks - 
> > > > reuse->num_closed_socks] = sk;
> > > > } else {
> > > > @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk)
> > > > call_rcu(>rcu, reuseport_free_rcu);
> > > >  out:
> > > > spin_unlock_bh(_lock);
> > > > +
> > > > +   return nsk;
> > > >  }
> > > >  EXPORT_SYMBOL(reuseport_detach_sock);
> > > >  
> > > > diff --git a/net/ipv4/inet_connection_sock.c 
> > > > b/net/ipv4/inet_connection_sock.c
> > > > index 1451aa9712b0..b27241ea96bd 100644
> > > > --- a/net/ipv4/inet_connection_sock.c
> > > > +++ b/net/ipv4/inet_connection_sock.c
> > > > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock 
> > > > *sk,
> > > >  }
> > > >  EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
> > > >  
> > > > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
> > > > +{
> > > > +   struct request_sock_queue *old_accept_queue, *new_accept_queue;
> > > > +
> > > > +   old_accept_queue = _csk(sk)->icsk_accept_queue;
> > > > +   new_accept_queue = _csk(nsk)->icsk_accept_queue;
> > > > +
> > > > +   spin_lock(_accept_queue->rskq_lock);
> > > > +   spin_lock(_accept_queue->rskq_lock);
> > > I am also not very thrilled on this double spin_lock.
> > > Can this be done in (or like) inet_csk_listen_stop() instead?
> > 
> > It will be possible to migrate sockets in inet_csk_listen_stop(), but I
> > think it is better to do it just after reuseport_detach_sock() becuase we
> > can select a different listener (almost) every time at a lower cost by
> > selecting the mo

[PATCH v2 bpf-next 13/13] bpf: Test BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

This patch adds a test for BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 .../bpf/prog_tests/select_reuseport_migrate.c | 173 ++
 .../bpf/progs/test_select_reuseport_migrate.c |  53 ++
 2 files changed, 226 insertions(+)
 create mode 100644 
tools/testing/selftests/bpf/prog_tests/select_reuseport_migrate.c
 create mode 100644 
tools/testing/selftests/bpf/progs/test_select_reuseport_migrate.c

diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport_migrate.c 
b/tools/testing/selftests/bpf/prog_tests/select_reuseport_migrate.c
new file mode 100644
index ..814b1e3a4c56
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport_migrate.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check if we can migrate child sockets.
+ *
+ *   1. call listen() for 5 server sockets.
+ *   2. update a map to migrate all child socket
+ *to the last server socket (migrate_map[cookie] = 4)
+ *   3. call connect() for 25 client sockets.
+ *   4. call close() for first 4 server sockets.
+ *   5. call accept() for the last server socket.
+ *
+ * Author: Kuniyuki Iwashima 
+ */
+
+#include 
+#include 
+
+#include "test_progs.h"
+#include "test_select_reuseport_migrate.skel.h"
+
+#define ADDRESS "127.0.0.1"
+#define PORT 80
+#define NUM_SERVERS 5
+#define NUM_CLIENTS (NUM_SERVERS * 5)
+
+
+static int test_listen(struct test_select_reuseport_migrate *skel, int 
server_fds[])
+{
+   int i, err, optval = 1, migrated_to = NUM_SERVERS - 1;
+   int prog_fd, reuseport_map_fd, migrate_map_fd;
+   struct sockaddr_in addr;
+   socklen_t addr_len;
+   __u64 value;
+
+   prog_fd = bpf_program__fd(skel->progs.prog_select_reuseport_migrate);
+   reuseport_map_fd = bpf_map__fd(skel->maps.reuseport_map);
+   migrate_map_fd = bpf_map__fd(skel->maps.migrate_map);
+
+   addr_len = sizeof(addr);
+   addr.sin_family = AF_INET;
+   addr.sin_port = htons(PORT);
+   inet_pton(AF_INET, ADDRESS, _addr.s_addr);
+
+   for (i = 0; i < NUM_SERVERS; i++) {
+   server_fds[i] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+   if (CHECK_FAIL(server_fds[i] == -1))
+   return -1;
+
+   err = setsockopt(server_fds[i], SOL_SOCKET, SO_REUSEPORT,
+, sizeof(optval));
+   if (CHECK_FAIL(err == -1))
+   return -1;
+
+   if (i == 0) {
+   err = setsockopt(server_fds[i], SOL_SOCKET, 
SO_ATTACH_REUSEPORT_EBPF,
+_fd, sizeof(prog_fd));
+   if (CHECK_FAIL(err == -1))
+   return -1;
+   }
+
+   err = bind(server_fds[i], (struct sockaddr *), addr_len);
+   if (CHECK_FAIL(err == -1))
+   return -1;
+
+   err = listen(server_fds[i], 32);
+   if (CHECK_FAIL(err == -1))
+   return -1;
+
+   err = bpf_map_update_elem(reuseport_map_fd, , _fds[i], 
BPF_NOEXIST);
+   if (CHECK_FAIL(err == -1))
+   return -1;
+
+   err = bpf_map_lookup_elem(reuseport_map_fd, , );
+   if (CHECK_FAIL(err == -1))
+   return -1;
+
+   err = bpf_map_update_elem(migrate_map_fd, , _to, 
BPF_NOEXIST);
+   if (CHECK_FAIL(err == -1))
+   return -1;
+   }
+
+   return 0;
+}
+
+static int test_connect(int client_fds[])
+{
+   struct sockaddr_in addr;
+   socklen_t addr_len;
+   int i, err;
+
+   addr_len = sizeof(addr);
+   addr.sin_family = AF_INET;
+   addr.sin_port = htons(PORT);
+   inet_pton(AF_INET, ADDRESS, _addr.s_addr);
+
+   for (i = 0; i < NUM_CLIENTS; i++) {
+   client_fds[i] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+   if (CHECK_FAIL(client_fds[i] == -1))
+   return -1;
+
+   err = connect(client_fds[i], (struct sockaddr *), 
addr_len);
+   if (CHECK_FAIL(err == -1))
+   return -1;
+   }
+
+   return 0;
+}
+
+static void test_close(int server_fds[], int num)
+{
+   int i;
+
+   for (i = 0; i < num; i++)
+   if (server_fds[i] > 0)
+   close(server_fds[i]);
+}
+
+static int test_accept(int server_fd)
+{
+   struct sockaddr_in addr;
+   socklen_t addr_len;
+   int cnt, client_fd;
+
+   fcntl(server_fd, F_SETFL, O_NONBLOCK);
+   addr_len = sizeof(addr);
+
+   for (cnt = 0; cnt < NUM_CLIENTS; cnt++) {
+   client_fd = accept(server_fd, (struct sockaddr *), 
_len);
+   if (CHECK_FAIL(client_fd == -1))
+   return -1;
+   }
+
+   retur

[PATCH v2 bpf-next 12/13] bpf: Call bpf_run_sk_reuseport() for socket migration.

This patch supports socket migration by eBPF. If the attached type is
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, we can select a new listener by
BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning
SK_DROP. This feature is useful when listeners have different settings at
the socket API level or when we want to free resources as soon as possible.

There are two noteworthy points. The first is that we select a listening
socket in reuseport_detach_sock() and __reuseport_select_sock(), but we do
not have struct skb at closing a listener or retransmitting a SYN+ACK.
However, some helper functions do not expect skb is NULL (e.g.
skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer() in
BPF_FUNC_skb_load_bytes_relative()). So we allocate an empty skb
temporarily before running the eBPF program. The second is that we do not
have struct request_sock in unhash path, and the sk_hash of the listener is
always zero. So we pass zero as hash to bpf_run_sk_reuseport().

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 net/core/filter.c  | 19 +++
 net/core/sock_reuseport.c  | 21 +++--
 net/ipv4/inet_hashtables.c |  2 +-
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 9f7018e3f545..53fa3bcbf00f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9890,10 +9890,29 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport 
*reuse, struct sock *sk,
 {
struct sk_reuseport_kern reuse_kern;
enum sk_action action;
+   bool allocated = false;
+
+   if (migration) {
+   /* cancel migration for possibly incapable eBPF program */
+   if (prog->expected_attach_type != 
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)
+   return ERR_PTR(-ENOTSUPP);
+
+   if (!skb) {
+   allocated = true;
+   skb = alloc_skb(0, GFP_ATOMIC);
+   if (!skb)
+   return ERR_PTR(-ENOMEM);
+   }
+   } else if (!skb) {
+   return NULL; /* fall back to select by hash */
+   }
 
bpf_init_reuseport_kern(_kern, reuse, sk, skb, hash, migration);
action = BPF_PROG_RUN(prog, _kern);
 
+   if (allocated)
+   kfree_skb(skb);
+
if (action == SK_PASS)
return reuse_kern.selected_sk;
else
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index b877c8e552d2..2358e8896199 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -221,8 +221,15 @@ struct sock *reuseport_detach_sock(struct sock *sk)
 
lockdep_is_held(_lock));
 
if (sk->sk_protocol == IPPROTO_TCP) {
-   if (reuse->num_socks && !prog)
-   nsk = i == reuse->num_socks ? reuse->socks[i - 
1] : reuse->socks[i];
+   if (reuse->num_socks) {
+   if (prog)
+   nsk = bpf_run_sk_reuseport(reuse, sk, 
prog, NULL, 0,
+  
BPF_SK_REUSEPORT_MIGRATE_QUEUE);
+
+   if (!nsk)
+   nsk = i == reuse->num_socks ?
+   reuse->socks[i - 1] : 
reuse->socks[i];
+   }
 
reuse->num_closed_socks++;
} else {
@@ -306,15 +313,9 @@ static struct sock *__reuseport_select_sock(struct sock 
*sk, u32 hash,
if (!prog)
goto select_by_hash;
 
-   if (migration)
-   goto out;
-
-   if (!skb)
-   goto select_by_hash;
-
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash, 
migration);
-   else
+   else if (!skb)
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
 
 select_by_hash:
@@ -352,7 +353,7 @@ struct sock *reuseport_select_migrated_sock(struct sock 
*sk, u32 hash,
struct sock *nsk;
 
nsk = __reuseport_select_sock(sk, hash, skb, 0, 
BPF_SK_REUSEPORT_MIGRATE_REQUEST);
-   if (nsk && likely(refcount_inc_not_zero(>sk_refcnt)))
+   if (!IS_ERR_OR_NULL(nsk) && 
likely(refcount_inc_not_zero(>sk_refcnt)))
return nsk;
 
return NULL;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 545538a6bfac..59f58740c20d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -699,7 +699,7 @@ void inet_unhash(struct sock *sk)
 
if (rcu_access_pointer(sk->sk_reuseport_cb)) {

[PATCH v2 bpf-next 11/13] bpf: Support BPF_FUNC_get_socket_cookie() for BPF_PROG_TYPE_SK_REUSEPORT.

We will call sock_reuseport.prog for socket migration in the next commit,
so the eBPF program has to know which listener is closing in order to
select the new listener.

Currently, we can get a unique ID for each listener in the userspace by
calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map.

This patch makes the sk pointer available in sk_reuseport_md so that we can
get the ID by BPF_FUNC_get_socket_cookie() in the eBPF program.

Link: 
https://lore.kernel.org/netdev/20201119001154.kapwihc2plp4f...@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau 
Signed-off-by: Kuniyuki Iwashima 
---
 include/uapi/linux/bpf.h   |  8 
 net/core/filter.c  | 22 ++
 tools/include/uapi/linux/bpf.h |  8 
 3 files changed, 38 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cf518e83df5c..a688a7a4fe85 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1655,6 +1655,13 @@ union bpf_attr {
  * A 8-byte long non-decreasing number on success, or 0 if the
  * socket field is missing inside *skb*.
  *
+ * u64 bpf_get_socket_cookie(struct bpf_sock *sk)
+ * Description
+ * Equivalent to bpf_get_socket_cookie() helper that accepts
+ * *skb*, but gets socket from **struct bpf_sock** context.
+ * Return
+ * A 8-byte long non-decreasing number.
+ *
  * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
  * Description
  * Equivalent to bpf_get_socket_cookie() helper that accepts
@@ -4463,6 +4470,7 @@ struct sk_reuseport_md {
__u32 bind_inany;   /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
__u8 migration; /* Migration type */
+   __bpf_md_ptr(struct bpf_sock *, sk); /* Current listening socket */
 };
 
 #define BPF_TAG_SIZE   8
diff --git a/net/core/filter.c b/net/core/filter.c
index 7bdf62f24044..9f7018e3f545 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4631,6 +4631,18 @@ static const struct bpf_func_proto 
bpf_get_socket_cookie_sock_proto = {
.arg1_type  = ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_socket_pointer_cookie, struct sock *, sk)
+{
+   return __sock_gen_cookie(sk);
+}
+
+static const struct bpf_func_proto bpf_get_socket_pointer_cookie_proto = {
+   .func   = bpf_get_socket_pointer_cookie,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_SOCKET,
+};
+
 BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
 {
return __sock_gen_cookie(ctx->sk);
@@ -9989,6 +10001,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
return _reuseport_load_bytes_proto;
case BPF_FUNC_skb_load_bytes_relative:
return _reuseport_load_bytes_relative_proto;
+   case BPF_FUNC_get_socket_cookie:
+   return _get_socket_pointer_cookie_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -10022,6 +10036,10 @@ sk_reuseport_is_valid_access(int off, int size,
return prog->expected_attach_type == 
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE &&
size == sizeof(__u8);
 
+   case offsetof(struct sk_reuseport_md, sk):
+   info->reg_type = PTR_TO_SOCKET;
+   return size == sizeof(__u64);
+
/* Fields that allow narrowing */
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
if (size < sizeof_field(struct sk_buff, protocol))
@@ -10098,6 +10116,10 @@ static u32 sk_reuseport_convert_ctx_access(enum 
bpf_access_type type,
case offsetof(struct sk_reuseport_md, migration):
SK_REUSEPORT_LOAD_FIELD(migration);
break;
+
+   case offsetof(struct sk_reuseport_md, sk):
+   SK_REUSEPORT_LOAD_FIELD(sk);
+   break;
}
 
return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cf518e83df5c..a688a7a4fe85 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1655,6 +1655,13 @@ union bpf_attr {
  * A 8-byte long non-decreasing number on success, or 0 if the
  * socket field is missing inside *skb*.
  *
+ * u64 bpf_get_socket_cookie(struct bpf_sock *sk)
+ * Description
+ * Equivalent to bpf_get_socket_cookie() helper that accepts
+ * *skb*, but gets socket from **struct bpf_sock** context.
+ * Return
+ * A 8-byte long non-decreasing number.
+ *
  * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
  * Description
  * Equivalent to bpf_get_socket_cookie() helper that accepts
@@ -4463,6 +4470,7 @@ struct sk_reuseport_md {
__u32 bind_inany;   /* Is sock bound to an INANY addre

[PATCH v2 bpf-next 10/13] bpf: Add migration to sk_reuseport_(kern|md).

This patch adds u8 migration field to sk_reuseport_kern and sk_reuseport_md
to signal the eBPF program if the kernel calls it for selecting a listener
for SYN or migrating sockets in the accept queue or an immature socket
during 3WHS.

Note that this field is accessible only if the attached type is
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

Link: 
https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6t...@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau 
Signed-off-by: Kuniyuki Iwashima 
---
 include/linux/bpf.h|  1 +
 include/linux/filter.h |  4 ++--
 include/uapi/linux/bpf.h   |  1 +
 net/core/filter.c  | 15 ---
 net/core/sock_reuseport.c  |  2 +-
 tools/include/uapi/linux/bpf.h |  1 +
 6 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d05e75ed8c1b..cdeb27f4ad63 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1914,6 +1914,7 @@ struct sk_reuseport_kern {
u32 hash;
u32 reuseport_id;
bool bind_inany;
+   u8 migration;
 };
 bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
  struct bpf_insn_access_aux *info);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1b62397bd124..15d5bf13a905 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -967,12 +967,12 @@ void bpf_warn_invalid_xdp_action(u32 act);
 #ifdef CONFIG_INET
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock 
*sk,
  struct bpf_prog *prog, struct sk_buff *skb,
- u32 hash);
+ u32 hash, u8 migration);
 #else
 static inline struct sock *
 bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 struct bpf_prog *prog, struct sk_buff *skb,
-u32 hash)
+u32 hash, u8 migration)
 {
return NULL;
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c7f6848c0226..cf518e83df5c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4462,6 +4462,7 @@ struct sk_reuseport_md {
__u32 ip_protocol;  /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
__u32 bind_inany;   /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
+   __u8 migration; /* Migration type */
 };
 
 #define BPF_TAG_SIZE   8
diff --git a/net/core/filter.c b/net/core/filter.c
index 77001a35768f..7bdf62f24044 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9860,7 +9860,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter 
__user *ubuf,
 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock_reuseport *reuse,
struct sock *sk, struct sk_buff *skb,
-   u32 hash)
+   u32 hash, u8 migration)
 {
reuse_kern->skb = skb;
reuse_kern->sk = sk;
@@ -9869,16 +9869,17 @@ static void bpf_init_reuseport_kern(struct 
sk_reuseport_kern *reuse_kern,
reuse_kern->hash = hash;
reuse_kern->reuseport_id = reuse->reuseport_id;
reuse_kern->bind_inany = reuse->bind_inany;
+   reuse_kern->migration = migration;
 }
 
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock 
*sk,
  struct bpf_prog *prog, struct sk_buff *skb,
- u32 hash)
+ u32 hash, u8 migration)
 {
struct sk_reuseport_kern reuse_kern;
enum sk_action action;
 
-   bpf_init_reuseport_kern(_kern, reuse, sk, skb, hash);
+   bpf_init_reuseport_kern(_kern, reuse, sk, skb, hash, migration);
action = BPF_PROG_RUN(prog, _kern);
 
if (action == SK_PASS)
@@ -10017,6 +10018,10 @@ sk_reuseport_is_valid_access(int off, int size,
case offsetof(struct sk_reuseport_md, hash):
return size == size_default;
 
+   case bpf_ctx_range(struct sk_reuseport_md, migration):
+   return prog->expected_attach_type == 
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE &&
+   size == sizeof(__u8);
+
/* Fields that allow narrowing */
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
if (size < sizeof_field(struct sk_buff, protocol))
@@ -10089,6 +10094,10 @@ static u32 sk_reuseport_convert_ctx_access(enum 
bpf_access_type type,
case offsetof(struct sk_reuseport_md, bind_inany):
SK_REUSEPORT_LOAD_FIELD(bind_inany);
break;
+
+   case offsetof(struct sk_reuseport_md, migration):
+   SK_REUSEPORT_LOAD_FIELD(migration);
+   break;
}
 
return

[PATCH v2 bpf-next 09/13] libbpf: Set expected_attach_type for BPF_PROG_TYPE_SK_REUSEPORT.

This commit introduces a new section (sk_reuseport/migrate) and sets
expected_attach_type to two each section in BPF_PROG_TYPE_SK_REUSEPORT
program.

Signed-off-by: Kuniyuki Iwashima 
---
 tools/lib/bpf/libbpf.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 9be88a90a4aa..ba64c891a5e7 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8471,7 +8471,10 @@ static struct bpf_link *attach_iter(const struct 
bpf_sec_def *sec,
 
 static const struct bpf_sec_def section_defs[] = {
BPF_PROG_SEC("socket",  BPF_PROG_TYPE_SOCKET_FILTER),
-   BPF_PROG_SEC("sk_reuseport",BPF_PROG_TYPE_SK_REUSEPORT),
+   BPF_EAPROG_SEC("sk_reuseport/migrate",  BPF_PROG_TYPE_SK_REUSEPORT,
+   
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE),
+   BPF_EAPROG_SEC("sk_reuseport",  BPF_PROG_TYPE_SK_REUSEPORT,
+   BPF_SK_REUSEPORT_SELECT),
SEC_DEF("kprobe/", KPROBE,
.attach_fn = attach_kprobe),
BPF_PROG_SEC("uprobe/", BPF_PROG_TYPE_KPROBE),
-- 
2.17.2 (Apple Git-113)

[PATCH v2 bpf-next 08/13] bpf: Introduce two attach types for BPF_PROG_TYPE_SK_REUSEPORT.

This commit adds new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT to
check if the attached eBPF program is capable of migrating sockets.

When the eBPF program is attached, the kernel runs it for socket migration
only if the expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.
The kernel will change the behaviour depending on the returned value:

  - SK_PASS with selected_sk, select it as a new listener
  - SK_PASS with selected_sk NULL, fall back to the random selection
  - SK_DROP, cancel the migration

Link: 
https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6t...@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau 
Signed-off-by: Kuniyuki Iwashima 
---
 include/uapi/linux/bpf.h   |  2 ++
 kernel/bpf/syscall.c   | 13 +
 tools/include/uapi/linux/bpf.h |  2 ++
 3 files changed, 17 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7a48e0055500..c7f6848c0226 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -241,6 +241,8 @@ enum bpf_attach_type {
BPF_XDP_CPUMAP,
BPF_SK_LOOKUP,
BPF_XDP,
+   BPF_SK_REUSEPORT_SELECT,
+   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0cd3cc2af9c1..0737673c727c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1920,6 +1920,11 @@ static void bpf_prog_load_fixup_attach_type(union 
bpf_attr *attr)
attr->expected_attach_type =
BPF_CGROUP_INET_SOCK_CREATE;
break;
+   case BPF_PROG_TYPE_SK_REUSEPORT:
+   if (!attr->expected_attach_type)
+   attr->expected_attach_type =
+   BPF_SK_REUSEPORT_SELECT;
+   break;
}
 }
 
@@ -2003,6 +2008,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
if (expected_attach_type == BPF_SK_LOOKUP)
return 0;
return -EINVAL;
+   case BPF_PROG_TYPE_SK_REUSEPORT:
+   switch (expected_attach_type) {
+   case BPF_SK_REUSEPORT_SELECT:
+   case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
+   return 0;
+   default:
+   return -EINVAL;
+   }
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
return -EINVAL;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7a48e0055500..c7f6848c0226 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -241,6 +241,8 @@ enum bpf_attach_type {
BPF_XDP_CPUMAP,
BPF_SK_LOOKUP,
BPF_XDP,
+   BPF_SK_REUSEPORT_SELECT,
+   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
__MAX_BPF_ATTACH_TYPE
 };
 
-- 
2.17.2 (Apple Git-113)

[PATCH v2 bpf-next 07/13] tcp: Migrate TCP_NEW_SYN_RECV requests.

This patch renames reuseport_select_sock() to __reuseport_select_sock() and
adds two wrapper function of it to pass the migration type defined in the
previous commit.

  reuseport_select_sock  : BPF_SK_REUSEPORT_MIGRATE_NO
  reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST

As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV
requests at receiving the final ACK or sending a SYN+ACK. Therefore, this
patch also changes the code to call reuseport_select_migrated_sock() even
if the listening socket is TCP_CLOSE. If we can pick out a listening socket
from the reuseport group, we rewrite request_sock.rsk_listener and resume
processing the request.

Link: https://lore.kernel.org/bpf/202012020136.bf0z4guu-...@intel.com/
Reported-by: kernel test robot 
Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 include/net/inet_connection_sock.h | 11 
 include/net/request_sock.h | 13 ++
 include/net/sock_reuseport.h   |  8 +++---
 net/core/sock_reuseport.c  | 40 --
 net/ipv4/inet_connection_sock.c| 13 --
 net/ipv4/tcp_ipv4.c|  9 +--
 net/ipv6/tcp_ipv6.c|  9 +--
 7 files changed, 86 insertions(+), 17 deletions(-)

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 2ea2d743f8fc..d8c3be31e987 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -272,6 +272,17 @@ static inline void inet_csk_reqsk_queue_added(struct sock 
*sk)
reqsk_queue_added(_csk(sk)->icsk_accept_queue);
 }
 
+static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,
+struct sock *nsk,
+struct request_sock *req)
+{
+   reqsk_queue_migrated(_csk(sk)->icsk_accept_queue,
+_csk(nsk)->icsk_accept_queue,
+req);
+   sock_put(sk);
+   req->rsk_listener = nsk;
+}
+
 static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
 {
return reqsk_queue_len(_csk(sk)->icsk_accept_queue);
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 29e41ff3ec93..d18ba0b857cc 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -226,6 +226,19 @@ static inline void reqsk_queue_added(struct 
request_sock_queue *queue)
atomic_inc(>qlen);
 }
 
+static inline void reqsk_queue_migrated(struct request_sock_queue 
*old_accept_queue,
+   struct request_sock_queue 
*new_accept_queue,
+   const struct request_sock *req)
+{
+   atomic_dec(_accept_queue->qlen);
+   atomic_inc(_accept_queue->qlen);
+
+   if (req->num_timeout == 0) {
+   atomic_dec(_accept_queue->young);
+   atomic_inc(_accept_queue->young);
+   }
+}
+
 static inline int reqsk_queue_len(const struct request_sock_queue *queue)
 {
return atomic_read(>qlen);
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 09a1b1539d4c..a48259a974be 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -32,10 +32,10 @@ extern int reuseport_alloc(struct sock *sk, bool 
bind_inany);
 extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
  bool bind_inany);
 extern struct sock *reuseport_detach_sock(struct sock *sk);
-extern struct sock *reuseport_select_sock(struct sock *sk,
- u32 hash,
- struct sk_buff *skb,
- int hdr_len);
+extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash,
+ struct sk_buff *skb, int hdr_len);
+extern struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,
+  struct sk_buff *skb);
 extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
 extern int reuseport_detach_prog(struct sock *sk);
 
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 2de42f8103ea..1011c3756c92 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -170,7 +170,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, 
bool bind_inany)
}
 
reuse->socks[reuse->num_socks] = sk;
-   /* paired with smp_rmb() in reuseport_select_sock() */
+   /* paired with smp_rmb() in __reuseport_select_sock() */
smp_wmb();
reuse->num_socks++;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
@@ -277,12 +277,13 @@ static struct sock *run_bpf_filter(struct sock_reuseport 
*reuse, u16 socks,
  *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
  *the skb d

[PATCH v2 bpf-next 06/13] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

This patch lets reuseport_detach_sock() return a pointer of struct sock,
which is used only by inet_unhash(). If it is not NULL,
inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
sockets from the closing listener to the selected one.

By default, the kernel selects a new listener randomly. In order to pick
out a different socket every time, we select the last element of socks[] as
the new listener. This behaviour is based on how the kernel moves sockets
in socks[]. (See also [1])

Basically, in order to redistribute sockets evenly, we have to use an eBPF
program called in the later commit, but as the side effect of such default
selection, the kernel can redistribute old requests evenly to new listeners
for a specific case where the application replaces listeners by
generations.

For example, we call listen() for four sockets (A, B, C, D), and close()
the first two by turns. The sockets move in socks[] like below.

  socks[0] : A <-.  socks[0] : D  socks[0] : D
  socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
  socks[2] : C   |  socks[2] : C --'
  socks[3] : D --'

Then, if C and D have newer settings than A and B, and each socket has a
request (a, b, c, d) in their accept queue, we can redistribute old
requests evenly to new listeners.

  socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : D (a + d)
  socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C (b + c)
  socks[2] : C (c)   |  socks[2] : C (c) --'
  socks[3] : D (d) --'

Here, (A, D), or (B, C) can have different application settings, but they
MUST have the same settings at the socket API level; otherwise, unexpected
error may happen. For instance, if only the new listeners have
TCP_SAVE_SYN, old requests do not hold SYN data, so the application will
face inconsistency and cause an error.

Therefore, if there are different kinds of sockets, we must attach an eBPF
program described in later commits.

Link: 
https://lore.kernel.org/netdev/CAEfhGiyG8Y_amDZ2C8dQoQqjZJMHjTY76b=KBkTKcBtA=dh...@mail.gmail.com/
Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 include/net/sock_reuseport.h |  2 +-
 net/core/sock_reuseport.c| 16 +---
 net/ipv4/inet_hashtables.c   |  9 +++--
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 0e558ca7afbf..09a1b1539d4c 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -31,7 +31,7 @@ struct sock_reuseport {
 extern int reuseport_alloc(struct sock *sk, bool bind_inany);
 extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
  bool bind_inany);
-extern void reuseport_detach_sock(struct sock *sk);
+extern struct sock *reuseport_detach_sock(struct sock *sk);
 extern struct sock *reuseport_select_sock(struct sock *sk,
  u32 hash,
  struct sk_buff *skb,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index c26f4256ff41..2de42f8103ea 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -184,9 +184,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, 
bool bind_inany)
 }
 EXPORT_SYMBOL(reuseport_add_sock);
 
-void reuseport_detach_sock(struct sock *sk)
+struct sock *reuseport_detach_sock(struct sock *sk)
 {
struct sock_reuseport *reuse;
+   struct bpf_prog *prog;
+   struct sock *nsk = NULL;
int i;
 
spin_lock_bh(_lock);
@@ -215,17 +217,25 @@ void reuseport_detach_sock(struct sock *sk)
 
reuse->num_socks--;
reuse->socks[i] = reuse->socks[reuse->num_socks];
+   prog = rcu_dereference_protected(reuse->prog,
+
lockdep_is_held(_lock));
+
+   if (sk->sk_protocol == IPPROTO_TCP) {
+   if (reuse->num_socks && !prog)
+   nsk = i == reuse->num_socks ? reuse->socks[i - 
1] : reuse->socks[i];
 
-   if (sk->sk_protocol == IPPROTO_TCP)
reuse->num_closed_socks++;
-   else
+   } else {
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
+   }
}
 
if (reuse->num_socks + reuse->num_closed_socks == 0)
call_rcu(>rcu, reuseport_free_rcu);
 
spin_unlock_bh(_lock);
+
+   return nsk;
 }
 EXPORT_SYMBOL(reuseport_detach_sock);
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 45fb450b4522..545538a6bfac 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -681,6 +681,7 @@ void inet_unhash(struct sock *sk)
 {
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb =

[PATCH v2 bpf-next 05/13] tcp: Set the new listener to migrated TFO requests.

A TFO request socket is only freed after BOTH 3WHS has completed (or
aborted) and the child socket has been accepted (or its listener has been
closed). Hence, depending on the order, there can be two kinds of request
sockets in the accept queue.

  3WHS -> accept : TCP_ESTABLISHED
  accept -> 3WHS : TCP_SYN_RECV

Unlike TCP_ESTABLISHED socket, accept() does not free the request socket
for TCP_SYN_RECV socket. It is freed later at reqsk_fastopen_remove().
Also, it accesses request_sock.rsk_listener. So, in order to complete TFO
socket migration, we have to set the current listener to it at accept()
before reqsk_fastopen_remove().

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 net/ipv4/inet_connection_sock.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5da38a756e4c..143590858c2e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -500,6 +500,16 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, 
int *err, bool kern)
tcp_rsk(req)->tfo_listener) {
spin_lock_bh(>fastopenq.lock);
if (tcp_rsk(req)->tfo_listener) {
+   if (req->rsk_listener != sk) {
+   /* TFO request was migrated to another listener 
so
+* the new listener must be used in 
reqsk_fastopen_remove()
+* to hold requests which cause RST.
+*/
+   sock_put(req->rsk_listener);
+   sock_hold(sk);
+   req->rsk_listener = sk;
+   }
+
/* We are still waiting for the final ACK from 3WHS
 * so can't free req now. Instead, we set req->sk to
 * NULL to signify that the child socket is taken
@@ -954,7 +964,6 @@ static void inet_child_forget(struct sock *sk, struct 
request_sock *req,
 
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
-   BUG_ON(sk != req->rsk_listener);
 
/* Paranoid, to prevent race condition if
 * an inbound pkt destined for child is
-- 
2.17.2 (Apple Git-113)

[PATCH v2 bpf-next 04/13] tcp: Introduce inet_csk_reqsk_queue_migrate().

This patch defines a new function to migrate ESTABLISHED/SYN_RECV sockets.

Listening sockets hold incoming connections as a linked list of struct
request_sock in the accept queue, and each request has reference to its
full socket and listener. In inet_csk_reqsk_queue_migrate(), we only unlink
the requests from the closing listener's queue and relink them to the head
of the new listener's queue. We do not process each request and its
reference to the listener, so the migration completes in O(1) time
complexity.

Moreover, if TFO requests caused RST before 3WHS has completed, they are
held in the listener's TFO queue to prevent DDoS attack. Thus, we also
migrate the requests in the TFO queue in the same way.

After 3WHS has completed, there are three access patterns to incoming
sockets:

  (1) access to the full socket instead of request_sock
  (2) access to request_sock from access queue
  (3) access to request_sock from TFO queue

In the first case, the full socket does not have a reference to its request
socket and listener, so we do not need the correct listener set in the
request socket. In the second case, we always have the correct listener and
currently do not use req->rsk_listener. However, in the third case of
TCP_SYN_RECV sockets, we take special care in the next commit.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 include/net/inet_connection_sock.h |  1 +
 net/ipv4/inet_connection_sock.c| 68 ++
 2 files changed, 69 insertions(+)

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 7338b3865a2a..2ea2d743f8fc 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -260,6 +260,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct 
sock *sk,
 struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
  struct request_sock *req,
  struct sock *child);
+void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk);
 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
   unsigned long timeout);
 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 1451aa9712b0..5da38a756e4c 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -992,6 +992,74 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
 }
 EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
 
+void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
+{
+   struct request_sock_queue *old_accept_queue, *new_accept_queue;
+   struct fastopen_queue *old_fastopenq, *new_fastopenq;
+   spinlock_t *l1, *l2, *l3, *l4;
+
+   old_accept_queue = _csk(sk)->icsk_accept_queue;
+   new_accept_queue = _csk(nsk)->icsk_accept_queue;
+   old_fastopenq = _accept_queue->fastopenq;
+   new_fastopenq = _accept_queue->fastopenq;
+
+   l1 = _accept_queue->rskq_lock;
+   l2 = _accept_queue->rskq_lock;
+   l3 = _fastopenq->lock;
+   l4 = _fastopenq->lock;
+
+   /* sk is never selected as the new listener from reuse->socks[],
+* so inversion deadlock does not happen here,
+* but change the order to avoid the warning of lockdep.
+*/
+   if (sk < nsk) {
+   swap(l1, l2);
+   swap(l3, l4);
+   }
+
+   spin_lock(l1);
+   spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+
+   if (old_accept_queue->rskq_accept_head) {
+   if (new_accept_queue->rskq_accept_head)
+   old_accept_queue->rskq_accept_tail->dl_next =
+   new_accept_queue->rskq_accept_head;
+   else
+   new_accept_queue->rskq_accept_tail = 
old_accept_queue->rskq_accept_tail;
+
+   new_accept_queue->rskq_accept_head = 
old_accept_queue->rskq_accept_head;
+   old_accept_queue->rskq_accept_head = NULL;
+   old_accept_queue->rskq_accept_tail = NULL;
+
+   WRITE_ONCE(nsk->sk_ack_backlog, nsk->sk_ack_backlog + 
sk->sk_ack_backlog);
+   WRITE_ONCE(sk->sk_ack_backlog, 0);
+   }
+
+   spin_unlock(l2);
+   spin_unlock(l1);
+
+   spin_lock_bh(l3);
+   spin_lock_bh_nested(l4, SINGLE_DEPTH_NESTING);
+
+   new_fastopenq->qlen += old_fastopenq->qlen;
+   old_fastopenq->qlen = 0;
+
+   if (old_fastopenq->rskq_rst_head) {
+   if (new_fastopenq->rskq_rst_head)
+   old_fastopenq->rskq_rst_tail->dl_next = 
new_fastopenq->rskq_rst_head;
+   else
+   old_fastopenq->rskq_rst_tail = 
new_fastopenq->rskq_rst_tail;
+
+   new_

[PATCH v2 bpf-next 03/13] Revert "locking/spinlocks: Remove the unused spin_lock_bh_nested() API"

This reverts commit 607904c357c61adf20b8fd18af765e501d61a385 to use
spin_lock_bh_nested() in the next commit.

Link: 
https://lore.kernel.org/netdev/9d290a57-49e1-04cd-2487-262b0d7c5...@gmail.com/
Signed-off-by: Kuniyuki Iwashima 
CC: Waiman Long 
---
 include/linux/spinlock.h | 8 
 include/linux/spinlock_api_smp.h | 2 ++
 include/linux/spinlock_api_up.h  | 1 +
 kernel/locking/spinlock.c| 8 
 4 files changed, 19 insertions(+)

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 79897841a2cc..c020b375a071 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -227,6 +227,8 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) 
__releases(lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define raw_spin_lock_nested(lock, subclass) \
_raw_spin_lock_nested(lock, subclass)
+# define raw_spin_lock_bh_nested(lock, subclass) \
+   _raw_spin_lock_bh_nested(lock, subclass)
 
 # define raw_spin_lock_nest_lock(lock, nest_lock)  \
 do {   \
@@ -242,6 +244,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) 
__releases(lock)
 # define raw_spin_lock_nested(lock, subclass)  \
_raw_spin_lock(((void)(subclass), (lock)))
 # define raw_spin_lock_nest_lock(lock, nest_lock)  _raw_spin_lock(lock)
+# define raw_spin_lock_bh_nested(lock, subclass)   _raw_spin_lock_bh(lock)
 #endif
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
@@ -369,6 +372,11 @@ do {   
\
raw_spin_lock_nested(spinlock_check(lock), subclass);   \
 } while (0)
 
+#define spin_lock_bh_nested(lock, subclass)\
+do {   \
+   raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\
+} while (0)
+
 #define spin_lock_nest_lock(lock, nest_lock)   \
 do {   \
raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);   \
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 19a9be9d97ee..d565fb6304f2 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -22,6 +22,8 @@ int in_lock_functions(unsigned long addr);
 void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)   
__acquires(lock);
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)

__acquires(lock);
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+   
__acquires(lock);
 void __lockfunc
 _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map)

__acquires(lock);
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index d0d188861ad6..d3afef9d8dbe 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -57,6 +57,7 @@
 
 #define _raw_spin_lock(lock)   __LOCK(lock)
 #define _raw_spin_lock_nested(lock, subclass)  __LOCK(lock)
+#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock)
 #define _raw_read_lock(lock)   __LOCK(lock)
 #define _raw_write_lock(lock)  __LOCK(lock)
 #define _raw_spin_lock_bh(lock)__LOCK_BH(lock)
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 0ff08380f531..48e99ed1bdd8 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t 
*lock, int subclass)
 }
 EXPORT_SYMBOL(_raw_spin_lock_nested);
 
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+{
+   __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+   spin_acquire(>dep_map, subclass, 0, _RET_IP_);
+   LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
+
 unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
   int subclass)
 {
-- 
2.17.2 (Apple Git-113)

[PATCH v2 bpf-next 02/13] bpf: Define migration types for SO_REUSEPORT.

As noted in the preceding commit, there are two migration types. In
addition to that, the kernel will run the same eBPF program to select a
listener for SYN packets.

This patch defines three types to signal the kernel and the eBPF program if
it is receiving a new request or migrating ESTABLISHED/SYN_RECV sockets in
the accept queue or NEW_SYN_RECV socket during 3WHS.

Signed-off-by: Kuniyuki Iwashima 
---
 include/uapi/linux/bpf.h   | 14 ++
 tools/include/uapi/linux/bpf.h | 14 ++
 2 files changed, 28 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1233f14f659f..7a48e0055500 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4423,6 +4423,20 @@ struct sk_msg_md {
__bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
 };
 
+/* Migration type for SO_REUSEPORT enabled TCP sockets.
+ *
+ * BPF_SK_REUSEPORT_MIGRATE_NO  : Select a listener for SYN packets.
+ * BPF_SK_REUSEPORT_MIGRATE_QUEUE   : Migrate ESTABLISHED and SYN_RECV sockets 
in
+ *the accept queue at close() or 
shutdown().
+ * BPF_SK_REUSEPORT_MIGRATE_REQUEST : Migrate NEW_SYN_RECV socket at receiving 
the
+ *final ACK of 3WHS or retransmitting 
SYN+ACKs.
+ */
+enum {
+   BPF_SK_REUSEPORT_MIGRATE_NO,
+   BPF_SK_REUSEPORT_MIGRATE_QUEUE,
+   BPF_SK_REUSEPORT_MIGRATE_REQUEST,
+};
+
 struct sk_reuseport_md {
/*
 * Start of directly accessible data. It begins from
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1233f14f659f..7a48e0055500 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4423,6 +4423,20 @@ struct sk_msg_md {
__bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
 };
 
+/* Migration type for SO_REUSEPORT enabled TCP sockets.
+ *
+ * BPF_SK_REUSEPORT_MIGRATE_NO  : Select a listener for SYN packets.
+ * BPF_SK_REUSEPORT_MIGRATE_QUEUE   : Migrate ESTABLISHED and SYN_RECV sockets 
in
+ *the accept queue at close() or 
shutdown().
+ * BPF_SK_REUSEPORT_MIGRATE_REQUEST : Migrate NEW_SYN_RECV socket at receiving 
the
+ *final ACK of 3WHS or retransmitting 
SYN+ACKs.
+ */
+enum {
+   BPF_SK_REUSEPORT_MIGRATE_NO,
+   BPF_SK_REUSEPORT_MIGRATE_QUEUE,
+   BPF_SK_REUSEPORT_MIGRATE_REQUEST,
+};
+
 struct sk_reuseport_md {
/*
 * Start of directly accessible data. It begins from
-- 
2.17.2 (Apple Git-113)

[PATCH v2 bpf-next 01/13] tcp: Allow TCP_CLOSE sockets to hold the reuseport group.

This patch is a preparation patch to migrate incoming connections in the
later commits and adds a field (num_closed_socks) to the struct
sock_reuseport to allow TCP_CLOSE sockets to access to the reuseport group.

When we close a listening socket, to migrate its connections to another
listener in the same reuseport group, we have to handle two kinds of child
sockets. One is that a listening socket has a reference to, and the other
is not.

The former is the TCP_ESTABLISHED/TCP_SYN_RECV sockets, and they are in the
accept queue of their listening socket. So, we can pop them out and push
them into another listener's queue at close() or shutdown() syscalls. On
the other hand, the latter, the TCP_NEW_SYN_RECV socket is during the
three-way handshake and not in the accept queue. Thus, we cannot access
such sockets at close() or shutdown() syscalls. Accordingly, we have to
migrate immature sockets after their listening socket has been closed.

Currently, if their listening socket has been closed, TCP_NEW_SYN_RECV
sockets are freed at receiving the final ACK or retransmitting SYN+ACKs. At
that time, if we could select a new listener from the same reuseport group,
no connection would be aborted. However, it is impossible because
reuseport_detach_sock() sets NULL to sk_reuseport_cb and forbids access to
the reuseport group from closed sockets.

This patch allows TCP_CLOSE sockets to hold sk_reuseport_cb while any child
socket references to them. The point is that reuseport_detach_sock() is
called twice from inet_unhash() and sk_destruct(). At first, it decrements
num_socks and increments num_closed_socks. Later, when all migrated
connections are accepted, it decrements num_closed_socks and sets NULL to
sk_reuseport_cb.

By this change, closed sockets can keep sk_reuseport_cb until all child
requests have been freed or accepted. Consequently calling listen() after
shutdown() can cause EADDRINUSE or EBUSY in reuseport_add_sock() or
inet_csk_bind_conflict() which expect that such sockets should not have the
reuseport group. Therefore, this patch also loosens such validation rules
so that the socket can listen again if it has the same reuseport group with
other listening sockets.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 include/net/sock_reuseport.h|  5 +++--
 net/core/sock_reuseport.c   | 39 +++--
 net/ipv4/inet_connection_sock.c |  7 --
 3 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 505f1e18e9bf..0e558ca7afbf 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock;
 struct sock_reuseport {
struct rcu_head rcu;
 
-   u16 max_socks;  /* length of socks */
-   u16 num_socks;  /* elements in socks */
+   u16 max_socks;  /* length of socks */
+   u16 num_socks;  /* elements in socks */
+   u16 num_closed_socks;   /* closed elements in 
socks */
/* The last synq overflow event timestamp of this
 * reuse->socks[] group.
 */
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index bbdd3c7b6cb5..c26f4256ff41 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -98,14 +98,15 @@ static struct sock_reuseport *reuseport_grow(struct 
sock_reuseport *reuse)
return NULL;
 
more_reuse->num_socks = reuse->num_socks;
+   more_reuse->num_closed_socks = reuse->num_closed_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
more_reuse->has_conns = reuse->has_conns;
+   more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
 
memcpy(more_reuse->socks, reuse->socks,
   reuse->num_socks * sizeof(struct sock *));
-   more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
 
for (i = 0; i < reuse->num_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
@@ -152,8 +153,10 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, 
bool bind_inany)
reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
  lockdep_is_held(_lock));
old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
-lockdep_is_held(_lock));
-   if (old_reuse && old_reuse->num_socks != 1) {
+ lockdep_is_held(_lock));
+   if (old_reuse == reuse) {
+   reuse->num_closed_socks--;
+   } else if (old_reuse && old_reuse->num_socks !

[PATCH v2 bpf-next 00/13] Socket migration for SO_REUSEPORT.

The SO_REUSEPORT option allows sockets to listen on the same port and to
accept connections evenly. However, there is a defect in the current
implementation[1]. When a SYN packet is received, the connection is tied to
a listening socket. Accordingly, when the listener is closed, in-flight
requests during the three-way handshake and child sockets in the accept
queue are dropped even if other listeners on the same port could accept
such connections.

1. Stop routing SYN packets to the listener by eBPF.
2. Wait for all timers to expire to complete requests
3. Accept connections until EAGAIN, then close the listener.

1. Start counting SYN packets and accept syscalls using eBPF map.
2. Stop routing SYN packets.
3. Accept connections up to the count, then close the listener.

In either way, we cannot close a listener immediately. However, ideally,
the application need not drain the not yet accepted sockets because 3WHS
and tying a connection to a listener are just the kernel behaviour. The
root cause is within the kernel, so the issue should be addressed in kernel
space and should not be visible to user space. This patchset fixes it so
that users need not take care of kernel implementation and connection
draining. With this patchset, the kernel redistributes requests and
connections from a listener to others in the same reuseport group at/after
close() or shutdown() syscalls.

Although some software does connection draining, there are still merits in
migration. For some security reasons such as replacing TLS certificates, we
may want to apply new settings as soon as possible and/or we may not be
able to wait for connection draining. The sockets in the accept queue have
not started application sessions yet. So, if we do not drain such sockets,
they can be handled by the newer listeners and could have a longer
lifetime. It is difficult to drain all connections in every case, but we
can decrease such aborted connections by migration. In that sense,
migration is always better than draining.

Moreover, auto-migration simplifies userspace logic and also works well in
a case where we cannot modify and build a server program to implement the
workaround.

Note that the source and destination listeners MUST have the same settings
at the socket API level; otherwise, applications may face inconsistency and
cause errors. In such a case, we have to use eBPF program to select a
specific listener or to cancel migration.

Link:
[1] The SO_REUSEPORT socket option
https://lwn.net/Articles/542629/

[2] Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain
mode

https://lore.kernel.org/netdev/1458828813.10868.65.ca...@edumazet-glaptop3.roam.corp.google.com/

Changelog:
v2:
* Do not save closed sockets in socks[]
* Revert 607904c357c61adf20b8fd18af765e501d61a385
* Extract inet_csk_reqsk_queue_migrate() into a single patch
* Change the spin_lock order to avoid lockdep warning
* Add static to __reuseport_select_sock
* Use refcount_inc_not_zero() in reuseport_select_migrated_sock()
* Set the default attach type in bpf_prog_load_check_attach()
* Define new proto of BPF_FUNC_get_socket_cookie
* Fix test to be compiled successfully
* Update commit messages

RFC:
https://lore.kernel.org/netdev/20201117094023.3685-1-kun...@amazon.co.jp/

Kuniyuki Iwashima (13):
tcp: Allow TCP_CLOSE sockets to hold the reuseport group.
bpf: Define migration types for SO_REUSEPORT.
Revert "locking/spinlocks: Remove the unused spin_lock_bh_nested()
API"
tcp: Introduce inet_csk_reqsk_queue_migrate().
tcp: Set the new listener to migrated TFO requests.
tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.
tcp: Migrate TCP_NEW_SYN_RECV requests.
bpf: Introduce two attach types for BPF_PROG_TYPE_SK_REUSEPORT.
libbpf: Set expected_attach_type for BPF_PROG_TYPE_SK_REUSEPORT.
bpf: Add migration to sk_reuseport_(kern|md).

Re: [PATCH v1 bpf-next 11/11] bpf: Test BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

I'm sending this mail just for logging because I failed to send mails only 
to LKML, netdev, and bpf yesterday.


From:   Martin KaFai Lau 
Date:   Fri, 4 Dec 2020 17:50:00 -0800
> On Tue, Dec 01, 2020 at 11:44:18PM +0900, Kuniyuki Iwashima wrote:
> > This patch adds a test for BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.
> > 
> > Reviewed-by: Benjamin Herrenschmidt 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >  .../bpf/prog_tests/migrate_reuseport.c| 164 ++
> >  .../bpf/progs/test_migrate_reuseport_kern.c   |  54 ++
> >  2 files changed, 218 insertions(+)
> >  create mode 100644 
> > tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
> >  create mode 100644 
> > tools/testing/selftests/bpf/progs/test_migrate_reuseport_kern.c
> > 
> > diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c 
> > b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
> > new file mode 100644
> > index ..87c72d9ccadd
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
> > @@ -0,0 +1,164 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Check if we can migrate child sockets.
> > + *
> > + *   1. call listen() for 5 server sockets.
> > + *   2. update a map to migrate all child socket
> > + *to the last server socket (migrate_map[cookie] = 4)
> > + *   3. call connect() for 25 client sockets.
> > + *   4. call close() for first 4 server sockets.
> > + *   5. call accept() for the last server socket.
> > + *
> > + * Author: Kuniyuki Iwashima 
> > + */
> > +
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +#define NUM_SOCKS 5
> > +#define LOCALHOST "127.0.0.1"
> > +#define err_exit(condition, message) \
> > +   do {  \
> > +   if (condition) {  \
> > +   perror("ERROR: " message " ");\
> > +   exit(1);  \
> > +   } \
> > +   } while (0)
> > +
> > +__u64 server_fds[NUM_SOCKS];
> > +int prog_fd, reuseport_map_fd, migrate_map_fd;
> > +
> > +
> > +void setup_bpf(void)
> > +{
> > +   struct bpf_object *obj;
> > +   struct bpf_program *prog;
> > +   struct bpf_map *reuseport_map, *migrate_map;
> > +   int err;
> > +
> > +   obj = bpf_object__open("test_migrate_reuseport_kern.o");
> > +   err_exit(libbpf_get_error(obj), "opening BPF object file failed");
> > +
> > +   err = bpf_object__load(obj);
> > +   err_exit(err, "loading BPF object failed");
> > +
> > +   prog = bpf_program__next(NULL, obj);
> > +   err_exit(!prog, "loading BPF program failed");
> > +
> > +   reuseport_map = bpf_object__find_map_by_name(obj, "reuseport_map");
> > +   err_exit(!reuseport_map, "loading BPF reuseport_map failed");
> > +
> > +   migrate_map = bpf_object__find_map_by_name(obj, "migrate_map");
> > +   err_exit(!migrate_map, "loading BPF migrate_map failed");
> > +
> > +   prog_fd = bpf_program__fd(prog);
> > +   reuseport_map_fd = bpf_map__fd(reuseport_map);
> > +   migrate_map_fd = bpf_map__fd(migrate_map);
> > +}
> > +
> > +void test_listen(void)
> > +{
> > +   struct sockaddr_in addr;
> > +   socklen_t addr_len = sizeof(addr);
> > +   int i, err, optval = 1, migrated_to = NUM_SOCKS - 1;
> > +   __u64 value;
> > +
> > +   addr.sin_family = AF_INET;
> > +   addr.sin_port = htons(80);
> > +   inet_pton(AF_INET, LOCALHOST, _addr.s_addr);
> > +
> > +   for (i = 0; i < NUM_SOCKS; i++) {
> > +   server_fds[i] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
> > +   err_exit(server_fds[i] == -1, "socket() for listener sockets 
> > failed");
> > +
> > +   err = setsockopt(server_fds[i], SOL_SOCKET, SO_REUSEPORT,
> > +, sizeof(optval));
> > +   err_exit(err == -1, "setsockopt() for SO_REUSEPORT failed");
> > +
> > +   if (i == 0) {
> > +   err = setsockopt(server_fds[i], SOL_SOCKET, 
> > SO_ATTACH_REUSEPORT_EBPF,
> > +

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

I'm sending this mail just for logging because I failed to send mails only 
to LKML, netdev, and bpf yesterday.


From:   Martin KaFai Lau 
Date:   Fri, 4 Dec 2020 17:42:41 -0800
> On Tue, Dec 01, 2020 at 11:44:10PM +0900, Kuniyuki Iwashima wrote:
> [ ... ]
> > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> > index fd133516ac0e..60d7c1f28809 100644
> > --- a/net/core/sock_reuseport.c
> > +++ b/net/core/sock_reuseport.c
> > @@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock 
> > *sk2, bool bind_inany)
> >  }
> >  EXPORT_SYMBOL(reuseport_add_sock);
> >  
> > -void reuseport_detach_sock(struct sock *sk)
> > +struct sock *reuseport_detach_sock(struct sock *sk)
> >  {
> > struct sock_reuseport *reuse;
> > +   struct bpf_prog *prog;
> > +   struct sock *nsk = NULL;
> > int i;
> >  
> > spin_lock_bh(_lock);
> > @@ -242,8 +244,12 @@ void reuseport_detach_sock(struct sock *sk)
> >  
> > reuse->num_socks--;
> > reuse->socks[i] = reuse->socks[reuse->num_socks];
> > +   prog = rcu_dereference(reuse->prog);
> Is it under rcu_read_lock() here?

reuseport_lock is locked in this function, and we do not modify the prog,
but is rcu_dereference_protected() preferable?

---8<---
prog = rcu_dereference_protected(reuse->prog,
 lockdep_is_held(_lock));
---8<---


> > if (sk->sk_protocol == IPPROTO_TCP) {
> > +   if (reuse->num_socks && !prog)
> > +   nsk = i == reuse->num_socks ? reuse->socks[i - 
> > 1] : reuse->socks[i];
> > +
> > reuse->num_closed_socks++;
> > reuse->socks[reuse->max_socks - 
> > reuse->num_closed_socks] = sk;
> > } else {
> > @@ -264,6 +270,8 @@ void reuseport_detach_sock(struct sock *sk)
> > call_rcu(>rcu, reuseport_free_rcu);
> >  out:
> > spin_unlock_bh(_lock);
> > +
> > +   return nsk;
> >  }
> >  EXPORT_SYMBOL(reuseport_detach_sock);
> >  
> > diff --git a/net/ipv4/inet_connection_sock.c 
> > b/net/ipv4/inet_connection_sock.c
> > index 1451aa9712b0..b27241ea96bd 100644
> > --- a/net/ipv4/inet_connection_sock.c
> > +++ b/net/ipv4/inet_connection_sock.c
> > @@ -992,6 +992,36 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
> >  }
> >  EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
> >  
> > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
> > +{
> > +   struct request_sock_queue *old_accept_queue, *new_accept_queue;
> > +
> > +   old_accept_queue = _csk(sk)->icsk_accept_queue;
> > +   new_accept_queue = _csk(nsk)->icsk_accept_queue;
> > +
> > +   spin_lock(_accept_queue->rskq_lock);
> > +   spin_lock(_accept_queue->rskq_lock);
> I am also not very thrilled on this double spin_lock.
> Can this be done in (or like) inet_csk_listen_stop() instead?

It will be possible to migrate sockets in inet_csk_listen_stop(), but I
think it is better to do it just after reuseport_detach_sock() becuase we
can select a different listener (almost) every time at a lower cost by
selecting the moved socket and pass it to inet_csk_reqsk_queue_migrate()
easily.

sk_hash of the listener is 0, so we would have to generate a random number
in inet_csk_listen_stop().

Re: [PATCH v1 bpf-next 01/11] tcp: Keep TCP_CLOSE sockets in the reuseport group.

I'm sending this mail just for logging because I failed to send mails only 
to LKML, netdev, and bpf yesterday.


From:   Martin KaFai Lau 
Date:   Fri, 4 Dec 2020 17:31:03 -0800
> On Tue, Dec 01, 2020 at 11:44:08PM +0900, Kuniyuki Iwashima wrote:
> > This patch is a preparation patch to migrate incoming connections in the
> > later commits and adds a field (num_closed_socks) to the struct
> > sock_reuseport to keep TCP_CLOSE sockets in the reuseport group.
> > 
> > When we close a listening socket, to migrate its connections to another
> > listener in the same reuseport group, we have to handle two kinds of child
> > sockets. One is that a listening socket has a reference to, and the other
> > is not.
> > 
> > The former is the TCP_ESTABLISHED/TCP_SYN_RECV sockets, and they are in the
> > accept queue of their listening socket. So, we can pop them out and push
> > them into another listener's queue at close() or shutdown() syscalls. On
> > the other hand, the latter, the TCP_NEW_SYN_RECV socket is during the
> > three-way handshake and not in the accept queue. Thus, we cannot access
> > such sockets at close() or shutdown() syscalls. Accordingly, we have to
> > migrate immature sockets after their listening socket has been closed.
> > 
> > Currently, if their listening socket has been closed, TCP_NEW_SYN_RECV
> > sockets are freed at receiving the final ACK or retransmitting SYN+ACKs. At
> > that time, if we could select a new listener from the same reuseport group,
> > no connection would be aborted. However, it is impossible because
> > reuseport_detach_sock() sets NULL to sk_reuseport_cb and forbids access to
> > the reuseport group from closed sockets.
> > 
> > This patch allows TCP_CLOSE sockets to remain in the reuseport group and to
> > have access to it while any child socket references to them. The point is
> > that reuseport_detach_sock() is called twice from inet_unhash() and
> > sk_destruct(). At first, it moves the socket backwards in socks[] and
> > increments num_closed_socks. Later, when all migrated connections are
> > accepted, it removes the socket from socks[], decrements num_closed_socks,
> > and sets NULL to sk_reuseport_cb.
> > 
> > By this change, closed sockets can keep sk_reuseport_cb until all child
> > requests have been freed or accepted. Consequently calling listen() after
> > shutdown() can cause EADDRINUSE or EBUSY in reuseport_add_sock() or
> > inet_csk_bind_conflict() which expect that such sockets should not have the
> > reuseport group. Therefore, this patch also loosens such validation rules
> > so that the socket can listen again if it has the same reuseport group with
> > other listening sockets.
> > 
> > Reviewed-by: Benjamin Herrenschmidt 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >  include/net/sock_reuseport.h|  5 ++-
> >  net/core/sock_reuseport.c   | 79 +++--
> >  net/ipv4/inet_connection_sock.c |  7 ++-
> >  3 files changed, 74 insertions(+), 17 deletions(-)
> > 
> > diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
> > index 505f1e18e9bf..0e558ca7afbf 100644
> > --- a/include/net/sock_reuseport.h
> > +++ b/include/net/sock_reuseport.h
> > @@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock;
> >  struct sock_reuseport {
> > struct rcu_head rcu;
> >  
> > -   u16 max_socks;  /* length of socks */
> > -   u16 num_socks;  /* elements in socks */
> > +   u16 max_socks;  /* length of socks */
> > +   u16 num_socks;  /* elements in socks */
> > +   u16 num_closed_socks;   /* closed elements in 
> > socks */
> > /* The last synq overflow event timestamp of this
> >  * reuse->socks[] group.
> >  */
> > diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
> > index bbdd3c7b6cb5..fd133516ac0e 100644
> > --- a/net/core/sock_reuseport.c
> > +++ b/net/core/sock_reuseport.c
> > @@ -98,16 +98,21 @@ static struct sock_reuseport *reuseport_grow(struct 
> > sock_reuseport *reuse)
> > return NULL;
> >  
> > more_reuse->num_socks = reuse->num_socks;
> > +   more_reuse->num_closed_socks = reuse->num_closed_socks;
> > more_reuse->prog = reuse->prog;
> > more_reuse->reuseport_id = reuse->reuseport_id;
> > more_reuse->bind_inany = reuse->bind_inany;
> > more_reuse->has_conns = reuse->has_conns;
> > +   mo

Re: [PATCH v1 bpf-next 09/11] bpf: Support bpf_get_socket_cookie_sock() for BPF_PROG_TYPE_SK_REUSEPORT.

I'm sending this mail just for logging because I failed to send mails only 
to LKML, netdev, and bpf yesterday.


From:   Martin KaFai Lau 
Date:   Fri, 4 Dec 2020 11:58:07 -0800
> On Tue, Dec 01, 2020 at 11:44:16PM +0900, Kuniyuki Iwashima wrote:
> > We will call sock_reuseport.prog for socket migration in the next commit,
> > so the eBPF program has to know which listener is closing in order to
> > select the new listener.
> > 
> > Currently, we can get a unique ID for each listener in the userspace by
> > calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map.
> > 
> > This patch makes the sk pointer available in sk_reuseport_md so that we can
> > get the ID by BPF_FUNC_get_socket_cookie() in the eBPF program.
> > 
> > Link: 
> > https://lore.kernel.org/netdev/20201119001154.kapwihc2plp4f...@kafai-mbp.dhcp.thefacebook.com/
> > Suggested-by: Martin KaFai Lau 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >  include/uapi/linux/bpf.h   |  8 
> >  net/core/filter.c  | 12 +++-
> >  tools/include/uapi/linux/bpf.h |  8 
> >  3 files changed, 27 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index efe342bf3dbc..3e9b8bd42b4e 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -1650,6 +1650,13 @@ union bpf_attr {
> >   * A 8-byte long non-decreasing number on success, or 0 if 
> > the
> >   * socket field is missing inside *skb*.
> >   *
> > + * u64 bpf_get_socket_cookie(struct bpf_sock *sk)
> > + * Description
> > + * Equivalent to bpf_get_socket_cookie() helper that 
> > accepts
> > + * *skb*, but gets socket from **struct bpf_sock** context.
> > + * Return
> > + * A 8-byte long non-decreasing number.
> > + *
> >   * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
> >   * Description
> >   * Equivalent to bpf_get_socket_cookie() helper that 
> > accepts
> > @@ -4420,6 +4427,7 @@ struct sk_reuseport_md {
> > __u32 bind_inany;   /* Is sock bound to an INANY address? */
> > __u32 hash; /* A hash of the packet 4 tuples */
> > __u8 migration; /* Migration type */
> > +   __bpf_md_ptr(struct bpf_sock *, sk); /* current listening socket */
> >  };
> >  
> >  #define BPF_TAG_SIZE   8
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 0a0634787bb4..1059d31847ef 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -4628,7 +4628,7 @@ static const struct bpf_func_proto 
> > bpf_get_socket_cookie_sock_proto = {
> > .func   = bpf_get_socket_cookie_sock,
> > .gpl_only   = false,
> > .ret_type   = RET_INTEGER,
> > -   .arg1_type  = ARG_PTR_TO_CTX,
> > +   .arg1_type  = ARG_PTR_TO_SOCKET,
> This will break existing bpf prog (BPF_PROG_TYPE_CGROUP_SOCK)
> using this proto.  A new proto is needed and there is
> an on-going patch doing this [0].
> 
> [0]: https://lore.kernel.org/bpf/20201203213330.1657666-1-rev...@google.com/

Thank you for notifying me of this patch!
I will define another proto, but may drop the part if the above patch is
already merged then.

Re: [PATCH v1 bpf-next 06/11] bpf: Introduce two attach types for BPF_PROG_TYPE_SK_REUSEPORT.

I'm sending this mail just for logging because I failed to send mails only
to LKML, netdev, and bpf yesterday.


From:   Martin KaFai Lau 
Date:   Thu, 3 Dec 2020 21:56:53 -0800
> On Thu, Dec 03, 2020 at 11:16:08PM +0900, Kuniyuki Iwashima wrote:
> > From:   Martin KaFai Lau 
> > Date:   Wed, 2 Dec 2020 20:24:02 -0800
> > > On Wed, Dec 02, 2020 at 11:19:02AM -0800, Martin KaFai Lau wrote:
> > > > On Tue, Dec 01, 2020 at 06:04:50PM -0800, Andrii Nakryiko wrote:
> > > > > On Tue, Dec 1, 2020 at 6:49 AM Kuniyuki Iwashima 
> > > > >  wrote:
> > > > > >
> > > > > > This commit adds new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT 
> > > > > > to
> > > > > > check if the attached eBPF program is capable of migrating sockets.
> > > > > >
> > > > > > When the eBPF program is attached, the kernel runs it for socket 
> > > > > > migration
> > > > > > only if the expected_attach_type is 
> > > > > > BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.
> > > > > > The kernel will change the behaviour depending on the returned 
> > > > > > value:
> > > > > >
> > > > > >   - SK_PASS with selected_sk, select it as a new listener
> > > > > >   - SK_PASS with selected_sk NULL, fall back to the random selection
> > > > > >   - SK_DROP, cancel the migration
> > > > > >
> > > > > > Link: 
> > > > > > https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6t...@kafai-mbp.dhcp.thefacebook.com/
> > > > > > Suggested-by: Martin KaFai Lau 
> > > > > > Signed-off-by: Kuniyuki Iwashima 
> > > > > > ---
> > > > > >  include/uapi/linux/bpf.h   | 2 ++
> > > > > >  kernel/bpf/syscall.c   | 8 
> > > > > >  tools/include/uapi/linux/bpf.h | 2 ++
> > > > > >  3 files changed, 12 insertions(+)
> > > > > >
> > > > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > > > index 85278deff439..cfc207ae7782 100644
> > > > > > --- a/include/uapi/linux/bpf.h
> > > > > > +++ b/include/uapi/linux/bpf.h
> > > > > > @@ -241,6 +241,8 @@ enum bpf_attach_type {
> > > > > > BPF_XDP_CPUMAP,
> > > > > > BPF_SK_LOOKUP,
> > > > > > BPF_XDP,
> > > > > > +   BPF_SK_REUSEPORT_SELECT,
> > > > > > +   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> > > > > > __MAX_BPF_ATTACH_TYPE
> > > > > >  };
> > > > > >
> > > > > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > > > > > index f3fe9f53f93c..a0796a8de5ea 100644
> > > > > > --- a/kernel/bpf/syscall.c
> > > > > > +++ b/kernel/bpf/syscall.c
> > > > > > @@ -2036,6 +2036,14 @@ bpf_prog_load_check_attach(enum 
> > > > > > bpf_prog_type prog_type,
> > > > > > if (expected_attach_type == BPF_SK_LOOKUP)
> > > > > > return 0;
> > > > > > return -EINVAL;
> > > > > > +   case BPF_PROG_TYPE_SK_REUSEPORT:
> > > > > > +   switch (expected_attach_type) {
> > > > > > +   case BPF_SK_REUSEPORT_SELECT:
> > > > > > +   case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
> > > > > > +   return 0;
> > > > > > +   default:
> > > > > > +   return -EINVAL;
> > > > > > +   }
> > > > > 
> > > > > this is a kernel regression, previously expected_attach_type wasn't
> > > > > enforced, so user-space could have provided any number without an
> > > > > error.
> > > > I also think this change alone will break things like when the usual
> > > > attr->expected_attach_type == 0 case.  At least changes is needed in
> > > > bpf_prog_load_fixup_attach_type() which is also handling a
> > > > similar situation for BPF_PROG_TYPE_CGROUP_SOCK.
> > > > 
> > > > I now think there is no need to expose new bpf_attach_type to the UAPI.
> > > > Since the prog->expected_attach_type is not used, it can be cleared at 
> > > > load

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

From:   Eric Dumazet 
Date:   Thu, 3 Dec 2020 15:31:53 +0100
> On Thu, Dec 3, 2020 at 3:14 PM Kuniyuki Iwashima  wrote:
> >
> > From:   Eric Dumazet 
> > Date:   Tue, 1 Dec 2020 16:25:51 +0100
> > > On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> > > > This patch lets reuseport_detach_sock() return a pointer of struct sock,
> > > > which is used only by inet_unhash(). If it is not NULL,
> > > > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > > > sockets from the closing listener to the selected one.
> > > >
> > > > Listening sockets hold incoming connections as a linked list of struct
> > > > request_sock in the accept queue, and each request has reference to a 
> > > > full
> > > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we only 
> > > > unlink
> > > > the requests from the closing listener's queue and relink them to the 
> > > > head
> > > > of the new listener's queue. We do not process each request and its
> > > > reference to the listener, so the migration completes in O(1) time
> > > > complexity. However, in the case of TCP_SYN_RECV sockets, we take 
> > > > special
> > > > care in the next commit.
> > > >
> > > > By default, the kernel selects a new listener randomly. In order to pick
> > > > out a different socket every time, we select the last element of 
> > > > socks[] as
> > > > the new listener. This behaviour is based on how the kernel moves 
> > > > sockets
> > > > in socks[]. (See also [1])
> > > >
> > > > Basically, in order to redistribute sockets evenly, we have to use an 
> > > > eBPF
> > > > program called in the later commit, but as the side effect of such 
> > > > default
> > > > selection, the kernel can redistribute old requests evenly to new 
> > > > listeners
> > > > for a specific case where the application replaces listeners by
> > > > generations.
> > > >
> > > > For example, we call listen() for four sockets (A, B, C, D), and close 
> > > > the
> > > > first two by turns. The sockets move in socks[] like below.
> > > >
> > > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > > >   socks[2] : C   |  socks[2] : C --'
> > > >   socks[3] : D --'
> > > >
> > > > Then, if C and D have newer settings than A and B, and each socket has a
> > > > request (a, b, c, d) in their accept queue, we can redistribute old
> > > > requests evenly to new listeners.
> > > >
> > > >   socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : D (a + 
> > > > d)
> > > >   socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C (b + 
> > > > c)
> > > >   socks[2] : C (c)   |  socks[2] : C (c) --'
> > > >   socks[3] : D (d) --'
> > > >
> > > > Here, (A, D) or (B, C) can have different application settings, but they
> > > > MUST have the same settings at the socket API level; otherwise, 
> > > > unexpected
> > > > error may happen. For instance, if only the new listeners have
> > > > TCP_SAVE_SYN, old requests do not have SYN data, so the application will
> > > > face inconsistency and cause an error.
> > > >
> > > > Therefore, if there are different kinds of sockets, we must attach an 
> > > > eBPF
> > > > program described in later commits.
> > > >
> > > > Link: 
> > > > https://lore.kernel.org/netdev/CAEfhGiyG8Y_amDZ2C8dQoQqjZJMHjTY76b=KBkTKcBtA=dh...@mail.gmail.com/
> > > > Reviewed-by: Benjamin Herrenschmidt 
> > > > Signed-off-by: Kuniyuki Iwashima 
> > > > ---
> > > >  include/net/inet_connection_sock.h |  1 +
> > > >  include/net/sock_reuseport.h   |  2 +-
> > > >  net/core/sock_reuseport.c  | 10 +-
> > > >  net/ipv4/inet_connection_sock.c| 30 ++
> > > >  net/ipv4/inet_hashtables.c |  9 +++--
> > > >  5 files changed, 48 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/include/net/inet_connection_sock.h 
> > > > b/include/net/inet_connection_sock.h
> > > > index 7338b3865a2a.

Re: [PATCH v1 bpf-next 06/11] bpf: Introduce two attach types for BPF_PROG_TYPE_SK_REUSEPORT.

From:   Martin KaFai Lau 
Date:   Wed, 2 Dec 2020 20:24:02 -0800
> On Wed, Dec 02, 2020 at 11:19:02AM -0800, Martin KaFai Lau wrote:
> > On Tue, Dec 01, 2020 at 06:04:50PM -0800, Andrii Nakryiko wrote:
> > > On Tue, Dec 1, 2020 at 6:49 AM Kuniyuki Iwashima  
> > > wrote:
> > > >
> > > > This commit adds new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT to
> > > > check if the attached eBPF program is capable of migrating sockets.
> > > >
> > > > When the eBPF program is attached, the kernel runs it for socket 
> > > > migration
> > > > only if the expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.
> > > > The kernel will change the behaviour depending on the returned value:
> > > >
> > > >   - SK_PASS with selected_sk, select it as a new listener
> > > >   - SK_PASS with selected_sk NULL, fall back to the random selection
> > > >   - SK_DROP, cancel the migration
> > > >
> > > > Link: 
> > > > https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6t...@kafai-mbp.dhcp.thefacebook.com/
> > > > Suggested-by: Martin KaFai Lau 
> > > > Signed-off-by: Kuniyuki Iwashima 
> > > > ---
> > > >  include/uapi/linux/bpf.h   | 2 ++
> > > >  kernel/bpf/syscall.c   | 8 
> > > >  tools/include/uapi/linux/bpf.h | 2 ++
> > > >  3 files changed, 12 insertions(+)
> > > >
> > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > index 85278deff439..cfc207ae7782 100644
> > > > --- a/include/uapi/linux/bpf.h
> > > > +++ b/include/uapi/linux/bpf.h
> > > > @@ -241,6 +241,8 @@ enum bpf_attach_type {
> > > > BPF_XDP_CPUMAP,
> > > > BPF_SK_LOOKUP,
> > > > BPF_XDP,
> > > > +   BPF_SK_REUSEPORT_SELECT,
> > > > +   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> > > > __MAX_BPF_ATTACH_TYPE
> > > >  };
> > > >
> > > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > > > index f3fe9f53f93c..a0796a8de5ea 100644
> > > > --- a/kernel/bpf/syscall.c
> > > > +++ b/kernel/bpf/syscall.c
> > > > @@ -2036,6 +2036,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type 
> > > > prog_type,
> > > > if (expected_attach_type == BPF_SK_LOOKUP)
> > > > return 0;
> > > > return -EINVAL;
> > > > +   case BPF_PROG_TYPE_SK_REUSEPORT:
> > > > +   switch (expected_attach_type) {
> > > > +   case BPF_SK_REUSEPORT_SELECT:
> > > > +   case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
> > > > +   return 0;
> > > > +   default:
> > > > +   return -EINVAL;
> > > > +   }
> > > 
> > > this is a kernel regression, previously expected_attach_type wasn't
> > > enforced, so user-space could have provided any number without an
> > > error.
> > I also think this change alone will break things like when the usual
> > attr->expected_attach_type == 0 case.  At least changes is needed in
> > bpf_prog_load_fixup_attach_type() which is also handling a
> > similar situation for BPF_PROG_TYPE_CGROUP_SOCK.
> > 
> > I now think there is no need to expose new bpf_attach_type to the UAPI.
> > Since the prog->expected_attach_type is not used, it can be cleared at load 
> > time
> > and then only set to BPF_SK_REUSEPORT_SELECT_OR_MIGRATE (probably defined
> > internally at filter.[c|h]) in the is_valid_access() when "migration"
> > is accessed.  When "migration" is accessed, the bpf prog can handle
> > migration (and the original not-migration) case.
> Scrap this internal only BPF_SK_REUSEPORT_SELECT_OR_MIGRATE idea.
> I think there will be cases that bpf prog wants to do both
> without accessing any field from sk_reuseport_md.
> 
> Lets go back to the discussion on using a similar
> idea as BPF_PROG_TYPE_CGROUP_SOCK in bpf_prog_load_fixup_attach_type().
> I am not aware there is loader setting a random number
> in expected_attach_type, so the chance of breaking
> is very low.  There was a similar discussion earlier [0].
> 
> [0]: https://lore.kernel.org/netdev/20200126045443.f47dzxdglazzchfm@ast-mbp/

Thank you for the idea and reference.

I will remove the change in bpf_prog_load_check_attach() and set the
default value (BPF_SK_REUSEPORT_SELECT) in bpf_prog_load_fixup_attach_type()
for backward compatibility if expected_attach_type is 0.


> > > > case BPF_PROG_TYPE_EXT:
> > > > if (expected_attach_type)
> > > > return -EINVAL;
> > > > diff --git a/tools/include/uapi/linux/bpf.h 
> > > > b/tools/include/uapi/linux/bpf.h
> > > > index 85278deff439..cfc207ae7782 100644
> > > > --- a/tools/include/uapi/linux/bpf.h
> > > > +++ b/tools/include/uapi/linux/bpf.h
> > > > @@ -241,6 +241,8 @@ enum bpf_attach_type {
> > > > BPF_XDP_CPUMAP,
> > > > BPF_SK_LOOKUP,
> > > > BPF_XDP,
> > > > +   BPF_SK_REUSEPORT_SELECT,
> > > > +   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> > > > __MAX_BPF_ATTACH_TYPE
> > > >  };
> > > >
> > > > --
> > > > 2.17.2 (Apple Git-113)

Re: [PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

From:   Eric Dumazet 
Date:   Tue, 1 Dec 2020 16:25:51 +0100
> On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> > This patch lets reuseport_detach_sock() return a pointer of struct sock,
> > which is used only by inet_unhash(). If it is not NULL,
> > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > sockets from the closing listener to the selected one.
> > 
> > Listening sockets hold incoming connections as a linked list of struct
> > request_sock in the accept queue, and each request has reference to a full
> > socket and its listener. In inet_csk_reqsk_queue_migrate(), we only unlink
> > the requests from the closing listener's queue and relink them to the head
> > of the new listener's queue. We do not process each request and its
> > reference to the listener, so the migration completes in O(1) time
> > complexity. However, in the case of TCP_SYN_RECV sockets, we take special
> > care in the next commit.
> > 
> > By default, the kernel selects a new listener randomly. In order to pick
> > out a different socket every time, we select the last element of socks[] as
> > the new listener. This behaviour is based on how the kernel moves sockets
> > in socks[]. (See also [1])
> > 
> > Basically, in order to redistribute sockets evenly, we have to use an eBPF
> > program called in the later commit, but as the side effect of such default
> > selection, the kernel can redistribute old requests evenly to new listeners
> > for a specific case where the application replaces listeners by
> > generations.
> > 
> > For example, we call listen() for four sockets (A, B, C, D), and close the
> > first two by turns. The sockets move in socks[] like below.
> > 
> >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> >   socks[2] : C   |  socks[2] : C --'
> >   socks[3] : D --'
> > 
> > Then, if C and D have newer settings than A and B, and each socket has a
> > request (a, b, c, d) in their accept queue, we can redistribute old
> > requests evenly to new listeners.
> > 
> >   socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : D (a + d)
> >   socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C (b + c)
> >   socks[2] : C (c)   |  socks[2] : C (c) --'
> >   socks[3] : D (d) --'
> > 
> > Here, (A, D) or (B, C) can have different application settings, but they
> > MUST have the same settings at the socket API level; otherwise, unexpected
> > error may happen. For instance, if only the new listeners have
> > TCP_SAVE_SYN, old requests do not have SYN data, so the application will
> > face inconsistency and cause an error.
> > 
> > Therefore, if there are different kinds of sockets, we must attach an eBPF
> > program described in later commits.
> > 
> > Link: 
> > https://lore.kernel.org/netdev/CAEfhGiyG8Y_amDZ2C8dQoQqjZJMHjTY76b=KBkTKcBtA=dh...@mail.gmail.com/
> > Reviewed-by: Benjamin Herrenschmidt 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >  include/net/inet_connection_sock.h |  1 +
> >  include/net/sock_reuseport.h   |  2 +-
> >  net/core/sock_reuseport.c  | 10 +-
> >  net/ipv4/inet_connection_sock.c| 30 ++
> >  net/ipv4/inet_hashtables.c |  9 +++--
> >  5 files changed, 48 insertions(+), 4 deletions(-)
> > 
> > diff --git a/include/net/inet_connection_sock.h 
> > b/include/net/inet_connection_sock.h
> > index 7338b3865a2a..2ea2d743f8fc 100644
> > --- a/include/net/inet_connection_sock.h
> > +++ b/include/net/inet_connection_sock.h
> > @@ -260,6 +260,7 @@ struct dst_entry *inet_csk_route_child_sock(const 
> > struct sock *sk,
> >  struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
> >   struct request_sock *req,
> >   struct sock *child);
> > +void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk);
> >  void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock 
> > *req,
> >unsigned long timeout);
> >  struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock 
> > *child,
> > diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
> > index 0e558ca7afbf..09a1b1539d4c 100644
> > --- a/include/net/sock_reuseport.h
> > +++ b/include/net/sock_reuseport.h
> > @@ -31,7 +31,7 @@ struct sock_reuseport {
> >  extern int reuseport_alloc(struct soc

Re: [PATCH v1 bpf-next 05/11] tcp: Migrate TCP_NEW_SYN_RECV requests.

From:   Eric Dumazet 
Date:   Tue, 1 Dec 2020 16:13:39 +0100
> On 12/1/20 3:44 PM, Kuniyuki Iwashima wrote:
> > This patch renames reuseport_select_sock() to __reuseport_select_sock() and
> > adds two wrapper function of it to pass the migration type defined in the
> > previous commit.
> > 
> >   reuseport_select_sock  : BPF_SK_REUSEPORT_MIGRATE_NO
> >   reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST
> > 
> > As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV
> > requests at receiving the final ACK or sending a SYN+ACK. Therefore, this
> > patch also changes the code to call reuseport_select_migrated_sock() even
> > if the listening socket is TCP_CLOSE. If we can pick out a listening socket
> > from the reuseport group, we rewrite request_sock.rsk_listener and resume
> > processing the request.
> > 
> > Reviewed-by: Benjamin Herrenschmidt 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >  include/net/inet_connection_sock.h | 12 +++
> >  include/net/request_sock.h | 13 
> >  include/net/sock_reuseport.h   |  8 +++
> >  net/core/sock_reuseport.c  | 34 --
> >  net/ipv4/inet_connection_sock.c| 13 ++--
> >  net/ipv4/tcp_ipv4.c|  9 ++--
> >  net/ipv6/tcp_ipv6.c|  9 ++--
> >  7 files changed, 81 insertions(+), 17 deletions(-)
> > 
> > diff --git a/include/net/inet_connection_sock.h 
> > b/include/net/inet_connection_sock.h
> > index 2ea2d743f8fc..1e0958f5eb21 100644
> > --- a/include/net/inet_connection_sock.h
> > +++ b/include/net/inet_connection_sock.h
> > @@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct 
> > sock *sk)
> > reqsk_queue_added(_csk(sk)->icsk_accept_queue);
> >  }
> >  
> > +static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,
> > +struct sock *nsk,
> > +struct request_sock *req)
> > +{
> > +   reqsk_queue_migrated(_csk(sk)->icsk_accept_queue,
> > +_csk(nsk)->icsk_accept_queue,
> > +req);
> > +   sock_put(sk);
> > +   sock_hold(nsk);
> 
> This looks racy to me. nsk refcount might be zero at this point.
> 
> If you think it can _not_ be zero, please add a big comment here,
> because this would mean something has been done before reaching this function,
> and this sock_hold() would be not needed in the first place.
> 
> There is a good reason reqsk_alloc() is using refcount_inc_not_zero().

Exactly, I will fix this in the next spin like below.
Thank you.

---8<---
diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 1e0958f5eb21..d8c3be31e987 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -280,7 +280,6 @@ static inline void inet_csk_reqsk_queue_migrated(struct 
sock *sk,
 _csk(nsk)->icsk_accept_queue,
 req);
sock_put(sk);
-   sock_hold(nsk);
req->rsk_listener = nsk;
 }
 
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 6b475897b496..4d07bddcf678 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -386,7 +386,14 @@ EXPORT_SYMBOL(reuseport_select_sock);
 struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,
struct sk_buff *skb)
 {
-   return __reuseport_select_sock(sk, hash, skb, 0, 
BPF_SK_REUSEPORT_MIGRATE_REQUEST);
+   struct sock *nsk;
+
+   nsk = __reuseport_select_sock(sk, hash, skb, 0, 
BPF_SK_REUSEPORT_MIGRATE_REQUEST);
+   if (IS_ERR_OR_NULL(nsk) ||
+   unlikely(!refcount_inc_not_zero(>sk_refcnt)))
+   return NULL;
+
+   return nsk;
 }
 EXPORT_SYMBOL(reuseport_select_migrated_sock);
 
---8<---


> > +   req->rsk_listener = nsk;
> > +}
> > +
> 
> Honestly, this patch series looks quite complex, and finding a bug in the
> very first function I am looking at is not really a good sign...

I also think this issue is quite complex, but it might be easier to fix
than it was disscussed in 2015 [1] thanks to your some refactoring.

https://lore.kernel.org/netdev/1443313848-751-1-git-send-email-tolga.cey...@gmail.com/

[PATCH v1 bpf-next 10/11] bpf: Call bpf_run_sk_reuseport() for socket migration.

This patch supports socket migration by eBPF. If the attached type is
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, we can select a new listener by
BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning
SK_DROP. This feature is useful when listeners have different settings at
the socket API level or when we want to free resources as soon as possible.

There are two noteworthy points. The first is that we select a listening
socket in reuseport_detach_sock() and __reuseport_select_sock(), but we do
not have struct skb at closing a listener or retransmitting a SYN+ACK.
However, some helper functions do not expect skb is NULL (e.g.
skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer() in
BPF_FUNC_skb_load_bytes_relative()). So, we allocate an empty skb
temporarily before running the eBPF program. The second is that we do not
have struct request_sock in unhash path, and the sk_hash of the listener is
always zero. Thus, we pass zero as hash to bpf_run_sk_reuseport().

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 net/core/filter.c  | 19 +++
 net/core/sock_reuseport.c  | 19 ++-
 net/ipv4/inet_hashtables.c |  2 +-
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 1059d31847ef..2f2fb77cdb72 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9871,10 +9871,29 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport 
*reuse, struct sock *sk,
 {
struct sk_reuseport_kern reuse_kern;
enum sk_action action;
+   bool allocated = false;
+
+   if (migration) {
+   /* cancel migration for possibly incapable eBPF program */
+   if (prog->expected_attach_type != 
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)
+   return ERR_PTR(-ENOTSUPP);
+
+   if (!skb) {
+   allocated = true;
+   skb = alloc_skb(0, GFP_ATOMIC);
+   if (!skb)
+   return ERR_PTR(-ENOMEM);
+   }
+   } else if (!skb) {
+   return NULL; /* fall back to select by hash */
+   }
 
bpf_init_reuseport_kern(_kern, reuse, sk, skb, hash, migration);
action = BPF_PROG_RUN(prog, _kern);
 
+   if (allocated)
+   kfree_skb(skb);
+
if (action == SK_PASS)
return reuse_kern.selected_sk;
else
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 96d65b4c6974..6b475897b496 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -247,8 +247,15 @@ struct sock *reuseport_detach_sock(struct sock *sk)
prog = rcu_dereference(reuse->prog);
 
if (sk->sk_protocol == IPPROTO_TCP) {
-   if (reuse->num_socks && !prog)
-   nsk = i == reuse->num_socks ? reuse->socks[i - 
1] : reuse->socks[i];
+   if (reuse->num_socks) {
+   if (prog)
+   nsk = bpf_run_sk_reuseport(reuse, sk, 
prog, NULL, 0,
+  
BPF_SK_REUSEPORT_MIGRATE_QUEUE);
+
+   if (!nsk)
+   nsk = i == reuse->num_socks ?
+   reuse->socks[i - 1] : 
reuse->socks[i];
+   }
 
reuse->num_closed_socks++;
reuse->socks[reuse->max_socks - 
reuse->num_closed_socks] = sk;
@@ -342,15 +349,9 @@ struct sock *__reuseport_select_sock(struct sock *sk, u32 
hash,
if (!prog)
goto select_by_hash;
 
-   if (migration)
-   goto out;
-
-   if (!skb)
-   goto select_by_hash;
-
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash, 
migration);
-   else
+   else if (!skb)
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
 
 select_by_hash:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 545538a6bfac..59f58740c20d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -699,7 +699,7 @@ void inet_unhash(struct sock *sk)
 
if (rcu_access_pointer(sk->sk_reuseport_cb)) {
nsk = reuseport_detach_sock(sk);
-   if (nsk)
+   if (!IS_ERR_OR_NULL(nsk))
inet_csk_reqsk_queue_migrate(sk, nsk);
}
 
-- 
2.17.2 (Apple Git-113)

[PATCH v1 bpf-next 09/11] bpf: Support bpf_get_socket_cookie_sock() for BPF_PROG_TYPE_SK_REUSEPORT.

We will call sock_reuseport.prog for socket migration in the next commit,
so the eBPF program has to know which listener is closing in order to
select the new listener.

Currently, we can get a unique ID for each listener in the userspace by
calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map.

This patch makes the sk pointer available in sk_reuseport_md so that we can
get the ID by BPF_FUNC_get_socket_cookie() in the eBPF program.

Link: 
https://lore.kernel.org/netdev/20201119001154.kapwihc2plp4f...@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau 
Signed-off-by: Kuniyuki Iwashima 
---
 include/uapi/linux/bpf.h   |  8 
 net/core/filter.c  | 12 +++-
 tools/include/uapi/linux/bpf.h |  8 
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index efe342bf3dbc..3e9b8bd42b4e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1650,6 +1650,13 @@ union bpf_attr {
  * A 8-byte long non-decreasing number on success, or 0 if the
  * socket field is missing inside *skb*.
  *
+ * u64 bpf_get_socket_cookie(struct bpf_sock *sk)
+ * Description
+ * Equivalent to bpf_get_socket_cookie() helper that accepts
+ * *skb*, but gets socket from **struct bpf_sock** context.
+ * Return
+ * A 8-byte long non-decreasing number.
+ *
  * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
  * Description
  * Equivalent to bpf_get_socket_cookie() helper that accepts
@@ -4420,6 +4427,7 @@ struct sk_reuseport_md {
__u32 bind_inany;   /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
__u8 migration; /* Migration type */
+   __bpf_md_ptr(struct bpf_sock *, sk); /* current listening socket */
 };
 
 #define BPF_TAG_SIZE   8
diff --git a/net/core/filter.c b/net/core/filter.c
index 0a0634787bb4..1059d31847ef 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4628,7 +4628,7 @@ static const struct bpf_func_proto 
bpf_get_socket_cookie_sock_proto = {
.func   = bpf_get_socket_cookie_sock,
.gpl_only   = false,
.ret_type   = RET_INTEGER,
-   .arg1_type  = ARG_PTR_TO_CTX,
+   .arg1_type  = ARG_PTR_TO_SOCKET,
 };
 
 BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
@@ -9982,6 +9982,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
return _reuseport_load_bytes_proto;
case BPF_FUNC_skb_load_bytes_relative:
return _reuseport_load_bytes_relative_proto;
+   case BPF_FUNC_get_socket_cookie:
+   return _get_socket_cookie_sock_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -10015,6 +10017,10 @@ sk_reuseport_is_valid_access(int off, int size,
return prog->expected_attach_type == 
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE &&
size == sizeof(__u8);
 
+   case offsetof(struct sk_reuseport_md, sk):
+   info->reg_type = PTR_TO_SOCKET;
+   return size == sizeof(__u64);
+
/* Fields that allow narrowing */
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
if (size < sizeof_field(struct sk_buff, protocol))
@@ -10091,6 +10097,10 @@ static u32 sk_reuseport_convert_ctx_access(enum 
bpf_access_type type,
case offsetof(struct sk_reuseport_md, migration):
SK_REUSEPORT_LOAD_FIELD(migration);
break;
+
+   case offsetof(struct sk_reuseport_md, sk):
+   SK_REUSEPORT_LOAD_FIELD(sk);
+   break;
}
 
return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index efe342bf3dbc..3e9b8bd42b4e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1650,6 +1650,13 @@ union bpf_attr {
  * A 8-byte long non-decreasing number on success, or 0 if the
  * socket field is missing inside *skb*.
  *
+ * u64 bpf_get_socket_cookie(struct bpf_sock *sk)
+ * Description
+ * Equivalent to bpf_get_socket_cookie() helper that accepts
+ * *skb*, but gets socket from **struct bpf_sock** context.
+ * Return
+ * A 8-byte long non-decreasing number.
+ *
  * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
  * Description
  * Equivalent to bpf_get_socket_cookie() helper that accepts
@@ -4420,6 +4427,7 @@ struct sk_reuseport_md {
__u32 bind_inany;   /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
__u8 migration; /* Migration type */
+   __bpf_md_ptr(struct bpf_sock *, sk); /* current listening socket */
 };
 
 #define BPF_TAG_SIZE   8
-- 
2.17.2 (Apple Git-113)

[PATCH v1 bpf-next 11/11] bpf: Test BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

This patch adds a test for BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 .../bpf/prog_tests/migrate_reuseport.c| 164 ++
 .../bpf/progs/test_migrate_reuseport_kern.c   |  54 ++
 2 files changed, 218 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
 create mode 100644 
tools/testing/selftests/bpf/progs/test_migrate_reuseport_kern.c

diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c 
b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
new file mode 100644
index ..87c72d9ccadd
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check if we can migrate child sockets.
+ *
+ *   1. call listen() for 5 server sockets.
+ *   2. update a map to migrate all child socket
+ *to the last server socket (migrate_map[cookie] = 4)
+ *   3. call connect() for 25 client sockets.
+ *   4. call close() for first 4 server sockets.
+ *   5. call accept() for the last server socket.
+ *
+ * Author: Kuniyuki Iwashima 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define NUM_SOCKS 5
+#define LOCALHOST "127.0.0.1"
+#define err_exit(condition, message) \
+   do {  \
+   if (condition) {  \
+   perror("ERROR: " message " ");\
+   exit(1);  \
+   } \
+   } while (0)
+
+__u64 server_fds[NUM_SOCKS];
+int prog_fd, reuseport_map_fd, migrate_map_fd;
+
+
+void setup_bpf(void)
+{
+   struct bpf_object *obj;
+   struct bpf_program *prog;
+   struct bpf_map *reuseport_map, *migrate_map;
+   int err;
+
+   obj = bpf_object__open("test_migrate_reuseport_kern.o");
+   err_exit(libbpf_get_error(obj), "opening BPF object file failed");
+
+   err = bpf_object__load(obj);
+   err_exit(err, "loading BPF object failed");
+
+   prog = bpf_program__next(NULL, obj);
+   err_exit(!prog, "loading BPF program failed");
+
+   reuseport_map = bpf_object__find_map_by_name(obj, "reuseport_map");
+   err_exit(!reuseport_map, "loading BPF reuseport_map failed");
+
+   migrate_map = bpf_object__find_map_by_name(obj, "migrate_map");
+   err_exit(!migrate_map, "loading BPF migrate_map failed");
+
+   prog_fd = bpf_program__fd(prog);
+   reuseport_map_fd = bpf_map__fd(reuseport_map);
+   migrate_map_fd = bpf_map__fd(migrate_map);
+}
+
+void test_listen(void)
+{
+   struct sockaddr_in addr;
+   socklen_t addr_len = sizeof(addr);
+   int i, err, optval = 1, migrated_to = NUM_SOCKS - 1;
+   __u64 value;
+
+   addr.sin_family = AF_INET;
+   addr.sin_port = htons(80);
+   inet_pton(AF_INET, LOCALHOST, _addr.s_addr);
+
+   for (i = 0; i < NUM_SOCKS; i++) {
+   server_fds[i] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+   err_exit(server_fds[i] == -1, "socket() for listener sockets 
failed");
+
+   err = setsockopt(server_fds[i], SOL_SOCKET, SO_REUSEPORT,
+, sizeof(optval));
+   err_exit(err == -1, "setsockopt() for SO_REUSEPORT failed");
+
+   if (i == 0) {
+   err = setsockopt(server_fds[i], SOL_SOCKET, 
SO_ATTACH_REUSEPORT_EBPF,
+_fd, sizeof(prog_fd));
+   err_exit(err == -1, "setsockopt() for 
SO_ATTACH_REUSEPORT_EBPF failed");
+   }
+
+   err = bind(server_fds[i], (struct sockaddr *), addr_len);
+   err_exit(err == -1, "bind() failed");
+
+   err = listen(server_fds[i], 32);
+   err_exit(err == -1, "listen() failed");
+
+   err = bpf_map_update_elem(reuseport_map_fd, , _fds[i], 
BPF_NOEXIST);
+   err_exit(err == -1, "updating BPF reuseport_map failed");
+
+   err = bpf_map_lookup_elem(reuseport_map_fd, , );
+   err_exit(err == -1, "looking up BPF reuseport_map failed");
+
+   printf("fd[%d] (cookie: %llu) -> fd[%d]\n", i, value, 
migrated_to);
+   err = bpf_map_update_elem(migrate_map_fd, , _to, 
BPF_NOEXIST);
+   err_exit(err == -1, "updating BPF migrate_map failed");
+   }
+}
+
+void test_connect(void)
+{
+   struct sockaddr_in addr;
+   socklen_t addr_len = sizeof(addr);
+   int i, err, client_fd;
+
+   addr.sin_family = AF_INET

[PATCH v1 bpf-next 08/11] bpf: Add migration to sk_reuseport_(kern|md).

This patch adds u8 migration field to sk_reuseport_kern and sk_reuseport_md
to signal the eBPF program if the kernel calls it for selecting a listener
for SYN or migrating sockets in the accept queue or an immature socket
during 3WHS.

Note that this field is accessible only if the attached type is
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

Link: 
https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6t...@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau 
Signed-off-by: Kuniyuki Iwashima 
---
 include/linux/bpf.h|  1 +
 include/linux/filter.h |  4 ++--
 include/uapi/linux/bpf.h   |  1 +
 net/core/filter.c  | 15 ---
 net/core/sock_reuseport.c  |  2 +-
 tools/include/uapi/linux/bpf.h |  1 +
 6 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 581b2a2e78eb..244f823f1f84 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1897,6 +1897,7 @@ struct sk_reuseport_kern {
u32 hash;
u32 reuseport_id;
bool bind_inany;
+   u8 migration;
 };
 bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
  struct bpf_insn_access_aux *info);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1b62397bd124..15d5bf13a905 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -967,12 +967,12 @@ void bpf_warn_invalid_xdp_action(u32 act);
 #ifdef CONFIG_INET
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock 
*sk,
  struct bpf_prog *prog, struct sk_buff *skb,
- u32 hash);
+ u32 hash, u8 migration);
 #else
 static inline struct sock *
 bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 struct bpf_prog *prog, struct sk_buff *skb,
-u32 hash)
+u32 hash, u8 migration)
 {
return NULL;
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cfc207ae7782..efe342bf3dbc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4419,6 +4419,7 @@ struct sk_reuseport_md {
__u32 ip_protocol;  /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
__u32 bind_inany;   /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
+   __u8 migration; /* Migration type */
 };
 
 #define BPF_TAG_SIZE   8
diff --git a/net/core/filter.c b/net/core/filter.c
index 2ca5eecebacf..0a0634787bb4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9853,7 +9853,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter 
__user *ubuf,
 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock_reuseport *reuse,
struct sock *sk, struct sk_buff *skb,
-   u32 hash)
+   u32 hash, u8 migration)
 {
reuse_kern->skb = skb;
reuse_kern->sk = sk;
@@ -9862,16 +9862,17 @@ static void bpf_init_reuseport_kern(struct 
sk_reuseport_kern *reuse_kern,
reuse_kern->hash = hash;
reuse_kern->reuseport_id = reuse->reuseport_id;
reuse_kern->bind_inany = reuse->bind_inany;
+   reuse_kern->migration = migration;
 }
 
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock 
*sk,
  struct bpf_prog *prog, struct sk_buff *skb,
- u32 hash)
+ u32 hash, u8 migration)
 {
struct sk_reuseport_kern reuse_kern;
enum sk_action action;
 
-   bpf_init_reuseport_kern(_kern, reuse, sk, skb, hash);
+   bpf_init_reuseport_kern(_kern, reuse, sk, skb, hash, migration);
action = BPF_PROG_RUN(prog, _kern);
 
if (action == SK_PASS)
@@ -10010,6 +10011,10 @@ sk_reuseport_is_valid_access(int off, int size,
case offsetof(struct sk_reuseport_md, hash):
return size == size_default;
 
+   case bpf_ctx_range(struct sk_reuseport_md, migration):
+   return prog->expected_attach_type == 
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE &&
+   size == sizeof(__u8);
+
/* Fields that allow narrowing */
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
if (size < sizeof_field(struct sk_buff, protocol))
@@ -10082,6 +10087,10 @@ static u32 sk_reuseport_convert_ctx_access(enum 
bpf_access_type type,
case offsetof(struct sk_reuseport_md, bind_inany):
SK_REUSEPORT_LOAD_FIELD(bind_inany);
break;
+
+   case offsetof(struct sk_reuseport_md, migration):
+   SK_REUSEPORT_LOAD_FIELD(migration);
+   break;
}
 
return

[PATCH v1 bpf-next 07/11] libbpf: Set expected_attach_type for BPF_PROG_TYPE_SK_REUSEPORT.

This commit introduces a new section (sk_reuseport/migrate) and sets
expected_attach_type to two each section in BPF_PROG_TYPE_SK_REUSEPORT
program.

Signed-off-by: Kuniyuki Iwashima 
---
 tools/lib/bpf/libbpf.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 28baee7ba1ca..bbb3902a0e41 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8237,7 +8237,10 @@ static struct bpf_link *attach_iter(const struct 
bpf_sec_def *sec,
 
 static const struct bpf_sec_def section_defs[] = {
BPF_PROG_SEC("socket",  BPF_PROG_TYPE_SOCKET_FILTER),
-   BPF_PROG_SEC("sk_reuseport",BPF_PROG_TYPE_SK_REUSEPORT),
+   BPF_EAPROG_SEC("sk_reuseport/migrate",  BPF_PROG_TYPE_SK_REUSEPORT,
+   
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE),
+   BPF_EAPROG_SEC("sk_reuseport",  BPF_PROG_TYPE_SK_REUSEPORT,
+   BPF_SK_REUSEPORT_SELECT),
SEC_DEF("kprobe/", KPROBE,
.attach_fn = attach_kprobe),
BPF_PROG_SEC("uprobe/", BPF_PROG_TYPE_KPROBE),
-- 
2.17.2 (Apple Git-113)

[PATCH v1 bpf-next 06/11] bpf: Introduce two attach types for BPF_PROG_TYPE_SK_REUSEPORT.

This commit adds new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT to
check if the attached eBPF program is capable of migrating sockets.

When the eBPF program is attached, the kernel runs it for socket migration
only if the expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.
The kernel will change the behaviour depending on the returned value:

  - SK_PASS with selected_sk, select it as a new listener
  - SK_PASS with selected_sk NULL, fall back to the random selection
  - SK_DROP, cancel the migration

Link: 
https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6t...@kafai-mbp.dhcp.thefacebook.com/
Suggested-by: Martin KaFai Lau 
Signed-off-by: Kuniyuki Iwashima 
---
 include/uapi/linux/bpf.h   | 2 ++
 kernel/bpf/syscall.c   | 8 
 tools/include/uapi/linux/bpf.h | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 85278deff439..cfc207ae7782 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -241,6 +241,8 @@ enum bpf_attach_type {
BPF_XDP_CPUMAP,
BPF_SK_LOOKUP,
BPF_XDP,
+   BPF_SK_REUSEPORT_SELECT,
+   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f3fe9f53f93c..a0796a8de5ea 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2036,6 +2036,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
if (expected_attach_type == BPF_SK_LOOKUP)
return 0;
return -EINVAL;
+   case BPF_PROG_TYPE_SK_REUSEPORT:
+   switch (expected_attach_type) {
+   case BPF_SK_REUSEPORT_SELECT:
+   case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
+   return 0;
+   default:
+   return -EINVAL;
+   }
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
return -EINVAL;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 85278deff439..cfc207ae7782 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -241,6 +241,8 @@ enum bpf_attach_type {
BPF_XDP_CPUMAP,
BPF_SK_LOOKUP,
BPF_XDP,
+   BPF_SK_REUSEPORT_SELECT,
+   BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
__MAX_BPF_ATTACH_TYPE
 };
 
-- 
2.17.2 (Apple Git-113)

[PATCH v1 bpf-next 05/11] tcp: Migrate TCP_NEW_SYN_RECV requests.

This patch renames reuseport_select_sock() to __reuseport_select_sock() and
adds two wrapper function of it to pass the migration type defined in the
previous commit.

  reuseport_select_sock  : BPF_SK_REUSEPORT_MIGRATE_NO
  reuseport_select_migrated_sock : BPF_SK_REUSEPORT_MIGRATE_REQUEST

As mentioned before, we have to select a new listener for TCP_NEW_SYN_RECV
requests at receiving the final ACK or sending a SYN+ACK. Therefore, this
patch also changes the code to call reuseport_select_migrated_sock() even
if the listening socket is TCP_CLOSE. If we can pick out a listening socket
from the reuseport group, we rewrite request_sock.rsk_listener and resume
processing the request.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 include/net/inet_connection_sock.h | 12 +++
 include/net/request_sock.h | 13 
 include/net/sock_reuseport.h   |  8 +++
 net/core/sock_reuseport.c  | 34 --
 net/ipv4/inet_connection_sock.c| 13 ++--
 net/ipv4/tcp_ipv4.c|  9 ++--
 net/ipv6/tcp_ipv6.c|  9 ++--
 7 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 2ea2d743f8fc..1e0958f5eb21 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -272,6 +272,18 @@ static inline void inet_csk_reqsk_queue_added(struct sock 
*sk)
reqsk_queue_added(_csk(sk)->icsk_accept_queue);
 }
 
+static inline void inet_csk_reqsk_queue_migrated(struct sock *sk,
+struct sock *nsk,
+struct request_sock *req)
+{
+   reqsk_queue_migrated(_csk(sk)->icsk_accept_queue,
+_csk(nsk)->icsk_accept_queue,
+req);
+   sock_put(sk);
+   sock_hold(nsk);
+   req->rsk_listener = nsk;
+}
+
 static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
 {
return reqsk_queue_len(_csk(sk)->icsk_accept_queue);
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 29e41ff3ec93..d18ba0b857cc 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -226,6 +226,19 @@ static inline void reqsk_queue_added(struct 
request_sock_queue *queue)
atomic_inc(>qlen);
 }
 
+static inline void reqsk_queue_migrated(struct request_sock_queue 
*old_accept_queue,
+   struct request_sock_queue 
*new_accept_queue,
+   const struct request_sock *req)
+{
+   atomic_dec(_accept_queue->qlen);
+   atomic_inc(_accept_queue->qlen);
+
+   if (req->num_timeout == 0) {
+   atomic_dec(_accept_queue->young);
+   atomic_inc(_accept_queue->young);
+   }
+}
+
 static inline int reqsk_queue_len(const struct request_sock_queue *queue)
 {
return atomic_read(>qlen);
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 09a1b1539d4c..a48259a974be 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -32,10 +32,10 @@ extern int reuseport_alloc(struct sock *sk, bool 
bind_inany);
 extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
  bool bind_inany);
 extern struct sock *reuseport_detach_sock(struct sock *sk);
-extern struct sock *reuseport_select_sock(struct sock *sk,
- u32 hash,
- struct sk_buff *skb,
- int hdr_len);
+extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash,
+ struct sk_buff *skb, int hdr_len);
+extern struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,
+  struct sk_buff *skb);
 extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog);
 extern int reuseport_detach_prog(struct sock *sk);
 
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 60d7c1f28809..b4fe0829c9ab 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -202,7 +202,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, 
bool bind_inany)
}
 
reuse->socks[reuse->num_socks] = sk;
-   /* paired with smp_rmb() in reuseport_select_sock() */
+   /* paired with smp_rmb() in __reuseport_select_sock() */
smp_wmb();
reuse->num_socks++;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
@@ -313,12 +313,13 @@ static struct sock *run_bpf_filter(struct sock_reuseport 
*reuse, u16 socks,
  *  @hdr_len: BPF filter expects skb data pointer at payload data.  If
  *the skb does not yet point at the payload, this parameter represents
  *

[PATCH v1 bpf-next 01/11] tcp: Keep TCP_CLOSE sockets in the reuseport group.

This patch is a preparation patch to migrate incoming connections in the
later commits and adds a field (num_closed_socks) to the struct
sock_reuseport to keep TCP_CLOSE sockets in the reuseport group.

When we close a listening socket, to migrate its connections to another
listener in the same reuseport group, we have to handle two kinds of child
sockets. One is that a listening socket has a reference to, and the other
is not.

The former is the TCP_ESTABLISHED/TCP_SYN_RECV sockets, and they are in the
accept queue of their listening socket. So, we can pop them out and push
them into another listener's queue at close() or shutdown() syscalls. On
the other hand, the latter, the TCP_NEW_SYN_RECV socket is during the
three-way handshake and not in the accept queue. Thus, we cannot access
such sockets at close() or shutdown() syscalls. Accordingly, we have to
migrate immature sockets after their listening socket has been closed.

Currently, if their listening socket has been closed, TCP_NEW_SYN_RECV
sockets are freed at receiving the final ACK or retransmitting SYN+ACKs. At
that time, if we could select a new listener from the same reuseport group,
no connection would be aborted. However, it is impossible because
reuseport_detach_sock() sets NULL to sk_reuseport_cb and forbids access to
the reuseport group from closed sockets.

This patch allows TCP_CLOSE sockets to remain in the reuseport group and to
have access to it while any child socket references to them. The point is
that reuseport_detach_sock() is called twice from inet_unhash() and
sk_destruct(). At first, it moves the socket backwards in socks[] and
increments num_closed_socks. Later, when all migrated connections are
accepted, it removes the socket from socks[], decrements num_closed_socks,
and sets NULL to sk_reuseport_cb.

By this change, closed sockets can keep sk_reuseport_cb until all child
requests have been freed or accepted. Consequently calling listen() after
shutdown() can cause EADDRINUSE or EBUSY in reuseport_add_sock() or
inet_csk_bind_conflict() which expect that such sockets should not have the
reuseport group. Therefore, this patch also loosens such validation rules
so that the socket can listen again if it has the same reuseport group with
other listening sockets.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 include/net/sock_reuseport.h|  5 ++-
 net/core/sock_reuseport.c   | 79 +++--
 net/ipv4/inet_connection_sock.c |  7 ++-
 3 files changed, 74 insertions(+), 17 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 505f1e18e9bf..0e558ca7afbf 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock;
 struct sock_reuseport {
struct rcu_head rcu;
 
-   u16 max_socks;  /* length of socks */
-   u16 num_socks;  /* elements in socks */
+   u16 max_socks;  /* length of socks */
+   u16 num_socks;  /* elements in socks */
+   u16 num_closed_socks;   /* closed elements in 
socks */
/* The last synq overflow event timestamp of this
 * reuse->socks[] group.
 */
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index bbdd3c7b6cb5..fd133516ac0e 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -98,16 +98,21 @@ static struct sock_reuseport *reuseport_grow(struct 
sock_reuseport *reuse)
return NULL;
 
more_reuse->num_socks = reuse->num_socks;
+   more_reuse->num_closed_socks = reuse->num_closed_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
more_reuse->has_conns = reuse->has_conns;
+   more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
 
memcpy(more_reuse->socks, reuse->socks,
   reuse->num_socks * sizeof(struct sock *));
-   more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
+   memcpy(more_reuse->socks +
+  (more_reuse->max_socks - more_reuse->num_closed_socks),
+  reuse->socks + reuse->num_socks,
+  reuse->num_closed_socks * sizeof(struct sock *));
 
-   for (i = 0; i < reuse->num_socks; ++i)
+   for (i = 0; i < reuse->max_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
   more_reuse);
 
@@ -129,6 +134,25 @@ static void reuseport_free_rcu(struct rcu_head *head)
kfree(reuse);
 }
 
+static int reuseport_sock_index(struct sock_reuseport *reuse, struct sock *sk,
+   bool closed)
+{
+   i

[PATCH v1 bpf-next 04/11] tcp: Migrate TFO requests causing RST during TCP_SYN_RECV.

A TFO request socket is only freed after BOTH 3WHS has completed (or
aborted) and the child socket has been accepted (or its listener has been
closed). Hence, depending on the order, there can be two kinds of request
sockets in the accept queue.

  3WHS -> accept : TCP_ESTABLISHED
  accept -> 3WHS : TCP_SYN_RECV

Unlike TCP_ESTABLISHED socket, accept() does not free the request socket
for TCP_SYN_RECV socket. It is freed later at reqsk_fastopen_remove().
Also, it accesses request_sock.rsk_listener. So, in order to complete TFO
socket migration, we have to set the current listener to it at accept()
before reqsk_fastopen_remove().

Moreover, if TFO request caused RST before 3WHS has completed, it is held
in the listener's TFO queue to prevent DDoS attack. Thus, we also have to
migrate the requests in TFO queue.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 net/ipv4/inet_connection_sock.c | 35 -
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index b27241ea96bd..361efe55b1ad 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -500,6 +500,16 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, 
int *err, bool kern)
tcp_rsk(req)->tfo_listener) {
spin_lock_bh(>fastopenq.lock);
if (tcp_rsk(req)->tfo_listener) {
+   if (req->rsk_listener != sk) {
+   /* TFO request was migrated to another listener 
so
+* the new listener must be used in 
reqsk_fastopen_remove()
+* to hold requests which cause RST.
+*/
+   sock_put(req->rsk_listener);
+   sock_hold(sk);
+   req->rsk_listener = sk;
+   }
+
/* We are still waiting for the final ACK from 3WHS
 * so can't free req now. Instead, we set req->sk to
 * NULL to signify that the child socket is taken
@@ -954,7 +964,6 @@ static void inet_child_forget(struct sock *sk, struct 
request_sock *req,
 
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
-   BUG_ON(sk != req->rsk_listener);
 
/* Paranoid, to prevent race condition if
 * an inbound pkt destined for child is
@@ -995,6 +1004,7 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
 void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
 {
struct request_sock_queue *old_accept_queue, *new_accept_queue;
+   struct fastopen_queue *old_fastopenq, *new_fastopenq;
 
old_accept_queue = _csk(sk)->icsk_accept_queue;
new_accept_queue = _csk(nsk)->icsk_accept_queue;
@@ -1019,6 +1029,29 @@ void inet_csk_reqsk_queue_migrate(struct sock *sk, 
struct sock *nsk)
 
spin_unlock(_accept_queue->rskq_lock);
spin_unlock(_accept_queue->rskq_lock);
+
+   old_fastopenq = _accept_queue->fastopenq;
+   new_fastopenq = _accept_queue->fastopenq;
+
+   spin_lock_bh(_fastopenq->lock);
+   spin_lock_bh(_fastopenq->lock);
+
+   new_fastopenq->qlen += old_fastopenq->qlen;
+   old_fastopenq->qlen = 0;
+
+   if (old_fastopenq->rskq_rst_head) {
+   if (new_fastopenq->rskq_rst_head)
+   old_fastopenq->rskq_rst_tail->dl_next = 
new_fastopenq->rskq_rst_head;
+   else
+   old_fastopenq->rskq_rst_tail = 
new_fastopenq->rskq_rst_tail;
+
+   new_fastopenq->rskq_rst_head = old_fastopenq->rskq_rst_head;
+   old_fastopenq->rskq_rst_head = NULL;
+   old_fastopenq->rskq_rst_tail = NULL;
+   }
+
+   spin_unlock_bh(_fastopenq->lock);
+   spin_unlock_bh(_fastopenq->lock);
 }
 EXPORT_SYMBOL(inet_csk_reqsk_queue_migrate);
 
-- 
2.17.2 (Apple Git-113)

[PATCH v1 bpf-next 03/11] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

This patch lets reuseport_detach_sock() return a pointer of struct sock,
which is used only by inet_unhash(). If it is not NULL,
inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
sockets from the closing listener to the selected one.

Listening sockets hold incoming connections as a linked list of struct
request_sock in the accept queue, and each request has reference to a full
socket and its listener. In inet_csk_reqsk_queue_migrate(), we only unlink
the requests from the closing listener's queue and relink them to the head
of the new listener's queue. We do not process each request and its
reference to the listener, so the migration completes in O(1) time
complexity. However, in the case of TCP_SYN_RECV sockets, we take special
care in the next commit.

By default, the kernel selects a new listener randomly. In order to pick
out a different socket every time, we select the last element of socks[] as
the new listener. This behaviour is based on how the kernel moves sockets
in socks[]. (See also [1])

Basically, in order to redistribute sockets evenly, we have to use an eBPF
program called in the later commit, but as the side effect of such default
selection, the kernel can redistribute old requests evenly to new listeners
for a specific case where the application replaces listeners by
generations.

For example, we call listen() for four sockets (A, B, C, D), and close the
first two by turns. The sockets move in socks[] like below.

  socks[0] : A <-.  socks[0] : D  socks[0] : D
  socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
  socks[2] : C   |  socks[2] : C --'
  socks[3] : D --'

Then, if C and D have newer settings than A and B, and each socket has a
request (a, b, c, d) in their accept queue, we can redistribute old
requests evenly to new listeners.

  socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : D (a + d)
  socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C (b + c)
  socks[2] : C (c)   |  socks[2] : C (c) --'
  socks[3] : D (d) --'

Here, (A, D) or (B, C) can have different application settings, but they
MUST have the same settings at the socket API level; otherwise, unexpected
error may happen. For instance, if only the new listeners have
TCP_SAVE_SYN, old requests do not have SYN data, so the application will
face inconsistency and cause an error.

Therefore, if there are different kinds of sockets, we must attach an eBPF
program described in later commits.

Link: 
https://lore.kernel.org/netdev/CAEfhGiyG8Y_amDZ2C8dQoQqjZJMHjTY76b=KBkTKcBtA=dh...@mail.gmail.com/
Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 include/net/inet_connection_sock.h |  1 +
 include/net/sock_reuseport.h   |  2 +-
 net/core/sock_reuseport.c  | 10 +-
 net/ipv4/inet_connection_sock.c| 30 ++
 net/ipv4/inet_hashtables.c |  9 +++--
 5 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 7338b3865a2a..2ea2d743f8fc 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -260,6 +260,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct 
sock *sk,
 struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
  struct request_sock *req,
  struct sock *child);
+void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk);
 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
   unsigned long timeout);
 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 0e558ca7afbf..09a1b1539d4c 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -31,7 +31,7 @@ struct sock_reuseport {
 extern int reuseport_alloc(struct sock *sk, bool bind_inany);
 extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
  bool bind_inany);
-extern void reuseport_detach_sock(struct sock *sk);
+extern struct sock *reuseport_detach_sock(struct sock *sk);
 extern struct sock *reuseport_select_sock(struct sock *sk,
  u32 hash,
  struct sk_buff *skb,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index fd133516ac0e..60d7c1f28809 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -216,9 +216,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, 
bool bind_inany)
 }
 EXPORT_SYMBOL(reuseport_add_sock);
 
-void reuseport_detach_sock(struct sock *sk)
+struct sock *reuseport_detach_sock(struct sock *sk)
 {
struct sock_reuseport *reuse;
+   struct bpf_prog *prog;
+   struct sock *nsk = NULL;
int i;

[PATCH v1 bpf-next 02/11] bpf: Define migration types for SO_REUSEPORT.

As noted in the preceding commit, there are two migration types. In
addition to that, the kernel will run the same eBPF program to select a
listener for SYN packets.

This patch defines three types to signal the kernel and the eBPF program if
it is receiving a new request or migrating ESTABLISHED/SYN_RECV sockets in
the accept queue or NEW_SYN_RECV socket during 3WHS.

Signed-off-by: Kuniyuki Iwashima 
---
 include/uapi/linux/bpf.h   | 14 ++
 tools/include/uapi/linux/bpf.h | 14 ++
 2 files changed, 28 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 162999b12790..85278deff439 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4380,6 +4380,20 @@ struct sk_msg_md {
__bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
 };
 
+/* Migration type for SO_REUSEPORT enabled TCP sockets.
+ *
+ * BPF_SK_REUSEPORT_MIGRATE_NO  : Select a listener for SYN packets.
+ * BPF_SK_REUSEPORT_MIGRATE_QUEUE   : Migrate ESTABLISHED and SYN_RECV sockets 
in
+ *the accept queue at close() or 
shutdown().
+ * BPF_SK_REUSEPORT_MIGRATE_REQUEST : Migrate NEW_SYN_RECV socket at receiving 
the
+ *final ACK of 3WHS or retransmitting 
SYN+ACKs.
+ */
+enum {
+   BPF_SK_REUSEPORT_MIGRATE_NO,
+   BPF_SK_REUSEPORT_MIGRATE_QUEUE,
+   BPF_SK_REUSEPORT_MIGRATE_REQUEST,
+};
+
 struct sk_reuseport_md {
/*
 * Start of directly accessible data. It begins from
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 162999b12790..85278deff439 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4380,6 +4380,20 @@ struct sk_msg_md {
__bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
 };
 
+/* Migration type for SO_REUSEPORT enabled TCP sockets.
+ *
+ * BPF_SK_REUSEPORT_MIGRATE_NO  : Select a listener for SYN packets.
+ * BPF_SK_REUSEPORT_MIGRATE_QUEUE   : Migrate ESTABLISHED and SYN_RECV sockets 
in
+ *the accept queue at close() or 
shutdown().
+ * BPF_SK_REUSEPORT_MIGRATE_REQUEST : Migrate NEW_SYN_RECV socket at receiving 
the
+ *final ACK of 3WHS or retransmitting 
SYN+ACKs.
+ */
+enum {
+   BPF_SK_REUSEPORT_MIGRATE_NO,
+   BPF_SK_REUSEPORT_MIGRATE_QUEUE,
+   BPF_SK_REUSEPORT_MIGRATE_REQUEST,
+};
+
 struct sk_reuseport_md {
/*
 * Start of directly accessible data. It begins from
-- 
2.17.2 (Apple Git-113)

[PATCH v1 bpf-next 00/11] Socket migration for SO_REUSEPORT.

The SO_REUSEPORT option allows sockets to listen on the same port and to
accept connections evenly. However, there is a defect in the current
implementation[1]. When a SYN packet is received, the connection is tied to
a listening socket. Accordingly, when the listener is closed, in-flight
requests during the three-way handshake and child sockets in the accept
queue are dropped even if other listeners on the same port could accept
such connections.

1. Stop routing SYN packets to the listener by eBPF.
2. Wait for all timers to expire to complete requests
3. Accept connections until EAGAIN, then close the listener.

1. Start counting SYN packets and accept syscalls using eBPF map.
2. Stop routing SYN packets.
3. Accept connections up to the count, then close the listener.

In either way, we cannot close a listener immediately. However, ideally,
the application need not drain the not yet accepted sockets because 3WHS
and tying a connection to a listener are just the kernel behaviour. The
root cause is within the kernel, so the issue should be addressed in kernel
space and should not be visible to user space. This patchset fixes it so
that users need not take care of kernel implementation and connection
draining. With this patchset, the kernel redistributes requests and
connections from a listener to others in the same reuseport group at/after
close() or shutdown() syscalls.

Moreover, auto-migration simplifies userspace logic and also works well in
a case where we cannot modify and build a server program to implement the
workaround.

Note that the source and destination listeners MUST have the same settings
at the socket API level; otherwise, applications may face inconsistency and
cause errors. In such a case, we have to use eBPF program to select a
specific listener or to cancel migration.

Link:

[1] The SO_REUSEPORT socket option
https://lwn.net/Articles/542629/

[2] Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as drain
mode

https://lore.kernel.org/netdev/1458828813.10868.65.ca...@edumazet-glaptop3.roam.corp.google.com/

Changelog:

v1:
* Remove the sysctl option
* Enable migration if eBPF progam is not attached
* Add expected_attach_type to check if eBPF program can migrate sockets
* Add a field to tell migration type to eBPF program
* Support BPF_FUNC_get_socket_cookie to get the cookie of sk
* Allocate an empty skb if skb is NULL
* Pass req_to_sk(req)->sk_hash because listener's hash is zero
* Update commit messages and coverletter

RFC v0:
https://lore.kernel.org/netdev/20201117094023.3685-1-kun...@amazon.co.jp/

Kuniyuki Iwashima (11):
tcp: Keep TCP_CLOSE sockets in the reuseport group.
bpf: Define migration types for SO_REUSEPORT.
tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.
tcp: Migrate TFO requests causing RST during TCP_SYN_RECV.
tcp: Migrate TCP_NEW_SYN_RECV requests.
bpf: Introduce two attach types for BPF_PROG_TYPE_SK_REUSEPORT.
libbpf: Set expected_attach_type for BPF_PROG_TYPE_SK_REUSEPORT.
bpf: Add migration to sk_reuseport_(kern|md).
bpf: Support bpf_get_socket_cookie_sock() for
BPF_PROG_TYPE_SK_REUSEPORT.
bpf: Call bpf_run_sk_reuseport() for socket migration.
bpf: Test BPF_SK_REUSEPORT_SELECT_OR_MIGRATE.

Re: [RFC PATCH bpf-next 3/8] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-11-24 Thread Kuniyuki Iwashima

From:   Martin KaFai Lau 
Date:   Sun, 22 Nov 2020 16:40:20 -0800
> On Sat, Nov 21, 2020 at 07:13:22PM +0900, Kuniyuki Iwashima wrote:
> > From:   Martin KaFai Lau 
> > Date:   Thu, 19 Nov 2020 17:53:46 -0800
> > > On Fri, Nov 20, 2020 at 07:09:22AM +0900, Kuniyuki Iwashima wrote:
> > > > From: Martin KaFai Lau 
> > > > Date: Wed, 18 Nov 2020 15:50:17 -0800
> > > > > On Tue, Nov 17, 2020 at 06:40:18PM +0900, Kuniyuki Iwashima wrote:
> > > > > > This patch lets reuseport_detach_sock() return a pointer of struct 
> > > > > > sock,
> > > > > > which is used only by inet_unhash(). If it is not NULL,
> > > > > > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > > > > > sockets from the closing listener to the selected one.
> > > > > > 
> > > > > > Listening sockets hold incoming connections as a linked list of 
> > > > > > struct
> > > > > > request_sock in the accept queue, and each request has reference to 
> > > > > > a full
> > > > > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we 
> > > > > > unlink the
> > > > > > requests from the closing listener's queue and relink them to the 
> > > > > > head of
> > > > > > the new listener's queue. We do not process each request, so the 
> > > > > > migration
> > > > > > completes in O(1) time complexity. However, in the case of 
> > > > > > TCP_SYN_RECV
> > > > > > sockets, we will take special care in the next commit.
> > > > > > 
> > > > > > By default, we select the last element of socks[] as the new 
> > > > > > listener.
> > > > > > This behaviour is based on how the kernel moves sockets in socks[].
> > > > > > 
> > > > > > For example, we call listen() for four sockets (A, B, C, D), and 
> > > > > > close the
> > > > > > first two by turns. The sockets move in socks[] like below. (See 
> > > > > > also [1])
> > > > > > 
> > > > > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > > > > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > > > > >   socks[2] : C   |  socks[2] : C --'
> > > > > >   socks[3] : D --'
> > > > > > 
> > > > > > Then, if C and D have newer settings than A and B, and each socket 
> > > > > > has a
> > > > > > request (a, b, c, d) in their accept queue, we can redistribute old
> > > > > > requests evenly to new listeners.
> > > > > I don't think it should emphasize/claim there is a specific way that
> > > > > the kernel-pick here can redistribute the requests evenly.  It 
> > > > > depends on
> > > > > how the application close/listen.  The userspace can not expect the
> > > > > ordering of socks[] will behave in a certain way.
> > > > 
> > > > I've expected replacing listeners by generations as a general use case.
> > > > But exactly. Users should not expect the undocumented kernel internal.
> > > > 
> > > > 
> > > > > The primary redistribution policy has to depend on BPF which is the
> > > > > policy defined by the user based on its application logic (e.g. how
> > > > > its binary restart work).  The application (and bpf) knows which one
> > > > > is a dying process and can avoid distributing to it.
> > > > > 
> > > > > The kernel-pick could be an optional fallback but not a must.  If the 
> > > > > bpf
> > > > > prog is attached, I would even go further to call bpf to redistribute
> > > > > regardless of the sysctl, so I think the sysctl is not necessary.
> > > > 
> > > > I also think it is just an optional fallback, but to pick out a 
> > > > different
> > > > listener everytime, choosing the moved socket was reasonable. So the 
> > > > even
> > > > redistribution for a specific use case is a side effect of such socket
> > > > selection.
> > > > 
> > > > But, users should decide to use either way:
> > > >   (1) let the kernel select a new listener randomly
> > > >   (2) select a particular listener by eBPF
> &g

Re: [RFC PATCH bpf-next 0/8] Socket migration for SO_REUSEPORT.

2020-11-21 Thread Kuniyuki Iwashima

From:   Martin KaFai Lau 
Date:   Thu, 19 Nov 2020 18:31:57 -0800
> On Fri, Nov 20, 2020 at 07:17:49AM +0900, Kuniyuki Iwashima wrote:
> > From:   Martin KaFai Lau 
> > Date:   Wed, 18 Nov 2020 17:49:13 -0800
> > > On Tue, Nov 17, 2020 at 06:40:15PM +0900, Kuniyuki Iwashima wrote:
> > > > The SO_REUSEPORT option allows sockets to listen on the same port and to
> > > > accept connections evenly. However, there is a defect in the current
> > > > implementation. When a SYN packet is received, the connection is tied 
> > > > to a
> > > > listening socket. Accordingly, when the listener is closed, in-flight
> > > > requests during the three-way handshake and child sockets in the accept
> > > > queue are dropped even if other listeners could accept such connections.
> > > > 
> > > > This situation can happen when various server management tools restart
> > > > server (such as nginx) processes. For instance, when we change nginx
> > > > configurations and restart it, it spins up new workers that respect the 
> > > > new
> > > > configuration and closes all listeners on the old workers, resulting in
> > > > in-flight ACK of 3WHS is responded by RST.
> > > > 
> > > > As a workaround for this issue, we can do connection draining by eBPF:
> > > > 
> > > >   1. Before closing a listener, stop routing SYN packets to it.
> > > >   2. Wait enough time for requests to complete 3WHS.
> > > >   3. Accept connections until EAGAIN, then close the listener.
> > > > 
> > > > Although this approach seems to work well, EAGAIN has nothing to do with
> > > > how many requests are still during 3WHS. Thus, we have to know the 
> > > > number
> > > It sounds like the application can already drain the established socket
> > > by accept()?  To solve the problem that you have,
> > > does it mean migrating req_sk (the in-progress 3WHS) is enough?
> > 
> > Ideally, the application needs to drain only the accepted sockets because
> > 3WHS and tying a connection to a listener are just kernel behaviour. Also,
> > there are some cases where we want to apply new configurations as soon as
> > possible such as replacing TLS certificates.
> > 
> > It is possible to drain the established sockets by accept(), but the
> > sockets in the accept queue have not started application sessions yet. So,
> > if we do not drain such sockets (or if the kernel happened to select
> > another listener), we can (could) apply the new settings much earlier.
> > 
> > Moreover, the established sockets may start long-standing connections so
> > that we cannot complete draining for a long time and may have to
> > force-close them (and they would have longer lifetime if they are migrated
> > to a new listener).
> > 
> > 
> > > Applications can already use the bpf prog to do (1) and divert
> > > the SYN to the newly started process.
> > > 
> > > If the application cares about service disruption,
> > > it usually needs to drain the fd(s) that it already has and
> > > finishes serving the pending request (e.g. https) on them anyway.
> > > The time taking to finish those could already be longer than it takes
> > > to drain the accept queue or finish off the 3WHS in reasonable time.
> > > or the application that you have does not need to drain the fd(s) 
> > > it already has and it can close them immediately?
> > 
> > In the point of view of service disruption, I agree with you.
> > 
> > However, I think that there are some situations where we want to apply new
> > configurations rather than to drain sockets with old configurations and
> > that if the kernel migrates sockets automatically, we can simplify user
> > programs.
> This configuration-update(/new-TLS-cert...etc) consideration will be useful
> if it is also included in the cover letter.

I will add this to the next cover letter.


> It sounds like the service that you have is draining the existing
> already-accepted fd(s) which are using the old configuration.
> Those existing fd(s) could also be long life.  Potentially those
> existing fd(s) will be in a much higher number than the
> to-be-accepted fd(s)?

In many cases, yes.


> or you meant in some cases it wants to migrate to the new configuration
> ASAP (e.g. for security reason) even it has to close all the
> already-accepted fds() which are using the old configuration??

And sometimes, yes.
As you expected, for some reasons including security, there are cases we
have t

Re: [RFC PATCH bpf-next 3/8] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

2020-11-21 Thread Kuniyuki Iwashima

From:   Martin KaFai Lau 
Date:   Thu, 19 Nov 2020 17:53:46 -0800
> On Fri, Nov 20, 2020 at 07:09:22AM +0900, Kuniyuki Iwashima wrote:
> > From: Martin KaFai Lau 
> > Date: Wed, 18 Nov 2020 15:50:17 -0800
> > > On Tue, Nov 17, 2020 at 06:40:18PM +0900, Kuniyuki Iwashima wrote:
> > > > This patch lets reuseport_detach_sock() return a pointer of struct sock,
> > > > which is used only by inet_unhash(). If it is not NULL,
> > > > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > > > sockets from the closing listener to the selected one.
> > > > 
> > > > Listening sockets hold incoming connections as a linked list of struct
> > > > request_sock in the accept queue, and each request has reference to a 
> > > > full
> > > > socket and its listener. In inet_csk_reqsk_queue_migrate(), we unlink 
> > > > the
> > > > requests from the closing listener's queue and relink them to the head 
> > > > of
> > > > the new listener's queue. We do not process each request, so the 
> > > > migration
> > > > completes in O(1) time complexity. However, in the case of TCP_SYN_RECV
> > > > sockets, we will take special care in the next commit.
> > > > 
> > > > By default, we select the last element of socks[] as the new listener.
> > > > This behaviour is based on how the kernel moves sockets in socks[].
> > > > 
> > > > For example, we call listen() for four sockets (A, B, C, D), and close 
> > > > the
> > > > first two by turns. The sockets move in socks[] like below. (See also 
> > > > [1])
> > > > 
> > > >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> > > >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> > > >   socks[2] : C   |  socks[2] : C --'
> > > >   socks[3] : D --'
> > > > 
> > > > Then, if C and D have newer settings than A and B, and each socket has a
> > > > request (a, b, c, d) in their accept queue, we can redistribute old
> > > > requests evenly to new listeners.
> > > I don't think it should emphasize/claim there is a specific way that
> > > the kernel-pick here can redistribute the requests evenly.  It depends on
> > > how the application close/listen.  The userspace can not expect the
> > > ordering of socks[] will behave in a certain way.
> > 
> > I've expected replacing listeners by generations as a general use case.
> > But exactly. Users should not expect the undocumented kernel internal.
> > 
> > 
> > > The primary redistribution policy has to depend on BPF which is the
> > > policy defined by the user based on its application logic (e.g. how
> > > its binary restart work).  The application (and bpf) knows which one
> > > is a dying process and can avoid distributing to it.
> > > 
> > > The kernel-pick could be an optional fallback but not a must.  If the bpf
> > > prog is attached, I would even go further to call bpf to redistribute
> > > regardless of the sysctl, so I think the sysctl is not necessary.
> > 
> > I also think it is just an optional fallback, but to pick out a different
> > listener everytime, choosing the moved socket was reasonable. So the even
> > redistribution for a specific use case is a side effect of such socket
> > selection.
> > 
> > But, users should decide to use either way:
> >   (1) let the kernel select a new listener randomly
> >   (2) select a particular listener by eBPF
> > 
> > I will update the commit message like:
> > The kernel selects a new listener randomly, but as the side effect, it can
> > redistribute packets evenly for a specific case where an application
> > replaces listeners by generations.
> Since there is no feedback on sysctl, so may be something missed
> in the lines.

I'm sorry, I have missed this point while thinking about each reply...


> I don't think this migration logic should depend on a sysctl.
> At least not when a bpf prog is attached that is capable of doing
> migration, it is too fragile to ask user to remember to turn on
> the sysctl before attaching the bpf prog.
> 
> Your use case is to primarily based on bpf prog to pick or only based
> on kernel to do a random pick?

I think we have to care about both cases.

I think we can always enable the migration feature if eBPF prog is not
attached. On the other hand, if BPF_PROG_TYPE_SK_REUSEPORT prog is attached
to select a listener by some rules, along updating the kernel

Re: [RFC PATCH bpf-next 0/8] Socket migration for SO_REUSEPORT.

From:   Martin KaFai Lau 
Date:   Wed, 18 Nov 2020 17:49:13 -0800
> On Tue, Nov 17, 2020 at 06:40:15PM +0900, Kuniyuki Iwashima wrote:
> > The SO_REUSEPORT option allows sockets to listen on the same port and to
> > accept connections evenly. However, there is a defect in the current
> > implementation. When a SYN packet is received, the connection is tied to a
> > listening socket. Accordingly, when the listener is closed, in-flight
> > requests during the three-way handshake and child sockets in the accept
> > queue are dropped even if other listeners could accept such connections.
> > 
> > This situation can happen when various server management tools restart
> > server (such as nginx) processes. For instance, when we change nginx
> > configurations and restart it, it spins up new workers that respect the new
> > configuration and closes all listeners on the old workers, resulting in
> > in-flight ACK of 3WHS is responded by RST.
> > 
> > As a workaround for this issue, we can do connection draining by eBPF:
> > 
> >   1. Before closing a listener, stop routing SYN packets to it.
> >   2. Wait enough time for requests to complete 3WHS.
> >   3. Accept connections until EAGAIN, then close the listener.
> > 
> > Although this approach seems to work well, EAGAIN has nothing to do with
> > how many requests are still during 3WHS. Thus, we have to know the number
> It sounds like the application can already drain the established socket
> by accept()?  To solve the problem that you have,
> does it mean migrating req_sk (the in-progress 3WHS) is enough?

Ideally, the application needs to drain only the accepted sockets because
3WHS and tying a connection to a listener are just kernel behaviour. Also,
there are some cases where we want to apply new configurations as soon as
possible such as replacing TLS certificates.

It is possible to drain the established sockets by accept(), but the
sockets in the accept queue have not started application sessions yet. So,
if we do not drain such sockets (or if the kernel happened to select
another listener), we can (could) apply the new settings much earlier.

Moreover, the established sockets may start long-standing connections so
that we cannot complete draining for a long time and may have to
force-close them (and they would have longer lifetime if they are migrated
to a new listener).

> Applications can already use the bpf prog to do (1) and divert
> the SYN to the newly started process.
> 
> If the application cares about service disruption,
> it usually needs to drain the fd(s) that it already has and
> finishes serving the pending request (e.g. https) on them anyway.
> The time taking to finish those could already be longer than it takes
> to drain the accept queue or finish off the 3WHS in reasonable time.
> or the application that you have does not need to drain the fd(s) 
> it already has and it can close them immediately?

In the point of view of service disruption, I agree with you.

However, I think that there are some situations where we want to apply new
configurations rather than to drain sockets with old configurations and
that if the kernel migrates sockets automatically, we can simplify user
programs.

Re: [RFC PATCH bpf-next 7/8] bpf: Call bpf_run_sk_reuseport() for socket migration.

From:   Martin KaFai Lau 
Date:   Wed, 18 Nov 2020 17:00:45 -0800
> On Tue, Nov 17, 2020 at 06:40:22PM +0900, Kuniyuki Iwashima wrote:
> > This patch makes it possible to select a new listener for socket migration
> > by eBPF.
> > 
> > The noteworthy point is that we select a listening socket in
> > reuseport_detach_sock() and reuseport_select_sock(), but we do not have
> > struct skb in the unhash path.
> > 
> > Since we cannot pass skb to the eBPF program, we run only the
> > BPF_PROG_TYPE_SK_REUSEPORT program by calling bpf_run_sk_reuseport() with
> > skb NULL. So, some fields derived from skb are also NULL in the eBPF
> > program.
> More things need to be considered here when skb is NULL.
> 
> Some helpers are probably assuming skb is not NULL.
> 
> Also, the sk_lookup in filter.c is actually passing a NULL skb to avoid
> doing the reuseport select.

Honestly, I have missed this point...
I wanted users to reuse the same eBPF program seamlessly, but it seems unsafe.


> > Moreover, we can cancel migration by returning SK_DROP. This feature is
> > useful when listeners have different settings at the socket API level or
> > when we want to free resources as soon as possible.
> > 
> > Reviewed-by: Benjamin Herrenschmidt 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >  net/core/filter.c  | 26 +-
> >  net/core/sock_reuseport.c  | 23 ---
> >  net/ipv4/inet_hashtables.c |  2 +-
> >  3 files changed, 42 insertions(+), 9 deletions(-)
> > 
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 01e28f283962..ffc4591878b8 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -8914,6 +8914,22 @@ static u32 xdp_convert_ctx_access(enum 
> > bpf_access_type type,
> > SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \
> >  BPF_FIELD_SIZEOF(NS, NF), 0)
> >  
> > +#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF_OR_NULL(S, NS, F, NF, SIZE, 
> > OFF)  \
> > +   do {
> > \
> > +   *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg,  
> > \
> > + si->src_reg, offsetof(S, F)); 
> > \
> > +   *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);  
> > \
> Although it may not matter much, always doing this check seems not very ideal
> considering the fast path will always have skb and only the slow
> path (accept-queue migrate) has skb is NULL.  I think the req_sk usually
> has the skb also except the timer one.

Yes, but the migration happens only when/after the listener is closed, so
I think it does not occur so frequently and will not be a problem.


> First thought is to create a temp skb but it has its own issues.
> or it may actually belong to a new prog type.  However, lets keep
> exploring possible options (including NULL skb).

I also thought up the two ideas, but the former will be a bit complicated.
And the latter makes users implement the new eBPF program. I did not want
users to struggle anymore, so I have selected the NULL skb. However, it is
not safe, so adding a new prog type seems to be the better way.

Re: [RFC PATCH bpf-next 6/8] bpf: Add cookie in sk_reuseport_md.

From:   Martin KaFai Lau 
Date:   Wed, 18 Nov 2020 16:11:54 -0800
> On Tue, Nov 17, 2020 at 06:40:21PM +0900, Kuniyuki Iwashima wrote:
> > We will call sock_reuseport.prog for socket migration in the next commit,
> > so the eBPF program has to know which listener is closing in order to
> > select the new listener.
> > 
> > Currently, we can get a unique ID for each listener in the userspace by
> > calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map.
> > This patch exposes the ID to the eBPF program.
> > 
> > Reviewed-by: Benjamin Herrenschmidt 
> > Signed-off-by: Kuniyuki Iwashima 
> > ---
> >  include/linux/bpf.h| 1 +
> >  include/uapi/linux/bpf.h   | 1 +
> >  net/core/filter.c  | 8 
> >  tools/include/uapi/linux/bpf.h | 1 +
> >  4 files changed, 11 insertions(+)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 581b2a2e78eb..c0646eceffa2 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1897,6 +1897,7 @@ struct sk_reuseport_kern {
> > u32 hash;
> > u32 reuseport_id;
> > bool bind_inany;
> > +   u64 cookie;
> >  };
> >  bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type 
> > type,
> >   struct bpf_insn_access_aux *info);
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 162999b12790..3fcddb032838 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -4403,6 +4403,7 @@ struct sk_reuseport_md {
> > __u32 ip_protocol;  /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
> > __u32 bind_inany;   /* Is sock bound to an INANY address? */
> > __u32 hash; /* A hash of the packet 4 tuples */
> > +   __u64 cookie;   /* ID of the listener in map */
> Instead of only adding the cookie of a sk, lets make the sk pointer available:
> 
>   __bpf_md_ptr(struct bpf_sock *, sk);
> 
> and then use the BPF_FUNC_get_socket_cookie to get the cookie.
> 
> Other fields of the sk can also be directly accessed too once
> the sk pointer is available.

Oh, I did not know BPF_FUNC_get_socket_cookie.
I will add the sk pointer and use the helper function in the next spin!
Thank you.

Re: [RFC PATCH bpf-next 3/8] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

From: Martin KaFai Lau 
Date: Wed, 18 Nov 2020 15:50:17 -0800
> On Tue, Nov 17, 2020 at 06:40:18PM +0900, Kuniyuki Iwashima wrote:
> > This patch lets reuseport_detach_sock() return a pointer of struct sock,
> > which is used only by inet_unhash(). If it is not NULL,
> > inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
> > sockets from the closing listener to the selected one.
> > 
> > Listening sockets hold incoming connections as a linked list of struct
> > request_sock in the accept queue, and each request has reference to a full
> > socket and its listener. In inet_csk_reqsk_queue_migrate(), we unlink the
> > requests from the closing listener's queue and relink them to the head of
> > the new listener's queue. We do not process each request, so the migration
> > completes in O(1) time complexity. However, in the case of TCP_SYN_RECV
> > sockets, we will take special care in the next commit.
> > 
> > By default, we select the last element of socks[] as the new listener.
> > This behaviour is based on how the kernel moves sockets in socks[].
> > 
> > For example, we call listen() for four sockets (A, B, C, D), and close the
> > first two by turns. The sockets move in socks[] like below. (See also [1])
> > 
> >   socks[0] : A <-.  socks[0] : D  socks[0] : D
> >   socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
> >   socks[2] : C   |  socks[2] : C --'
> >   socks[3] : D --'
> > 
> > Then, if C and D have newer settings than A and B, and each socket has a
> > request (a, b, c, d) in their accept queue, we can redistribute old
> > requests evenly to new listeners.
> I don't think it should emphasize/claim there is a specific way that
> the kernel-pick here can redistribute the requests evenly.  It depends on
> how the application close/listen.  The userspace can not expect the
> ordering of socks[] will behave in a certain way.

I've expected replacing listeners by generations as a general use case.
But exactly. Users should not expect the undocumented kernel internal.


> The primary redistribution policy has to depend on BPF which is the
> policy defined by the user based on its application logic (e.g. how
> its binary restart work).  The application (and bpf) knows which one
> is a dying process and can avoid distributing to it.
> 
> The kernel-pick could be an optional fallback but not a must.  If the bpf
> prog is attached, I would even go further to call bpf to redistribute
> regardless of the sysctl, so I think the sysctl is not necessary.

I also think it is just an optional fallback, but to pick out a different
listener everytime, choosing the moved socket was reasonable. So the even
redistribution for a specific use case is a side effect of such socket
selection.

But, users should decide to use either way:
  (1) let the kernel select a new listener randomly
  (2) select a particular listener by eBPF

I will update the commit message like:
The kernel selects a new listener randomly, but as the side effect, it can
redistribute packets evenly for a specific case where an application
replaces listeners by generations.

Re: [RFC PATCH bpf-next 0/8] Socket migration for SO_REUSEPORT.

From:   Eric Dumazet 
Date:   Wed, 18 Nov 2020 17:25:44 +0100
> On 11/17/20 10:40 AM, Kuniyuki Iwashima wrote:
> > The SO_REUSEPORT option allows sockets to listen on the same port and to
> > accept connections evenly. However, there is a defect in the current
> > implementation. When a SYN packet is received, the connection is tied to a
> > listening socket. Accordingly, when the listener is closed, in-flight
> > requests during the three-way handshake and child sockets in the accept
> > queue are dropped even if other listeners could accept such connections.
> > 
> > This situation can happen when various server management tools restart
> > server (such as nginx) processes. For instance, when we change nginx
> > configurations and restart it, it spins up new workers that respect the new
> > configuration and closes all listeners on the old workers, resulting in
> > in-flight ACK of 3WHS is responded by RST.
> > 
> 
> I know some programs are simply removing a listener from the group,
> so that they no longer handle new SYN packets,
> and wait until all timers or 3WHS have completed before closing them.
> 
> They pass fd of newly accepted children to more recent programs using af_unix 
> fd passing,
> while in this draining mode.

Just out of curiosity, can I know the software for more study?

> Quite frankly, mixing eBPF in the picture is distracting.

I agree.
Also, I think eBPF itself is not always necessary in many cases and want
to make user programs simpler with this patchset.

The SO_REUSEPORT implementation is excellent to improve the scalability. On
the other hand, as a trade-off, users have to know deeply how the kernel
handles SYN packets and to implement connection draining by eBPF.

> It seems you want some way to transfer request sockets (and/or not yet 
> accepted established ones)
> from fd1 to fd2, isn't it something that should be discussed independently ?

I understand that you are asking that I should discuss the issue and how to
transfer sockets independently. Please correct me if I have misunderstood
your question.

The kernel handles 3WHS and users cannot know its existence (without eBPF).
Many users believe SO_REUSEPORT should make it possible to distribute all
connections across available listeners ideally, but actually, there are
possibly some connections aborted silently. Some user may think that if the
kernel selected other listeners, the connections would not be dropped.

The root cause is within the kernel, so the issue should be addressed in
the kernel space and should not be visible to userspace. In order not to
make users bother with implementing new some stuff, I want to fix the root
cause by transferring sockets automatically so that users need not take
care of kernel implementation and connection draining.

Moreover, if possible, I did not want to mix eBPF with the issue. But there
may be some cases that different applications listen on the same port and
eBPF routes packets to each by some rules. In such cases, redistributing
sockets without user intention will break the application. This patchset
will work in many cases, but to care such cases, I added the eBPF part.

RE: [RFC PATCH bpf-next 0/8] Socket migration for SO_REUSEPORT.

From:   David Laight 
Date:   Wed, 18 Nov 2020 09:18:24 +
> From: Kuniyuki Iwashima
> > Sent: 17 November 2020 09:40
> > 
> > The SO_REUSEPORT option allows sockets to listen on the same port and to
> > accept connections evenly. However, there is a defect in the current
> > implementation. When a SYN packet is received, the connection is tied to a
> > listening socket. Accordingly, when the listener is closed, in-flight
> > requests during the three-way handshake and child sockets in the accept
> > queue are dropped even if other listeners could accept such connections.
> > 
> > This situation can happen when various server management tools restart
> > server (such as nginx) processes. For instance, when we change nginx
> > configurations and restart it, it spins up new workers that respect the new
> > configuration and closes all listeners on the old workers, resulting in
> > in-flight ACK of 3WHS is responded by RST.
> 
> Can't you do something to stop new connections being queued (like
> setting the 'backlog' to zero), then carry on doing accept()s
> for a guard time (or until the queue length is zero) before finally
> closing the listening socket.

Yes, but with eBPF.
There are some ideas suggested and well discussed in the thread below,
resulting in that connection draining by eBPF was merged.
https://lore.kernel.org/netdev/1443313848-751-1-git-send-email-tolga.cey...@gmail.com/


Also, setting zero to backlog does not work well.
https://lore.kernel.org/netdev/1447262610.17135.114.ca...@edumazet-glaptop2.roam.corp.google.com/

---8<---
From: Eric Dumazet 
Subject: Re: [PATCH 1/1] net: Add SO_REUSEPORT_LISTEN_OFF socket option as
 drain mode
Date: Wed, 11 Nov 2015 09:23:30 -0800
> Actually listen(fd, 0) is not going to work well :
> 
> For request_sock that were created (by incoming SYN packet) before this
> listen(fd, 0) call, the 3rd packet (ACK coming from client) would not be
> able to create a child attached to this listener.
> 
> sk_acceptq_is_full() test in tcp_v4_syn_recv_sock() would simply drop
> the thing.
---8<---

[RFC PATCH bpf-next 1/8] net: Introduce net.ipv4.tcp_migrate_req.

This commit adds a new sysctl option: net.ipv4.tcp_migrate_req. If this
option is enabled, and then we call listen() for SO_REUSEPORT enabled
sockets and close one, we will be able to migrate its child sockets to
another listener.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 Documentation/networking/ip-sysctl.rst | 15 +++
 include/net/netns/ipv4.h   |  1 +
 net/ipv4/sysctl_net_ipv4.c |  9 +
 3 files changed, 25 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.rst 
b/Documentation/networking/ip-sysctl.rst
index dd2b12a32b73..4116771bf5ef 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -712,6 +712,21 @@ tcp_syncookies - INTEGER
network connections you can set this knob to 2 to enable
unconditionally generation of syncookies.
 
+tcp_migrate_req - INTEGER
+   By default, when a listening socket is closed, child sockets are also
+   closed. If it has SO_REUSEPORT enabled, the dropped connections should
+   have been accepted by other listeners on the same port. This option
+   makes it possible to migrate child sockets to another listener when
+   calling close() or shutdown().
+
+   Default: 0
+
+   Note that the source and destination listeners _must_ have the same
+   settings at the socket API level. If there are different kinds of
+   sockets on the port, disable this option or use
+   BPF_PROG_TYPE_SK_REUSEPORT program to select the correct socket by
+   bpf_sk_select_reuseport() or to cancel migration by returning SK_DROP.
+
 tcp_fastopen - INTEGER
Enable TCP Fast Open (RFC7413) to send and accept data in the opening
SYN packet.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 8e4fcac4df72..a3edc30d6a63 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -132,6 +132,7 @@ struct netns_ipv4 {
int sysctl_tcp_syn_retries;
int sysctl_tcp_synack_retries;
int sysctl_tcp_syncookies;
+   int sysctl_tcp_migrate_req;
int sysctl_tcp_reordering;
int sysctl_tcp_retries1;
int sysctl_tcp_retries2;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3e5f4f2e705e..6b76298fa271 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -933,6 +933,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler   = proc_dointvec
},
 #endif
+   {
+   .procname   = "tcp_migrate_req",
+   .data   = _net.ipv4.sysctl_tcp_migrate_req,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = SYSCTL_ZERO,
+   .extra2 = SYSCTL_ONE
+   },
{
.procname   = "tcp_reordering",
.data   = _net.ipv4.sysctl_tcp_reordering,
-- 
2.17.2 (Apple Git-113)

[RFC PATCH bpf-next 2/8] tcp: Keep TCP_CLOSE sockets in the reuseport group.

This patch is a preparation patch to migrate incoming connections in the
later commits and adds two fields (migrate_req and num_closed_socks) to the
struct sock_reuseport to keep TCP_CLOSE sockets in the reuseport group.

If migrate_req is 1, and then we close a listening socket, we can migrate
its connections to another listener in the same reuseport group. Then we
have to handle two kinds of child sockets. One is that a listening socket
has a reference to, and the other is not.

The former is the TCP_ESTABLISHED/TCP_SYN_RECV sockets, and they are in the
accept queue of their listening socket. So we can pop them out and push
them into another listener's queue at close() or shutdown() syscalls. On
the other hand, the latter, the TCP_NEW_SYN_RECV socket is during the
three-way handshake and not in the accept queue. Thus, we cannot access
such sockets at close() or shutdown() syscalls. Accordingly, we have to
migrate immature sockets after their listening socket has been closed.

Currently, if their listening socket has been closed, TCP_NEW_SYN_RECV
sockets are freed at receiving the final ACK or retransmitting SYN+ACKs. At
that time, if we could select a new listener from the same reuseport group,
no connection would be aborted. However, it is impossible because
reuseport_detach_sock() sets NULL to sk_reuseport_cb and forbids access to
the reuseport group from closed sockets.

This patch allows TCP_CLOSE sockets to remain in the reuseport group and to
have access to it while any child socket references to them. The point is
that reuseport_detach_sock() is called twice from inet_unhash() and
sk_destruct(). At first, it moves the socket backwards in socks[] and
increments num_closed_socks. Later, when all migrated connections are
accepted, it removes the socket from socks[], decrements num_closed_socks,
and sets NULL to sk_reuseport_cb.

By this change, closed sockets can keep sk_reuseport_cb until all child
requests have been freed or accepted. Consequently calling listen() after
shutdown() can cause EADDRINUSE or EBUSY in reuseport_add_sock() or
inet_csk_bind_conflict() which expect that such sockets should not have the
reuseport group. Therefore, this patch loosens such validation rules so
that the socket can listen again if it has the same reuseport group with
other listening sockets.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 include/net/sock_reuseport.h|  6 ++-
 net/core/sock_reuseport.c   | 83 +++--
 net/ipv4/inet_connection_sock.c |  7 ++-
 3 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index 505f1e18e9bf..ade3af55c91f 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -13,8 +13,9 @@ extern spinlock_t reuseport_lock;
 struct sock_reuseport {
struct rcu_head rcu;
 
-   u16 max_socks;  /* length of socks */
-   u16 num_socks;  /* elements in socks */
+   u16 max_socks;  /* length of socks */
+   u16 num_socks;  /* elements in socks */
+   u16 num_closed_socks;   /* closed elements in 
socks */
/* The last synq overflow event timestamp of this
 * reuse->socks[] group.
 */
@@ -23,6 +24,7 @@ struct sock_reuseport {
unsigned intreuseport_id;
unsigned intbind_inany:1;
unsigned inthas_conns:1;
+   unsigned intmigrate_req:1;
struct bpf_prog __rcu   *prog;  /* optional BPF sock selector */
struct sock *socks[];   /* array of sock pointers */
 };
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index bbdd3c7b6cb5..01a8b4ba39d7 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -36,6 +36,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int 
max_socks)
 int reuseport_alloc(struct sock *sk, bool bind_inany)
 {
struct sock_reuseport *reuse;
+   struct net *net = sock_net(sk);
int id, ret = 0;
 
/* bh lock used since this function call may precede hlist lock in
@@ -75,6 +76,8 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse->socks[0] = sk;
reuse->num_socks = 1;
reuse->bind_inany = bind_inany;
+   reuse->migrate_req = sk->sk_protocol == IPPROTO_TCP ?
+   net->ipv4.sysctl_tcp_migrate_req : 0;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
 out:
@@ -98,16 +101,22 @@ static struct sock_reuseport *reuseport_grow(struct 
sock_reuseport *reuse)
return NULL;
 
more_reuse->num_socks = reuse->num_socks;
+   more_reuse->num_closed_socks = reuse->num_closed_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport

[RFC PATCH bpf-next 6/8] bpf: Add cookie in sk_reuseport_md.

We will call sock_reuseport.prog for socket migration in the next commit,
so the eBPF program has to know which listener is closing in order to
select the new listener.

Currently, we can get a unique ID for each listener in the userspace by
calling bpf_map_lookup_elem() for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map.
This patch exposes the ID to the eBPF program.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 include/linux/bpf.h| 1 +
 include/uapi/linux/bpf.h   | 1 +
 net/core/filter.c  | 8 
 tools/include/uapi/linux/bpf.h | 1 +
 4 files changed, 11 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 581b2a2e78eb..c0646eceffa2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1897,6 +1897,7 @@ struct sk_reuseport_kern {
u32 hash;
u32 reuseport_id;
bool bind_inany;
+   u64 cookie;
 };
 bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
  struct bpf_insn_access_aux *info);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 162999b12790..3fcddb032838 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4403,6 +4403,7 @@ struct sk_reuseport_md {
__u32 ip_protocol;  /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
__u32 bind_inany;   /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
+   __u64 cookie;   /* ID of the listener in map */
 };
 
 #define BPF_TAG_SIZE   8
diff --git a/net/core/filter.c b/net/core/filter.c
index 2ca5eecebacf..01e28f283962 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9862,6 +9862,7 @@ static void bpf_init_reuseport_kern(struct 
sk_reuseport_kern *reuse_kern,
reuse_kern->hash = hash;
reuse_kern->reuseport_id = reuse->reuseport_id;
reuse_kern->bind_inany = reuse->bind_inany;
+   reuse_kern->cookie = sock_gen_cookie(sk);
 }
 
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock 
*sk,
@@ -10010,6 +10011,9 @@ sk_reuseport_is_valid_access(int off, int size,
case offsetof(struct sk_reuseport_md, hash):
return size == size_default;
 
+   case bpf_ctx_range(struct sk_reuseport_md, cookie):
+   return size == sizeof(__u64);
+
/* Fields that allow narrowing */
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
if (size < sizeof_field(struct sk_buff, protocol))
@@ -10082,6 +10086,10 @@ static u32 sk_reuseport_convert_ctx_access(enum 
bpf_access_type type,
case offsetof(struct sk_reuseport_md, bind_inany):
SK_REUSEPORT_LOAD_FIELD(bind_inany);
break;
+
+   case offsetof(struct sk_reuseport_md, cookie):
+   SK_REUSEPORT_LOAD_FIELD(cookie);
+   break;
}
 
return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 162999b12790..3fcddb032838 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4403,6 +4403,7 @@ struct sk_reuseport_md {
__u32 ip_protocol;  /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
__u32 bind_inany;   /* Is sock bound to an INANY address? */
__u32 hash; /* A hash of the packet 4 tuples */
+   __u64 cookie;   /* ID of the listener in map */
 };
 
 #define BPF_TAG_SIZE   8
-- 
2.17.2 (Apple Git-113)

[RFC PATCH bpf-next 4/8] tcp: Migrate TFO requests causing RST during TCP_SYN_RECV.

A TFO request socket is only freed after BOTH 3WHS has completed (or
aborted) and the child socket has been accepted (or its listener closed).
Hence, depending on the order, there can be two kinds of request sockets in
the accept queue.

  3WHS -> accept : TCP_ESTABLISHED
  accept -> 3WHS : TCP_SYN_RECV

Unlike TCP_ESTABLISHED socket, accept() does not free the request socket
for TCP_SYN_RECV socket. It is freed later at reqsk_fastopen_remove().
Also, it accesses request_sock.rsk_listener. So, in order to complete TFO
socket migration, we have to set the current listener to it at accept()
before reqsk_fastopen_remove().

Moreover, if TFO request caused RST before 3WHS has completed, it is held
in the listener's TFO queue to prevent DDoS attack. Thus, we also have to
migrate the requests in TFO queue.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 net/ipv4/inet_connection_sock.c | 35 -
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 583db7e2b1da..398c5c708bc5 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -500,6 +500,16 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, 
int *err, bool kern)
tcp_rsk(req)->tfo_listener) {
spin_lock_bh(>fastopenq.lock);
if (tcp_rsk(req)->tfo_listener) {
+   if (req->rsk_listener != sk) {
+   /* TFO request was migrated to another listener 
so
+* the new listener must be used in 
reqsk_fastopen_remove()
+* to hold requests which cause RST.
+*/
+   sock_put(req->rsk_listener);
+   sock_hold(sk);
+   req->rsk_listener = sk;
+   }
+
/* We are still waiting for the final ACK from 3WHS
 * so can't free req now. Instead, we set req->sk to
 * NULL to signify that the child socket is taken
@@ -954,7 +964,6 @@ static void inet_child_forget(struct sock *sk, struct 
request_sock *req,
 
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
-   BUG_ON(sk != req->rsk_listener);
 
/* Paranoid, to prevent race condition if
 * an inbound pkt destined for child is
@@ -995,6 +1004,7 @@ EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
 void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
 {
struct request_sock_queue *old_accept_queue, *new_accept_queue;
+   struct fastopen_queue *old_fastopenq, *new_fastopenq;
 
old_accept_queue = _csk(sk)->icsk_accept_queue;
new_accept_queue = _csk(nsk)->icsk_accept_queue;
@@ -1019,6 +1029,29 @@ void inet_csk_reqsk_queue_migrate(struct sock *sk, 
struct sock *nsk)
 
spin_unlock(_accept_queue->rskq_lock);
spin_unlock(_accept_queue->rskq_lock);
+
+   old_fastopenq = _accept_queue->fastopenq;
+   new_fastopenq = _accept_queue->fastopenq;
+
+   spin_lock_bh(_fastopenq->lock);
+   spin_lock_bh(_fastopenq->lock);
+
+   new_fastopenq->qlen += old_fastopenq->qlen;
+   old_fastopenq->qlen = 0;
+
+   if (old_fastopenq->rskq_rst_head) {
+   if (new_fastopenq->rskq_rst_head)
+   old_fastopenq->rskq_rst_tail->dl_next = 
new_fastopenq->rskq_rst_head;
+   else
+   old_fastopenq->rskq_rst_tail = 
new_fastopenq->rskq_rst_tail;
+
+   new_fastopenq->rskq_rst_head = old_fastopenq->rskq_rst_head;
+   old_fastopenq->rskq_rst_head = NULL;
+   old_fastopenq->rskq_rst_tail = NULL;
+   }
+
+   spin_unlock_bh(_fastopenq->lock);
+   spin_unlock_bh(_fastopenq->lock);
 }
 EXPORT_SYMBOL(inet_csk_reqsk_queue_migrate);
 
-- 
2.17.2 (Apple Git-113)

[RFC PATCH bpf-next 7/8] bpf: Call bpf_run_sk_reuseport() for socket migration.

This patch makes it possible to select a new listener for socket migration
by eBPF.

The noteworthy point is that we select a listening socket in
reuseport_detach_sock() and reuseport_select_sock(), but we do not have
struct skb in the unhash path.

Since we cannot pass skb to the eBPF program, we run only the
BPF_PROG_TYPE_SK_REUSEPORT program by calling bpf_run_sk_reuseport() with
skb NULL. So, some fields derived from skb are also NULL in the eBPF
program.

Moreover, we can cancel migration by returning SK_DROP. This feature is
useful when listeners have different settings at the socket API level or
when we want to free resources as soon as possible.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 net/core/filter.c  | 26 +-
 net/core/sock_reuseport.c  | 23 ---
 net/ipv4/inet_hashtables.c |  2 +-
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 01e28f283962..ffc4591878b8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8914,6 +8914,22 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type 
type,
SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \
 BPF_FIELD_SIZEOF(NS, NF), 0)
 
+#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF_OR_NULL(S, NS, F, NF, SIZE, OFF)  
\
+   do {
\
+   *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg,  
\
+ si->src_reg, offsetof(S, F)); 
\
+   *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);  
\
+   *insn++ = BPF_LDX_MEM(  
\
+   SIZE, si->dst_reg, si->dst_reg, 
\
+   bpf_target_off(NS, NF, sizeof_field(NS, NF),
\
+  target_size) 
\
+   + OFF); 
\
+   } while (0)
+
+#define SOCK_ADDR_LOAD_NESTED_FIELD_OR_NULL(S, NS, F, NF)  
\
+   SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF_OR_NULL(S, NS, F, NF,  
\
+BPF_FIELD_SIZEOF(NS, NF), 
0)
+
 /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
  * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
  *
@@ -9858,7 +9874,7 @@ static void bpf_init_reuseport_kern(struct 
sk_reuseport_kern *reuse_kern,
reuse_kern->skb = skb;
reuse_kern->sk = sk;
reuse_kern->selected_sk = NULL;
-   reuse_kern->data_end = skb->data + skb_headlen(skb);
+   reuse_kern->data_end = skb ? skb->data + skb_headlen(skb) : NULL;
reuse_kern->hash = hash;
reuse_kern->reuseport_id = reuse->reuseport_id;
reuse_kern->bind_inany = reuse->bind_inany;
@@ -10039,10 +10055,10 @@ sk_reuseport_is_valid_access(int off, int size,
})
 
 #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \
-   SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,   \
-   struct sk_buff, \
-   skb,\
-   SKB_FIELD)
+   SOCK_ADDR_LOAD_NESTED_FIELD_OR_NULL(struct sk_reuseport_kern,   \
+   struct sk_buff, \
+   skb,\
+   SKB_FIELD)
 
 #define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD)   \
SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,   \
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 74a46197854b..903f78ab35c3 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -224,6 +224,7 @@ struct sock *reuseport_detach_sock(struct sock *sk)
 {
struct sock_reuseport *reuse;
struct sock *nsk = NULL;
+   struct bpf_prog *prog;
int i;
 
spin_lock_bh(_lock);
@@ -249,8 +250,16 @@ struct sock *reuseport_detach_sock(struct sock *sk)
reuse->socks[i] = reuse->socks[reuse->num_socks];
 
if (reuse->migrate_req) {
-   if (reuse->num_socks)
-   nsk = i == reuse->num_socks ? reuse->socks[i - 
1] : reuse->socks[i];
+   if (reuse->num_socks) {
+   prog = rcu_dereference(reuse->prog);
+   if (prog && prog->type == 
BPF_PROG_TYPE_SK_REUSEPORT)
+   nsk = bpf_run_sk_reuseport(reuse, sk,

[RFC PATCH bpf-next 8/8] bpf: Test BPF_PROG_TYPE_SK_REUSEPORT for socket migration.

This patch adds a test for net.ipv4.tcp_migrate_req with eBPF.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
---
 .../bpf/prog_tests/migrate_reuseport.c| 175 ++
 .../bpf/progs/test_migrate_reuseport_kern.c   |  53 ++
 2 files changed, 228 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
 create mode 100644 
tools/testing/selftests/bpf/progs/test_migrate_reuseport_kern.c

diff --git a/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c 
b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
new file mode 100644
index ..fb182e575371
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check if we can migrate child sockets.
+ *
+ *   1. call listen() for 5 server sockets.
+ *   2. update a map to migrate all child socket
+ *to the last server socket (map[cookie] = 4)
+ *   3. call connect() for 25 client sockets.
+ *   4. call close() first 4 server sockets.
+ *   5. call receive() for the last server socket.
+ *
+ * Author: Kuniyuki Iwashima 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define NUM_SOCKS 5
+#define LOCALHOST "127.0.0.1"
+#define err_exit(condition, message) \
+   do {  \
+   if (condition) {  \
+   perror("ERROR: " message " ");\
+   setup_sysctl(0);  \
+   exit(1);  \
+   } \
+   } while (0)
+
+__u64 server_fds[NUM_SOCKS];
+int prog_fd, map_fd, migrate_map_fd;
+
+void setup_sysctl(int value)
+{
+   FILE *file;
+
+   file = fopen("/proc/sys/net/ipv4/tcp_migrate_req", "w");
+   fprintf(file, "%d", value);
+   fclose(file);
+}
+
+void setup_bpf(void)
+{
+   struct bpf_object *obj;
+   struct bpf_program *prog;
+   struct bpf_map *map, *migrate_map;
+   int err;
+
+   obj = bpf_object__open("test_migrate_reuseport_kern.o");
+   err_exit(libbpf_get_error(obj), "opening BPF object file failed");
+
+   err = bpf_object__load(obj);
+   err_exit(err, "loading BPF object failed");
+
+   prog = bpf_program__next(NULL, obj);
+   err_exit(!prog, "loading BPF program failed");
+
+   map = bpf_object__find_map_by_name(obj, "reuseport_map");
+   err_exit(!map, "loading BPF reuseport_map failed");
+
+   migrate_map = bpf_object__find_map_by_name(obj, "migrate_map");
+   err_exit(!map, "loading BPF migrate_map failed");
+
+   prog_fd = bpf_program__fd(prog);
+   map_fd = bpf_map__fd(map);
+   migrate_map_fd = bpf_map__fd(migrate_map);
+}
+
+void test_listen(void)
+{
+   struct sockaddr_in addr;
+   socklen_t addr_len = sizeof(addr);
+   int i, err, optval = 1, migrated_to = NUM_SOCKS - 1;
+   __u64 value;
+
+   addr.sin_family = AF_INET;
+   addr.sin_port = htons(80);
+   inet_pton(AF_INET, LOCALHOST, _addr.s_addr);
+
+   for (i = 0; i < NUM_SOCKS; i++) {
+   server_fds[i] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+   err_exit(server_fds[i] == -1, "socket() for listener sockets 
failed");
+
+   err = setsockopt(server_fds[i], SOL_SOCKET, SO_REUSEPORT,
+, sizeof(optval));
+   err_exit(err == -1, "setsockopt() for SO_REUSEPORT failed");
+
+   if (i == 0) {
+   err = setsockopt(server_fds[i], SOL_SOCKET, 
SO_ATTACH_REUSEPORT_EBPF,
+_fd, sizeof(prog_fd));
+   err_exit(err == -1, "setsockopt() for 
SO_ATTACH_REUSEPORT_EBPF failed");
+   }
+
+   err = bind(server_fds[i], (struct sockaddr *), addr_len);
+   err_exit(err == -1, "bind() failed");
+
+   err = listen(server_fds[i], 32);
+   err_exit(err == -1, "listen() failed");
+
+   err = bpf_map_update_elem(map_fd, , _fds[i], 
BPF_NOEXIST);
+   err_exit(err == -1, "updating BPF reuseport_map failed");
+
+   err = bpf_map_lookup_elem(map_fd, , );
+   err_exit(err == -1, "looking up BPF reuseport_map failed");
+
+   printf("fd[%d] (cookie: %llu) -> fd[%d]\n", i, value, 
migrated_to);
+   err = bpf_map_update_elem(migrate_map_fd, , _to, 
BPF_NOEXIST);
+   err_exit(err == -1, "updating BPF migrate_map failed");
+   }
+}
+
+void t

[RFC PATCH bpf-next 3/8] tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.

This patch lets reuseport_detach_sock() return a pointer of struct sock,
which is used only by inet_unhash(). If it is not NULL,
inet_csk_reqsk_queue_migrate() migrates TCP_ESTABLISHED/TCP_SYN_RECV
sockets from the closing listener to the selected one.

Listening sockets hold incoming connections as a linked list of struct
request_sock in the accept queue, and each request has reference to a full
socket and its listener. In inet_csk_reqsk_queue_migrate(), we unlink the
requests from the closing listener's queue and relink them to the head of
the new listener's queue. We do not process each request, so the migration
completes in O(1) time complexity. However, in the case of TCP_SYN_RECV
sockets, we will take special care in the next commit.

By default, we select the last element of socks[] as the new listener.
This behaviour is based on how the kernel moves sockets in socks[].

For example, we call listen() for four sockets (A, B, C, D), and close the
first two by turns. The sockets move in socks[] like below. (See also [1])

  socks[0] : A <-.  socks[0] : D  socks[0] : D
  socks[1] : B   |  =>  socks[1] : B <-.  =>  socks[1] : C
  socks[2] : C   |  socks[2] : C --'
  socks[3] : D --'

Then, if C and D have newer settings than A and B, and each socket has a
request (a, b, c, d) in their accept queue, we can redistribute old
requests evenly to new listeners.

  socks[0] : A (a) <-.  socks[0] : D (a + d)  socks[0] : D (a + d)
  socks[1] : B (b)   |  =>  socks[1] : B (b) <-.  =>  socks[1] : C (b + c)
  socks[2] : C (c)   |  socks[2] : C (c) --'
  socks[3] : D (d) --'

Here, (A, D), or (B, C) can have different application settings, but they
MUST have the same settings at the socket API level; otherwise, unexpected
error may happen. For instance, if only the new listeners have
TCP_SAVE_SYN, old requests do not have SYN data, so the application will
face inconsistency and cause an error.

Therefore, if there are different kinds of sockets, we must disable this
feature or use an eBPF program described in later commits.

Reviewed-by: Benjamin Herrenschmidt 
Signed-off-by: Kuniyuki Iwashima 
Link: 
https://lore.kernel.org/netdev/CAEfhGiyG8Y_amDZ2C8dQoQqjZJMHjTY76b=KBkTKcBtA=dh...@mail.gmail.com/
---
 include/net/inet_connection_sock.h |  1 +
 include/net/sock_reuseport.h   |  2 +-
 net/core/sock_reuseport.c  |  8 +++-
 net/ipv4/inet_connection_sock.c| 30 ++
 net/ipv4/inet_hashtables.c |  9 +++--
 5 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/include/net/inet_connection_sock.h 
b/include/net/inet_connection_sock.h
index 7338b3865a2a..2ea2d743f8fc 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -260,6 +260,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct 
sock *sk,
 struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
  struct request_sock *req,
  struct sock *child);
+void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk);
 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
   unsigned long timeout);
 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index ade3af55c91f..ece1c70ca907 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -32,7 +32,7 @@ struct sock_reuseport {
 extern int reuseport_alloc(struct sock *sk, bool bind_inany);
 extern int reuseport_add_sock(struct sock *sk, struct sock *sk2,
  bool bind_inany);
-extern void reuseport_detach_sock(struct sock *sk);
+extern struct sock *reuseport_detach_sock(struct sock *sk);
 extern struct sock *reuseport_select_sock(struct sock *sk,
  u32 hash,
  struct sk_buff *skb,
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 01a8b4ba39d7..74a46197854b 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -220,9 +220,10 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, 
bool bind_inany)
 }
 EXPORT_SYMBOL(reuseport_add_sock);
 
-void reuseport_detach_sock(struct sock *sk)
+struct sock *reuseport_detach_sock(struct sock *sk)
 {
struct sock_reuseport *reuse;
+   struct sock *nsk = NULL;
int i;
 
spin_lock_bh(_lock);
@@ -248,6 +249,9 @@ void reuseport_detach_sock(struct sock *sk)
reuse->socks[i] = reuse->socks[reuse->num_socks];
 
if (reuse->migrate_req) {
+   if (reuse->num_socks)
+   nsk = i == reuse->num_socks ? reuse->socks[i - 
1] : reuse->socks[i];
+
reuse->num_closed_socks++;

[RFC PATCH bpf-next 0/8] Socket migration for SO_REUSEPORT.

The SO_REUSEPORT option allows sockets to listen on the same port and to
accept connections evenly. However, there is a defect in the current
implementation. When a SYN packet is received, the connection is tied to a
listening socket. Accordingly, when the listener is closed, in-flight
requests during the three-way handshake and child sockets in the accept
queue are dropped even if other listeners could accept such connections.

This situation can happen when various server management tools restart
server (such as nginx) processes. For instance, when we change nginx
configurations and restart it, it spins up new workers that respect the new
configuration and closes all listeners on the old workers, resulting in
in-flight ACK of 3WHS is responded by RST.

As a workaround for this issue, we can do connection draining by eBPF:

  1. Before closing a listener, stop routing SYN packets to it.
  2. Wait enough time for requests to complete 3WHS.
  3. Accept connections until EAGAIN, then close the listener.

Although this approach seems to work well, EAGAIN has nothing to do with
how many requests are still during 3WHS. Thus, we have to know the number
of such requests by counting SYN packets by eBPF to complete connection
draining.

  1. Start counting SYN packets and accept syscalls using eBPF map.
  2. Stop routing SYN packets.
  3. Accept connections up to the count, then close the listener.

In cases that eBPF is used only for connection draining, it seems a bit
expensive. Moreover, there is some situation that we cannot modify and
build a server program to implement the workaround. This patchset
introduces a new sysctl option to free userland programs from the kernel
issue. If we enable net.ipv4.tcp_migrate_req before creating a reuseport
group, we can redistribute requests and connections from a listener to
others in the same reuseport group at close() or shutdown() syscalls.

Note that the source and destination listeners MUST have the same settings
at the socket API level; otherwise, applications may face inconsistency and
cause errors. In such a case, we have to use eBPF program to select a
specific listener or to cancel migration.

Kuniyuki Iwashima (8):
  net: Introduce net.ipv4.tcp_migrate_req.
  tcp: Keep TCP_CLOSE sockets in the reuseport group.
  tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.
  tcp: Migrate TFO requests causing RST during TCP_SYN_RECV.
  tcp: Migrate TCP_NEW_SYN_RECV requests.
  bpf: Add cookie in sk_reuseport_md.
  bpf: Call bpf_run_sk_reuseport() for socket migration.
  bpf: Test BPF_PROG_TYPE_SK_REUSEPORT for socket migration.

 Documentation/networking/ip-sysctl.rst|  15 ++
 include/linux/bpf.h   |   1 +
 include/net/inet_connection_sock.h|  13 ++
 include/net/netns/ipv4.h  |   1 +
 include/net/request_sock.h|  13 ++
 include/net/sock_reuseport.h  |   8 +-
 include/uapi/linux/bpf.h  |   1 +
 net/core/filter.c |  34 +++-
 net/core/sock_reuseport.c | 110 +--
 net/ipv4/inet_connection_sock.c   |  84 -
 net/ipv4/inet_hashtables.c|   9 +-
 net/ipv4/sysctl_net_ipv4.c|   9 +
 net/ipv4/tcp_ipv4.c   |   9 +-
 net/ipv6/tcp_ipv6.c   |   9 +-
 tools/include/uapi/linux/bpf.h|   1 +
 .../bpf/prog_tests/migrate_reuseport.c| 175 ++
 .../bpf/progs/test_migrate_reuseport_kern.c   |  53 ++
 17 files changed, 511 insertions(+), 34 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/migrate_reuseport.c
 create mode 100644 
tools/testing/selftests/bpf/progs/test_migrate_reuseport_kern.c

-- 
2.17.2 (Apple Git-113)

[RFC PATCH bpf-next 5/8] tcp: Migrate TCP_NEW_SYN_RECV requests.