> First, thanks for the patch.
> 
> While the reuseport could cure (or hide if you will) the unbalancing you
> see it makes sense to get better understanding what exactly is going on.
>  So far we haven't seen such weird behaviour ourself neither received
> reports about such uneven connections distribution among nginx workers.
> 
> Any chances you have accept_mutex and/or multi_accept?  Any other ideas?

Unfortunately I'm not 100% sure what's causing it, but it's pretty easy for us 
to reproduce even on our development machines. Just to show there's no 
accept_mutex or multi_accept in our config.

```
# grep accept /etc/nginx/mail.conf
# 
```

And here's what a cut down version of our config looks like.

```
worker_processes  auto;
worker_shutdown_timeout 5m;

events {
    use epoll;
    worker_connections  65536;
}
...
mail {
    auth_http http://unix:/var/run/nginx/mail_auth.sock:/nginx/;
    imap_client_buffer  16k;
    imap_capabilities "IMAP4" "IMAP4rev1" "LITERAL+" "ENABLE" "UIDPLUS" 
"SASL-IR" "NAMESPACE" "CONDSTORE" "SORT" "LIST-EXTENDED" "QRESYNC" "MOVE" 
"SPECIAL-USE" "CREATE-SPECIAL-USE" "IDLE";
    ssl_session_cache shared:sslcache:50m;
    ssl_session_timeout 30m;

    server {
      listen 10.a.b.c:993 ssl reuseport;
      auth_http_header "ServerHostname" "imap.foo";
      ssl_prefer_server_ciphers on;
      ssl_protocols ...
      ssl_ciphers ...;
      ssl_certificate      ...;
      ssl_certificate_key  ...;
      protocol imap;
      proxy on;
      proxy_timeout  1h;
    }
```

With that on a development machine which has 4 vcpus we see:

```
# ps auxw | grep nginx | grep mail
root        3839  0.0  0.0  68472  1372 ?        Ss   08:16   0:00 nginx: 
master process /usr/local/nginx/sbin/nginx -c /etc/nginx/mail.conf
nobody      3841  0.0  0.0  95732  3572 ?        S    08:16   0:01 nginx: 
worker process /usr/local/nginx/sbin/nginx -c /etc/nginx/mail.conf
nobody      3842  0.0  0.0  95732  3284 ?        S    08:16   0:01 nginx: 
worker process /usr/local/nginx/sbin/nginx -c /etc/nginx/mail.conf
nobody      3843  0.0  0.0  95796  4096 ?        S    08:16   0:01 nginx: 
worker process /usr/local/nginx/sbin/nginx -c /etc/nginx/mail.conf
nobody      3846  0.0  0.0  95732  3092 ?        S    08:16   0:01 nginx: 
worker process /usr/local/nginx/sbin/nginx -c /etc/nginx/mail.conf
```

Now lets just create 1000 SSL connections and see how they get distributed 
between those procs.

```
# perl -e 'use IO::Socket::SSL; for (1..1000) { push @s, 
IO::Socket::SSL->new("imap.foo:993"); } print "done\n"; sleep 1000;'
done
^Z
[3]+  Stopped
# for i in 3841 3842 3843 3846; do echo "$i - " `ls /proc/$i/fd | wc -l`; done
3841 -  335
3842 -  295
3843 -  293
3846 -  320
```

Reasonably even.

Now lets change `listen 10.a.b.c:993 ssl reuseport` to `listen 10.a.b.c:993 
ssl` and restart.

```
# ps auxw | grep nginx | grep mail
root      559885  0.0  0.0  68472  3104 ?        Ss   21:01   0:00 nginx: 
master process /usr/local/nginx/sbin/nginx -c /etc/nginx/mail.conf
nobody    559886  0.0  0.3  95620 30448 ?        S    21:01   0:00 nginx: 
worker process /usr/local/nginx/sbin/nginx -c /etc/nginx/mail.conf
nobody    559887  0.0  0.3  95620 30448 ?        S    21:01   0:00 nginx: 
worker process /usr/local/nginx/sbin/nginx -c /etc/nginx/mail.conf
nobody    559888  0.0  0.3  95620 30448 ?        S    21:01   0:00 nginx: 
worker process /usr/local/nginx/sbin/nginx -c /etc/nginx/mail.conf
nobody    559889  0.0  0.3  95620 30448 ?        S    21:01   0:00 nginx: 
worker process /usr/local/nginx/sbin/nginx -c /etc/nginx/mail.conf
# perl -e 'use IO::Socket::SSL; for (1..1000) { push @s, 
IO::Socket::SSL->new("imap.foo:993"); } print "done\n"; sleep 1000;'
done
^Z
[5]+  Stopped
# for i in 559886 559887 559888 559889; do echo "$i - " `ls /proc/$i/fd | wc 
-l`; done
559886 -  1054
559887 -  57
559888 -  60
559889 -  57
```

And as you can see, a completely uneven distribution of connections between 
processes! This doesn't just occur on our development machines either (e.g. 
it's not related to the source IP or anything), it occurs on production systems 
with connections arriving from real world customers and clients scattered 
around the world.

This is a fairly standard debian buster distribution, though we use a back 
ported newer kernel, and a recent version of nginx.

```
# uname -a
Linux xyz 5.10.0-0.bpo.4-amd64 #1 SMP Debian 5.10.19-1~bpo10+1 (2021-03-13) 
x86_64 GNU/Linux
# /usr/local/nginx/sbin/nginx -v
nginx version: nginx/1.20.1
```

As you can see, without the reuseport option, this causes severe scalability 
problems for us.

Even without that though, it would just be nice to have some more consistency 
of the `listen` options between http/stream/mail modules as well.

-- 
Rob Mueller
[email protected]
_______________________________________________
nginx-devel mailing list
[email protected]
http://mailman.nginx.org/mailman/listinfo/nginx-devel

Reply via email to