Re: [PATCH virt] virt: fix uninit-value in vhost_vsock_dev_open

2024-05-05 Thread Arseniy Krasnov
> But now that it's explained, the bugfix as proposed is incomplete:
> userspace can set features twice and the second time will leak
> old VIRTIO_VSOCK_F_SEQPACKET bit value.
> 
> And I am pretty sure the Fixes tag is wrong.
> 
> So I wrote this, but I actually don't have a set for
> seqpacket to test this. Arseny could you help test maybe?
> Thanks!

Hi! Sorry for late reply! Just run vsock test suite with this patch -
seems everything is ok!

> 
> 
> commit bcc17a060d93b198d8a17a9b87b593f41337ee28
> Author: Michael S. Tsirkin 
> Date:   Mon Apr 22 10:03:13 2024 -0400
> 
> vhost/vsock: always initialize seqpacket_allow
> 
> There are two issues around seqpacket_allow:
> 1. seqpacket_allow is not initialized when socket is
> created. Thus if features are never set, it will be
> read uninitialized.
> 2. if VIRTIO_VSOCK_F_SEQPACKET is set and then cleared,
> then seqpacket_allow will not be cleared appropriately
> (existing apps I know about don't usually do this but
> it's legal and there's no way to be sure no one relies
> on this).
> 
> To fix:
> - initialize seqpacket_allow after allocation
> - set it unconditionally in set_features
> 
> Reported-by: syzbot+6c21aeb59d0e82eb2...@syzkaller.appspotmail.com
> Reported-by: Jeongjun Park 
> Fixes: ced7b713711f ("vhost/vsock: support SEQPACKET for transport").
> Cc: Arseny Krasnov 
> Cc: David S. Miller 
> Cc: Stefan Hajnoczi 
> Signed-off-by: Michael S. Tsirkin 

Acked-by: Arseniy Krasnov 

> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index ec20ecff85c7..bf664ec9341b 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -667,6 +667,7 @@ static int vhost_vsock_dev_open(struct inode *inode, 
> struct file *file)
>   }
>  
>   vsock->guest_cid = 0; /* no CID assigned yet */
> + vsock->seqpacket_allow = false;
>  
>   atomic_set(>queued_replies, 0);
>  
> @@ -810,8 +811,7 @@ static int vhost_vsock_set_features(struct vhost_vsock 
> *vsock, u64 features)
>   goto err;
>   }
>  
> - if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
> - vsock->seqpacket_allow = true;
> + vsock->seqpacket_allow = features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET);
>  
>   for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>   vq = >vqs[i];





[PATCH net-next v1] vsock/test: print type for SOCK_SEQPACKET

2024-01-24 Thread Arseniy Krasnov
SOCK_SEQPACKET is supported for virtio transport, so do not interpret
such type of socket as unknown.

Signed-off-by: Arseniy Krasnov 
---
 tools/testing/vsock/vsock_diag_test.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/vsock/vsock_diag_test.c 
b/tools/testing/vsock/vsock_diag_test.c
index 5e6049226b77..17aeba7cbd14 100644
--- a/tools/testing/vsock/vsock_diag_test.c
+++ b/tools/testing/vsock/vsock_diag_test.c
@@ -39,6 +39,8 @@ static const char *sock_type_str(int type)
return "DGRAM";
case SOCK_STREAM:
return "STREAM";
+   case SOCK_SEQPACKET:
+   return "SEQPACKET";
default:
return "INVALID TYPE";
}
-- 
2.25.1




[PATCH net-next v2] vsock/test: add '--peer-port' input argument

2024-01-22 Thread Arseniy Krasnov
Implement port for given CID as input argument instead of using
hardcoded value '1234'. This allows to run different test instances
on a single CID. Port argument is not required parameter and if it is
not set, then default value will be '1234' - thus we preserve previous
behaviour.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v1 -> v2:
  * Reword usage message.
  * Add commas after last field in 'opts' declaration.
  * 'RFC' -> 'net-next'.

 tools/testing/vsock/util.c|  17 +++-
 tools/testing/vsock/util.h|   4 +
 tools/testing/vsock/vsock_diag_test.c |  21 +++--
 tools/testing/vsock/vsock_test.c  | 102 +-
 tools/testing/vsock/vsock_test_zerocopy.c |  12 +--
 tools/testing/vsock/vsock_uring_test.c|  17 +++-
 6 files changed, 115 insertions(+), 58 deletions(-)

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index ae2b33c21c45..554b290fefdc 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -33,8 +33,7 @@ void init_signals(void)
signal(SIGPIPE, SIG_IGN);
 }
 
-/* Parse a CID in string representation */
-unsigned int parse_cid(const char *str)
+static unsigned int parse_uint(const char *str, const char *err_str)
 {
char *endptr = NULL;
unsigned long n;
@@ -42,12 +41,24 @@ unsigned int parse_cid(const char *str)
errno = 0;
n = strtoul(str, , 10);
if (errno || *endptr != '\0') {
-   fprintf(stderr, "malformed CID \"%s\"\n", str);
+   fprintf(stderr, "malformed %s \"%s\"\n", err_str, str);
exit(EXIT_FAILURE);
}
return n;
 }
 
+/* Parse a CID in string representation */
+unsigned int parse_cid(const char *str)
+{
+   return parse_uint(str, "CID");
+}
+
+/* Parse a port in string representation */
+unsigned int parse_port(const char *str)
+{
+   return parse_uint(str, "port");
+}
+
 /* Wait for the remote to close the connection */
 void vsock_wait_remote_close(int fd)
 {
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index 03c88d0cb861..e95e62485959 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -12,10 +12,13 @@ enum test_mode {
TEST_MODE_SERVER
 };
 
+#define DEFAULT_PEER_PORT  1234
+
 /* Test runner options */
 struct test_opts {
enum test_mode mode;
unsigned int peer_cid;
+   unsigned int peer_port;
 };
 
 /* A test case definition.  Test functions must print failures to stderr and
@@ -35,6 +38,7 @@ struct test_case {
 
 void init_signals(void);
 unsigned int parse_cid(const char *str);
+unsigned int parse_port(const char *str);
 int vsock_stream_connect(unsigned int cid, unsigned int port);
 int vsock_bind_connect(unsigned int cid, unsigned int port,
   unsigned int bind_port, int type);
diff --git a/tools/testing/vsock/vsock_diag_test.c 
b/tools/testing/vsock/vsock_diag_test.c
index fa927ad16f8a..9d61b1f1c4c3 100644
--- a/tools/testing/vsock/vsock_diag_test.c
+++ b/tools/testing/vsock/vsock_diag_test.c
@@ -342,7 +342,7 @@ static void test_listen_socket_server(const struct 
test_opts *opts)
} addr = {
.svm = {
.svm_family = AF_VSOCK,
-   .svm_port = 1234,
+   .svm_port = opts->peer_port,
.svm_cid = VMADDR_CID_ANY,
},
};
@@ -378,7 +378,7 @@ static void test_connect_client(const struct test_opts 
*opts)
LIST_HEAD(sockets);
struct vsock_stat *st;
 
-   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   fd = vsock_stream_connect(opts->peer_cid, opts->peer_port);
if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
@@ -403,7 +403,7 @@ static void test_connect_server(const struct test_opts 
*opts)
LIST_HEAD(sockets);
int client_fd;
 
-   client_fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   client_fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL);
if (client_fd < 0) {
perror("accept");
exit(EXIT_FAILURE);
@@ -461,6 +461,11 @@ static const struct option longopts[] = {
.has_arg = required_argument,
.val = 'p',
},
+   {
+   .name = "peer-port",
+   .has_arg = required_argument,
+   .val = 'q',
+   },
{
.name = "list",
.has_arg = no_argument,
@@ -481,7 +486,7 @@ static const struct option longopts[] = {
 
 static void usage(void)
 {
-   fprintf(stderr, "Usage: vsock_diag_test [--help] 
[--control-host=] --control-port= --mode=client|server 
--peer-cid= [--list] [--skip=]\n"
+   fprintf(stderr, "Usage: vsock_di

Re: [RFC PATCH v1] vsock/test: add '--peer-port' input argument

2024-01-15 Thread Arseniy Krasnov



On 15.01.2024 14:54, Stefano Garzarella wrote:
> Hi Arseniy,
> thanks for this patch!
> 
> On Sat, Jan 13, 2024 at 12:21:10AM +0300, Arseniy Krasnov wrote:
>> Implement port for given CID as input argument instead of using
>> hardcoded value '1234'. This allows to run different test instances
>> on a single CID. Port argument is not required parameter and if it is
>> not set, then default value will be '1234' - thus we preserve previous
>> behaviour.
>>
>> Signed-off-by: Arseniy Krasnov 
>> ---
>> tools/testing/vsock/util.c    | 17 +++-
>> tools/testing/vsock/util.h    |  4 +
>> tools/testing/vsock/vsock_diag_test.c | 18 -
>> tools/testing/vsock/vsock_test.c  | 96 +--
>> tools/testing/vsock/vsock_test_zerocopy.c | 12 +--
>> tools/testing/vsock/vsock_uring_test.c    | 16 +++-
>> 6 files changed, 107 insertions(+), 56 deletions(-)
>>
>> diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
>> index ae2b33c21c45..554b290fefdc 100644
>> --- a/tools/testing/vsock/util.c
>> +++ b/tools/testing/vsock/util.c
>> @@ -33,8 +33,7 @@ void init_signals(void)
>> signal(SIGPIPE, SIG_IGN);
>> }
>>
>> -/* Parse a CID in string representation */
>> -unsigned int parse_cid(const char *str)
>> +static unsigned int parse_uint(const char *str, const char *err_str)
>> {
>> char *endptr = NULL;
>> unsigned long n;
>> @@ -42,12 +41,24 @@ unsigned int parse_cid(const char *str)
>> errno = 0;
>> n = strtoul(str, , 10);
>> if (errno || *endptr != '\0') {
>> -    fprintf(stderr, "malformed CID \"%s\"\n", str);
>> +    fprintf(stderr, "malformed %s \"%s\"\n", err_str, str);
>>     exit(EXIT_FAILURE);
>> }
>> return n;
>> }
>>
>> +/* Parse a CID in string representation */
>> +unsigned int parse_cid(const char *str)
>> +{
>> +    return parse_uint(str, "CID");
>> +}
>> +
>> +/* Parse a port in string representation */
>> +unsigned int parse_port(const char *str)
>> +{
>> +    return parse_uint(str, "port");
>> +}
>> +
>> /* Wait for the remote to close the connection */
>> void vsock_wait_remote_close(int fd)
>> {
>> diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
>> index 03c88d0cb861..e95e62485959 100644
>> --- a/tools/testing/vsock/util.h
>> +++ b/tools/testing/vsock/util.h
>> @@ -12,10 +12,13 @@ enum test_mode {
>> TEST_MODE_SERVER
>> };
>>
>> +#define DEFAULT_PEER_PORT    1234
>> +
>> /* Test runner options */
>> struct test_opts {
>> enum test_mode mode;
>> unsigned int peer_cid;
>> +    unsigned int peer_port;
>> };
>>
>> /* A test case definition.  Test functions must print failures to stderr and
>> @@ -35,6 +38,7 @@ struct test_case {
>>
>> void init_signals(void);
>> unsigned int parse_cid(const char *str);
>> +unsigned int parse_port(const char *str);
>> int vsock_stream_connect(unsigned int cid, unsigned int port);
>> int vsock_bind_connect(unsigned int cid, unsigned int port,
>>    unsigned int bind_port, int type);
>> diff --git a/tools/testing/vsock/vsock_diag_test.c 
>> b/tools/testing/vsock/vsock_diag_test.c
>> index fa927ad16f8a..5e6049226b77 100644
>> --- a/tools/testing/vsock/vsock_diag_test.c
>> +++ b/tools/testing/vsock/vsock_diag_test.c
>> @@ -342,7 +342,7 @@ static void test_listen_socket_server(const struct 
>> test_opts *opts)
>> } addr = {
>>     .svm = {
>>     .svm_family = AF_VSOCK,
>> -    .svm_port = 1234,
>> +    .svm_port = opts->peer_port,
>>     .svm_cid = VMADDR_CID_ANY,
>>     },
>> };
>> @@ -378,7 +378,7 @@ static void test_connect_client(const struct test_opts 
>> *opts)
>> LIST_HEAD(sockets);
>> struct vsock_stat *st;
>>
>> -    fd = vsock_stream_connect(opts->peer_cid, 1234);
>> +    fd = vsock_stream_connect(opts->peer_cid, opts->peer_port);
>> if (fd < 0) {
>>     perror("connect");
>>     exit(EXIT_FAILURE);
>> @@ -403,7 +403,7 @@ static void test_connect_server(const struct test_opts 
>> *opts)
>> LIST_HEAD(sockets);
>> int client_fd;
>>
>> -    client_fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
>> +    client_fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL

[RFC PATCH v1] vsock/test: add '--peer-port' input argument

2024-01-12 Thread Arseniy Krasnov
Implement port for given CID as input argument instead of using
hardcoded value '1234'. This allows to run different test instances
on a single CID. Port argument is not required parameter and if it is
not set, then default value will be '1234' - thus we preserve previous
behaviour.

Signed-off-by: Arseniy Krasnov 
---
 tools/testing/vsock/util.c| 17 +++-
 tools/testing/vsock/util.h|  4 +
 tools/testing/vsock/vsock_diag_test.c | 18 -
 tools/testing/vsock/vsock_test.c  | 96 +--
 tools/testing/vsock/vsock_test_zerocopy.c | 12 +--
 tools/testing/vsock/vsock_uring_test.c| 16 +++-
 6 files changed, 107 insertions(+), 56 deletions(-)

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index ae2b33c21c45..554b290fefdc 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -33,8 +33,7 @@ void init_signals(void)
signal(SIGPIPE, SIG_IGN);
 }
 
-/* Parse a CID in string representation */
-unsigned int parse_cid(const char *str)
+static unsigned int parse_uint(const char *str, const char *err_str)
 {
char *endptr = NULL;
unsigned long n;
@@ -42,12 +41,24 @@ unsigned int parse_cid(const char *str)
errno = 0;
n = strtoul(str, , 10);
if (errno || *endptr != '\0') {
-   fprintf(stderr, "malformed CID \"%s\"\n", str);
+   fprintf(stderr, "malformed %s \"%s\"\n", err_str, str);
exit(EXIT_FAILURE);
}
return n;
 }
 
+/* Parse a CID in string representation */
+unsigned int parse_cid(const char *str)
+{
+   return parse_uint(str, "CID");
+}
+
+/* Parse a port in string representation */
+unsigned int parse_port(const char *str)
+{
+   return parse_uint(str, "port");
+}
+
 /* Wait for the remote to close the connection */
 void vsock_wait_remote_close(int fd)
 {
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index 03c88d0cb861..e95e62485959 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -12,10 +12,13 @@ enum test_mode {
TEST_MODE_SERVER
 };
 
+#define DEFAULT_PEER_PORT  1234
+
 /* Test runner options */
 struct test_opts {
enum test_mode mode;
unsigned int peer_cid;
+   unsigned int peer_port;
 };
 
 /* A test case definition.  Test functions must print failures to stderr and
@@ -35,6 +38,7 @@ struct test_case {
 
 void init_signals(void);
 unsigned int parse_cid(const char *str);
+unsigned int parse_port(const char *str);
 int vsock_stream_connect(unsigned int cid, unsigned int port);
 int vsock_bind_connect(unsigned int cid, unsigned int port,
   unsigned int bind_port, int type);
diff --git a/tools/testing/vsock/vsock_diag_test.c 
b/tools/testing/vsock/vsock_diag_test.c
index fa927ad16f8a..5e6049226b77 100644
--- a/tools/testing/vsock/vsock_diag_test.c
+++ b/tools/testing/vsock/vsock_diag_test.c
@@ -342,7 +342,7 @@ static void test_listen_socket_server(const struct 
test_opts *opts)
} addr = {
.svm = {
.svm_family = AF_VSOCK,
-   .svm_port = 1234,
+   .svm_port = opts->peer_port,
.svm_cid = VMADDR_CID_ANY,
},
};
@@ -378,7 +378,7 @@ static void test_connect_client(const struct test_opts 
*opts)
LIST_HEAD(sockets);
struct vsock_stat *st;
 
-   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   fd = vsock_stream_connect(opts->peer_cid, opts->peer_port);
if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
@@ -403,7 +403,7 @@ static void test_connect_server(const struct test_opts 
*opts)
LIST_HEAD(sockets);
int client_fd;
 
-   client_fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   client_fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL);
if (client_fd < 0) {
perror("accept");
exit(EXIT_FAILURE);
@@ -461,6 +461,11 @@ static const struct option longopts[] = {
.has_arg = required_argument,
.val = 'p',
},
+   {
+   .name = "peer-port",
+   .has_arg = required_argument,
+   .val = 'q',
+   },
{
.name = "list",
.has_arg = no_argument,
@@ -481,7 +486,7 @@ static const struct option longopts[] = {
 
 static void usage(void)
 {
-   fprintf(stderr, "Usage: vsock_diag_test [--help] 
[--control-host=] --control-port= --mode=client|server 
--peer-cid= [--list] [--skip=]\n"
+   fprintf(stderr, "Usage: vsock_diag_test [--help] 
[--control-host=] --control-port= --mode=client|server 
--peer-cid= [--peer-port=] [--list] [--skip=]\n"
"\n"

Re: [PATCH net-next v9 3/4] vsock: update SO_RCVLOWAT setting callback

2023-12-14 Thread Arseniy Krasnov



On 14.12.2023 13:29, Michael S. Tsirkin wrote:
> On Thu, Dec 14, 2023 at 12:19:46PM +0300, Arseniy Krasnov wrote:
>> Do not return if transport callback for SO_RCVLOWAT is set (only in
>> error case). In this case we don't need to set 'sk_rcvlowat' field in
>> each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
>> is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
>> to 'notify_set_rcvlowat'.
>>
>> Signed-off-by: Arseniy Krasnov 
>> Reviewed-by: Stefano Garzarella 
>> Acked-by: Michael S. Tsirkin 
> 
> Maybe squash this with patch 2/4?

Done in v10

Thanks, Arseniy

> 
>> ---
>>  Changelog:
>>  v3 -> v4:
>>   * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
>>   * Commit message updated.
>>
>>  include/net/af_vsock.h   | 2 +-
>>  net/vmw_vsock/af_vsock.c | 9 +++--
>>  net/vmw_vsock/hyperv_transport.c | 4 ++--
>>  3 files changed, 10 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>> index e302c0e804d0..535701efc1e5 100644
>> --- a/include/net/af_vsock.h
>> +++ b/include/net/af_vsock.h
>> @@ -137,7 +137,6 @@ struct vsock_transport {
>>  u64 (*stream_rcvhiwat)(struct vsock_sock *);
>>  bool (*stream_is_active)(struct vsock_sock *);
>>  bool (*stream_allow)(u32 cid, u32 port);
>> -int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
>>  
>>  /* SEQ_PACKET. */
>>  ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
>> @@ -168,6 +167,7 @@ struct vsock_transport {
>>  struct vsock_transport_send_notify_data *);
>>  /* sk_lock held by the caller */
>>  void (*notify_buffer_size)(struct vsock_sock *, u64 *);
>> +int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
>>  
>>  /* Shutdown. */
>>  int (*shutdown)(struct vsock_sock *, int);
>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>> index 816725af281f..54ba7316f808 100644
>> --- a/net/vmw_vsock/af_vsock.c
>> +++ b/net/vmw_vsock/af_vsock.c
>> @@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int 
>> val)
>>  
>>  transport = vsk->transport;
>>  
>> -if (transport && transport->set_rcvlowat)
>> -return transport->set_rcvlowat(vsk, val);
>> +if (transport && transport->notify_set_rcvlowat) {
>> +int err;
>> +
>> +err = transport->notify_set_rcvlowat(vsk, val);
>> +if (err)
>> +return err;
>> +}
>>  
>>  WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
>>  return 0;
> 
> 
> 
> I would s
> 
>> diff --git a/net/vmw_vsock/hyperv_transport.c 
>> b/net/vmw_vsock/hyperv_transport.c
>> index 7cb1a9d2cdb4..e2157e387217 100644
>> --- a/net/vmw_vsock/hyperv_transport.c
>> +++ b/net/vmw_vsock/hyperv_transport.c
>> @@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
>> ssize_t written,
>>  }
>>  
>>  static
>> -int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
>> +int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
>>  {
>>  return -EOPNOTSUPP;
>>  }
>> @@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
>>  .notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
>>  .notify_send_post_enqueue = hvs_notify_send_post_enqueue,
>>  
>> -.set_rcvlowat = hvs_set_rcvlowat
>> +.notify_set_rcvlowat  = hvs_notify_set_rcvlowat
>>  };
>>  
>>  static bool hvs_check_transport(struct vsock_sock *vsk)
>> -- 
>> 2.25.1
> 



[PATCH net-next v10 1/3] virtio/vsock: fix logic which reduces credit update messages

2023-12-14 Thread Arseniy Krasnov
Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v6 -> v7:
  * Handle wrap of 'fwd_cnt'.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
 v7 -> v8:
  * Remove unneeded/wrong handling of wrap for 'fwd_cnt'.

 net/vmw_vsock/virtio_transport_common.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index c8e162c9d1df..7eabe5219ef7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -557,6 +557,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+   u32 fwd_cnt_delta;
+   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;
 
@@ -600,7 +602,10 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}
}
 
-   free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+   fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
+   free_space = vvs->buf_alloc - fwd_cnt_delta;
+   low_rx_bytes = (vvs->rx_bytes <
+   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
 
spin_unlock_bh(>rx_lock);
 
@@ -610,9 +615,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * too high causes extra messages. Too low causes transmitter
 * stalls. As stalls are in theory more expensive than extra
 * messages, we set the limit to a high value. TODO: experiment
-* with different values.
+* with different values. Also send credit update message when
+* number of bytes in rx queue is not enough to wake up reader.
 */
-   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   if (fwd_cnt_delta &&
+   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
virtio_transport_send_credit_update(vsk);
 
return total;
-- 
2.25.1




[PATCH net-next v10 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-14 Thread Arseniy Krasnov
Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
O_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Rename 'set_rcvlowat' callback to 'notify_set_rcvlowat' and set
'sk->sk_rcvlowat' only in one place (i.e. 'vsock_set_rcvlowat'), so the
transport doesn't need to do it.

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v9 -> v10:
  * This is squash of 0002 and 0003 from v9.

 drivers/vhost/vsock.c   |  1 +
 include/linux/virtio_vsock.h|  1 +
 include/net/af_vsock.h  |  2 +-
 net/vmw_vsock/af_vsock.c|  9 ++--
 net/vmw_vsock/hyperv_transport.c|  4 ++--
 net/vmw_vsock/virtio_transport.c|  1 +
 net/vmw_vsock/virtio_transport_common.c | 30 +
 net/vmw_vsock/vsock_loopback.c  |  1 +
 8 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ec20ecff85c7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e302c0e804d0..535701efc1e5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -137,7 +137,6 @@ struct vsock_transport {
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);
-   int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* SEQ_PACKET. */
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
@@ -168,6 +167,7 @@ struct vsock_transport {
struct vsock_transport_send_notify_data *);
/* sk_lock held by the caller */
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
+   int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..54ba7316f808 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
transport = vsk->transport;
 
-   if (transport && transport->set_rcvlowat)
-   return transport->set_rcvlowat(vsk, val);
+   if (transport && transport->notify_set_rcvlowat) {
+   int err;
+
+   err = transport->notify_set_rcvlowat(vsk, val);
+   if (err)
+   return err;
+   }
 
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
return 0;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7cb1a9d2cdb4..e2157e387217 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
ssize_t written,
 }
 
 static
-int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
+int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
 {
return -EOPNOTSUPP;
 }
@@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
 
-   .set_rcvlowat = hvs_set_rcvlowat
+   .notify_set_rcvlowat  = hvs_notify_set_rcvlowat
 };
 
 static bool hvs_check_transport(struct vsock_sock *vsk)
diff --git a/net/vmw_vsock/virtio_transport.c b/n

[PATCH net-next v10 0/3] send credit update during setting SO_RCVLOWAT

2023-12-14 Thread Arseniy Krasnov
Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=9bab51bd662be4c3ebb18a28879981d69f3ef15a

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
Link to v4:
https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
Link to v5:
https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/
Link to v6:
https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/
Link to v7:
https://lore.kernel.org/netdev/20231206211849.2707151-1-avkras...@salutedevices.com/
Link to v8:
https://lore.kernel.org/netdev/20231211211658.2904268-1-avkras...@salutedevices.com/
Link to v9:
https://lore.kernel.org/netdev/20231214091947.395892-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.
v2 -> v3:
 * See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v4 -> v5:
 * Change patchset tag 'RFC' -> 'net-next'.
 * See per-patch changelog after ---.
v5 -> v6:
 * New patch 0003 which sends credit update during reading bytes from
   socket.
 * See per-patch changelog after ---.
v6 -> v7:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v7 -> v8:
 * See per-patch changelog after ---.
v8 -> v9:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * Add 'Fixes' tag for the current 0002.
 * Reorder patches by moving two fixes first.
v9 -> v10:
 * Squash 0002 and 0003 and update commit message in result.

Arseniy Krasnov (3):
  virtio/vsock: fix logic which reduces credit update messages
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  vsock/test: two tests to check credit update logic

 drivers/vhost/vsock.c   |   1 +
 include/linux/virtio_vsock.h|   1 +
 include/net/af_vsock.h  |   2 +-
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/hyperv_transport.c|   4 +-
 net/vmw_vsock/virtio_transport.c|   1 +
 net/vmw_vsock/virtio_transport_common.c |  43 +-
 net/vmw_vsock/vsock_loopback.c  |   1 +
 tools/testing/vsock/vsock_test.c| 175 
 9 files changed, 229 insertions(+), 8 deletions(-)

-- 
2.25.1




[PATCH net-next v10 3/3] vsock/test: two tests to check credit update logic

2023-12-14 Thread Arseniy Krasnov
Both tests are almost same, only differs in two 'if' conditions, so
implemented in a single function. Tests check, that credit update
message is sent:

1) During setting SO_RCVLOWAT value of the socket.
2) When number of 'rx_bytes' become smaller than SO_RCVLOWAT value.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Update commit message by adding details about dependency for this
test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
  * Add comment for this dependency in 'vsock_test.c' where this define
is duplicated.
 v2 -> v3:
  * Replace synchronization based on control TCP socket with vsock
data socket - this is needed to allow sender transmit data only
when new buffer size of receiver is visible to sender. Otherwise
there is race and test fails sometimes.
 v3 -> v4:
  * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation
in server part. This is needed to ensure that 'poll()' wake up us
when number of bytes ready to read is equal to SO_RCVLOWAT value.
 v4 -> v5:
  * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'.
 v5 -> v6:
  * Add second test which checks, that credit update is sent during
reading data from socket.
  * Update commit message.

 tools/testing/vsock/vsock_test.c | 175 +++
 1 file changed, 175 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 01fa816868bc..66246d81d654 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1232,6 +1232,171 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
 }
 
+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_credit_update_test(const struct test_opts *opts,
+  bool low_rx_bytes_test)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  _size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   if (low_rx_bytes_test) {
+   /* Set new SO_RCVLOWAT here. This enables sending credit
+* update when number of bytes if our rx queue become <
+* SO_RCVLOWAT value.
+*/
+   recv_buf_size = 1 + VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  _buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t re

Re: [PATCH net-next v9 3/4] vsock: update SO_RCVLOWAT setting callback

2023-12-14 Thread Arseniy Krasnov



On 14.12.2023 13:29, Michael S. Tsirkin wrote:
> On Thu, Dec 14, 2023 at 12:19:46PM +0300, Arseniy Krasnov wrote:
>> Do not return if transport callback for SO_RCVLOWAT is set (only in
>> error case). In this case we don't need to set 'sk_rcvlowat' field in
>> each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
>> is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
>> to 'notify_set_rcvlowat'.
>>
>> Signed-off-by: Arseniy Krasnov 
>> Reviewed-by: Stefano Garzarella 
>> Acked-by: Michael S. Tsirkin 
> 
> Maybe squash this with patch 2/4?

You mean just do 'git squash' without updating commit message manually?

Thanks, Arseniy

> 
>> ---
>>  Changelog:
>>  v3 -> v4:
>>   * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
>>   * Commit message updated.
>>
>>  include/net/af_vsock.h   | 2 +-
>>  net/vmw_vsock/af_vsock.c | 9 +++--
>>  net/vmw_vsock/hyperv_transport.c | 4 ++--
>>  3 files changed, 10 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>> index e302c0e804d0..535701efc1e5 100644
>> --- a/include/net/af_vsock.h
>> +++ b/include/net/af_vsock.h
>> @@ -137,7 +137,6 @@ struct vsock_transport {
>>  u64 (*stream_rcvhiwat)(struct vsock_sock *);
>>  bool (*stream_is_active)(struct vsock_sock *);
>>  bool (*stream_allow)(u32 cid, u32 port);
>> -int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
>>  
>>  /* SEQ_PACKET. */
>>  ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
>> @@ -168,6 +167,7 @@ struct vsock_transport {
>>  struct vsock_transport_send_notify_data *);
>>  /* sk_lock held by the caller */
>>  void (*notify_buffer_size)(struct vsock_sock *, u64 *);
>> +int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
>>  
>>  /* Shutdown. */
>>  int (*shutdown)(struct vsock_sock *, int);
>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>> index 816725af281f..54ba7316f808 100644
>> --- a/net/vmw_vsock/af_vsock.c
>> +++ b/net/vmw_vsock/af_vsock.c
>> @@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int 
>> val)
>>  
>>  transport = vsk->transport;
>>  
>> -if (transport && transport->set_rcvlowat)
>> -return transport->set_rcvlowat(vsk, val);
>> +if (transport && transport->notify_set_rcvlowat) {
>> +int err;
>> +
>> +err = transport->notify_set_rcvlowat(vsk, val);
>> +if (err)
>> +return err;
>> +}
>>  
>>  WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
>>  return 0;
> 
> 
> 
> I would s
> 
>> diff --git a/net/vmw_vsock/hyperv_transport.c 
>> b/net/vmw_vsock/hyperv_transport.c
>> index 7cb1a9d2cdb4..e2157e387217 100644
>> --- a/net/vmw_vsock/hyperv_transport.c
>> +++ b/net/vmw_vsock/hyperv_transport.c
>> @@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
>> ssize_t written,
>>  }
>>  
>>  static
>> -int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
>> +int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
>>  {
>>  return -EOPNOTSUPP;
>>  }
>> @@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
>>  .notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
>>  .notify_send_post_enqueue = hvs_notify_send_post_enqueue,
>>  
>> -.set_rcvlowat = hvs_set_rcvlowat
>> +.notify_set_rcvlowat  = hvs_notify_set_rcvlowat
>>  };
>>  
>>  static bool hvs_check_transport(struct vsock_sock *vsk)
>> -- 
>> 2.25.1
> 



[PATCH net-next v9 1/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-14 Thread Arseniy Krasnov
Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v6 -> v7:
  * Handle wrap of 'fwd_cnt'.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
 v7 -> v8:
  * Remove unneeded/wrong handling of wrap for 'fwd_cnt'.

 net/vmw_vsock/virtio_transport_common.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index c8e162c9d1df..7eabe5219ef7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -557,6 +557,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+   u32 fwd_cnt_delta;
+   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;
 
@@ -600,7 +602,10 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}
}
 
-   free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+   fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
+   free_space = vvs->buf_alloc - fwd_cnt_delta;
+   low_rx_bytes = (vvs->rx_bytes <
+   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
 
spin_unlock_bh(>rx_lock);
 
@@ -610,9 +615,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * too high causes extra messages. Too low causes transmitter
 * stalls. As stalls are in theory more expensive than extra
 * messages, we set the limit to a high value. TODO: experiment
-* with different values.
+* with different values. Also send credit update message when
+* number of bytes in rx queue is not enough to wake up reader.
 */
-   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   if (fwd_cnt_delta &&
+   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
virtio_transport_send_credit_update(vsk);
 
return total;
-- 
2.25.1




[PATCH net-next v9 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-14 Thread Arseniy Krasnov
Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Do not initialize 'send_update' variable - set it directly during
first usage.
 v3 -> v4:
  * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
 v4 -> v5:
  * Do not change callbacks order in transport structures.
 v5 -> v6:
  * Reorder callbacks in transport structures.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
 v8 -> v9:
  * Add 'Fixes' tag.

 drivers/vhost/vsock.c   |  1 +
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  1 +
 net/vmw_vsock/virtio_transport_common.c | 30 +
 net/vmw_vsock/vsock_loopback.c  |  1 +
 5 files changed, 34 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ec20ecff85c7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..f495b9e5186b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -537,6 +537,7 @@ static struct virtio_transport virtio_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 7eabe5219ef7..9d2305fdc65c 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1690,6 +1690,36 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(>rx_lock);
+
+   /* If number of available bytes is less than new SO_RCVLOWAT value,
+* kick sender to send more data, because sender may sleep in its
+* 'send()' syscall waiting for enough space at our side. Also
+* don't send credit update when peer already knows actual value -
+* such transmission will be useless.
+*/
+   send_update = (vvs->rx_bytes < val) &&
+ (vvs->fwd_cnt != vvs->last_fwd_cnt);
+
+   spin_unlock_bh(>rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..6dea6119f5b2 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vso

[PATCH net-next v9 0/4] send credit update during setting SO_RCVLOWAT

2023-12-14 Thread Arseniy Krasnov
Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=9bab51bd662be4c3ebb18a28879981d69f3ef15a

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
Link to v4:
https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
Link to v5:
https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/
Link to v6:
https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/
Link to v7:
https://lore.kernel.org/netdev/20231206211849.2707151-1-avkras...@salutedevices.com/
Link to v8:
https://lore.kernel.org/netdev/20231211211658.2904268-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.
v2 -> v3:
 * See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v4 -> v5:
 * Change patchset tag 'RFC' -> 'net-next'.
 * See per-patch changelog after ---.
v5 -> v6:
 * New patch 0003 which sends credit update during reading bytes from
   socket.
 * See per-patch changelog after ---.
v6 -> v7:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v7 -> v8:
 * See per-patch changelog after ---.
v8 -> v9:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * Add 'Fixes' tag for the current 0002.
 * Reorder patches by moving two fixes first.

Arseniy Krasnov (4):
  virtio/vsock: fix logic which reduces credit update messages
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  vsock: update SO_RCVLOWAT setting callback
  vsock/test: two tests to check credit update logic

 drivers/vhost/vsock.c   |   1 +
 include/linux/virtio_vsock.h|   1 +
 include/net/af_vsock.h  |   2 +-
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/hyperv_transport.c|   4 +-
 net/vmw_vsock/virtio_transport.c|   1 +
 net/vmw_vsock/virtio_transport_common.c |  43 +-
 net/vmw_vsock/vsock_loopback.c  |   1 +
 tools/testing/vsock/vsock_test.c| 175 
 9 files changed, 229 insertions(+), 8 deletions(-)

-- 
2.25.1




[PATCH net-next v9 4/4] vsock/test: two tests to check credit update logic

2023-12-14 Thread Arseniy Krasnov
Both tests are almost same, only differs in two 'if' conditions, so
implemented in a single function. Tests check, that credit update
message is sent:

1) During setting SO_RCVLOWAT value of the socket.
2) When number of 'rx_bytes' become smaller than SO_RCVLOWAT value.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Update commit message by adding details about dependency for this
test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
  * Add comment for this dependency in 'vsock_test.c' where this define
is duplicated.
 v2 -> v3:
  * Replace synchronization based on control TCP socket with vsock
data socket - this is needed to allow sender transmit data only
when new buffer size of receiver is visible to sender. Otherwise
there is race and test fails sometimes.
 v3 -> v4:
  * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation
in server part. This is needed to ensure that 'poll()' wake up us
when number of bytes ready to read is equal to SO_RCVLOWAT value.
 v4 -> v5:
  * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'.
 v5 -> v6:
  * Add second test which checks, that credit update is sent during
reading data from socket.
  * Update commit message.

 tools/testing/vsock/vsock_test.c | 175 +++
 1 file changed, 175 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 01fa816868bc..66246d81d654 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1232,6 +1232,171 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
 }
 
+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_credit_update_test(const struct test_opts *opts,
+  bool low_rx_bytes_test)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  _size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   if (low_rx_bytes_test) {
+   /* Set new SO_RCVLOWAT here. This enables sending credit
+* update when number of bytes if our rx queue become <
+* SO_RCVLOWAT value.
+*/
+   recv_buf_size = 1 + VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  _buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t re

[PATCH net-next v9 3/4] vsock: update SO_RCVLOWAT setting callback

2023-12-14 Thread Arseniy Krasnov
Do not return if transport callback for SO_RCVLOWAT is set (only in
error case). In this case we don't need to set 'sk_rcvlowat' field in
each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
to 'notify_set_rcvlowat'.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
Acked-by: Michael S. Tsirkin 
---
 Changelog:
 v3 -> v4:
  * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
  * Commit message updated.

 include/net/af_vsock.h   | 2 +-
 net/vmw_vsock/af_vsock.c | 9 +++--
 net/vmw_vsock/hyperv_transport.c | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e302c0e804d0..535701efc1e5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -137,7 +137,6 @@ struct vsock_transport {
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);
-   int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* SEQ_PACKET. */
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
@@ -168,6 +167,7 @@ struct vsock_transport {
struct vsock_transport_send_notify_data *);
/* sk_lock held by the caller */
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
+   int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..54ba7316f808 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
transport = vsk->transport;
 
-   if (transport && transport->set_rcvlowat)
-   return transport->set_rcvlowat(vsk, val);
+   if (transport && transport->notify_set_rcvlowat) {
+   int err;
+
+   err = transport->notify_set_rcvlowat(vsk, val);
+   if (err)
+   return err;
+   }
 
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
return 0;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7cb1a9d2cdb4..e2157e387217 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
ssize_t written,
 }
 
 static
-int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
+int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
 {
return -EOPNOTSUPP;
 }
@@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
 
-   .set_rcvlowat = hvs_set_rcvlowat
+   .notify_set_rcvlowat  = hvs_notify_set_rcvlowat
 };
 
 static bool hvs_check_transport(struct vsock_sock *vsk)
-- 
2.25.1




Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT

2023-12-13 Thread Arseniy Krasnov



On 13.12.2023 18:13, Michael S. Tsirkin wrote:
> On Wed, Dec 13, 2023 at 10:05:44AM -0500, Michael S. Tsirkin wrote:
>> On Wed, Dec 13, 2023 at 12:08:27PM +0300, Arseniy Krasnov wrote:
>>>
>>>
>>> On 13.12.2023 11:43, Stefano Garzarella wrote:
>>>> On Tue, Dec 12, 2023 at 08:43:07PM +0300, Arseniy Krasnov wrote:
>>>>>
>>>>>
>>>>> On 12.12.2023 19:12, Michael S. Tsirkin wrote:
>>>>>> On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 12.12.2023 18:54, Michael S. Tsirkin wrote:
>>>>>>>> On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote:
>>>>>>>>> Hello,
>>>>>>>>>
>>>>>>>>>    DESCRIPTION
>>>>>>>>>
>>>>>>>>> This patchset fixes old problem with hungup of both rx/tx sides and 
>>>>>>>>> adds
>>>>>>>>> test for it. This happens due to non-default SO_RCVLOWAT value and
>>>>>>>>> deferred credit update in virtio/vsock. Link to previous old patchset:
>>>>>>>>> https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/
>>>>>>>>
>>>>>>>>
>>>>>>>> Patchset:
>>>>>>>>
>>>>>>>> Acked-by: Michael S. Tsirkin 
>>>>>>>
>>>>>>> Thanks!
>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> But I worry whether we actually need 3/8 in net not in net-next.
>>>>>>>
>>>>>>> Because of "Fixes" tag ? I think this problem is not critical and 
>>>>>>> reproducible
>>>>>>> only in special cases, but i'm not familiar with netdev process so 
>>>>>>> good, so I don't
>>>>>>> have strong opinion. I guess @Stefano knows better.
>>>>>>>
>>>>>>> Thanks, Arseniy
>>>>>>
>>>>>> Fixes means "if you have that other commit then you need this commit
>>>>>> too". I think as a minimum you need to rearrange patches to make the
>>>>>> fix go in first. We don't want a regression followed by a fix.
>>>>>
>>>>> I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, 
>>>>> because this
>>>>> patch fixes problem that is not related with the new patches from this 
>>>>> patchset.
>>>>
>>>> I agree, patch 3 is for sure net material (I'm fine with both 
>>>> rearrangement or send it separately), but IMHO also patch 2 could be.
>>>> I think with the same fixes tag, since before commit b89d882dc9fc 
>>>> ("vsock/virtio: reduce credit update messages") we sent a credit update
>>>> for every bytes we read, so we should not have this problem, right?
>>>
>>> Agree for 2, so I think I can rearrange: two fixes go first, then current 
>>> 0001, and then tests. And send it as V9 for 'net' only ?
>>>
>>> Thanks, Arseniy
>>
>>
>> hmm why not net-next?
> 
> Oh I missed your previous discussion. I think everything in net-next is
> safer.  Having said that, I won't nack it net, either.

So, summarizing all above:
1) This patchset entirely goes to net-next as v9
2) I reorder patches like 3 - 2 - 1 - 4, e.g. two fixes goes first with Fixes 
tag
3) Add Acked-by: Michael S. Tsirkin  to each patch

@Michael, @Stefano ?

Thanks, Arseniy

> 
>>>>
>>>> So, maybe all the series could be "net".
>>>>
>>>> Thanks,
>>>> Stefano
>>>>
> 



Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT

2023-12-13 Thread Arseniy Krasnov



On 13.12.2023 12:41, Stefano Garzarella wrote:
> On Wed, Dec 13, 2023 at 12:08:27PM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 13.12.2023 11:43, Stefano Garzarella wrote:
>>> On Tue, Dec 12, 2023 at 08:43:07PM +0300, Arseniy Krasnov wrote:
>>>>
>>>>
>>>> On 12.12.2023 19:12, Michael S. Tsirkin wrote:
>>>>> On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote:
>>>>>>
>>>>>>
>>>>>> On 12.12.2023 18:54, Michael S. Tsirkin wrote:
>>>>>>> On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote:
>>>>>>>> Hello,
>>>>>>>>
>>>>>>>>    DESCRIPTION
>>>>>>>>
>>>>>>>> This patchset fixes old problem with hungup of both rx/tx sides and 
>>>>>>>> adds
>>>>>>>> test for it. This happens due to non-default SO_RCVLOWAT value and
>>>>>>>> deferred credit update in virtio/vsock. Link to previous old patchset:
>>>>>>>> https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/
>>>>>>>
>>>>>>>
>>>>>>> Patchset:
>>>>>>>
>>>>>>> Acked-by: Michael S. Tsirkin 
>>>>>>
>>>>>> Thanks!
>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> But I worry whether we actually need 3/8 in net not in net-next.
>>>>>>
>>>>>> Because of "Fixes" tag ? I think this problem is not critical and 
>>>>>> reproducible
>>>>>> only in special cases, but i'm not familiar with netdev process so good, 
>>>>>> so I don't
>>>>>> have strong opinion. I guess @Stefano knows better.
>>>>>>
>>>>>> Thanks, Arseniy
>>>>>
>>>>> Fixes means "if you have that other commit then you need this commit
>>>>> too". I think as a minimum you need to rearrange patches to make the
>>>>> fix go in first. We don't want a regression followed by a fix.
>>>>
>>>> I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, 
>>>> because this
>>>> patch fixes problem that is not related with the new patches from this 
>>>> patchset.
>>>
>>> I agree, patch 3 is for sure net material (I'm fine with both rearrangement 
>>> or send it separately), but IMHO also patch 2 could be.
>>> I think with the same fixes tag, since before commit b89d882dc9fc 
>>> ("vsock/virtio: reduce credit update messages") we sent a credit update
>>> for every bytes we read, so we should not have this problem, right?
>>
>> Agree for 2, so I think I can rearrange: two fixes go first, then current 
>> 0001, and then tests. And send it as V9 for 'net' only ?
> 
> Maybe you can add this to patch 1 if we want it on net:
> 
> Fixes: e38f22c860ed ("vsock: SO_RCVLOWAT transport set callback")
> 
> Then I think that patch should go before patch 2, so we don't need to
> touch that code multiple times.
> 
> so, IMHO the order should be the actual order or 3 - 1 - 2 - 4.
> 
> Another option is to send just 2 & 3 to net, and the rest (1 & 4) to 
> net-next. IMHO should be fine to send the entire series to net with the fixes 
> tag also in patch 1.

Ok, agree that it is good to send whole patchset to net without splitting it.

> 
> Net maintainers and Michael might have a different advice.

Ok

> 
> Thanks,
> Stefano
> 



Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT

2023-12-13 Thread Arseniy Krasnov



On 13.12.2023 11:43, Stefano Garzarella wrote:
> On Tue, Dec 12, 2023 at 08:43:07PM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 12.12.2023 19:12, Michael S. Tsirkin wrote:
>>> On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote:
>>>>
>>>>
>>>> On 12.12.2023 18:54, Michael S. Tsirkin wrote:
>>>>> On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote:
>>>>>> Hello,
>>>>>>
>>>>>>    DESCRIPTION
>>>>>>
>>>>>> This patchset fixes old problem with hungup of both rx/tx sides and adds
>>>>>> test for it. This happens due to non-default SO_RCVLOWAT value and
>>>>>> deferred credit update in virtio/vsock. Link to previous old patchset:
>>>>>> https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/
>>>>>
>>>>>
>>>>> Patchset:
>>>>>
>>>>> Acked-by: Michael S. Tsirkin 
>>>>
>>>> Thanks!
>>>>
>>>>>
>>>>>
>>>>> But I worry whether we actually need 3/8 in net not in net-next.
>>>>
>>>> Because of "Fixes" tag ? I think this problem is not critical and 
>>>> reproducible
>>>> only in special cases, but i'm not familiar with netdev process so good, 
>>>> so I don't
>>>> have strong opinion. I guess @Stefano knows better.
>>>>
>>>> Thanks, Arseniy
>>>
>>> Fixes means "if you have that other commit then you need this commit
>>> too". I think as a minimum you need to rearrange patches to make the
>>> fix go in first. We don't want a regression followed by a fix.
>>
>> I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, because 
>> this
>> patch fixes problem that is not related with the new patches from this 
>> patchset.
> 
> I agree, patch 3 is for sure net material (I'm fine with both rearrangement 
> or send it separately), but IMHO also patch 2 could be.
> I think with the same fixes tag, since before commit b89d882dc9fc 
> ("vsock/virtio: reduce credit update messages") we sent a credit update
> for every bytes we read, so we should not have this problem, right?

Agree for 2, so I think I can rearrange: two fixes go first, then current 0001, 
and then tests. And send it as V9 for 'net' only ?

Thanks, Arseniy

> 
> So, maybe all the series could be "net".
> 
> Thanks,
> Stefano
> 



Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT

2023-12-12 Thread Arseniy Krasnov



On 12.12.2023 19:12, Michael S. Tsirkin wrote:
> On Tue, Dec 12, 2023 at 06:59:03PM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 12.12.2023 18:54, Michael S. Tsirkin wrote:
>>> On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote:
>>>> Hello,
>>>>
>>>>DESCRIPTION
>>>>
>>>> This patchset fixes old problem with hungup of both rx/tx sides and adds
>>>> test for it. This happens due to non-default SO_RCVLOWAT value and
>>>> deferred credit update in virtio/vsock. Link to previous old patchset:
>>>> https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/
>>>
>>>
>>> Patchset:
>>>
>>> Acked-by: Michael S. Tsirkin 
>>
>> Thanks!
>>
>>>
>>>
>>> But I worry whether we actually need 3/8 in net not in net-next.
>>
>> Because of "Fixes" tag ? I think this problem is not critical and 
>> reproducible
>> only in special cases, but i'm not familiar with netdev process so good, so 
>> I don't
>> have strong opinion. I guess @Stefano knows better.
>>
>> Thanks, Arseniy
> 
> Fixes means "if you have that other commit then you need this commit
> too". I think as a minimum you need to rearrange patches to make the
> fix go in first. We don't want a regression followed by a fix.

I see, ok, @Stefano WDYT? I think rearrange doesn't break anything, because this
patch fixes problem that is not related with the new patches from this patchset.

Thanks, Arseniy

> 
>>>
>>> Thanks!
>>>
>>>> Here is what happens step by step:
>>>>
>>>>   TEST
>>>>
>>>> INITIAL CONDITIONS
>>>>
>>>> 1) Vsock buffer size is 128KB.
>>>> 2) Maximum packet size is also 64KB as defined in header (yes it is
>>>>hardcoded, just to remind about that value).
>>>> 3) SO_RCVLOWAT is default, e.g. 1 byte.
>>>>
>>>>
>>>>  STEPS
>>>>
>>>> SENDER  RECEIVER
>>>> 1) sends 128KB + 1 byte in a
>>>>single buffer. 128KB will
>>>>be sent, but for 1 byte
>>>>sender will wait for free
>>>>space at peer. Sender goes
>>>>to sleep.
>>>>
>>>>
>>>> 2) reads 64KB, credit update not sent
>>>> 3) sets SO_RCVLOWAT to 64KB + 1
>>>> 4) poll() -> wait forever, there is
>>>>only 64KB available to read.
>>>>
>>>> So in step 4) receiver also goes to sleep, waiting for enough data or
>>>> connection shutdown message from the sender. Idea to fix it is that rx
>>>> kicks tx side to continue transmission (and may be close connection)
>>>> when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
>>>> this value is bigger than number of available bytes to read.
>>>>
>>>> I've added small test for this, but not sure as it uses hardcoded value
>>>> for maximum packet length, this value is defined in kernel header and
>>>> used to control deferred credit update. And as this is not available to
>>>> userspace, I can't control test parameters correctly (if one day this
>>>> define will be changed - test may become useless). 
>>>>
>>>> Head for this patchset is:
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=021b0c952f226236f2edf89c737efb9a28d1422d
>>>>
>>>> Link to v1:
>>>> https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
>>>> Link to v2:
>>>> https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
>>>> Link to v3:
>>>> https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
>>>> Link to v4:
>>>> https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
>>>> Link to v5:
>>>> https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/
>>>> Link to v6:
>>>> https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/
>>>> Link to v7:
&g

Re: [PATCH net-next v8 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-12 Thread Arseniy Krasnov



On 12.12.2023 19:11, Michael S. Tsirkin wrote:
> On Tue, Dec 12, 2023 at 06:50:39PM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 12.12.2023 18:54, Michael S. Tsirkin wrote:
>>> On Tue, Dec 12, 2023 at 12:16:57AM +0300, Arseniy Krasnov wrote:
>>>> Add one more condition for sending credit update during dequeue from
>>>> stream socket: when number of bytes in the rx queue is smaller than
>>>> SO_RCVLOWAT value of the socket. This is actual for non-default value
>>>> of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
>>>> transmission, because we need at least SO_RCVLOWAT bytes in our rx
>>>> queue to wake up user for reading data (in corner case it is also
>>>> possible to stuck both tx and rx sides, this is why 'Fixes' is used).
>>>
>>> I don't get what does "to stuck both tx and rx sides" mean.
>>
>> I meant situation when tx waits for the free space, while rx doesn't send
>> credit update, just waiting for more data. Sorry for my English :)
>>
>>> Besides being agrammatical, is there a way to do this without
>>> playing with SO_RCVLOWAT?
>>
>> No, this may happen only with non-default SO_RCVLOWAT values (e.g. != 1)
>>
>> Thanks, Arseniy 
> 
> I am split on whether we need the Fixes tag. I guess if the other side
> is vhost with SO_RCVLOWAT then it might be stuck and it might apply
> without SO_RCVLOWAT on the local kernel?

IIUC your question, then this problem is actual for any transports: g2h, h2g and
loopback.

Thanks, Arseniy

> 
> 
>>>
>>>>
>>>> Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
>>>> Signed-off-by: Arseniy Krasnov 
>>>> ---
>>>>  Changelog:
>>>>  v6 -> v7:
>>>>   * Handle wrap of 'fwd_cnt'.
>>>>   * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
>>>>  v7 -> v8:
>>>>   * Remove unneeded/wrong handling of wrap for 'fwd_cnt'.
>>>>
>>>>  net/vmw_vsock/virtio_transport_common.c | 13 ++---
>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>>>> b/net/vmw_vsock/virtio_transport_common.c
>>>> index e137d740804e..8572f94bba88 100644
>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>> @@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>>>> *vsk,
>>>>struct virtio_vsock_sock *vvs = vsk->trans;
>>>>size_t bytes, total = 0;
>>>>struct sk_buff *skb;
>>>> +  u32 fwd_cnt_delta;
>>>> +  bool low_rx_bytes;
>>>>int err = -EFAULT;
>>>>u32 free_space;
>>>>  
>>>> @@ -601,7 +603,10 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>>>> *vsk,
>>>>}
>>>>}
>>>>  
>>>> -  free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
>>>> +  fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
>>>> +  free_space = vvs->buf_alloc - fwd_cnt_delta;
>>>> +  low_rx_bytes = (vvs->rx_bytes <
>>>> +  sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
>>>>  
>>>>spin_unlock_bh(>rx_lock);
>>>>  
>>>> @@ -611,9 +616,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>>>> *vsk,
>>>> * too high causes extra messages. Too low causes transmitter
>>>> * stalls. As stalls are in theory more expensive than extra
>>>> * messages, we set the limit to a high value. TODO: experiment
>>>> -   * with different values.
>>>> +   * with different values. Also send credit update message when
>>>> +   * number of bytes in rx queue is not enough to wake up reader.
>>>> */
>>>> -  if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
>>>> +  if (fwd_cnt_delta &&
>>>> +  (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
>>>>virtio_transport_send_credit_update(vsk);
>>>>  
>>>>return total;
>>>> -- 
>>>> 2.25.1
>>>
> 



Re: [PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT

2023-12-12 Thread Arseniy Krasnov



On 12.12.2023 18:54, Michael S. Tsirkin wrote:
> On Tue, Dec 12, 2023 at 12:16:54AM +0300, Arseniy Krasnov wrote:
>> Hello,
>>
>>DESCRIPTION
>>
>> This patchset fixes old problem with hungup of both rx/tx sides and adds
>> test for it. This happens due to non-default SO_RCVLOWAT value and
>> deferred credit update in virtio/vsock. Link to previous old patchset:
>> https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/
> 
> 
> Patchset:
> 
> Acked-by: Michael S. Tsirkin 

Thanks!

> 
> 
> But I worry whether we actually need 3/8 in net not in net-next.

Because of "Fixes" tag ? I think this problem is not critical and reproducible
only in special cases, but i'm not familiar with netdev process so good, so I 
don't
have strong opinion. I guess @Stefano knows better.

Thanks, Arseniy

> 
> Thanks!
> 
>> Here is what happens step by step:
>>
>>   TEST
>>
>> INITIAL CONDITIONS
>>
>> 1) Vsock buffer size is 128KB.
>> 2) Maximum packet size is also 64KB as defined in header (yes it is
>>hardcoded, just to remind about that value).
>> 3) SO_RCVLOWAT is default, e.g. 1 byte.
>>
>>
>>  STEPS
>>
>> SENDER  RECEIVER
>> 1) sends 128KB + 1 byte in a
>>single buffer. 128KB will
>>be sent, but for 1 byte
>>sender will wait for free
>>space at peer. Sender goes
>>to sleep.
>>
>>
>> 2) reads 64KB, credit update not sent
>> 3) sets SO_RCVLOWAT to 64KB + 1
>> 4) poll() -> wait forever, there is
>>only 64KB available to read.
>>
>> So in step 4) receiver also goes to sleep, waiting for enough data or
>> connection shutdown message from the sender. Idea to fix it is that rx
>> kicks tx side to continue transmission (and may be close connection)
>> when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
>> this value is bigger than number of available bytes to read.
>>
>> I've added small test for this, but not sure as it uses hardcoded value
>> for maximum packet length, this value is defined in kernel header and
>> used to control deferred credit update. And as this is not available to
>> userspace, I can't control test parameters correctly (if one day this
>> define will be changed - test may become useless). 
>>
>> Head for this patchset is:
>> https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=021b0c952f226236f2edf89c737efb9a28d1422d
>>
>> Link to v1:
>> https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
>> Link to v2:
>> https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
>> Link to v3:
>> https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
>> Link to v4:
>> https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
>> Link to v5:
>> https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/
>> Link to v6:
>> https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/
>> Link to v7:
>> https://lore.kernel.org/netdev/20231206211849.2707151-1-avkras...@salutedevices.com/
>>
>> Changelog:
>> v1 -> v2:
>>  * Patchset rebased and tested on new HEAD of net-next (see hash above).
>>  * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
>>callback in 'af_vsock.c' when transport callback is set - with that
>>we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
>>not copy-paste it to every transport. It was discussed in v1.
>>  * See per-patch changelog after ---.
>> v2 -> v3:
>>  * See changelog after --- in 0003 only (0001 and 0002 still same).
>> v3 -> v4:
>>  * Patchset rebased and tested on new HEAD of net-next (see hash above).
>>  * See per-patch changelog after ---.
>> v4 -> v5:
>>  * Change patchset tag 'RFC' -> 'net-next'.
>>  * See per-patch changelog after ---.
>> v5 -> v6:
>>  * New patch 0003 which sends credit update during reading bytes from
>>socket.
>>  * See per-patch changelog after ---.
>> v6 -> v7:
>>  * Patchset rebased and tested on new HEAD of net-next (see hash above).
>>  * See per-p

Re: [PATCH net-next v8 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-12 Thread Arseniy Krasnov



On 12.12.2023 18:54, Michael S. Tsirkin wrote:
> On Tue, Dec 12, 2023 at 12:16:57AM +0300, Arseniy Krasnov wrote:
>> Add one more condition for sending credit update during dequeue from
>> stream socket: when number of bytes in the rx queue is smaller than
>> SO_RCVLOWAT value of the socket. This is actual for non-default value
>> of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
>> transmission, because we need at least SO_RCVLOWAT bytes in our rx
>> queue to wake up user for reading data (in corner case it is also
>> possible to stuck both tx and rx sides, this is why 'Fixes' is used).
> 
> I don't get what does "to stuck both tx and rx sides" mean.

I meant situation when tx waits for the free space, while rx doesn't send
credit update, just waiting for more data. Sorry for my English :)

> Besides being agrammatical, is there a way to do this without
> playing with SO_RCVLOWAT?

No, this may happen only with non-default SO_RCVLOWAT values (e.g. != 1)

Thanks, Arseniy 

> 
>>
>> Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
>> Signed-off-by: Arseniy Krasnov 
>> ---
>>  Changelog:
>>  v6 -> v7:
>>   * Handle wrap of 'fwd_cnt'.
>>   * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
>>  v7 -> v8:
>>   * Remove unneeded/wrong handling of wrap for 'fwd_cnt'.
>>
>>  net/vmw_vsock/virtio_transport_common.c | 13 ++---
>>  1 file changed, 10 insertions(+), 3 deletions(-)
>>
>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>> b/net/vmw_vsock/virtio_transport_common.c
>> index e137d740804e..8572f94bba88 100644
>> --- a/net/vmw_vsock/virtio_transport_common.c
>> +++ b/net/vmw_vsock/virtio_transport_common.c
>> @@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>> *vsk,
>>  struct virtio_vsock_sock *vvs = vsk->trans;
>>  size_t bytes, total = 0;
>>  struct sk_buff *skb;
>> +u32 fwd_cnt_delta;
>> +bool low_rx_bytes;
>>  int err = -EFAULT;
>>  u32 free_space;
>>  
>> @@ -601,7 +603,10 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>> *vsk,
>>  }
>>  }
>>  
>> -free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
>> +fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
>> +free_space = vvs->buf_alloc - fwd_cnt_delta;
>> +low_rx_bytes = (vvs->rx_bytes <
>> +sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
>>  
>>  spin_unlock_bh(>rx_lock);
>>  
>> @@ -611,9 +616,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>> *vsk,
>>   * too high causes extra messages. Too low causes transmitter
>>   * stalls. As stalls are in theory more expensive than extra
>>   * messages, we set the limit to a high value. TODO: experiment
>> - * with different values.
>> + * with different values. Also send credit update message when
>> + * number of bytes in rx queue is not enough to wake up reader.
>>   */
>> -if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
>> +if (fwd_cnt_delta &&
>> +(free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
>>  virtio_transport_send_credit_update(vsk);
>>  
>>  return total;
>> -- 
>> 2.25.1
> 



[PATCH net-next v8 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-11 Thread Arseniy Krasnov
Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v6 -> v7:
  * Handle wrap of 'fwd_cnt'.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
 v7 -> v8:
  * Remove unneeded/wrong handling of wrap for 'fwd_cnt'.

 net/vmw_vsock/virtio_transport_common.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e137d740804e..8572f94bba88 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+   u32 fwd_cnt_delta;
+   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;
 
@@ -601,7 +603,10 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}
}
 
-   free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+   fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
+   free_space = vvs->buf_alloc - fwd_cnt_delta;
+   low_rx_bytes = (vvs->rx_bytes <
+   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
 
spin_unlock_bh(>rx_lock);
 
@@ -611,9 +616,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * too high causes extra messages. Too low causes transmitter
 * stalls. As stalls are in theory more expensive than extra
 * messages, we set the limit to a high value. TODO: experiment
-* with different values.
+* with different values. Also send credit update message when
+* number of bytes in rx queue is not enough to wake up reader.
 */
-   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   if (fwd_cnt_delta &&
+   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
virtio_transport_send_credit_update(vsk);
 
return total;
-- 
2.25.1




[PATCH net-next v8 4/4] vsock/test: two tests to check credit update logic

2023-12-11 Thread Arseniy Krasnov
Both tests are almost same, only differs in two 'if' conditions, so
implemented in a single function. Tests check, that credit update
message is sent:

1) During setting SO_RCVLOWAT value of the socket.
2) When number of 'rx_bytes' become smaller than SO_RCVLOWAT value.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Update commit message by adding details about dependency for this
test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
  * Add comment for this dependency in 'vsock_test.c' where this define
is duplicated.
 v2 -> v3:
  * Replace synchronization based on control TCP socket with vsock
data socket - this is needed to allow sender transmit data only
when new buffer size of receiver is visible to sender. Otherwise
there is race and test fails sometimes.
 v3 -> v4:
  * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation
in server part. This is needed to ensure that 'poll()' wake up us
when number of bytes ready to read is equal to SO_RCVLOWAT value.
 v4 -> v5:
  * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'.
 v5 -> v6:
  * Add second test which checks, that credit update is sent during
reading data from socket.
  * Update commit message.

 tools/testing/vsock/vsock_test.c | 175 +++
 1 file changed, 175 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 01fa816868bc..66246d81d654 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1232,6 +1232,171 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
 }
 
+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_credit_update_test(const struct test_opts *opts,
+  bool low_rx_bytes_test)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  _size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   if (low_rx_bytes_test) {
+   /* Set new SO_RCVLOWAT here. This enables sending credit
+* update when number of bytes if our rx queue become <
+* SO_RCVLOWAT value.
+*/
+   recv_buf_size = 1 + VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  _buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEE

[PATCH net-next v8 0/4] send credit update during setting SO_RCVLOWAT

2023-12-11 Thread Arseniy Krasnov
Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=021b0c952f226236f2edf89c737efb9a28d1422d

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
Link to v4:
https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
Link to v5:
https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/
Link to v6:
https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/
Link to v7:
https://lore.kernel.org/netdev/20231206211849.2707151-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.
v2 -> v3:
 * See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v4 -> v5:
 * Change patchset tag 'RFC' -> 'net-next'.
 * See per-patch changelog after ---.
v5 -> v6:
 * New patch 0003 which sends credit update during reading bytes from
   socket.
 * See per-patch changelog after ---.
v6 -> v7:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v7 -> v8:
 * See per-patch changelog after ---.

Arseniy Krasnov (4):
  vsock: update SO_RCVLOWAT setting callback
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  virtio/vsock: fix logic which reduces credit update messages
  vsock/test: two tests to check credit update logic

 drivers/vhost/vsock.c   |   1 +
 include/linux/virtio_vsock.h|   1 +
 include/net/af_vsock.h  |   2 +-
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/hyperv_transport.c|   4 +-
 net/vmw_vsock/virtio_transport.c|   1 +
 net/vmw_vsock/virtio_transport_common.c |  43 +-
 net/vmw_vsock/vsock_loopback.c  |   1 +
 tools/testing/vsock/vsock_test.c| 175 
 9 files changed, 229 insertions(+), 8 deletions(-)

-- 
2.25.1




[PATCH net-next v8 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-11 Thread Arseniy Krasnov
Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Do not initialize 'send_update' variable - set it directly during
first usage.
 v3 -> v4:
  * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
 v4 -> v5:
  * Do not change callbacks order in transport structures.
 v5 -> v6:
  * Reorder callbacks in transport structures.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.

 drivers/vhost/vsock.c   |  1 +
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  1 +
 net/vmw_vsock/virtio_transport_common.c | 30 +
 net/vmw_vsock/vsock_loopback.c  |  1 +
 5 files changed, 34 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ec20ecff85c7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..f495b9e5186b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -537,6 +537,7 @@ static struct virtio_transport virtio_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..e137d740804e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,36 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(>rx_lock);
+
+   /* If number of available bytes is less than new SO_RCVLOWAT value,
+* kick sender to send more data, because sender may sleep in its
+* 'send()' syscall waiting for enough space at our side. Also
+* don't send credit update when peer already knows actual value -
+* such transmission will be useless.
+*/
+   send_update = (vvs->rx_bytes < val) &&
+ (vvs->fwd_cnt != vvs->last_fwd_cnt);
+
+   spin_unlock_bh(>rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..6dea6119f5b2 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -96,6 +96,7 @@ static struct virtio_transport loopback_transport = {
.notify_send_pre_enqueue  = 
virtio_tr

[PATCH net-next v8 1/4] vsock: update SO_RCVLOWAT setting callback

2023-12-11 Thread Arseniy Krasnov
Do not return if transport callback for SO_RCVLOWAT is set (only in
error case). In this case we don't need to set 'sk_rcvlowat' field in
each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
to 'notify_set_rcvlowat'.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
---
 Changelog:
 v3 -> v4:
  * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
  * Commit message updated.

 include/net/af_vsock.h   | 2 +-
 net/vmw_vsock/af_vsock.c | 9 +++--
 net/vmw_vsock/hyperv_transport.c | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e302c0e804d0..535701efc1e5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -137,7 +137,6 @@ struct vsock_transport {
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);
-   int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* SEQ_PACKET. */
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
@@ -168,6 +167,7 @@ struct vsock_transport {
struct vsock_transport_send_notify_data *);
/* sk_lock held by the caller */
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
+   int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..54ba7316f808 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
transport = vsk->transport;
 
-   if (transport && transport->set_rcvlowat)
-   return transport->set_rcvlowat(vsk, val);
+   if (transport && transport->notify_set_rcvlowat) {
+   int err;
+
+   err = transport->notify_set_rcvlowat(vsk, val);
+   if (err)
+   return err;
+   }
 
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
return 0;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7cb1a9d2cdb4..e2157e387217 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
ssize_t written,
 }
 
 static
-int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
+int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
 {
return -EOPNOTSUPP;
 }
@@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
 
-   .set_rcvlowat = hvs_set_rcvlowat
+   .notify_set_rcvlowat  = hvs_notify_set_rcvlowat
 };
 
 static bool hvs_check_transport(struct vsock_sock *vsk)
-- 
2.25.1




Re: [PATCH net-next v7 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-11 Thread Arseniy Krasnov



On 11.12.2023 15:01, Stefano Garzarella wrote:
> On Thu, Dec 07, 2023 at 01:50:05AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 07.12.2023 01:08, Michael S. Tsirkin wrote:
>>> On Thu, Dec 07, 2023 at 12:52:51AM +0300, Arseniy Krasnov wrote:
>>>>
>>>>
>>>> On 07.12.2023 00:53, Michael S. Tsirkin wrote:
>>>>> On Thu, Dec 07, 2023 at 12:18:48AM +0300, Arseniy Krasnov wrote:
>>>>>> Add one more condition for sending credit update during dequeue from
>>>>>> stream socket: when number of bytes in the rx queue is smaller than
>>>>>> SO_RCVLOWAT value of the socket. This is actual for non-default value
>>>>>> of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
>>>>>> transmission, because we need at least SO_RCVLOWAT bytes in our rx
>>>>>> queue to wake up user for reading data (in corner case it is also
>>>>>> possible to stuck both tx and rx sides, this is why 'Fixes' is used).
>>>>>> Also handle case when 'fwd_cnt' wraps, while 'last_fwd_cnt' is still
>>>>>> not.
>>>>>>
>>>>>> Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
>>>>>> Signed-off-by: Arseniy Krasnov 
>>>>>> ---
>>>>>>  Changelog:
>>>>>>  v6 -> v7:
>>>>>>   * Handle wrap of 'fwd_cnt'.
>>>>>>   * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
>>>>>>
>>>>>>  net/vmw_vsock/virtio_transport_common.c | 18 +++---
>>>>>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>>>>>> b/net/vmw_vsock/virtio_transport_common.c
>>>>>> index e137d740804e..39f8660d825d 100644
>>>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>>>> @@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>>>>>> *vsk,
>>>>>>  struct virtio_vsock_sock *vvs = vsk->trans;
>>>>>>  size_t bytes, total = 0;
>>>>>>  struct sk_buff *skb;
>>>>>> +    u32 fwd_cnt_delta;
>>>>>> +    bool low_rx_bytes;
>>>>>>  int err = -EFAULT;
>>>>>>  u32 free_space;
>>>>>>
>>>>>> @@ -601,7 +603,15 @@ virtio_transport_stream_do_dequeue(struct 
>>>>>> vsock_sock *vsk,
>>>>>>  }
>>>>>>  }
>>>>>>
>>>>>> -    free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
>>>>>> +    /* Handle wrap of 'fwd_cnt'. */
>>>>>> +    if (vvs->fwd_cnt < vvs->last_fwd_cnt)
>>>>>> +    fwd_cnt_delta = vvs->fwd_cnt + (U32_MAX - vvs->last_fwd_cnt);
>>>>>
>>>>> Are you sure there's no off by one here? for example if fwd_cnt is 0
>>>>> and last_fwd_cnt is 0xf then apparently delta is 0.
>>>>
>>>> Seems yes, I need +1 here
>>>
>>> And then you will get a nop, because assigning U32_MAX + 1 to u32
>>> gives you 0. Adding () does nothing to change the result,
>>> + and - are commutative.
>>
>> Ahh, unsigned here, yes.
> 
> Ooops, sorry I was confused here!
> 
>>
>> @Stefano, what did You mean about wrapping here?
>>
>> I think Michael is right, for example
> 
> Yep, I agree!
> Sorry for this wrong suggestion!

Got it! I'll remove it, no problem 

Thanks, Arseniy

> 
> Stefano
> 
>>
>> vvs->fwd_cnt wraps and now == 5
>> vvs->last_fwd_cnt == 0x
>>
>> now delta before this patch will be 6 - correct value
>>
>> May be I didn't get your idea, so implement it very naive?
>>
>> Thanks, Arseniy
>>
>>>
>>>
>>>>>
>>>>>
>>>>>> +    else
>>>>>> +    fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
>>>>>
>>>>> I actually don't see what is wrong with just
>>>>> fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt
>>>>> 32 bit unsigned math will I think handle wrap around correctly.
>>>>>
>>>>> And given buf_alloc is also u32 - I don't see where the bug is in
>>>>> the original code.
>>>>
>>>> I think problem is when fwd_cnt wraps, while last_fwd_cnt is not. In this
>>>> case fwd_cnt_delta will be too big, so we won't send credit update which
>>>> leads to stall for sender
>>>>
>>>> Thanks, Arseniy
>>>
>>> Care coming up with an example?
>>>
>>>
>>>>>
>>>>>
>>>>>> +
>>>>>> +    free_space = vvs->buf_alloc - fwd_cnt_delta;
>>>>>> +    low_rx_bytes = (vvs->rx_bytes <
>>>>>> +    sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
>>>>>>
>>>>>>  spin_unlock_bh(>rx_lock);
>>>>>>
>>>>>> @@ -611,9 +621,11 @@ virtio_transport_stream_do_dequeue(struct 
>>>>>> vsock_sock *vsk,
>>>>>>   * too high causes extra messages. Too low causes transmitter
>>>>>>   * stalls. As stalls are in theory more expensive than extra
>>>>>>   * messages, we set the limit to a high value. TODO: experiment
>>>>>> - * with different values.
>>>>>> + * with different values. Also send credit update message when
>>>>>> + * number of bytes in rx queue is not enough to wake up reader.
>>>>>>   */
>>>>>> -    if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
>>>>>> +    if (fwd_cnt_delta &&
>>>>>> +    (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
>>>>>>  virtio_transport_send_credit_update(vsk);
>>>>>>
>>>>>>  return total;
>>>>>> -- 
>>>>>> 2.25.1
>>>>>
>>>
>>
> 



Re: [PATCH net-next v7 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-06 Thread Arseniy Krasnov



On 07.12.2023 01:08, Michael S. Tsirkin wrote:
> On Thu, Dec 07, 2023 at 12:52:51AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 07.12.2023 00:53, Michael S. Tsirkin wrote:
>>> On Thu, Dec 07, 2023 at 12:18:48AM +0300, Arseniy Krasnov wrote:
>>>> Add one more condition for sending credit update during dequeue from
>>>> stream socket: when number of bytes in the rx queue is smaller than
>>>> SO_RCVLOWAT value of the socket. This is actual for non-default value
>>>> of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
>>>> transmission, because we need at least SO_RCVLOWAT bytes in our rx
>>>> queue to wake up user for reading data (in corner case it is also
>>>> possible to stuck both tx and rx sides, this is why 'Fixes' is used).
>>>> Also handle case when 'fwd_cnt' wraps, while 'last_fwd_cnt' is still
>>>> not.
>>>>
>>>> Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
>>>> Signed-off-by: Arseniy Krasnov 
>>>> ---
>>>>  Changelog:
>>>>  v6 -> v7:
>>>>   * Handle wrap of 'fwd_cnt'.
>>>>   * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
>>>>
>>>>  net/vmw_vsock/virtio_transport_common.c | 18 +++---
>>>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>>>> b/net/vmw_vsock/virtio_transport_common.c
>>>> index e137d740804e..39f8660d825d 100644
>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>> @@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>>>> *vsk,
>>>>struct virtio_vsock_sock *vvs = vsk->trans;
>>>>size_t bytes, total = 0;
>>>>struct sk_buff *skb;
>>>> +  u32 fwd_cnt_delta;
>>>> +  bool low_rx_bytes;
>>>>int err = -EFAULT;
>>>>u32 free_space;
>>>>  
>>>> @@ -601,7 +603,15 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>>>> *vsk,
>>>>}
>>>>}
>>>>  
>>>> -  free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
>>>> +  /* Handle wrap of 'fwd_cnt'. */
>>>> +  if (vvs->fwd_cnt < vvs->last_fwd_cnt)
>>>> +  fwd_cnt_delta = vvs->fwd_cnt + (U32_MAX - vvs->last_fwd_cnt);
>>>
>>> Are you sure there's no off by one here? for example if fwd_cnt is 0
>>> and last_fwd_cnt is 0xf then apparently delta is 0.
>>
>> Seems yes, I need +1 here
> 
> And then you will get a nop, because assigning U32_MAX + 1 to u32
> gives you 0. Adding () does nothing to change the result,
> + and - are commutative.

Ahh, unsigned here, yes.

@Stefano, what did You mean about wrapping here?

I think Michael is right, for example

vvs->fwd_cnt wraps and now == 5
vvs->last_fwd_cnt == 0x

now delta before this patch will be 6 - correct value

May be I didn't get your idea, so implement it very naive?

Thanks, Arseniy

> 
> 
>>>
>>>
>>>> +  else
>>>> +  fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
>>>
>>> I actually don't see what is wrong with just
>>> fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt
>>> 32 bit unsigned math will I think handle wrap around correctly.
>>>
>>> And given buf_alloc is also u32 - I don't see where the bug is in
>>> the original code.
>>
>> I think problem is when fwd_cnt wraps, while last_fwd_cnt is not. In this
>> case fwd_cnt_delta will be too big, so we won't send credit update which
>> leads to stall for sender
>>
>> Thanks, Arseniy
> 
> Care coming up with an example?
> 
> 
>>>
>>>
>>>> +
>>>> +  free_space = vvs->buf_alloc - fwd_cnt_delta;
>>>> +  low_rx_bytes = (vvs->rx_bytes <
>>>> +  sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
>>>>  
>>>>spin_unlock_bh(>rx_lock);
>>>>  
>>>> @@ -611,9 +621,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>>>> *vsk,
>>>> * too high causes extra messages. Too low causes transmitter
>>>> * stalls. As stalls are in theory more expensive than extra
>>>> * messages, we set the limit to a high value. TODO: experiment
>>>> -   * with different values.
>>>> +   * with different values. Also send credit update message when
>>>> +   * number of bytes in rx queue is not enough to wake up reader.
>>>> */
>>>> -  if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
>>>> +  if (fwd_cnt_delta &&
>>>> +  (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
>>>>virtio_transport_send_credit_update(vsk);
>>>>  
>>>>return total;
>>>> -- 
>>>> 2.25.1
>>>
> 



Re: [PATCH net-next v7 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-06 Thread Arseniy Krasnov



On 07.12.2023 00:53, Michael S. Tsirkin wrote:
> On Thu, Dec 07, 2023 at 12:18:48AM +0300, Arseniy Krasnov wrote:
>> Add one more condition for sending credit update during dequeue from
>> stream socket: when number of bytes in the rx queue is smaller than
>> SO_RCVLOWAT value of the socket. This is actual for non-default value
>> of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
>> transmission, because we need at least SO_RCVLOWAT bytes in our rx
>> queue to wake up user for reading data (in corner case it is also
>> possible to stuck both tx and rx sides, this is why 'Fixes' is used).
>> Also handle case when 'fwd_cnt' wraps, while 'last_fwd_cnt' is still
>> not.
>>
>> Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
>> Signed-off-by: Arseniy Krasnov 
>> ---
>>  Changelog:
>>  v6 -> v7:
>>   * Handle wrap of 'fwd_cnt'.
>>   * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.
>>
>>  net/vmw_vsock/virtio_transport_common.c | 18 +++---
>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>> b/net/vmw_vsock/virtio_transport_common.c
>> index e137d740804e..39f8660d825d 100644
>> --- a/net/vmw_vsock/virtio_transport_common.c
>> +++ b/net/vmw_vsock/virtio_transport_common.c
>> @@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>> *vsk,
>>  struct virtio_vsock_sock *vvs = vsk->trans;
>>  size_t bytes, total = 0;
>>  struct sk_buff *skb;
>> +u32 fwd_cnt_delta;
>> +bool low_rx_bytes;
>>  int err = -EFAULT;
>>  u32 free_space;
>>  
>> @@ -601,7 +603,15 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>> *vsk,
>>  }
>>  }
>>  
>> -free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
>> +/* Handle wrap of 'fwd_cnt'. */
>> +if (vvs->fwd_cnt < vvs->last_fwd_cnt)
>> +fwd_cnt_delta = vvs->fwd_cnt + (U32_MAX - vvs->last_fwd_cnt);
> 
> Are you sure there's no off by one here? for example if fwd_cnt is 0
> and last_fwd_cnt is 0xf then apparently delta is 0.

Seems yes, I need +1 here

> 
> 
>> +else
>> +fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
> 
> I actually don't see what is wrong with just
>   fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt
> 32 bit unsigned math will I think handle wrap around correctly.
> 
> And given buf_alloc is also u32 - I don't see where the bug is in
> the original code.

I think problem is when fwd_cnt wraps, while last_fwd_cnt is not. In this
case fwd_cnt_delta will be too big, so we won't send credit update which
leads to stall for sender

Thanks, Arseniy

> 
> 
>> +
>> +free_space = vvs->buf_alloc - fwd_cnt_delta;
>> +low_rx_bytes = (vvs->rx_bytes <
>> +sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
>>  
>>  spin_unlock_bh(>rx_lock);
>>  
>> @@ -611,9 +621,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>> *vsk,
>>   * too high causes extra messages. Too low causes transmitter
>>   * stalls. As stalls are in theory more expensive than extra
>>   * messages, we set the limit to a high value. TODO: experiment
>> - * with different values.
>> + * with different values. Also send credit update message when
>> + * number of bytes in rx queue is not enough to wake up reader.
>>   */
>> -if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
>> +if (fwd_cnt_delta &&
>> +(free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
>>  virtio_transport_send_credit_update(vsk);
>>  
>>  return total;
>> -- 
>> 2.25.1
> 



[PATCH net-next v7 4/4] vsock/test: two tests to check credit update logic

2023-12-06 Thread Arseniy Krasnov
Both tests are almost same, only differs in two 'if' conditions, so
implemented in a single function. Tests check, that credit update
message is sent:

1) During setting SO_RCVLOWAT value of the socket.
2) When number of 'rx_bytes' become smaller than SO_RCVLOWAT value.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Update commit message by adding details about dependency for this
test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
  * Add comment for this dependency in 'vsock_test.c' where this define
is duplicated.
 v2 -> v3:
  * Replace synchronization based on control TCP socket with vsock
data socket - this is needed to allow sender transmit data only
when new buffer size of receiver is visible to sender. Otherwise
there is race and test fails sometimes.
 v3 -> v4:
  * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation
in server part. This is needed to ensure that 'poll()' wake up us
when number of bytes ready to read is equal to SO_RCVLOWAT value.
 v4 -> v5:
  * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'.
 v5 -> v6:
  * Add second test which checks, that credit update is sent during
reading data from socket.
  * Update commit message.

 tools/testing/vsock/vsock_test.c | 175 +++
 1 file changed, 175 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 01fa816868bc..66246d81d654 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1232,6 +1232,171 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
 }
 
+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_credit_update_test(const struct test_opts *opts,
+  bool low_rx_bytes_test)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  _size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   if (low_rx_bytes_test) {
+   /* Set new SO_RCVLOWAT here. This enables sending credit
+* update when number of bytes if our rx queue become <
+* SO_RCVLOWAT value.
+*/
+   recv_buf_size = 1 + VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  _buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEE

[PATCH net-next v7 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-06 Thread Arseniy Krasnov
Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Do not initialize 'send_update' variable - set it directly during
first usage.
 v3 -> v4:
  * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
 v4 -> v5:
  * Do not change callbacks order in transport structures.
 v5 -> v6:
  * Reorder callbacks in transport structures.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.

 drivers/vhost/vsock.c   |  1 +
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  1 +
 net/vmw_vsock/virtio_transport_common.c | 30 +
 net/vmw_vsock/vsock_loopback.c  |  1 +
 5 files changed, 34 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ec20ecff85c7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..f495b9e5186b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -537,6 +537,7 @@ static struct virtio_transport virtio_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..e137d740804e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,36 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(>rx_lock);
+
+   /* If number of available bytes is less than new SO_RCVLOWAT value,
+* kick sender to send more data, because sender may sleep in its
+* 'send()' syscall waiting for enough space at our side. Also
+* don't send credit update when peer already knows actual value -
+* such transmission will be useless.
+*/
+   send_update = (vvs->rx_bytes < val) &&
+ (vvs->fwd_cnt != vvs->last_fwd_cnt);
+
+   spin_unlock_bh(>rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..6dea6119f5b2 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -96,6 +96,7 @@ static struct virtio_transport loopback_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,

[PATCH net-next v7 0/4] send credit update during setting SO_RCVLOWAT

2023-12-06 Thread Arseniy Krasnov
Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=021b0c952f226236f2edf89c737efb9a28d1422d

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
Link to v4:
https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
Link to v5:
https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/
Link to v6:
https://lore.kernel.org/netdev/20231205064806.2851305-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.
v2 -> v3:
 * See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v4 -> v5:
 * Change patchset tag 'RFC' -> 'net-next'.
 * See per-patch changelog after ---.
v5 -> v6:
 * New patch 0003 which sends credit update during reading bytes from
   socket.
 * See per-patch changelog after ---.
v6 -> v7:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.

Arseniy Krasnov (4):
  vsock: update SO_RCVLOWAT setting callback
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  virtio/vsock: fix logic which reduces credit update messages
  vsock/test: two tests to check credit update logic

 drivers/vhost/vsock.c   |   1 +
 include/linux/virtio_vsock.h|   1 +
 include/net/af_vsock.h  |   2 +-
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/hyperv_transport.c|   4 +-
 net/vmw_vsock/virtio_transport.c|   1 +
 net/vmw_vsock/virtio_transport_common.c |  48 ++-
 net/vmw_vsock/vsock_loopback.c  |   1 +
 tools/testing/vsock/vsock_test.c| 175 
 9 files changed, 234 insertions(+), 8 deletions(-)

-- 
2.25.1




[PATCH net-next v7 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-06 Thread Arseniy Krasnov
Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).
Also handle case when 'fwd_cnt' wraps, while 'last_fwd_cnt' is still
not.

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v6 -> v7:
  * Handle wrap of 'fwd_cnt'.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.

 net/vmw_vsock/virtio_transport_common.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e137d740804e..39f8660d825d 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -558,6 +558,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+   u32 fwd_cnt_delta;
+   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;
 
@@ -601,7 +603,15 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}
}
 
-   free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+   /* Handle wrap of 'fwd_cnt'. */
+   if (vvs->fwd_cnt < vvs->last_fwd_cnt)
+   fwd_cnt_delta = vvs->fwd_cnt + (U32_MAX - vvs->last_fwd_cnt);
+   else
+   fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
+
+   free_space = vvs->buf_alloc - fwd_cnt_delta;
+   low_rx_bytes = (vvs->rx_bytes <
+   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
 
spin_unlock_bh(>rx_lock);
 
@@ -611,9 +621,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * too high causes extra messages. Too low causes transmitter
 * stalls. As stalls are in theory more expensive than extra
 * messages, we set the limit to a high value. TODO: experiment
-* with different values.
+* with different values. Also send credit update message when
+* number of bytes in rx queue is not enough to wake up reader.
 */
-   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   if (fwd_cnt_delta &&
+   (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || low_rx_bytes))
virtio_transport_send_credit_update(vsk);
 
return total;
-- 
2.25.1




[PATCH net-next v7 1/4] vsock: update SO_RCVLOWAT setting callback

2023-12-06 Thread Arseniy Krasnov
Do not return if transport callback for SO_RCVLOWAT is set (only in
error case). In this case we don't need to set 'sk_rcvlowat' field in
each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
to 'notify_set_rcvlowat'.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
---
 Changelog:
 v3 -> v4:
  * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
  * Commit message updated.

 include/net/af_vsock.h   | 2 +-
 net/vmw_vsock/af_vsock.c | 9 +++--
 net/vmw_vsock/hyperv_transport.c | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e302c0e804d0..535701efc1e5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -137,7 +137,6 @@ struct vsock_transport {
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);
-   int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* SEQ_PACKET. */
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
@@ -168,6 +167,7 @@ struct vsock_transport {
struct vsock_transport_send_notify_data *);
/* sk_lock held by the caller */
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
+   int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..54ba7316f808 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
transport = vsk->transport;
 
-   if (transport && transport->set_rcvlowat)
-   return transport->set_rcvlowat(vsk, val);
+   if (transport && transport->notify_set_rcvlowat) {
+   int err;
+
+   err = transport->notify_set_rcvlowat(vsk, val);
+   if (err)
+   return err;
+   }
 
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
return 0;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7cb1a9d2cdb4..e2157e387217 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
ssize_t written,
 }
 
 static
-int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
+int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
 {
return -EOPNOTSUPP;
 }
@@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
 
-   .set_rcvlowat = hvs_set_rcvlowat
+   .notify_set_rcvlowat  = hvs_notify_set_rcvlowat
 };
 
 static bool hvs_check_transport(struct vsock_sock *vsk)
-- 
2.25.1




Re: [PATCH net] vsock/virtio: fix "comparison of distinct pointer types lacks a cast" warning

2023-12-06 Thread Arseniy Krasnov



On 06.12.2023 19:41, Stefano Garzarella wrote:
> After backporting commit 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY
> flag support") in CentOS Stream 9, CI reported the following error:
> 
> In file included from ./include/linux/kernel.h:17,
>  from ./include/linux/list.h:9,
>  from ./include/linux/preempt.h:11,
>  from ./include/linux/spinlock.h:56,
>  from net/vmw_vsock/virtio_transport_common.c:9:
> net/vmw_vsock/virtio_transport_common.c: In function 
> ‘virtio_transport_can_zcopy‘:
> ./include/linux/minmax.h:20:35: error: comparison of distinct pointer 
> types lacks a cast [-Werror]
>20 | (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
>   |   ^~
> ./include/linux/minmax.h:26:18: note: in expansion of macro ‘__typecheck‘
>26 | (__typecheck(x, y) && __no_side_effects(x, y))
>   |  ^~~
> ./include/linux/minmax.h:36:31: note: in expansion of macro ‘__safe_cmp‘
>36 | __builtin_choose_expr(__safe_cmp(x, y), \
>   |   ^~
> ./include/linux/minmax.h:45:25: note: in expansion of macro 
> ‘__careful_cmp‘
>45 | #define min(x, y)   __careful_cmp(x, y, <)
>   | ^
> net/vmw_vsock/virtio_transport_common.c:63:37: note: in expansion of 
> macro ‘min‘
>63 | int pages_to_send = min(pages_in_iov, 
> MAX_SKB_FRAGS);
> 
> We could solve it by using min_t(), but this operation seems entirely
> unnecessary, because we also pass MAX_SKB_FRAGS to iov_iter_npages(),
> which performs almost the same check, returning at most MAX_SKB_FRAGS
> elements. So, let's eliminate this unnecessary comparison.
> 
> Fixes: 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support")
> Cc: avkras...@salutedevices.com
> Signed-off-by: Stefano Garzarella 
> ---

Reviewed-by: Arseniy Krasnov 

>  net/vmw_vsock/virtio_transport_common.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/net/vmw_vsock/virtio_transport_common.c 
> b/net/vmw_vsock/virtio_transport_common.c
> index f6dc896bf44c..c8e162c9d1df 100644
> --- a/net/vmw_vsock/virtio_transport_common.c
> +++ b/net/vmw_vsock/virtio_transport_common.c
> @@ -59,8 +59,7 @@ static bool virtio_transport_can_zcopy(const struct 
> virtio_transport *t_ops,
>   t_ops = virtio_transport_get_ops(info->vsk);
>  
>   if (t_ops->can_msgzerocopy) {
> - int pages_in_iov = iov_iter_npages(iov_iter, MAX_SKB_FRAGS);
> - int pages_to_send = min(pages_in_iov, MAX_SKB_FRAGS);
> + int pages_to_send = iov_iter_npages(iov_iter, MAX_SKB_FRAGS);
>  
>   /* +1 is for packet header. */
>   return t_ops->can_msgzerocopy(pages_to_send + 1);



Re: [PATCH net-next v6 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-05 Thread Arseniy Krasnov



On 05.12.2023 17:21, Stefano Garzarella wrote:
> On Tue, Dec 05, 2023 at 03:07:47PM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 05.12.2023 13:54, Stefano Garzarella wrote:
>>> On Tue, Dec 05, 2023 at 09:48:05AM +0300, Arseniy Krasnov wrote:
>>>> Add one more condition for sending credit update during dequeue from
>>>> stream socket: when number of bytes in the rx queue is smaller than
>>>> SO_RCVLOWAT value of the socket. This is actual for non-default value
>>>> of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
>>>> transmission, because we need at least SO_RCVLOWAT bytes in our rx
>>>> queue to wake up user for reading data (in corner case it is also
>>>> possible to stuck both tx and rx sides, this is why 'Fixes' is used).
>>>>
>>>> Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
>>>> Signed-off-by: Arseniy Krasnov 
>>>> ---
>>>> net/vmw_vsock/virtio_transport_common.c | 9 +++--
>>>> 1 file changed, 7 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>>>> b/net/vmw_vsock/virtio_transport_common.c
>>>> index e137d740804e..461c89882142 100644
>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>> @@ -558,6 +558,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>>>> *vsk,
>>>> struct virtio_vsock_sock *vvs = vsk->trans;
>>>> size_t bytes, total = 0;
>>>> struct sk_buff *skb;
>>>> +    bool low_rx_bytes;
>>>> int err = -EFAULT;
>>>> u32 free_space;
>>>>
>>>> @@ -602,6 +603,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>>>> *vsk,
>>>> }
>>>>
>>>> free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
>>>> +    low_rx_bytes = (vvs->rx_bytes <
>>>> +    sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
>>>
>>> As in the previous patch, should we avoid the update it if `fwd_cnt` and 
>>> `last_fwd_cnt` are the same?
>>>
>>> Now I'm thinking if it is better to add that check directly in 
>>> virtio_transport_send_credit_update().
>>
>> Good point, but I think, that it is better to keep this check here, because 
>> access to 'fwd_cnt' and 'last_fwd_cnt'
>> requires taking rx_lock - so I guess it is better to avoid taking this lock 
>> every time in 'virtio_transport_send_credit_update()'.
> 
> Yeah, I agree.
> 
>> So may be we can do something like:
>>
>>
>> fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
>> free_space = vvs->buf_alloc - fwd_cnt_delta;
> 
> Pre-existing issue, but should we handle the wrap (e.g. fwd_cnt wrapped, but 
> last_fwd_cnt not yet?). Maybe in that case we can foce the status
> update.

Agree, I'll add this logic!

> 
>>
>> and then, after lock is released:
>>
>> if (fwd_cnt_delta && (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE ||
>>    low_rx_bytes))
>>    virtio_transport_send_credit_update(vsk);
>>
>> WDYT?
> 
> Yep, I agree.
> 
>>
>> Also, I guess that next idea to update this optimization(in next patchset), 
>> is to make
>> threshold depends on vvs->buf_alloc. Because if someone changes minimum 
>> buffer size to
>> for example 32KB, and then sets buffer size to 32KB, then free_space will be 
>> always
>> non-zero, thus optimization is off now and credit update is sent on every 
>> read.
> 
> But does it make sense to allow a buffer smaller than 
> VIRTIO_VSOCK_MAX_PKT_BUF_SIZE?
> 
> Maybe we should fail in virtio_transport_notify_buffer_size() or use it as 
> minimum.

Yes, currently there is no limitation in this transport callback - only for 
maximum.

Thanks, Arseniy

> 
> Stefano
> 



Re: [PATCH net-next v6 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-05 Thread Arseniy Krasnov



On 05.12.2023 13:54, Stefano Garzarella wrote:
> On Tue, Dec 05, 2023 at 09:48:05AM +0300, Arseniy Krasnov wrote:
>> Add one more condition for sending credit update during dequeue from
>> stream socket: when number of bytes in the rx queue is smaller than
>> SO_RCVLOWAT value of the socket. This is actual for non-default value
>> of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
>> transmission, because we need at least SO_RCVLOWAT bytes in our rx
>> queue to wake up user for reading data (in corner case it is also
>> possible to stuck both tx and rx sides, this is why 'Fixes' is used).
>>
>> Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
>> Signed-off-by: Arseniy Krasnov 
>> ---
>> net/vmw_vsock/virtio_transport_common.c | 9 +++--
>> 1 file changed, 7 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>> b/net/vmw_vsock/virtio_transport_common.c
>> index e137d740804e..461c89882142 100644
>> --- a/net/vmw_vsock/virtio_transport_common.c
>> +++ b/net/vmw_vsock/virtio_transport_common.c
>> @@ -558,6 +558,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>> *vsk,
>> struct virtio_vsock_sock *vvs = vsk->trans;
>> size_t bytes, total = 0;
>> struct sk_buff *skb;
>> +    bool low_rx_bytes;
>> int err = -EFAULT;
>> u32 free_space;
>>
>> @@ -602,6 +603,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>> *vsk,
>> }
>>
>> free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
>> +    low_rx_bytes = (vvs->rx_bytes <
>> +    sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
> 
> As in the previous patch, should we avoid the update it if `fwd_cnt` and 
> `last_fwd_cnt` are the same?
> 
> Now I'm thinking if it is better to add that check directly in 
> virtio_transport_send_credit_update().

Good point, but I think, that it is better to keep this check here, because 
access to 'fwd_cnt' and 'last_fwd_cnt'
requires taking rx_lock - so I guess it is better to avoid taking this lock 
every time in 'virtio_transport_send_credit_update()'.
So may be we can do something like:


fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
free_space = vvs->buf_alloc - fwd_cnt_delta;

and then, after lock is released:

if (fwd_cnt_delta && (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE ||
low_rx_bytes))
virtio_transport_send_credit_update(vsk);

WDYT?

Also, I guess that next idea to update this optimization(in next patchset), is 
to make
threshold depends on vvs->buf_alloc. Because if someone changes minimum buffer 
size to
for example 32KB, and then sets buffer size to 32KB, then free_space will be 
always
non-zero, thus optimization is off now and credit update is sent on every read.

Thanks, Arseniy

> 
> Stefano
> 
>>
>> spin_unlock_bh(>rx_lock);
>>
>> @@ -611,9 +614,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock 
>> *vsk,
>>  * too high causes extra messages. Too low causes transmitter
>>  * stalls. As stalls are in theory more expensive than extra
>>  * messages, we set the limit to a high value. TODO: experiment
>> - * with different values.
>> + * with different values. Also send credit update message when
>> + * number of bytes in rx queue is not enough to wake up reader.
>>  */
>> -    if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
>> +    if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE ||
>> +    low_rx_bytes)
>>     virtio_transport_send_credit_update(vsk);
>>
>> return total;
>> -- 
>> 2.25.1
>>
> 



[PATCH net-next v6 3/4] virtio/vsock: fix logic which reduces credit update messages

2023-12-04 Thread Arseniy Krasnov
Add one more condition for sending credit update during dequeue from
stream socket: when number of bytes in the rx queue is smaller than
SO_RCVLOWAT value of the socket. This is actual for non-default value
of SO_RCVLOWAT (e.g. not 1) - idea is to "kick" peer to continue data
transmission, because we need at least SO_RCVLOWAT bytes in our rx
queue to wake up user for reading data (in corner case it is also
possible to stuck both tx and rx sides, this is why 'Fixes' is used).

Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages")
Signed-off-by: Arseniy Krasnov 
---
 net/vmw_vsock/virtio_transport_common.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e137d740804e..461c89882142 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -558,6 +558,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
size_t bytes, total = 0;
struct sk_buff *skb;
+   bool low_rx_bytes;
int err = -EFAULT;
u32 free_space;
 
@@ -602,6 +603,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
}
 
free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+   low_rx_bytes = (vvs->rx_bytes <
+   sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
 
spin_unlock_bh(>rx_lock);
 
@@ -611,9 +614,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
 * too high causes extra messages. Too low causes transmitter
 * stalls. As stalls are in theory more expensive than extra
 * messages, we set the limit to a high value. TODO: experiment
-* with different values.
+* with different values. Also send credit update message when
+* number of bytes in rx queue is not enough to wake up reader.
 */
-   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE ||
+   low_rx_bytes)
virtio_transport_send_credit_update(vsk);
 
return total;
-- 
2.25.1




[PATCH net-next v6 0/4] send credit update during setting SO_RCVLOWAT

2023-12-04 Thread Arseniy Krasnov
Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=f1be1e04c76bb9c44789d3575bba4418cf0ea359

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
Link to v4:
https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/
Link to v5:
https://lore.kernel.org/netdev/20231130130840.253733-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.
v2 -> v3:
 * See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v4 -> v5:
 * Change patchset tag 'RFC' -> 'net-next'.
 * See per-patch changelog after ---.
v5 -> v6:
 * New patch 0003 which sends credit update during reading bytes from
   socket.
 * See per-patch changelog after ---.

Arseniy Krasnov (4):
  vsock: update SO_RCVLOWAT setting callback
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  virtio/vsock: fix logic which reduces credit update messages
  vsock/test: two tests to check credit update logic

 drivers/vhost/vsock.c   |   1 +
 include/linux/virtio_vsock.h|   1 +
 include/net/af_vsock.h  |   2 +-
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/hyperv_transport.c|   4 +-
 net/vmw_vsock/virtio_transport.c|   1 +
 net/vmw_vsock/virtio_transport_common.c |  39 +-
 net/vmw_vsock/vsock_loopback.c  |   1 +
 tools/testing/vsock/vsock_test.c| 175 
 9 files changed, 226 insertions(+), 7 deletions(-)

-- 
2.25.1




[PATCH net-next v6 2/4] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-04 Thread Arseniy Krasnov
Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Do not initialize 'send_update' variable - set it directly during
first usage.
 v3 -> v4:
  * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
 v4 -> v5:
  * Do not change callbacks order in transport structures.
 v5 -> v6:
  * Reorder callbacks in transport structures.
  * Do to send credit update when 'fwd_cnt' == 'last_fwd_cnt'.

 drivers/vhost/vsock.c   |  1 +
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  1 +
 net/vmw_vsock/virtio_transport_common.c | 30 +
 net/vmw_vsock/vsock_loopback.c  |  1 +
 5 files changed, 34 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ec20ecff85c7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,6 +449,7 @@ static struct virtio_transport vhost_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..f495b9e5186b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -537,6 +537,7 @@ static struct virtio_transport virtio_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
.read_skb = virtio_transport_read_skb,
},
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..e137d740804e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,36 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(>rx_lock);
+
+   /* If number of available bytes is less than new SO_RCVLOWAT value,
+* kick sender to send more data, because sender may sleep in its
+* 'send()' syscall waiting for enough space at our side. Also
+* don't send credit update when peer already knows actual value -
+* such transmission will be useless.
+*/
+   send_update = (vvs->rx_bytes < val) &&
+ (vvs->fwd_cnt != vvs->last_fwd_cnt);
+
+   spin_unlock_bh(>rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..6dea6119f5b2 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -96,6 +96,7 @@ static struct virtio_transport loopback_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,

[PATCH net-next v6 4/4] vsock/test: two tests to check credit update logic

2023-12-04 Thread Arseniy Krasnov
Both tests are almost same, only differs in two 'if' conditions, so
implemented in a single function. Tests check, that credit update
message is sent:

1) During setting SO_RCVLOWAT value of the socket.
2) When number of 'rx_bytes' become smaller than SO_RCVLOWAT value.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Update commit message by adding details about dependency for this
test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
  * Add comment for this dependency in 'vsock_test.c' where this define
is duplicated.
 v2 -> v3:
  * Replace synchronization based on control TCP socket with vsock
data socket - this is needed to allow sender transmit data only
when new buffer size of receiver is visible to sender. Otherwise
there is race and test fails sometimes.
 v3 -> v4:
  * Replace 'recv_buf()' to 'recv(MSG_DONTWAIT)' in last read operation
in server part. This is needed to ensure that 'poll()' wake up us
when number of bytes ready to read is equal to SO_RCVLOWAT value.
 v4 -> v5:
  * Use 'recv_buf(MSG_DONTWAIT)' instead of 'recv(MSG_DONTWAIT)'.
 v5 -> v6:
  * Add second test which checks, that credit update is sent during
reading data from socket.
  * Update commit message.

 tools/testing/vsock/vsock_test.c | 175 +++
 1 file changed, 175 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 01fa816868bc..66246d81d654 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1232,6 +1232,171 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
 }
 
+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_credit_update_test(const struct test_opts *opts,
+  bool low_rx_bytes_test)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  _size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   if (low_rx_bytes_test) {
+   /* Set new SO_RCVLOWAT here. This enables sending credit
+* update when number of bytes if our rx queue become <
+* SO_RCVLOWAT value.
+*/
+   recv_buf_size = 1 + VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  _buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEEK);
+

[PATCH net-next v6 1/4] vsock: update SO_RCVLOWAT setting callback

2023-12-04 Thread Arseniy Krasnov
Do not return if transport callback for SO_RCVLOWAT is set (only in
error case). In this case we don't need to set 'sk_rcvlowat' field in
each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
to 'notify_set_rcvlowat'.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
---
 Changelog:
 v3 -> v4:
  * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
  * Commit message updated.

 include/net/af_vsock.h   | 2 +-
 net/vmw_vsock/af_vsock.c | 9 +++--
 net/vmw_vsock/hyperv_transport.c | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e302c0e804d0..535701efc1e5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -137,7 +137,6 @@ struct vsock_transport {
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);
-   int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* SEQ_PACKET. */
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
@@ -168,6 +167,7 @@ struct vsock_transport {
struct vsock_transport_send_notify_data *);
/* sk_lock held by the caller */
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
+   int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..54ba7316f808 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
transport = vsk->transport;
 
-   if (transport && transport->set_rcvlowat)
-   return transport->set_rcvlowat(vsk, val);
+   if (transport && transport->notify_set_rcvlowat) {
+   int err;
+
+   err = transport->notify_set_rcvlowat(vsk, val);
+   if (err)
+   return err;
+   }
 
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
return 0;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7cb1a9d2cdb4..e2157e387217 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
ssize_t written,
 }
 
 static
-int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
+int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
 {
return -EOPNOTSUPP;
 }
@@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
 
-   .set_rcvlowat = hvs_set_rcvlowat
+   .notify_set_rcvlowat  = hvs_notify_set_rcvlowat
 };
 
 static bool hvs_check_transport(struct vsock_sock *vsk)
-- 
2.25.1




Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-02 Thread Arseniy Krasnov



On 02.12.2023 23:22, Michael S. Tsirkin wrote:
> On Fri, Dec 01, 2023 at 01:40:41PM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 01.12.2023 12:48, Stefano Garzarella wrote:
>>> On Fri, Dec 01, 2023 at 11:35:56AM +0300, Arseniy Krasnov wrote:
>>>>
>>>>
>>>> On 01.12.2023 11:27, Stefano Garzarella wrote:
>>>>> On Thu, Nov 30, 2023 at 12:40:43PM -0500, Michael S. Tsirkin wrote:
>>>>>> On Thu, Nov 30, 2023 at 03:11:19PM +0100, Stefano Garzarella wrote:
>>>>>>> On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote:
>>>>>>>> On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 30.11.2023 16:42, Michael S. Tsirkin wrote:
>>>>>>>>>> On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:
>>>>>>>>>>> Send credit update message when SO_RCVLOWAT is updated and it is 
>>>>>>>>>>> bigger
>>>>>>>>>>> than number of bytes in rx queue. It is needed, because 'poll()' 
>>>>>>>>>>> will
>>>>>>>>>>> wait until number of bytes in rx queue will be not smaller than
>>>>>>>>>>> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual 
>>>>>>>>>>> hungup
>>>>>>>>>>> for tx/rx is possible: sender waits for free space and receiver is
>>>>>>>>>>> waiting data in 'poll()'.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Arseniy Krasnov 
>>>>>>>>>>> ---
>>>>>>>>>>>   Changelog:
>>>>>>>>>>>   v1 -> v2:
>>>>>>>>>>>    * Update commit message by removing 'This patch adds XXX' manner.
>>>>>>>>>>>    * Do not initialize 'send_update' variable - set it directly 
>>>>>>>>>>> during
>>>>>>>>>>>  first usage.
>>>>>>>>>>>   v3 -> v4:
>>>>>>>>>>>    * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 
>>>>>>>>>>> chars.
>>>>>>>>>>>   v4 -> v5:
>>>>>>>>>>>    * Do not change callbacks order in transport structures.
>>>>>>>>>>>
>>>>>>>>>>>   drivers/vhost/vsock.c   |  1 +
>>>>>>>>>>>   include/linux/virtio_vsock.h    |  1 +
>>>>>>>>>>>   net/vmw_vsock/virtio_transport.c    |  1 +
>>>>>>>>>>>   net/vmw_vsock/virtio_transport_common.c | 27 
>>>>>>>>>>> +
>>>>>>>>>>>   net/vmw_vsock/vsock_loopback.c  |  1 +
>>>>>>>>>>>   5 files changed, 31 insertions(+)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>>>>>>>>> index f75731396b7e..4146f80db8ac 100644
>>>>>>>>>>> --- a/drivers/vhost/vsock.c
>>>>>>>>>>> +++ b/drivers/vhost/vsock.c
>>>>>>>>>>> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport 
>>>>>>>>>>> = {
>>>>>>>>>>>   .notify_buffer_size   = 
>>>>>>>>>>> virtio_transport_notify_buffer_size,
>>>>>>>>>>>
>>>>>>>>>>>   .read_skb = virtio_transport_read_skb,
>>>>>>>>>>> +    .notify_set_rcvlowat  = 
>>>>>>>>>>> virtio_transport_notify_set_rcvlowat
>>>>>>>>>>>   },
>>>>>>>>>>>
>>>>>>>>>>>   .send_pkt = vhost_transport_send_pkt,
>>>>>>>>>>> diff --git a/include/linux/virtio_vsock.h 
>>>>>>>>>>> b/include/linux/virtio_vsock.h
>>>>>>>>>>> index ebb3ce63d64d..c82089dee0c8 100644
>>>>>>>>>>> --- a/inclu

Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-01 Thread Arseniy Krasnov



On 01.12.2023 12:48, Stefano Garzarella wrote:
> On Fri, Dec 01, 2023 at 11:35:56AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 01.12.2023 11:27, Stefano Garzarella wrote:
>>> On Thu, Nov 30, 2023 at 12:40:43PM -0500, Michael S. Tsirkin wrote:
>>>> On Thu, Nov 30, 2023 at 03:11:19PM +0100, Stefano Garzarella wrote:
>>>>> On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote:
>>>>> > On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote:
>>>>> > >
>>>>> > >
>>>>> > > On 30.11.2023 16:42, Michael S. Tsirkin wrote:
>>>>> > > > On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:
>>>>> > > >> Send credit update message when SO_RCVLOWAT is updated and it is 
>>>>> > > >> bigger
>>>>> > > >> than number of bytes in rx queue. It is needed, because 'poll()' 
>>>>> > > >> will
>>>>> > > >> wait until number of bytes in rx queue will be not smaller than
>>>>> > > >> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual 
>>>>> > > >> hungup
>>>>> > > >> for tx/rx is possible: sender waits for free space and receiver is
>>>>> > > >> waiting data in 'poll()'.
>>>>> > > >>
>>>>> > > >> Signed-off-by: Arseniy Krasnov 
>>>>> > > >> ---
>>>>> > > >>  Changelog:
>>>>> > > >>  v1 -> v2:
>>>>> > > >>   * Update commit message by removing 'This patch adds XXX' manner.
>>>>> > > >>   * Do not initialize 'send_update' variable - set it directly 
>>>>> > > >>during
>>>>> > > >> first usage.
>>>>> > > >>  v3 -> v4:
>>>>> > > >>   * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 
>>>>> > > >>chars.
>>>>> > > >>  v4 -> v5:
>>>>> > > >>   * Do not change callbacks order in transport structures.
>>>>> > > >>
>>>>> > > >>  drivers/vhost/vsock.c   |  1 +
>>>>> > > >>  include/linux/virtio_vsock.h    |  1 +
>>>>> > > >>  net/vmw_vsock/virtio_transport.c    |  1 +
>>>>> > > >>  net/vmw_vsock/virtio_transport_common.c | 27 
>>>>> > > >>+
>>>>> > > >>  net/vmw_vsock/vsock_loopback.c  |  1 +
>>>>> > > >>  5 files changed, 31 insertions(+)
>>>>> > > >>
>>>>> > > >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>>> > > >> index f75731396b7e..4146f80db8ac 100644
>>>>> > > >> --- a/drivers/vhost/vsock.c
>>>>> > > >> +++ b/drivers/vhost/vsock.c
>>>>> > > >> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport 
>>>>> > > >> = {
>>>>> > > >>  .notify_buffer_size   = 
>>>>> > > >>virtio_transport_notify_buffer_size,
>>>>> > > >>
>>>>> > > >>  .read_skb = virtio_transport_read_skb,
>>>>> > > >> +    .notify_set_rcvlowat  = 
>>>>> > > >> virtio_transport_notify_set_rcvlowat
>>>>> > > >>  },
>>>>> > > >>
>>>>> > > >>  .send_pkt = vhost_transport_send_pkt,
>>>>> > > >> diff --git a/include/linux/virtio_vsock.h 
>>>>> > > >> b/include/linux/virtio_vsock.h
>>>>> > > >> index ebb3ce63d64d..c82089dee0c8 100644
>>>>> > > >> --- a/include/linux/virtio_vsock.h
>>>>> > > >> +++ b/include/linux/virtio_vsock.h
>>>>> > > >> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
>>>>> > > >> virtio_vsock_sock *vvs, u32 credit);
>>>>> > > >>  void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>>>>> > > >>  int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head 
>>>>> > > >>*list

Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-12-01 Thread Arseniy Krasnov



On 01.12.2023 11:27, Stefano Garzarella wrote:
> On Thu, Nov 30, 2023 at 12:40:43PM -0500, Michael S. Tsirkin wrote:
>> On Thu, Nov 30, 2023 at 03:11:19PM +0100, Stefano Garzarella wrote:
>>> On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote:
>>> > On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote:
>>> > >
>>> > >
>>> > > On 30.11.2023 16:42, Michael S. Tsirkin wrote:
>>> > > > On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:
>>> > > >> Send credit update message when SO_RCVLOWAT is updated and it is 
>>> > > >> bigger
>>> > > >> than number of bytes in rx queue. It is needed, because 'poll()' will
>>> > > >> wait until number of bytes in rx queue will be not smaller than
>>> > > >> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual 
>>> > > >> hungup
>>> > > >> for tx/rx is possible: sender waits for free space and receiver is
>>> > > >> waiting data in 'poll()'.
>>> > > >>
>>> > > >> Signed-off-by: Arseniy Krasnov 
>>> > > >> ---
>>> > > >>  Changelog:
>>> > > >>  v1 -> v2:
>>> > > >>   * Update commit message by removing 'This patch adds XXX' manner.
>>> > > >>   * Do not initialize 'send_update' variable - set it directly during
>>> > > >> first usage.
>>> > > >>  v3 -> v4:
>>> > > >>   * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 
>>> > > >>chars.
>>> > > >>  v4 -> v5:
>>> > > >>   * Do not change callbacks order in transport structures.
>>> > > >>
>>> > > >>  drivers/vhost/vsock.c   |  1 +
>>> > > >>  include/linux/virtio_vsock.h    |  1 +
>>> > > >>  net/vmw_vsock/virtio_transport.c    |  1 +
>>> > > >>  net/vmw_vsock/virtio_transport_common.c | 27 
>>> > > >>+
>>> > > >>  net/vmw_vsock/vsock_loopback.c  |  1 +
>>> > > >>  5 files changed, 31 insertions(+)
>>> > > >>
>>> > > >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>> > > >> index f75731396b7e..4146f80db8ac 100644
>>> > > >> --- a/drivers/vhost/vsock.c
>>> > > >> +++ b/drivers/vhost/vsock.c
>>> > > >> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = 
>>> > > >> {
>>> > > >>  .notify_buffer_size   = 
>>> > > >>virtio_transport_notify_buffer_size,
>>> > > >>
>>> > > >>  .read_skb = virtio_transport_read_skb,
>>> > > >> +    .notify_set_rcvlowat  = 
>>> > > >> virtio_transport_notify_set_rcvlowat
>>> > > >>  },
>>> > > >>
>>> > > >>  .send_pkt = vhost_transport_send_pkt,
>>> > > >> diff --git a/include/linux/virtio_vsock.h 
>>> > > >> b/include/linux/virtio_vsock.h
>>> > > >> index ebb3ce63d64d..c82089dee0c8 100644
>>> > > >> --- a/include/linux/virtio_vsock.h
>>> > > >> +++ b/include/linux/virtio_vsock.h
>>> > > >> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
>>> > > >> virtio_vsock_sock *vvs, u32 credit);
>>> > > >>  void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>>> > > >>  int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head 
>>> > > >>*list);
>>> > > >>  int virtio_transport_read_skb(struct vsock_sock *vsk, 
>>> > > >>skb_read_actor_t read_actor);
>>> > > >> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, 
>>> > > >> int val);
>>> > > >>  #endif /* _LINUX_VIRTIO_VSOCK_H */
>>> > > >> diff --git a/net/vmw_vsock/virtio_transport.c 
>>> > > >> b/net/vmw_vsock/virtio_transport.c
>>> > > >> index af5bab1acee1..8007593a3a93 100644
>>> > > >> --- a/net/vmw_vsock/virtio_transport.c
>>> > > >> +++ b/net/vmw_vsock/virtio_transport.c
&

Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-30 Thread Arseniy Krasnov



On 30.11.2023 20:37, Michael S. Tsirkin wrote:
> On Thu, Nov 30, 2023 at 06:41:56PM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 30.11.2023 17:11, Stefano Garzarella wrote:
>>> On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote:
>>>> On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote:
>>>>>
>>>>>
>>>>> On 30.11.2023 16:42, Michael S. Tsirkin wrote:
>>>>>> On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:
>>>>>>> Send credit update message when SO_RCVLOWAT is updated and it is bigger
>>>>>>> than number of bytes in rx queue. It is needed, because 'poll()' will
>>>>>>> wait until number of bytes in rx queue will be not smaller than
>>>>>>> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
>>>>>>> for tx/rx is possible: sender waits for free space and receiver is
>>>>>>> waiting data in 'poll()'.
>>>>>>>
>>>>>>> Signed-off-by: Arseniy Krasnov 
>>>>>>> ---
>>>>>>>   Changelog:
>>>>>>>   v1 -> v2:
>>>>>>>    * Update commit message by removing 'This patch adds XXX' manner.
>>>>>>>    * Do not initialize 'send_update' variable - set it directly during
>>>>>>>  first usage.
>>>>>>>   v3 -> v4:
>>>>>>>    * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 
>>>>>>> chars.
>>>>>>>   v4 -> v5:
>>>>>>>    * Do not change callbacks order in transport structures.
>>>>>>>
>>>>>>>   drivers/vhost/vsock.c   |  1 +
>>>>>>>   include/linux/virtio_vsock.h    |  1 +
>>>>>>>   net/vmw_vsock/virtio_transport.c    |  1 +
>>>>>>>   net/vmw_vsock/virtio_transport_common.c | 27 +
>>>>>>>   net/vmw_vsock/vsock_loopback.c  |  1 +
>>>>>>>   5 files changed, 31 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>>>>> index f75731396b7e..4146f80db8ac 100644
>>>>>>> --- a/drivers/vhost/vsock.c
>>>>>>> +++ b/drivers/vhost/vsock.c
>>>>>>> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = {
>>>>>>>   .notify_buffer_size   = 
>>>>>>> virtio_transport_notify_buffer_size,
>>>>>>>
>>>>>>>   .read_skb = virtio_transport_read_skb,
>>>>>>> +    .notify_set_rcvlowat  = 
>>>>>>> virtio_transport_notify_set_rcvlowat
>>>>>>>   },
>>>>>>>
>>>>>>>   .send_pkt = vhost_transport_send_pkt,
>>>>>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>>>>>> index ebb3ce63d64d..c82089dee0c8 100644
>>>>>>> --- a/include/linux/virtio_vsock.h
>>>>>>> +++ b/include/linux/virtio_vsock.h
>>>>>>> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
>>>>>>> virtio_vsock_sock *vvs, u32 credit);
>>>>>>>   void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>>>>>>>   int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>>>>>>>   int virtio_transport_read_skb(struct vsock_sock *vsk, 
>>>>>>> skb_read_actor_t read_actor);
>>>>>>> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int 
>>>>>>> val);
>>>>>>>   #endif /* _LINUX_VIRTIO_VSOCK_H */
>>>>>>> diff --git a/net/vmw_vsock/virtio_transport.c 
>>>>>>> b/net/vmw_vsock/virtio_transport.c
>>>>>>> index af5bab1acee1..8007593a3a93 100644
>>>>>>> --- a/net/vmw_vsock/virtio_transport.c
>>>>>>> +++ b/net/vmw_vsock/virtio_transport.c
>>>>>>> @@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = {
>>>>>>>   .notify_buffer_size   = 
>>>>>>> virtio_transport_notify_buffer_size,
>>>>>>>
>>>>>>>   .read_skb = virtio_transpor

Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-30 Thread Arseniy Krasnov



On 30.11.2023 17:11, Stefano Garzarella wrote:
> On Thu, Nov 30, 2023 at 08:58:58AM -0500, Michael S. Tsirkin wrote:
>> On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote:
>>>
>>>
>>> On 30.11.2023 16:42, Michael S. Tsirkin wrote:
>>> > On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:
>>> >> Send credit update message when SO_RCVLOWAT is updated and it is bigger
>>> >> than number of bytes in rx queue. It is needed, because 'poll()' will
>>> >> wait until number of bytes in rx queue will be not smaller than
>>> >> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
>>> >> for tx/rx is possible: sender waits for free space and receiver is
>>> >> waiting data in 'poll()'.
>>> >>
>>> >> Signed-off-by: Arseniy Krasnov 
>>> >> ---
>>> >>  Changelog:
>>> >>  v1 -> v2:
>>> >>   * Update commit message by removing 'This patch adds XXX' manner.
>>> >>   * Do not initialize 'send_update' variable - set it directly during
>>> >> first usage.
>>> >>  v3 -> v4:
>>> >>   * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
>>> >>  v4 -> v5:
>>> >>   * Do not change callbacks order in transport structures.
>>> >>
>>> >>  drivers/vhost/vsock.c   |  1 +
>>> >>  include/linux/virtio_vsock.h    |  1 +
>>> >>  net/vmw_vsock/virtio_transport.c    |  1 +
>>> >>  net/vmw_vsock/virtio_transport_common.c | 27 +
>>> >>  net/vmw_vsock/vsock_loopback.c  |  1 +
>>> >>  5 files changed, 31 insertions(+)
>>> >>
>>> >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>> >> index f75731396b7e..4146f80db8ac 100644
>>> >> --- a/drivers/vhost/vsock.c
>>> >> +++ b/drivers/vhost/vsock.c
>>> >> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = {
>>> >>  .notify_buffer_size   = virtio_transport_notify_buffer_size,
>>> >>
>>> >>  .read_skb = virtio_transport_read_skb,
>>> >> +    .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
>>> >>  },
>>> >>
>>> >>  .send_pkt = vhost_transport_send_pkt,
>>> >> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>> >> index ebb3ce63d64d..c82089dee0c8 100644
>>> >> --- a/include/linux/virtio_vsock.h
>>> >> +++ b/include/linux/virtio_vsock.h
>>> >> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
>>> >> virtio_vsock_sock *vvs, u32 credit);
>>> >>  void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>>> >>  int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>>> >>  int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
>>> >>read_actor);
>>> >> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int 
>>> >> val);
>>> >>  #endif /* _LINUX_VIRTIO_VSOCK_H */
>>> >> diff --git a/net/vmw_vsock/virtio_transport.c 
>>> >> b/net/vmw_vsock/virtio_transport.c
>>> >> index af5bab1acee1..8007593a3a93 100644
>>> >> --- a/net/vmw_vsock/virtio_transport.c
>>> >> +++ b/net/vmw_vsock/virtio_transport.c
>>> >> @@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = {
>>> >>  .notify_buffer_size   = virtio_transport_notify_buffer_size,
>>> >>
>>> >>  .read_skb = virtio_transport_read_skb,
>>> >> +    .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
>>> >>  },
>>> >>
>>> >>  .send_pkt = virtio_transport_send_pkt,
>>> >> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>>> >> b/net/vmw_vsock/virtio_transport_common.c
>>> >> index f6dc896bf44c..1cb556ad4597 100644
>>> >> --- a/net/vmw_vsock/virtio_transport_common.c
>>> >> +++ b/net/vmw_vsock/virtio_transport_common.c
>>> >> @@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock 
>>> >> *vsk, skb_read_actor_t recv_acto
>>> >>  }
>>> >>  EXPORT_SYMBOL_GPL(virtio_transport_rea

Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-30 Thread Arseniy Krasnov



On 30.11.2023 16:58, Michael S. Tsirkin wrote:
> On Thu, Nov 30, 2023 at 04:43:34PM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 30.11.2023 16:42, Michael S. Tsirkin wrote:
>>> On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:
>>>> Send credit update message when SO_RCVLOWAT is updated and it is bigger
>>>> than number of bytes in rx queue. It is needed, because 'poll()' will
>>>> wait until number of bytes in rx queue will be not smaller than
>>>> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
>>>> for tx/rx is possible: sender waits for free space and receiver is
>>>> waiting data in 'poll()'.
>>>>
>>>> Signed-off-by: Arseniy Krasnov 
>>>> ---
>>>>  Changelog:
>>>>  v1 -> v2:
>>>>   * Update commit message by removing 'This patch adds XXX' manner.
>>>>   * Do not initialize 'send_update' variable - set it directly during
>>>> first usage.
>>>>  v3 -> v4:
>>>>   * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
>>>>  v4 -> v5:
>>>>   * Do not change callbacks order in transport structures.
>>>>
>>>>  drivers/vhost/vsock.c   |  1 +
>>>>  include/linux/virtio_vsock.h|  1 +
>>>>  net/vmw_vsock/virtio_transport.c|  1 +
>>>>  net/vmw_vsock/virtio_transport_common.c | 27 +
>>>>  net/vmw_vsock/vsock_loopback.c  |  1 +
>>>>  5 files changed, 31 insertions(+)
>>>>
>>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>> index f75731396b7e..4146f80db8ac 100644
>>>> --- a/drivers/vhost/vsock.c
>>>> +++ b/drivers/vhost/vsock.c
>>>> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = {
>>>>.notify_buffer_size   = virtio_transport_notify_buffer_size,
>>>>  
>>>>.read_skb = virtio_transport_read_skb,
>>>> +  .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
>>>>},
>>>>  
>>>>.send_pkt = vhost_transport_send_pkt,
>>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>>> index ebb3ce63d64d..c82089dee0c8 100644
>>>> --- a/include/linux/virtio_vsock.h
>>>> +++ b/include/linux/virtio_vsock.h
>>>> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
>>>> virtio_vsock_sock *vvs, u32 credit);
>>>>  void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>>>>  int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>>>>  int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
>>>> read_actor);
>>>> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
>>>>  #endif /* _LINUX_VIRTIO_VSOCK_H */
>>>> diff --git a/net/vmw_vsock/virtio_transport.c 
>>>> b/net/vmw_vsock/virtio_transport.c
>>>> index af5bab1acee1..8007593a3a93 100644
>>>> --- a/net/vmw_vsock/virtio_transport.c
>>>> +++ b/net/vmw_vsock/virtio_transport.c
>>>> @@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = {
>>>>.notify_buffer_size   = virtio_transport_notify_buffer_size,
>>>>  
>>>>.read_skb = virtio_transport_read_skb,
>>>> +  .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
>>>>},
>>>>  
>>>>.send_pkt = virtio_transport_send_pkt,
>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>>>> b/net/vmw_vsock/virtio_transport_common.c
>>>> index f6dc896bf44c..1cb556ad4597 100644
>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>> @@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock 
>>>> *vsk, skb_read_actor_t recv_acto
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
>>>>  
>>>> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
>>>> +{
>>>> +  struct virtio_vsock_sock *vvs = vsk->trans;
>>>> +  bool send_update;
>>>> +
>>>> +  spin_lock_bh(>rx_lock);
>>>> +
>>>> +  /* If number of available bytes is less than new SO_RCVLOWAT value,
>>>> +   * kick sender to se

Re: [PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-30 Thread Arseniy Krasnov



On 30.11.2023 16:42, Michael S. Tsirkin wrote:
> On Thu, Nov 30, 2023 at 04:08:39PM +0300, Arseniy Krasnov wrote:
>> Send credit update message when SO_RCVLOWAT is updated and it is bigger
>> than number of bytes in rx queue. It is needed, because 'poll()' will
>> wait until number of bytes in rx queue will be not smaller than
>> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
>> for tx/rx is possible: sender waits for free space and receiver is
>> waiting data in 'poll()'.
>>
>> Signed-off-by: Arseniy Krasnov 
>> ---
>>  Changelog:
>>  v1 -> v2:
>>   * Update commit message by removing 'This patch adds XXX' manner.
>>   * Do not initialize 'send_update' variable - set it directly during
>> first usage.
>>  v3 -> v4:
>>   * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
>>  v4 -> v5:
>>   * Do not change callbacks order in transport structures.
>>
>>  drivers/vhost/vsock.c   |  1 +
>>  include/linux/virtio_vsock.h|  1 +
>>  net/vmw_vsock/virtio_transport.c|  1 +
>>  net/vmw_vsock/virtio_transport_common.c | 27 +
>>  net/vmw_vsock/vsock_loopback.c  |  1 +
>>  5 files changed, 31 insertions(+)
>>
>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> index f75731396b7e..4146f80db8ac 100644
>> --- a/drivers/vhost/vsock.c
>> +++ b/drivers/vhost/vsock.c
>> @@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = {
>>  .notify_buffer_size   = virtio_transport_notify_buffer_size,
>>  
>>  .read_skb = virtio_transport_read_skb,
>> +.notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
>>  },
>>  
>>  .send_pkt = vhost_transport_send_pkt,
>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>> index ebb3ce63d64d..c82089dee0c8 100644
>> --- a/include/linux/virtio_vsock.h
>> +++ b/include/linux/virtio_vsock.h
>> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
>> virtio_vsock_sock *vvs, u32 credit);
>>  void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>>  int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>>  int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
>> read_actor);
>> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
>>  #endif /* _LINUX_VIRTIO_VSOCK_H */
>> diff --git a/net/vmw_vsock/virtio_transport.c 
>> b/net/vmw_vsock/virtio_transport.c
>> index af5bab1acee1..8007593a3a93 100644
>> --- a/net/vmw_vsock/virtio_transport.c
>> +++ b/net/vmw_vsock/virtio_transport.c
>> @@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = {
>>  .notify_buffer_size   = virtio_transport_notify_buffer_size,
>>  
>>  .read_skb = virtio_transport_read_skb,
>> +.notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
>>  },
>>  
>>  .send_pkt = virtio_transport_send_pkt,
>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>> b/net/vmw_vsock/virtio_transport_common.c
>> index f6dc896bf44c..1cb556ad4597 100644
>> --- a/net/vmw_vsock/virtio_transport_common.c
>> +++ b/net/vmw_vsock/virtio_transport_common.c
>> @@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
>> skb_read_actor_t recv_acto
>>  }
>>  EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
>>  
>> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
>> +{
>> +struct virtio_vsock_sock *vvs = vsk->trans;
>> +bool send_update;
>> +
>> +spin_lock_bh(>rx_lock);
>> +
>> +/* If number of available bytes is less than new SO_RCVLOWAT value,
>> + * kick sender to send more data, because sender may sleep in its
>> + * 'send()' syscall waiting for enough space at our side.
>> + */
>> +send_update = vvs->rx_bytes < val;
>> +
>> +spin_unlock_bh(>rx_lock);
>> +
>> +if (send_update) {
>> +int err;
>> +
>> +err = virtio_transport_send_credit_update(vsk);
>> +if (err < 0)
>> +return err;
>> +}
>> +
>> +return 0;
>> +}
> 
> 
> I find it strange that this will send a credit update
> even if nothing changed since this was called previously.
> I'm not sure whether this is a problem protocol-wise,
> but it certainly was not envisioned when 

[PATCH net-next v5 0/3] send credit update during setting SO_RCVLOWAT

2023-11-30 Thread Arseniy Krasnov
Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=f1be1e04c76bb9c44789d3575bba4418cf0ea359

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/
Link to v4:
https://lore.kernel.org/netdev/20231129212519.2938875-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.
v2 -> v3:
 * See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
v4 -> v5:
 * Change patchset tag 'RFC' -> 'net-next'.
 * See per-patch changelog after ---.

Arseniy Krasnov (3):
  vsock: update SO_RCVLOWAT setting callback
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  vsock/test: SO_RCVLOWAT + deferred credit update test

 drivers/vhost/vsock.c   |   1 +
 include/linux/virtio_vsock.h|   1 +
 include/net/af_vsock.h  |   2 +-
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/hyperv_transport.c|   4 +-
 net/vmw_vsock/virtio_transport.c|   1 +
 net/vmw_vsock/virtio_transport_common.c |  27 +
 net/vmw_vsock/vsock_loopback.c  |   1 +
 tools/testing/vsock/vsock_test.c| 142 
 9 files changed, 183 insertions(+), 5 deletions(-)

-- 
2.25.1




[PATCH net-next v5 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-30 Thread Arseniy Krasnov
Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Do not initialize 'send_update' variable - set it directly during
first usage.
 v3 -> v4:
  * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
 v4 -> v5:
  * Do not change callbacks order in transport structures.

 drivers/vhost/vsock.c   |  1 +
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  1 +
 net/vmw_vsock/virtio_transport_common.c | 27 +
 net/vmw_vsock/vsock_loopback.c  |  1 +
 5 files changed, 31 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..4146f80db8ac 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -451,6 +451,7 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+   .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
},
 
.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..8007593a3a93 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -539,6 +539,7 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+   .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
},
 
.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..1cb556ad4597 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(>rx_lock);
+
+   /* If number of available bytes is less than new SO_RCVLOWAT value,
+* kick sender to send more data, because sender may sleep in its
+* 'send()' syscall waiting for enough space at our side.
+*/
+   send_update = vvs->rx_bytes < val;
+
+   spin_unlock_bh(>rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..9f4b814fbbc7 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -98,6 +98,7 @@ static struct virtio_transport loopback_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+   .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat
},
 
.send_pkt = vsock_loopback_send_pkt,
-- 
2.25.1




[PATCH net-next v5 1/3] vsock: update SO_RCVLOWAT setting callback

2023-11-30 Thread Arseniy Krasnov
Do not return if transport callback for SO_RCVLOWAT is set (only in
error case). In this case we don't need to set 'sk_rcvlowat' field in
each transport - only in 'vsock_set_rcvlowat()'. Also, if 'sk_rcvlowat'
is now set only in af_vsock.c, change callback name from 'set_rcvlowat'
to 'notify_set_rcvlowat'.

Signed-off-by: Arseniy Krasnov 
Reviewed-by: Stefano Garzarella 
---
 Changelog:
 v3 -> v4:
  * Rename 'set_rcvlowat' to 'notify_set_rcvlowat'.
  * Commit message updated.

 include/net/af_vsock.h   | 2 +-
 net/vmw_vsock/af_vsock.c | 9 +++--
 net/vmw_vsock/hyperv_transport.c | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index e302c0e804d0..535701efc1e5 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -137,7 +137,6 @@ struct vsock_transport {
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);
-   int (*set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* SEQ_PACKET. */
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
@@ -168,6 +167,7 @@ struct vsock_transport {
struct vsock_transport_send_notify_data *);
/* sk_lock held by the caller */
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
+   int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
 
/* Shutdown. */
int (*shutdown)(struct vsock_sock *, int);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..54ba7316f808 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
transport = vsk->transport;
 
-   if (transport && transport->set_rcvlowat)
-   return transport->set_rcvlowat(vsk, val);
+   if (transport && transport->notify_set_rcvlowat) {
+   int err;
+
+   err = transport->notify_set_rcvlowat(vsk, val);
+   if (err)
+   return err;
+   }
 
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
return 0;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7cb1a9d2cdb4..e2157e387217 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -816,7 +816,7 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, 
ssize_t written,
 }
 
 static
-int hvs_set_rcvlowat(struct vsock_sock *vsk, int val)
+int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
 {
return -EOPNOTSUPP;
 }
@@ -856,7 +856,7 @@ static struct vsock_transport hvs_transport = {
.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
 
-   .set_rcvlowat = hvs_set_rcvlowat
+   .notify_set_rcvlowat  = hvs_notify_set_rcvlowat
 };
 
 static bool hvs_check_transport(struct vsock_sock *vsk)
-- 
2.25.1




Re: [RFC PATCH v4 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-30 Thread Arseniy Krasnov



On 30.11.2023 11:38, Stefano Garzarella wrote:
> On Thu, Nov 30, 2023 at 12:25:18AM +0300, Arseniy Krasnov wrote:
>> Send credit update message when SO_RCVLOWAT is updated and it is bigger
>> than number of bytes in rx queue. It is needed, because 'poll()' will
>> wait until number of bytes in rx queue will be not smaller than
>> SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
>> for tx/rx is possible: sender waits for free space and receiver is
>> waiting data in 'poll()'.
>>
>> Signed-off-by: Arseniy Krasnov 
>> ---
>> Changelog:
>> v1 -> v2:
>>  * Update commit message by removing 'This patch adds XXX' manner.
>>  * Do not initialize 'send_update' variable - set it directly during
>>    first usage.
>> v3 -> v4:
>>  * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.
>>
>> drivers/vhost/vsock.c   |  3 ++-
>> include/linux/virtio_vsock.h    |  1 +
>> net/vmw_vsock/virtio_transport.c    |  3 ++-
>> net/vmw_vsock/virtio_transport_common.c | 27 +
>> net/vmw_vsock/vsock_loopback.c  |  3 ++-
>> 5 files changed, 34 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> index f75731396b7e..c5e58a60a546 100644
>> --- a/drivers/vhost/vsock.c
>> +++ b/drivers/vhost/vsock.c
>> @@ -449,8 +449,9 @@ static struct virtio_transport vhost_transport = {
>>     .notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
>>     .notify_send_post_enqueue = 
>> virtio_transport_notify_send_post_enqueue,
>>     .notify_buffer_size   = virtio_transport_notify_buffer_size,
>> +    .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat,
>>
>> -    .read_skb = virtio_transport_read_skb,
>> +    .read_skb = virtio_transport_read_skb
> 
> I think it is better to avoid this change, so when we will need to add
> new callbacks, we don't need to edit this line again.
> 
> Please avoid it also in the other place in this patch.
> 
> The rest LGTM.

Yes, I see, I thought about that, but chose beauty instead of pragmatism :)
Ok, I'll fix it:)

Thanks, Arseniy

> 
> Thanks,
> Stefano
> 
>> },
>>
>> .send_pkt = vhost_transport_send_pkt,
>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>> index ebb3ce63d64d..c82089dee0c8 100644
>> --- a/include/linux/virtio_vsock.h
>> +++ b/include/linux/virtio_vsock.h
>> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
>> virtio_vsock_sock *vvs, u32 credit);
>> void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>> int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>> int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
>> read_actor);
>> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
>> #endif /* _LINUX_VIRTIO_VSOCK_H */
>> diff --git a/net/vmw_vsock/virtio_transport.c 
>> b/net/vmw_vsock/virtio_transport.c
>> index af5bab1acee1..8b7bb7ca8ea5 100644
>> --- a/net/vmw_vsock/virtio_transport.c
>> +++ b/net/vmw_vsock/virtio_transport.c
>> @@ -537,8 +537,9 @@ static struct virtio_transport virtio_transport = {
>>     .notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
>>     .notify_send_post_enqueue = 
>> virtio_transport_notify_send_post_enqueue,
>>     .notify_buffer_size   = virtio_transport_notify_buffer_size,
>> +    .notify_set_rcvlowat  = virtio_transport_notify_set_rcvlowat,
>>
>> -    .read_skb = virtio_transport_read_skb,
>> +    .read_skb = virtio_transport_read_skb
>> },
>>
>> .send_pkt = virtio_transport_send_pkt,
>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>> b/net/vmw_vsock/virtio_transport_common.c
>> index f6dc896bf44c..1cb556ad4597 100644
>> --- a/net/vmw_vsock/virtio_transport_common.c
>> +++ b/net/vmw_vsock/virtio_transport_common.c
>> @@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
>> skb_read_actor_t recv_acto
>> }
>> EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
>>
>> +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
>> +{
>> +    struct virtio_vsock_sock *vvs = vsk->trans;
>> +    bool send_update;
>> +
>> +    spin_lock_bh(>rx_lock);
>> +
>> +    /* If number of available bytes is less than new SO_RCVLOWAT value,
>> + * kick sender to send more data, because sender may sleep 

[RFC PATCH v4 0/3] send credit update during setting SO_RCVLOWAT

2023-11-29 Thread Arseniy Krasnov
Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=f1be1e04c76bb9c44789d3575bba4418cf0ea359

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/
Link to v3:
https://lore.kernel.org/netdev/20231122180510.2297075-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.
v2 -> v3:
 * See changelog after --- in 0003 only (0001 and 0002 still same).
v3 -> v4:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.

Arseniy Krasnov (3):
  vsock: update SO_RCVLOWAT setting callback
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  vsock/test: SO_RCVLOWAT + deferred credit update test

 drivers/vhost/vsock.c   |   3 +-
 include/linux/virtio_vsock.h|   1 +
 include/net/af_vsock.h  |   2 +-
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/hyperv_transport.c|   4 +-
 net/vmw_vsock/virtio_transport.c|   3 +-
 net/vmw_vsock/virtio_transport_common.c |  27 +
 net/vmw_vsock/vsock_loopback.c  |   3 +-
 tools/testing/vsock/vsock_test.c| 149 
 9 files changed, 193 insertions(+), 8 deletions(-)

-- 
2.25.1




[RFC PATCH v4 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-29 Thread Arseniy Krasnov
Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Do not initialize 'send_update' variable - set it directly during
first usage.
 v3 -> v4:
  * Fit comment in 'virtio_transport_notify_set_rcvlowat()' to 80 chars.

 drivers/vhost/vsock.c   |  3 ++-
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  3 ++-
 net/vmw_vsock/virtio_transport_common.c | 27 +
 net/vmw_vsock/vsock_loopback.c  |  3 ++-
 5 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..c5e58a60a546 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -449,8 +449,9 @@ static struct virtio_transport vhost_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
-   .read_skb = virtio_transport_read_skb,
+   .read_skb = virtio_transport_read_skb
},
 
.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..c82089dee0c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..8b7bb7ca8ea5 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -537,8 +537,9 @@ static struct virtio_transport virtio_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set_rcvlowat  = 
virtio_transport_notify_set_rcvlowat,
 
-   .read_skb = virtio_transport_read_skb,
+   .read_skb = virtio_transport_read_skb
},
 
.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..1cb556ad4597 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,33 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(>rx_lock);
+
+   /* If number of available bytes is less than new SO_RCVLOWAT value,
+* kick sender to send more data, because sender may sleep in its
+* 'send()' syscall waiting for enough space at our side.
+*/
+   send_update = vvs->rx_bytes < val;
+
+   spin_unlock_bh(>rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..454f69838c2a 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -96,8 +96,9 @@ static struct virtio_transport loopback_transport = {
.notify_send_pre_enqueue  = 
virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = 
virtio_transport_notify_send_post_enqueue,
.notify_buffer_size   = virtio_transport_notify_buffer_size,
+   .notify_set

[RFC PATCH v3 3/3] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-22 Thread Arseniy Krasnov
Test which checks, that updating SO_RCVLOWAT value also sends credit
update message. Otherwise mutual hungup may happen when receiver didn't
send credit update and then calls 'poll()' with non default SO_RCVLOWAT
value (e.g. waiting enough bytes to read), while sender waits for free
space at receiver's side. Important thing is that this test relies on
kernel's define for maximum packet size for virtio transport and this
value is not exported to user: VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (this
define is used to control moment when to send credit update message).
If this value or its usage will be changed in kernel - this test may
become useless/broken.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Update commit message by adding details about dependency for this
test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
  * Add comment for this dependency in 'vsock_test.c' where this define
is duplicated.
 v2 -> v3:
  * Replace synchronization based on control TCP socket with vsock
data socket - this is needed to allow sender transmit data only
when new buffer size of receiver is visible to sender. Otherwise
there is race and test fails sometimes.

 tools/testing/vsock/vsock_test.c | 142 +++
 1 file changed, 142 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 5b0e93f9996c..773a71260fba 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1225,6 +1225,143 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
 }
 
+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   recv_byte(fd, 1, 0);
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts 
*opts)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  _size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send one dummy byte here, because 'setsockopt()' above also
+* sends special packet which tells sender to update our buffer
+* size. This 'send_byte()' will serialize such packet with data
+* reads in a loop below. Sender starts transmission only when
+* it receives this single byte.
+*/
+   send_byte(fd, 1, 0);
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEEK);
+   if (res == buf_size)
+   break;
+
+   if (res <= 0) {
+   fprintf(stderr, "unexpected 'recv()' return: %zi\n", 
res);
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* There is 128KB of data in the socket's rx queue,
+* dequeue first 64KB, credit update is not sent.
+*/
+   recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+   recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size);
+   recv_buf_size++;
+
+   /* Updating SO_RCVLOWAT will send credit update. */
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  _buf_size, sizeof(recv_buf_size))

[RFC PATCH v3 1/3] vsock: update SO_RCVLOWAT setting callback

2023-11-22 Thread Arseniy Krasnov
Do not return if transport callback for SO_RCVLOWAT is set (only in
error case). In this case we don't need to set 'sk_rcvlowat' field in
each transport - only in 'vsock_set_rcvlowat()'.

Signed-off-by: Arseniy Krasnov 
---
 net/vmw_vsock/af_vsock.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..af0058037f72 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
transport = vsk->transport;
 
-   if (transport && transport->set_rcvlowat)
-   return transport->set_rcvlowat(vsk, val);
+   if (transport && transport->set_rcvlowat) {
+   int err;
+
+   err = transport->set_rcvlowat(vsk, val);
+   if (err)
+   return err;
+   }
 
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
return 0;
-- 
2.25.1




[RFC PATCH v3 0/3] send credit update during setting SO_RCVLOWAT

2023-11-22 Thread Arseniy Krasnov
Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=18de1e517ed37ebaf33e771e46faf052e966e163

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/
Link to v2:
https://lore.kernel.org/netdev/20231119204922.2251912-1-avkras...@salutedevices.com/

Changelog:
v1 -> v2:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.
v2 -> v3:
 * See changelog after --- in 0003 only (0001 and 0002 still same).

Arseniy Krasnov (3):
  vsock: update SO_RCVLOWAT setting callback
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  vsock/test: SO_RCVLOWAT + deferred credit update test

 drivers/vhost/vsock.c   |   2 +
 include/linux/virtio_vsock.h|   1 +
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/virtio_transport.c|   2 +
 net/vmw_vsock/virtio_transport_common.c |  28 +
 net/vmw_vsock/vsock_loopback.c  |   2 +
 tools/testing/vsock/vsock_test.c| 142 
 7 files changed, 184 insertions(+), 2 deletions(-)

-- 
2.25.1




[RFC PATCH v3 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-22 Thread Arseniy Krasnov
Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Do not initialize 'send_update' variable - set it directly during
first usage.

 drivers/vhost/vsock.c   |  2 ++
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  2 ++
 net/vmw_vsock/virtio_transport_common.c | 28 +
 net/vmw_vsock/vsock_loopback.c  |  2 ++
 5 files changed, 35 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ecfa5c11f5ee 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},
 
.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..97dc1bebc69c 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..cf3431189d0c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -539,6 +539,8 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},
 
.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..4acee21b4350 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,34 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(>rx_lock);
+
+   /* If number of available bytes is less than new
+* SO_RCVLOWAT value, kick sender to send more
+* data, because sender may sleep in its 'send()'
+* syscall waiting for enough space at our side.
+*/
+   send_update = vvs->rx_bytes < val;
+
+   spin_unlock_bh(>rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..388c157f6633 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -98,6 +98,8 @@ static struct virtio_transport loopback_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},
 
.send_pkt = vsock_loopback_send_pkt,
-- 
2.25.1




[PATCH net v1] vsock/test: fix SEQPACKET message bounds test

2023-11-21 Thread Arseniy Krasnov
Tune message length calculation to make this test work on machines
where 'getpagesize()' returns >32KB. Now maximum message length is not
hardcoded (on machines above it was smaller than 'getpagesize()' return
value, thus we get negative value and test fails), but calculated at
runtime and always bigger than 'getpagesize()' result. Reproduced on
aarch64 with 64KB page size.

Fixes: 5c338112e48a ("test/vsock: rework message bounds test")
Signed-off-by: Arseniy Krasnov 
---
 tools/testing/vsock/vsock_test.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index f5623b8d76b7..691e44c746bf 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -353,11 +353,12 @@ static void test_stream_msg_peek_server(const struct 
test_opts *opts)
 }
 
 #define SOCK_BUF_SIZE (2 * 1024 * 1024)
-#define MAX_MSG_SIZE (32 * 1024)
+#define MAX_MSG_PAGES 4
 
 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
 {
unsigned long curr_hash;
+   size_t max_msg_size;
int page_size;
int msg_count;
int fd;
@@ -373,7 +374,8 @@ static void test_seqpacket_msg_bounds_client(const struct 
test_opts *opts)
 
curr_hash = 0;
page_size = getpagesize();
-   msg_count = SOCK_BUF_SIZE / MAX_MSG_SIZE;
+   max_msg_size = MAX_MSG_PAGES * page_size;
+   msg_count = SOCK_BUF_SIZE / max_msg_size;
 
for (int i = 0; i < msg_count; i++) {
size_t buf_size;
@@ -383,7 +385,7 @@ static void test_seqpacket_msg_bounds_client(const struct 
test_opts *opts)
/* Use "small" buffers and "big" buffers. */
if (i & 1)
buf_size = page_size +
-   (rand() % (MAX_MSG_SIZE - page_size));
+   (rand() % (max_msg_size - page_size));
else
buf_size = 1 + (rand() % page_size);
 
@@ -429,7 +431,6 @@ static void test_seqpacket_msg_bounds_server(const struct 
test_opts *opts)
unsigned long remote_hash;
unsigned long curr_hash;
int fd;
-   char buf[MAX_MSG_SIZE];
struct msghdr msg = {0};
struct iovec iov = {0};
 
@@ -457,8 +458,13 @@ static void test_seqpacket_msg_bounds_server(const struct 
test_opts *opts)
control_writeln("SRVREADY");
/* Wait, until peer sends whole data. */
control_expectln("SENDDONE");
-   iov.iov_base = buf;
-   iov.iov_len = sizeof(buf);
+   iov.iov_len = MAX_MSG_PAGES * getpagesize();
+   iov.iov_base = malloc(iov.iov_len);
+   if (!iov.iov_base) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
msg.msg_iov = 
msg.msg_iovlen = 1;
 
@@ -483,6 +489,7 @@ static void test_seqpacket_msg_bounds_server(const struct 
test_opts *opts)
curr_hash += hash_djb2(msg.msg_iov[0].iov_base, recv_size);
}
 
+   free(iov.iov_base);
close(fd);
remote_hash = control_readulong();
 
-- 
2.25.1




[RFC PATCH v2 2/3] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-19 Thread Arseniy Krasnov
Send credit update message when SO_RCVLOWAT is updated and it is bigger
than number of bytes in rx queue. It is needed, because 'poll()' will
wait until number of bytes in rx queue will be not smaller than
SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup
for tx/rx is possible: sender waits for free space and receiver is
waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Do not initialize 'send_update' variable - set it directly during
first usage.

 drivers/vhost/vsock.c   |  2 ++
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  2 ++
 net/vmw_vsock/virtio_transport_common.c | 28 +
 net/vmw_vsock/vsock_loopback.c  |  2 ++
 5 files changed, 35 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ecfa5c11f5ee 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},
 
.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..97dc1bebc69c 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..cf3431189d0c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -539,6 +539,8 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},
 
.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f6dc896bf44c..4acee21b4350 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1684,6 +1684,34 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update;
+
+   spin_lock_bh(>rx_lock);
+
+   /* If number of available bytes is less than new
+* SO_RCVLOWAT value, kick sender to send more
+* data, because sender may sleep in its 'send()'
+* syscall waiting for enough space at our side.
+*/
+   send_update = vvs->rx_bytes < val;
+
+   spin_unlock_bh(>rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..388c157f6633 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -98,6 +98,8 @@ static struct virtio_transport loopback_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},
 
.send_pkt = vsock_loopback_send_pkt,
-- 
2.25.1




[RFC PATCH v2 1/3] vsock: update SO_RCVLOWAT setting callback

2023-11-19 Thread Arseniy Krasnov
Do not return if transport callback for SO_RCVLOWAT is set (only in
error case). In this case we don't need to set 'sk_rcvlowat' field in
each transport - only in 'vsock_set_rcvlowat()'.

Signed-off-by: Arseniy Krasnov 
---
 net/vmw_vsock/af_vsock.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 816725af281f..af0058037f72 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -2264,8 +2264,13 @@ static int vsock_set_rcvlowat(struct sock *sk, int val)
 
transport = vsk->transport;
 
-   if (transport && transport->set_rcvlowat)
-   return transport->set_rcvlowat(vsk, val);
+   if (transport && transport->set_rcvlowat) {
+   int err;
+
+   err = transport->set_rcvlowat(vsk, val);
+   if (err)
+   return err;
+   }
 
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
return 0;
-- 
2.25.1




[RFC PATCH v2 0/3] send credit update during setting SO_RCVLOWAT

2023-11-19 Thread Arseniy Krasnov
Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=18de1e517ed37ebaf33e771e46faf052e966e163

Link to v1:
https://lore.kernel.org/netdev/20231108072004.1045669-1-avkras...@salutedevices.com/

Changelog:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * New patch is added as 0001 - it removes return from SO_RCVLOWAT set
   callback in 'af_vsock.c' when transport callback is set - with that
   we can set 'sk_rcvlowat' only once in 'af_vsock.c' and in future do
   not copy-paste it to every transport. It was discussed in v1.
 * See per-patch changelog after ---.

Arseniy Krasnov (3):
  vsock: update SO_RCVLOWAT setting callback
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  vsock/test: SO_RCVLOWAT + deferred credit update test

 drivers/vhost/vsock.c   |   2 +
 include/linux/virtio_vsock.h|   1 +
 net/vmw_vsock/af_vsock.c|   9 +-
 net/vmw_vsock/virtio_transport.c|   2 +
 net/vmw_vsock/virtio_transport_common.c |  28 +
 net/vmw_vsock/vsock_loopback.c  |   2 +
 tools/testing/vsock/vsock_test.c| 136 
 7 files changed, 178 insertions(+), 2 deletions(-)

-- 
2.25.1




[RFC PATCH v2 3/3] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-19 Thread Arseniy Krasnov
Test which checks, that updating SO_RCVLOWAT value also sends credit
update message. Otherwise mutual hungup may happen when receiver didn't
send credit update and then calls 'poll()' with non default SO_RCVLOWAT
value (e.g. waiting enough bytes to read), while sender waits for free
space at receiver's side. Important thing is that this test relies on
kernel's define for maximum packet size for virtio transport and this
value is not exported to user: VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (this
define is used to control moment when to send credit update message).
If this value or its usage will be changed in kernel - this test may
become useless/broken.

Signed-off-by: Arseniy Krasnov 
---
 Changelog:
 v1 -> v2:
  * Update commit message by removing 'This patch adds XXX' manner.
  * Update commit message by adding details about dependency for this
test from kernel internal define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.
  * Add comment for this dependency in 'vsock_test.c' where this define
is duplicated.

 tools/testing/vsock/vsock_test.c | 136 +++
 1 file changed, 136 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 5b0e93f9996c..f5623b8d76b7 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1225,6 +1225,137 @@ static void test_double_bind_connect_client(const 
struct test_opts *opts)
}
 }
 
+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+/* This define is the same as in 'include/linux/virtio_vsock.h':
+ * it is used to decide when to send credit update message during
+ * reading from rx queue of a socket. Value and its usage in
+ * kernel is important for this test.
+ */
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   control_expectln("SRVREADY");
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts 
*opts)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  _size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   control_writeln("SRVREADY");
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEEK);
+   if (res == buf_size)
+   break;
+
+   if (res <= 0) {
+   fprintf(stderr, "unexpected 'recv()' return: %zi\n", 
res);
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* There is 128KB of data in the socket's rx queue,
+* dequeue first 64KB, credit update is not sent.
+*/
+   recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+   recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size);
+   recv_buf_size++;
+
+   /* Updating SO_RCVLOWAT will send credit update. */
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  _buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+
+   memset(, 0, sizeof(fds));
+   fds.fd = fd;
+   fds.events = POLLIN | POLLRDNORM | POLLERR |
+POLLRDHUP | POLLHUP;
+
+   /* This 'poll()' will return once we receive last byte
+* sent by client.
+*/
+   if (poll(, 1, -1) < 0) {
+   perror("poll");
+   exit(EXIT_FAILURE);
+   }
+
+   if (fds.revents & POLLERR) {
+   fprintf(stderr, "'poll(

Re: [RFC PATCH v1 2/2] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-17 Thread Arseniy Krasnov



On 17.11.2023 11:30, Stefano Garzarella wrote:
> On Fri, Nov 17, 2023 at 10:12:38AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 15.11.2023 14:11, Stefano Garzarella wrote:
>>> On Wed, Nov 08, 2023 at 10:20:04AM +0300, Arseniy Krasnov wrote:
>>>> This adds test which checks, that updating SO_RCVLOWAT value also sends
>>>
>>> You can avoid "This adds", and write just "Add test ...".
>>>
>>> See 
>>> https://docs.kernel.org/process/submitting-patches.html#describe-your-changes
>>>
>>>     Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
>>>     instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
>>>     to do frotz", as if you are giving orders to the codebase to change
>>>     its behaviour.
>>>
>>> Also in the other patch.
>>>
>>>> credit update message. Otherwise mutual hungup may happen when receiver
>>>> didn't send credit update and then calls 'poll()' with non default
>>>> SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender
>>>> waits for free space at receiver's side.
>>>>
>>>> Signed-off-by: Arseniy Krasnov 
>>>> ---
>>>> tools/testing/vsock/vsock_test.c | 131 +++
>>>> 1 file changed, 131 insertions(+)
>>>>
>>>> diff --git a/tools/testing/vsock/vsock_test.c 
>>>> b/tools/testing/vsock/vsock_test.c
>>>> index c1f7bc9abd22..c71b3875fd16 100644
>>>> --- a/tools/testing/vsock/vsock_test.c
>>>> +++ b/tools/testing/vsock/vsock_test.c
>>>> @@ -1180,6 +1180,132 @@ static void test_stream_shutrd_server(const struct 
>>>> test_opts *opts)
>>>> close(fd);
>>>> }
>>>>
>>>> +#define RCVLOWAT_CREDIT_UPD_BUF_SIZE    (1024 * 128)
>>>> +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE    (1024 * 64)
>>>
>>> What about adding a comment like the one in the cover letter about
>>> dependency with kernel values?
>>>
>>> Please add it also in the commit description.
>>>
>>> I'm thinking if we should move all the defines that depends on the
>>> kernel in some special header.
>>
>> IIUC it will be new header file in tools/testing/vsock, which includes such 
>> defines. At
>> this moment in will contain only VIRTIO_VSOCK_MAX_PKT_BUF_SIZE. Idea is that 
>> such defines
> 
> So this only works on the virtio transport though, not the other
> transports, right? (but maybe the others don't have this problem, so
> it's fine).

Yes, this case is only actual in virtio as this logic exists in virtio
only (the same situation as for skb merging sometimes ago).

> 
>> are not supposed to use by user (so do not move it to uapi headers), but 
>> needed by tests
>> to check kernel behaviour. Please correct me if i'm wrong.
> 
> Right!
> Maybe if it's just one, we can leave it there for now, but with a
> comment on top explaining where it comes.

Ok, got it, I'll add comment

Thanks, Arseniy

> 
> Thanks,
> Stefano
> 



Re: [RFC PATCH v1 2/2] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-16 Thread Arseniy Krasnov



On 15.11.2023 14:11, Stefano Garzarella wrote:
> On Wed, Nov 08, 2023 at 10:20:04AM +0300, Arseniy Krasnov wrote:
>> This adds test which checks, that updating SO_RCVLOWAT value also sends
> 
> You can avoid "This adds", and write just "Add test ...".
> 
> See 
> https://docs.kernel.org/process/submitting-patches.html#describe-your-changes
> 
>     Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
>     instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
>     to do frotz", as if you are giving orders to the codebase to change
>     its behaviour.
> 
> Also in the other patch.
> 
>> credit update message. Otherwise mutual hungup may happen when receiver
>> didn't send credit update and then calls 'poll()' with non default
>> SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender
>> waits for free space at receiver's side.
>>
>> Signed-off-by: Arseniy Krasnov 
>> ---
>> tools/testing/vsock/vsock_test.c | 131 +++
>> 1 file changed, 131 insertions(+)
>>
>> diff --git a/tools/testing/vsock/vsock_test.c 
>> b/tools/testing/vsock/vsock_test.c
>> index c1f7bc9abd22..c71b3875fd16 100644
>> --- a/tools/testing/vsock/vsock_test.c
>> +++ b/tools/testing/vsock/vsock_test.c
>> @@ -1180,6 +1180,132 @@ static void test_stream_shutrd_server(const struct 
>> test_opts *opts)
>> close(fd);
>> }
>>
>> +#define RCVLOWAT_CREDIT_UPD_BUF_SIZE    (1024 * 128)
>> +#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE    (1024 * 64)
> 
> What about adding a comment like the one in the cover letter about
> dependency with kernel values?
> 
> Please add it also in the commit description.
> 
> I'm thinking if we should move all the defines that depends on the
> kernel in some special header.

IIUC it will be new header file in tools/testing/vsock, which includes such 
defines. At
this moment in will contain only VIRTIO_VSOCK_MAX_PKT_BUF_SIZE. Idea is that 
such defines
are not supposed to use by user (so do not move it to uapi headers), but needed 
by tests
to check kernel behaviour. Please correct me if i'm wrong.

Thanks, Arseniy

> 
>> +
>> +static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
>> *opts)
>> +{
>> +    size_t buf_size;
>> +    void *buf;
>> +    int fd;
>> +
>> +    fd = vsock_stream_connect(opts->peer_cid, 1234);
>> +    if (fd < 0) {
>> +    perror("connect");
>> +    exit(EXIT_FAILURE);
>> +    }
>> +
>> +    /* Send 1 byte more than peer's buffer size. */
>> +    buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
>> +
>> +    buf = malloc(buf_size);
>> +    if (!buf) {
>> +    perror("malloc");
>> +    exit(EXIT_FAILURE);
>> +    }
>> +
>> +    /* Wait until peer sets needed buffer size. */
>> +    control_expectln("SRVREADY");
>> +
>> +    if (send(fd, buf, buf_size, 0) != buf_size) {
>> +    perror("send failed");
>> +    exit(EXIT_FAILURE);
>> +    }
>> +
>> +    free(buf);
>> +    close(fd);
>> +}
>> +
>> +static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts 
>> *opts)
>> +{
>> +    size_t recv_buf_size;
>> +    struct pollfd fds;
>> +    size_t buf_size;
>> +    void *buf;
>> +    int fd;
>> +
>> +    fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
>> +    if (fd < 0) {
>> +    perror("accept");
>> +    exit(EXIT_FAILURE);
>> +    }
>> +
>> +    buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
>> +
>> +    if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
>> +   _size, sizeof(buf_size))) {
>> +    perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
>> +    exit(EXIT_FAILURE);
>> +    }
>> +
>> +    buf = malloc(buf_size);
>> +    if (!buf) {
>> +    perror("malloc");
>> +    exit(EXIT_FAILURE);
>> +    }
>> +
>> +    control_writeln("SRVREADY");
>> +
>> +    /* Wait until there will be 128KB of data in rx queue. */
>> +    while (1) {
>> +    ssize_t res;
>> +
>> +    res = recv(fd, buf, buf_size, MSG_PEEK);
>> +    if (res == buf_size)
>> +    break;
>> +
>> +    if (res <= 0) {
>> +    fprintf(stderr, "unexpected 'recv()' return: %zi\n", res);
>> +    exit(EXIT_FAILURE);
>> +    }
&g

Re: [RFC PATCH v1 1/2] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-16 Thread Arseniy Krasnov



On 15.11.2023 14:08, Stefano Garzarella wrote:
> On Wed, Nov 08, 2023 at 10:20:03AM +0300, Arseniy Krasnov wrote:
>> This adds sending credit update message when SO_RCVLOWAT is updated and
>> it is bigger than number of bytes in rx queue. It is needed, because
>> 'poll()' will wait until number of bytes in rx queue will be not smaller
>> than SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual
>> hungup for tx/rx is possible: sender waits for free space and receiver
>> is waiting data in 'poll()'.
>>
>> Signed-off-by: Arseniy Krasnov 
>> ---
>> drivers/vhost/vsock.c   |  2 ++
>> include/linux/virtio_vsock.h    |  1 +
>> net/vmw_vsock/virtio_transport.c    |  2 ++
>> net/vmw_vsock/virtio_transport_common.c | 31 +
>> net/vmw_vsock/vsock_loopback.c  |  2 ++
>> 5 files changed, 38 insertions(+)
>>
>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> index f75731396b7e..ecfa5c11f5ee 100644
>> --- a/drivers/vhost/vsock.c
>> +++ b/drivers/vhost/vsock.c
>> @@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = {
>>     .notify_buffer_size   = virtio_transport_notify_buffer_size,
>>
>>     .read_skb = virtio_transport_read_skb,
>> +
>> +    .set_rcvlowat = virtio_transport_set_rcvlowat
>> },
>>
>> .send_pkt = vhost_transport_send_pkt,
>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>> index ebb3ce63d64d..97dc1bebc69c 100644
>> --- a/include/linux/virtio_vsock.h
>> +++ b/include/linux/virtio_vsock.h
>> @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct 
>> virtio_vsock_sock *vvs, u32 credit);
>> void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
>> int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
>> int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
>> read_actor);
>> +int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val);
>> #endif /* _LINUX_VIRTIO_VSOCK_H */
>> diff --git a/net/vmw_vsock/virtio_transport.c 
>> b/net/vmw_vsock/virtio_transport.c
>> index af5bab1acee1..cf3431189d0c 100644
>> --- a/net/vmw_vsock/virtio_transport.c
>> +++ b/net/vmw_vsock/virtio_transport.c
>> @@ -539,6 +539,8 @@ static struct virtio_transport virtio_transport = {
>>     .notify_buffer_size   = virtio_transport_notify_buffer_size,
>>
>>     .read_skb = virtio_transport_read_skb,
>> +
>> +    .set_rcvlowat = virtio_transport_set_rcvlowat
>> },
>>
>> .send_pkt = virtio_transport_send_pkt,
>> diff --git a/net/vmw_vsock/virtio_transport_common.c 
>> b/net/vmw_vsock/virtio_transport_common.c
>> index e22c81435ef7..88a58163046e 100644
>> --- a/net/vmw_vsock/virtio_transport_common.c
>> +++ b/net/vmw_vsock/virtio_transport_common.c
>> @@ -1676,6 +1676,37 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
>> skb_read_actor_t recv_acto
>> }
>> EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
>>
>> +int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val)
>> +{
>> +    struct virtio_vsock_sock *vvs = vsk->trans;
>> +    bool send_update = false;
> 
> I'd declare this not initialized.
> 
>> +
>> +    spin_lock_bh(>rx_lock);
>> +
>> +    /* If number of available bytes is less than new
>> + * SO_RCVLOWAT value, kick sender to send more
>> + * data, because sender may sleep in its 'send()'
>> + * syscall waiting for enough space at our side.
>> + */
>> +    if (vvs->rx_bytes < val)
>> +    send_update = true;
> 
> Then here just:
> send_update = vvs->rx_bytes < val;
> 
>> +
>> +    spin_unlock_bh(>rx_lock);
>> +
>> +    if (send_update) {
>> +    int err;
>> +
>> +    err = virtio_transport_send_credit_update(vsk);
>> +    if (err < 0)
>> +    return err;
>> +    }
>> +
>> +    WRITE_ONCE(sk_vsock(vsk)->sk_rcvlowat, val ? : 1);
> 
> Not in this patch, but what about doing this in vsock_set_rcvlowat() in 
> af_vsock.c?
> 
> I mean avoid to return if `transport->set_rcvlowat(vsk, val)` is
> successfully, so set sk_rcvlowat in a single point.

Yes, we can do it, I'll include new patch as 0001 in v2, don't remember why it 
wasn't implemented in this
way before.

Thanks, Arseniy

> 
> The rest LGTM!
> 
> Stefano
> 
>> +
>> +    return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(virti

[RFC PATCH v1 1/2] virtio/vsock: send credit update during setting SO_RCVLOWAT

2023-11-07 Thread Arseniy Krasnov
This adds sending credit update message when SO_RCVLOWAT is updated and
it is bigger than number of bytes in rx queue. It is needed, because
'poll()' will wait until number of bytes in rx queue will be not smaller
than SO_RCVLOWAT, so kick sender to send more data. Otherwise mutual
hungup for tx/rx is possible: sender waits for free space and receiver
is waiting data in 'poll()'.

Signed-off-by: Arseniy Krasnov 
---
 drivers/vhost/vsock.c   |  2 ++
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  2 ++
 net/vmw_vsock/virtio_transport_common.c | 31 +
 net/vmw_vsock/vsock_loopback.c  |  2 ++
 5 files changed, 38 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index f75731396b7e..ecfa5c11f5ee 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -451,6 +451,8 @@ static struct virtio_transport vhost_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},
 
.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index ebb3ce63d64d..97dc1bebc69c 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock 
*vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
 int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t 
read_actor);
+int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index af5bab1acee1..cf3431189d0c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -539,6 +539,8 @@ static struct virtio_transport virtio_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},
 
.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index e22c81435ef7..88a58163046e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1676,6 +1676,37 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, 
skb_read_actor_t recv_acto
 }
 EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
 
+int virtio_transport_set_rcvlowat(struct vsock_sock *vsk, int val)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool send_update = false;
+
+   spin_lock_bh(>rx_lock);
+
+   /* If number of available bytes is less than new
+* SO_RCVLOWAT value, kick sender to send more
+* data, because sender may sleep in its 'send()'
+* syscall waiting for enough space at our side.
+*/
+   if (vvs->rx_bytes < val)
+   send_update = true;
+
+   spin_unlock_bh(>rx_lock);
+
+   if (send_update) {
+   int err;
+
+   err = virtio_transport_send_credit_update(vsk);
+   if (err < 0)
+   return err;
+   }
+
+   WRITE_ONCE(sk_vsock(vsk)->sk_rcvlowat, val ? : 1);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_rcvlowat);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 048640167411..388c157f6633 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -98,6 +98,8 @@ static struct virtio_transport loopback_transport = {
.notify_buffer_size   = virtio_transport_notify_buffer_size,
 
.read_skb = virtio_transport_read_skb,
+
+   .set_rcvlowat = virtio_transport_set_rcvlowat
},
 
.send_pkt = vsock_loopback_send_pkt,
-- 
2.25.1




[RFC PATCH v1 2/2] vsock/test: SO_RCVLOWAT + deferred credit update test

2023-11-07 Thread Arseniy Krasnov
This adds test which checks, that updating SO_RCVLOWAT value also sends
credit update message. Otherwise mutual hungup may happen when receiver
didn't send credit update and then calls 'poll()' with non default
SO_RCVLOWAT value (e.g. waiting enough bytes to read), while sender
waits for free space at receiver's side.

Signed-off-by: Arseniy Krasnov 
---
 tools/testing/vsock/vsock_test.c | 131 +++
 1 file changed, 131 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index c1f7bc9abd22..c71b3875fd16 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1180,6 +1180,132 @@ static void test_stream_shutrd_server(const struct 
test_opts *opts)
close(fd);
 }
 
+#define RCVLOWAT_CREDIT_UPD_BUF_SIZE   (1024 * 128)
+#define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE  (1024 * 64)
+
+static void test_stream_rcvlowat_def_cred_upd_client(const struct test_opts 
*opts)
+{
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send 1 byte more than peer's buffer size. */
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE + 1;
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Wait until peer sets needed buffer size. */
+   control_expectln("SRVREADY");
+
+   if (send(fd, buf, buf_size, 0) != buf_size) {
+   perror("send failed");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
+static void test_stream_rcvlowat_def_cred_upd_server(const struct test_opts 
*opts)
+{
+   size_t recv_buf_size;
+   struct pollfd fds;
+   size_t buf_size;
+   void *buf;
+   int fd;
+
+   fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
+   if (fd < 0) {
+   perror("accept");
+   exit(EXIT_FAILURE);
+   }
+
+   buf_size = RCVLOWAT_CREDIT_UPD_BUF_SIZE;
+
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+  _size, sizeof(buf_size))) {
+   perror("setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+   exit(EXIT_FAILURE);
+   }
+
+   buf = malloc(buf_size);
+   if (!buf) {
+   perror("malloc");
+   exit(EXIT_FAILURE);
+   }
+
+   control_writeln("SRVREADY");
+
+   /* Wait until there will be 128KB of data in rx queue. */
+   while (1) {
+   ssize_t res;
+
+   res = recv(fd, buf, buf_size, MSG_PEEK);
+   if (res == buf_size)
+   break;
+
+   if (res <= 0) {
+   fprintf(stderr, "unexpected 'recv()' return: %zi\n", 
res);
+   exit(EXIT_FAILURE);
+   }
+   }
+
+   /* There is 128KB of data in the socket's rx queue,
+* dequeue first 64KB, credit update is not sent.
+*/
+   recv_buf_size = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+   recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size);
+   recv_buf_size++;
+
+   /* Updating SO_RCVLOWAT will send credit update. */
+   if (setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT,
+  _buf_size, sizeof(recv_buf_size))) {
+   perror("setsockopt(SO_RCVLOWAT)");
+   exit(EXIT_FAILURE);
+   }
+
+   memset(, 0, sizeof(fds));
+   fds.fd = fd;
+   fds.events = POLLIN | POLLRDNORM | POLLERR |
+POLLRDHUP | POLLHUP;
+
+   /* This 'poll()' will return once we receive last byte
+* sent by client.
+*/
+   if (poll(, 1, -1) < 0) {
+   perror("poll");
+   exit(EXIT_FAILURE);
+   }
+
+   if (fds.revents & POLLERR) {
+   fprintf(stderr, "'poll()' error\n");
+   exit(EXIT_FAILURE);
+   }
+
+   if (fds.revents & (POLLIN | POLLRDNORM)) {
+   recv_buf(fd, buf, recv_buf_size, 0, recv_buf_size);
+   } else {
+   /* These flags must be set, as there is at
+* least 64KB of data ready to read.
+*/
+   fprintf(stderr, "POLLIN | POLLRDNORM expected\n");
+   exit(EXIT_FAILURE);
+   }
+
+   free(buf);
+   close(fd);
+}
+
 static struct test_case test_cases[] = {
{
.name = "SOCK_STREAM connection reset",
@@ -1285,6 +1411,11 @@ static struct test_case test_cases[] = {
.run_client = test_stream_msgzcopy_empty_errq_client,
.run_server = test_stream_msgzcopy_empty_errq_server,
},
+   {
+   .

[RFC PATCH v1 0/2] send credit update during setting SO_RCVLOWAT

2023-11-07 Thread Arseniy Krasnov
Hello,

   DESCRIPTION

This patchset fixes old problem with hungup of both rx/tx sides and adds
test for it. This happens due to non-default SO_RCVLOWAT value and
deferred credit update in virtio/vsock. Link to previous old patchset:
https://lore.kernel.org/netdev/39b2e9fd-601b-189d-39a9-914e55745...@sberdevices.ru/

Here is what happens step by step:

  TEST

INITIAL CONDITIONS

1) Vsock buffer size is 128KB.
2) Maximum packet size is also 64KB as defined in header (yes it is
   hardcoded, just to remind about that value).
3) SO_RCVLOWAT is default, e.g. 1 byte.


 STEPS

SENDER  RECEIVER
1) sends 128KB + 1 byte in a
   single buffer. 128KB will
   be sent, but for 1 byte
   sender will wait for free
   space at peer. Sender goes
   to sleep.


2) reads 64KB, credit update not sent
3) sets SO_RCVLOWAT to 64KB + 1
4) poll() -> wait forever, there is
   only 64KB available to read.

So in step 4) receiver also goes to sleep, waiting for enough data or
connection shutdown message from the sender. Idea to fix it is that rx
kicks tx side to continue transmission (and may be close connection)
when rx changes number of bytes to be woken up (e.g. SO_RCVLOWAT) and
this value is bigger than number of available bytes to read.

I've added small test for this, but not sure as it uses hardcoded value
for maximum packet length, this value is defined in kernel header and
used to control deferred credit update. And as this is not available to
userspace, I can't control test parameters correctly (if one day this
define will be changed - test may become useless). 

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=ff269e2cd5adce4ae14f883fc9c8803bc43ee1e9

Arseniy Krasnov (2):
  virtio/vsock: send credit update during setting SO_RCVLOWAT
  vsock/test: SO_RCVLOWAT + deferred credit update test

 drivers/vhost/vsock.c   |   2 +
 include/linux/virtio_vsock.h|   1 +
 net/vmw_vsock/virtio_transport.c|   2 +
 net/vmw_vsock/virtio_transport_common.c |  31 ++
 net/vmw_vsock/vsock_loopback.c  |   2 +
 tools/testing/vsock/vsock_test.c| 131 
 6 files changed, 169 insertions(+)

-- 
2.25.1




[PATCH 02/13] hperf_hmp: introduce hew domain flag.

2015-11-06 Thread Arseniy Krasnov
New scheduler domain type: HMP. Each big.LITTLE cluster is detected by
scheduler as HMP domain. HPERF_HMP logic works between two HMP domains, the
default CFS logic, in turn, works inside the HMP domain.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 arch/arm/kernel/topology.c | 6 +-
 include/linux/sched.h  | 4 
 kernel/sched/core.c| 9 -
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 08b7847..7fcc5fe 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -285,7 +285,11 @@ static struct sched_domain_topology_level arm_topology[] = 
{
{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 #endif
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+   { cpu_cpu_mask,
+#ifdef CONFIG_HPERF_HMP
+.flags = SD_HMP_BALANCE,
+#endif
+SD_INIT_NAME(DIE)},
{ NULL, },
 };
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b7b9501..eb084df 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -990,6 +990,10 @@ extern void wake_up_q(struct wake_q_head *head);
 #define SD_OVERLAP 0x2000  /* sched_domains of this level overlap 
*/
 #define SD_NUMA0x4000  /* cross-node balancing */
 
+#ifdef CONFIG_HPERF_HMP
+#define SD_HMP_BALANCE 0x8000  /* Use HMP load balancing algorithm */
+#endif
+
 #ifdef CONFIG_SCHED_SMT
 static inline int cpu_smt_flags(void)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcd214e..16092e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6410,6 +6410,9 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
| 0*SD_PREFER_SIBLING
| 0*SD_NUMA
| sd_flags
+#ifdef CONFIG_HPERF_HMP
+   | (tl->flags & SD_HMP_BALANCE)
+#endif
,
 
.last_balance   = jiffies,
@@ -6472,7 +6475,11 @@ static struct sched_domain_topology_level 
default_topology[] = {
 #ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 #endif
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+   { cpu_cpu_mask,
+#ifdef CONFIG_HPERF_HMP
+.flags = SD_HMP_BALANCE,
+#endif
+SD_INIT_NAME(DIE)},
{ NULL, },
 };
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 04/13] hperf_hmp: scheduler initialization routines.

2015-11-06 Thread Arseniy Krasnov
Adds new fields to 'rq' structure and routine called during fair class
setup, which initializes some HMP scheduler variables: big and little cluster
masks. They are read from kernel config(if set), else default values are used.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 kernel/sched/core.c  |  4 
 kernel/sched/fair.c  | 46 ++
 kernel/sched/sched.h | 15 +++
 3 files changed, 65 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e3a632f..8747e06 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7488,6 +7488,10 @@ void __init sched_init(void)
 #endif
init_rq_hrtick(rq);
atomic_set(>nr_iowait, 0);
+#ifdef CONFIG_HPERF_HMP
+   rq->druntime_sum = 0;
+   rq->nr_hmp_tasks = 0;
+#endif
}
 
set_load_weight(_task);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a5e60f..c57007f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -100,6 +100,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 
50UL;
  */
 unsigned int __read_mostly sysctl_sched_shares_window = 1000UL;
 
+#ifdef CONFIG_HPERF_HMP
+extern void hmp_set_cpu_masks(struct cpumask *, struct cpumask *);
+static unsigned int freq_scale_cpu_power[CONFIG_NR_CPUS];
+#endif /* CONFIG_HPERF_HMP */
+
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
@@ -8305,8 +8310,38 @@ void show_numa_stats(struct task_struct *p, struct 
seq_file *m)
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
 
+#ifdef CONFIG_HPERF_HMP
+static unsigned long default_fast_mask = 0x0F;
+static unsigned long default_slow_mask = 0xF0;
+
+void hmp_set_cpu_masks(struct cpumask *fast_mask, struct cpumask *slow_mask)
+{
+   cpumask_clear(fast_mask);
+   cpumask_clear(slow_mask);
+
+   /* try to parse CPU masks from config */
+   if (strlen(CONFIG_HMP_FAST_CPU_MASK) &&
+   strlen(CONFIG_HMP_SLOW_CPU_MASK)) {
+   if (cpumask_parse(CONFIG_HMP_FAST_CPU_MASK, fast_mask) ||
+   cpumask_parse(CONFIG_HMP_SLOW_CPU_MASK, slow_mask))
+   pr_err("hperf_hmp: Failed to get CPU masks from 
config!\n");
+   else
+   return;
+   }
+
+   pr_err("hperf_hmp: Fast mask will be: %08lX, slow mask: %08lX\n",
+  default_fast_mask, default_slow_mask);
+
+   fast_mask->bits[0] = default_fast_mask;
+   slow_mask->bits[0] = default_slow_mask;
+}
+#endif
+
 __init void init_sched_fair_class(void)
 {
+#ifdef CONFIG_HPERF_HMP
+   int cpu;
+#endif
 #ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 
@@ -8315,6 +8350,17 @@ __init void init_sched_fair_class(void)
zalloc_cpumask_var(_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
 #endif
+
+#ifdef CONFIG_HPERF_HMP
+   for_each_possible_cpu(cpu)
+   freq_scale_cpu_power[cpu] = SCHED_CAPACITY_SCALE;
+   hmp_set_cpu_masks(cpu_fastest_mask, cpu_slowest_mask);
+   pr_info("hperf_hmp: fast CPUs mask: %08X\n",
+   (unsigned int)cpumask_bits(cpu_fastest_mask)[0]);
+   pr_info("hperf_hmp: slow CPUs mask: %08X\n",
+   (unsigned int)cpumask_bits(cpu_slowest_mask)[0]);
+#endif
+
 #endif /* SMP */
 
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d2a119..94828dc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -597,6 +597,11 @@ struct rq {
 */
unsigned long nr_uninterruptible;
 
+#ifdef CONFIG_HPERF_HMP
+   /* shows the amount of accumulated unfairness by tasks of this rq */
+   long druntime_sum;
+   unsigned int nr_hmp_tasks;
+#endif
struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
@@ -892,6 +897,16 @@ static inline unsigned int group_first_cpu(struct 
sched_group *group)
 
 extern int group_balance_cpu(struct sched_group *sg);
 
+#ifdef CONFIG_HPERF_HMP
+extern struct cpumask *cpu_fastest_mask;
+extern struct cpumask *cpu_slowest_mask;
+
+static inline bool cpu_is_fastest(int cpu)
+{
+   return cpumask_test_cpu(cpu, cpu_fastest_mask);
+}
+#endif
+
 #else
 
 static inline void sched_ttwu_pending(void) { }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 05/13] hperf_hmp: introduce druntime metric.

2015-11-06 Thread Arseniy Krasnov
This patch adds special per-task metric to look for candidate for
migration between HMP domains(clusters). 'druntime' grows up when task runs on
A7 cluster, and goes down on A15 cluster. Also druntime is scaled according load
on little cluster in order to align its value with big cluster's total druntime.
For migration from big/little to little/big cluster task with lowest/highest
'druntime' chosen. 'druntime' is used to execute each task on each cluster
approximately same amount of time. 'druntime' is calculated each call of default
'update_curr' function.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 include/linux/sched.h |   3 ++
 kernel/sched/core.c   |   3 ++
 kernel/sched/fair.c   | 115 ++
 3 files changed, 121 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa72125..89c1bf3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1257,6 +1257,9 @@ struct sched_entity {
struct list_headgroup_node;
unsigned inton_rq;
 
+#ifdef CONFIG_HPERF_HMP
+   longdruntime;
+#endif
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8747e06..6883a00 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2085,6 +2085,9 @@ static void __sched_fork(unsigned long clone_flags, 
struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime  = 0;
+#ifdef CONFIG_HPERF_HMP
+   p->se.druntime  = 0;
+#endif
INIT_LIST_HEAD(>se.group_node);
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c57007f..e94fab4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -102,6 +102,10 @@ unsigned int __read_mostly sysctl_sched_shares_window = 
1000UL;
 
 #ifdef CONFIG_HPERF_HMP
 extern void hmp_set_cpu_masks(struct cpumask *, struct cpumask *);
+static atomic_t a15_nr_hmp_busy = ATOMIC_INIT(0);
+static atomic_t a7_nr_hmp_busy = ATOMIC_INIT(0);
+static atomic_t hmp_imbalance = ATOMIC_INIT(0);
+
 static unsigned int freq_scale_cpu_power[CONFIG_NR_CPUS];
 #endif /* CONFIG_HPERF_HMP */
 
@@ -660,6 +664,115 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
+#ifdef CONFIG_HPERF_HMP
+static bool
+is_task_hmp(struct task_struct *task, const struct cpumask *task_cpus)
+{
+   if (!task_cpus)
+   task_cpus = tsk_cpus_allowed(task);
+
+   /*
+* Check if a task has cpus_allowed only for one CPU domain (A15 or A7)
+*/
+   return !(cpumask_intersects(task_cpus, cpu_fastest_mask) ^
+cpumask_intersects(task_cpus, cpu_slowest_mask));
+}
+
+#ifdef CONFIG_HPERF_HMP_DEBUG
+static inline void check_druntime_sum(struct rq *rq, long druntime_sum)
+{
+   BUG_ON(rq->cfs.h_nr_running == 0 && druntime_sum != 0);
+
+   if (cpu_is_fastest(rq->cpu))
+   BUG_ON(druntime_sum > 0);
+   else
+   BUG_ON(druntime_sum < 0);
+}
+#else
+static inline void check_druntime_sum(struct rq *rq, long druntime_sum)
+{
+}
+#endif
+
+static inline void add_druntime_sum(struct rq *rq, long delta)
+{
+   rq->druntime_sum += delta;
+   check_druntime_sum(rq, rq->druntime_sum);
+}
+/* Updates druntime for a task */
+static inline void
+update_hmp_stat(struct cfs_rq *cfs_rq, struct sched_entity *curr,
+   unsigned long delta_exec)
+{
+   long to_add;
+   unsigned int hmp_fairness_threshold = 240;
+   struct rq *rq = rq_of(cfs_rq);
+   int a7_nr_hmp_busy_tmp;
+
+   if (atomic_read(_imbalance) == 0)
+   return;
+
+   if (!curr->on_rq)
+   return;
+
+   if (!entity_is_task(curr))
+   return;
+
+   if (!task_of(curr)->on_rq)
+   return;
+
+   if (!cfs_rq->h_nr_running)
+   return;
+
+   if (!is_task_hmp(task_of(curr), NULL))
+   return;
+
+   delta_exec = delta_exec >> 10;
+
+   if (cpu_is_fastest(rq->cpu))
+   to_add = -delta_exec;
+   else
+   to_add = delta_exec;
+
+   to_add -= curr->druntime;
+
+   /* Avoid values with the different sign */
+   if ((cpu_is_fastest(rq->cpu) && to_add >= 0) ||
+   (!cpu_is_fastest(rq->cpu) && to_add <= 0))
+   return;
+
+   to_add /= (long)(2 + 4 * hmp_fairness_threshold /
+   (cfs_rq->h_nr_running + 1));
+
+   a7_nr_hmp_busy_tmp = atomic_read(_nr_hmp_busy);
+   /* druntim

[PATCH 06/13] hperf_hmp: is_hmp_imbalance introduced.

2015-11-06 Thread Arseniy Krasnov
'is_hmp_imbalance' function calculates imbalance between clusters, four
cases are possible: balancing from/to one of clusters, task swap(when clusters
are balanced) or skip rebalance. Function calculates load difference between two
cluster(cluster load / cluster power) and threshold when balancing is needed.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 kernel/sched/fair.c | 103 
 1 file changed, 103 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e94fab4..3ab39b6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -104,9 +104,21 @@ unsigned int __read_mostly sysctl_sched_shares_window = 
1000UL;
 extern void hmp_set_cpu_masks(struct cpumask *, struct cpumask *);
 static atomic_t a15_nr_hmp_busy = ATOMIC_INIT(0);
 static atomic_t a7_nr_hmp_busy = ATOMIC_INIT(0);
+
+/* Total weight of all running tasks on A15 and A7 CPU domains */
+static atomic_long_t a15_total_weight = ATOMIC_LONG_INIT(0);
+static atomic_long_t a7_total_weight = ATOMIC_LONG_INIT(0);
+
 static atomic_t hmp_imbalance = ATOMIC_INIT(0);
 
 static unsigned int freq_scale_cpu_power[CONFIG_NR_CPUS];
+
+enum hmp_balance_actions {
+   SWAP_TASKS,
+   A15_TO_A7,
+   A7_TO_A15,
+   SKIP_REBALANCE,
+};
 #endif /* CONFIG_HPERF_HMP */
 
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -7016,6 +7028,97 @@ static int should_we_balance(struct lb_env *env)
 */
return balance_cpu == env->dst_cpu;
 }
+#ifdef CONFIG_HPERF_HMP
+/**
+ * is_hmp_imbalance(): Calculates imbalance between HMP domains.
+ * @sd: Current sched domain.
+ *
+ * Returns migration direction(see SWAP_TASKS, A15_TO_A7, A7_TO_A15,
+ * SKIP_REBALANCE).
+ *
+ * Imbalance depends on load of tasks on A15 cores and A7 cores,
+ * current CPU's frequencies, and A7 slowdown coefficient which is about 2.4.
+ */
+static int is_hmp_imbalance(struct sched_domain *sd)
+{
+   int imbalance, cpu;
+   int a15_group_power = 0, a7_group_power = 0,
+   hmp_imbalance_min_threshold;
+   int a15_group_load, a7_group_load, a15_a7_group_power;
+   unsigned int a7_balanced_num;
+   int reminder, divisor;
+   unsigned int a15_balanced_num;
+   long long int hmp_imbalance_threshold;
+
+   if (!sd->a15_group) {
+   return SKIP_REBALANCE;
+   }
+
+   if (!sd->a7_group) {
+   return SKIP_REBALANCE;
+   }
+   for_each_online_cpu(cpu) {
+   if (cpu_is_fastest(cpu))
+   a15_group_power += freq_scale_cpu_power[cpu];
+   else
+   a7_group_power += freq_scale_cpu_power[cpu];
+   }
+
+   if (a15_group_power == 0 || a7_group_power == 0) {
+   return SKIP_REBALANCE;
+   }
+
+   a15_balanced_num = 0;
+   a7_balanced_num = 0;
+
+   for_each_online_cpu(cpu) {
+   if (cpu_rq(cpu)->cfs.h_nr_running <= 1) {
+   if (cpu_is_fastest(cpu))
+   a15_balanced_num++;
+   else
+   a7_balanced_num++;
+   }
+   }
+
+   a7_group_load = atomic_long_read(_total_weight);
+
+   if (atomic_long_read(_total_weight) == 0 &&
+   (a15_balanced_num == sd->a15_group->group_weight)) {
+   return SKIP_REBALANCE;
+   }
+
+   a15_group_load = atomic_long_read(_total_weight);
+   a15_a7_group_power = a15_group_power + a7_group_power;
+
+   imbalance = (a15_group_load * 1024) / (a15_group_power) -
+   (a7_group_load * 1024) / (a7_group_power);
+   hmp_imbalance_threshold = ((long long int)NICE_0_LOAD *
+  1024 * a15_a7_group_power);
+   divisor = 2 * a15_group_power * a7_group_power;
+   hmp_imbalance_threshold = div_s64_rem(hmp_imbalance_threshold,
+   divisor, );
+   hmp_imbalance_min_threshold = hmp_imbalance_threshold >> 3;
+
+   if (imbalance < hmp_imbalance_min_threshold &&
+   imbalance > -hmp_imbalance_min_threshold) {
+   atomic_set(_imbalance, 0);
+   return SKIP_REBALANCE;
+   }
+
+   if (imbalance > hmp_imbalance_threshold) {
+   return A15_TO_A7;
+   } else {
+   if (imbalance < -hmp_imbalance_threshold) {
+   if (a7_balanced_num == sd->a7_group->group_weight)
+   return SWAP_TASKS;
+   else
+   return A7_TO_A15;
+   } else {
+   return SWAP_TASKS;
+   }
+   }
+}
+#endif /* CONFIG_HPERF_HMP */
 
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
-- 
1.9.1

[PATCH 10/13] hperf_hmp: idle pull function.

2015-11-06 Thread Arseniy Krasnov
HMP idle pull is triggered when CPU becomes idle. It tries to pull task
from another cluster when it is overloaded. Also A7 can't pull alone task from
A15, but A15 can do that with A7 core. Task for migration is chosen in the same
way as for other HMP migration cases - using 'druntime' metric. Only difference
is that migration task doesn't need to run 5ms on its cluster before migration.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 kernel/sched/fair.c | 66 +
 1 file changed, 66 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4fda1ec..fd16729 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7421,6 +7421,72 @@ static unsigned int try_to_move_task(struct task_struct 
*migrate_task,
 }
 
 /**
+ * hmp_idle_pull(): Pulls task from opposite domain of this_cpu to this_cpu.
+ * @sd: Current sched domain.
+ * @this_cpu: without NO_HZ same as smp_processor_id().
+ *
+ * Returns moved weight.
+ *
+ * Chooses task by its druntime. Ignores task's druntime and
+ * time of last HMP migration. Also A7 can't pulls task from A15
+ * if A15 become idle.
+ */
+static unsigned int hmp_idle_pull(struct sched_domain *sd, int this_cpu)
+{
+   unsigned int ld_moved = 0;
+   struct task_struct *task_to_pull;
+   unsigned long local_flags;
+   int idle_stopper = 0;
+   struct rq *local_rq;
+   struct rq *rq;
+
+   local_irq_save(local_flags);
+   local_rq = cpu_rq(this_cpu);
+   rq = get_unfair_rq(sd, this_cpu);
+
+   if (!rq) {
+   local_irq_restore(local_flags);
+   return 0;
+   }
+   double_lock_balance(rq, local_rq);
+
+   if (rq->active_balance)
+   goto unlock;
+
+   if (local_rq->active_balance)
+   goto unlock;
+
+   /* Forbids secondary CPUs to pull alone task from primary CPUs */
+   if (!cpu_is_fastest(this_cpu) && rq->cfs.h_nr_running <= 1)
+   goto unlock;
+
+   /* Get task to pull from opposite domain to this_cpu */
+   task_to_pull = get_migration_candidate(sd, rq, 1, this_cpu);
+
+   if (!task_to_pull)
+   goto unlock;
+
+   ld_moved = try_to_move_task(task_to_pull, this_cpu, _stopper);
+
+   if (idle_stopper) {
+   rq->push_cpu = this_cpu;
+   rq->active_balance = 1;
+   rq->migrate_task = task_to_pull;
+   }
+
+unlock:
+   double_rq_unlock(local_rq, rq);
+   local_irq_restore(local_flags);
+
+   if (idle_stopper)
+   stop_one_cpu_nowait(rq->cpu, active_load_balance_cpu_stop,
+   rq, >active_balance_work);
+
+   return ld_moved;
+}
+
+
+/**
  * swap_tasks(): swaps two tasks from different HMP domains
  * @sd: Current sched domain
  * @this_cpu: without NO_HZ same as smp_processor_id().
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 03/13] hperf_hmp: add sched domains initialization.

2015-11-06 Thread Arseniy Krasnov
Attaching CPU clusters as 'sched_group' to HMP domains. Each HMP domain
has two pointers to A15 and A7 scheduling groups(struct sched_group).

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 include/linux/sched.h |  4 
 kernel/sched/core.c   | 49 +
 2 files changed, 53 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eb084df..aa72125 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1057,6 +1057,10 @@ struct sched_domain {
u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost;
 
+#ifdef CONFIG_HPERF_HMP
+   struct sched_group *a15_group;
+   struct sched_group *a7_group;
+#endif
 #ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 16092e0..e3a632f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,6 +90,16 @@
 #define CREATE_TRACE_POINTS
 #include 
 
+#ifdef CONFIG_HPERF_HMP
+/* cpumask for A15 cpus */
+static DECLARE_BITMAP(cpu_fastest_bits, CONFIG_NR_CPUS);
+struct cpumask *cpu_fastest_mask = to_cpumask(cpu_fastest_bits);
+
+/* cpumask for A7 cpus */
+static DECLARE_BITMAP(cpu_slowest_bits, CONFIG_NR_CPUS);
+struct cpumask *cpu_slowest_mask = to_cpumask(cpu_slowest_bits);
+#endif
+
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -6971,6 +6981,45 @@ static int build_sched_domains(const struct cpumask 
*cpu_map,
sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
+
+#ifdef CONFIG_HPERF_HMP
+   for (i = nr_cpumask_bits - 1; i >= 0; i--) {
+   if (!cpumask_test_cpu(i, cpu_map))
+   continue;
+
+   for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+   struct sched_group *sg;
+   sd->a7_group = NULL;
+   sd->a15_group = NULL;
+
+   /* Process only HMP domains */
+   if (!(sd->flags & SD_HMP_BALANCE))
+   continue;
+
+   /*
+* Process sched groups of this domain.
+* Attach sg to hmp domains.
+*/
+   sg = sd->groups;
+   do {
+   if (!sg->sgc)
+   goto next_sg;
+#ifdef CONFIG_SCHED_DEBUG
+   printk(KERN_EMERG "Attaching CPUs 0x%08lX to 
domain %s\n",
+  sched_group_cpus(sg)->bits[0], sd->name);
+#endif
+   if (cpumask_intersects(sched_group_cpus(sg),
+   cpu_fastest_mask))
+   sd->a15_group = sg;
+   else
+   sd->a7_group = sg;
+next_sg:
+   sg = sg->next;
+   } while (sg != sd->groups);
+   }
+   }
+#endif /* CONFIG_HPERF_HMP */
+
rcu_read_unlock();
 
ret = 0;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 08/13] hperf_hmp: swap tasks function.

2015-11-06 Thread Arseniy Krasnov
'swap_tasks' performs migration between current CPU and CPU from another
cluster. It scans two runqueues looking for tasks using 'druntime' metric. When
both tasks are found it pulls task from another cluster, and push task from the
current CPU.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 kernel/sched/fair.c  | 100 +++
 kernel/sched/sched.h |   1 +
 2 files changed, 101 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff05364..028d329 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7419,6 +7419,106 @@ static unsigned int try_to_move_task(struct task_struct 
*migrate_task,
 
return migrate_runnable_task(migrate_task, destination_cpu);
 }
+
+/**
+ * swap_tasks(): swaps two tasks from different HMP domains
+ * @sd: Current sched domain
+ * @this_cpu: without NO_HZ same as smp_processor_id().
+ *
+ * Returns weight of migrated tasks.
+ */
+static unsigned int swap_tasks(struct sched_domain *sd, int this_cpu)
+{
+   unsigned int ld_moved = 0;
+   int local_stopper = 0;
+   int foreign_stopper = 0;
+   struct rq *local_rq = cpu_rq(this_cpu);
+   struct rq *foreign_rq = NULL;
+   struct task_struct *local_task = NULL;
+   struct task_struct *foreign_task = NULL;
+   unsigned long local_flags;
+
+   local_irq_save(local_flags);
+   foreign_rq = get_unfair_rq(sd, this_cpu);
+
+   if (!foreign_rq) {
+   local_irq_restore(local_flags);
+   return 0;
+   }
+
+   double_lock_balance(foreign_rq, local_rq);
+
+   /* rq's waiting for stopper execution, return */
+   if (foreign_rq->active_balance)
+   goto unlock;
+
+   if (local_rq->active_balance)
+   goto unlock;
+
+   foreign_task = get_migration_candidate(sd, foreign_rq, 0, this_cpu);
+
+   if (!foreign_task)
+   goto unlock;
+
+   /* Get local task for migration */
+   local_task = get_migration_candidate(sd, local_rq, 0, foreign_rq->cpu);
+
+   if (!local_task) {
+   foreign_task->se.migrate_candidate = 0;
+   goto unlock;
+   }
+   /* First try to push local task */
+   ld_moved = try_to_move_task(local_task, foreign_rq->cpu,
+   _stopper);
+
+   /* If failed to move, then return, don't try to move foreign task */
+   if (!ld_moved) {
+   local_task->se.migrate_candidate = 0;
+   foreign_task->se.migrate_candidate = 0;
+   goto unlock;
+   }
+
+   /*
+* Migration is possible, but task is running,
+* so mark rq to run stopper.
+*/
+   if (local_stopper) {
+   local_rq->push_cpu = foreign_rq->cpu;
+   local_rq->migrate_task = local_task;
+   local_rq->active_balance = 1;
+   }
+
+   /* Now try to pull task from another cpu */
+   ld_moved = try_to_move_task(foreign_task, this_cpu,
+   _stopper);
+
+   /* Failed to move foreign_task */
+   if (!ld_moved)
+   foreign_task->se.migrate_candidate = 0;
+
+   /* Migration is possible, mark rq to run stopper */
+   if (foreign_stopper) {
+   foreign_rq->push_cpu = this_cpu;
+   foreign_rq->migrate_task = foreign_task;
+   foreign_rq->active_balance = 1;
+   }
+
+unlock:
+   double_rq_unlock(local_rq, foreign_rq);
+   local_irq_restore(local_flags);
+
+   if (local_stopper)
+   stop_one_cpu_nowait(local_rq->cpu,
+   active_load_balance_cpu_stop, local_rq,
+   _rq->active_balance_work);
+
+   if (foreign_stopper)
+   stop_one_cpu_nowait(foreign_rq->cpu,
+   active_load_balance_cpu_stop, foreign_rq,
+   _rq->active_balance_work);
+
+   return ld_moved;
+}
 #endif /* CONFIG_HPERF_HMP */
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 94828dc..47e9605 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -598,6 +598,7 @@ struct rq {
unsigned long nr_uninterruptible;
 
 #ifdef CONFIG_HPERF_HMP
+   struct task_struct *migrate_task; /* task from this rq for migration */
/* shows the amount of accumulated unfairness by tasks of this rq */
long druntime_sum;
unsigned int nr_hmp_tasks;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 07/13] hperf_hmp: migration auxiliary functions.

2015-11-06 Thread Arseniy Krasnov
Adds functions used for migration: scanning every runqueue from another
cluster for migration process, searching task to migrate from runqueue mentioned
above and function to move task from one CPU to another.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 include/linux/sched.h |   6 +
 kernel/sched/fair.c   | 301 ++
 2 files changed, 307 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89c1bf3..dafda4b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1259,6 +1259,12 @@ struct sched_entity {
 
 #ifdef CONFIG_HPERF_HMP
longdruntime;
+
+   /* Time of last migration between HMP domains (in jiffies)*/
+   unsigned long   last_migration;
+
+   /* If set, don't touch for migration */
+   int migrate_candidate;
 #endif
u64 exec_start;
u64 sum_exec_runtime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3ab39b6..ff05364 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7118,6 +7118,307 @@ static int is_hmp_imbalance(struct sched_domain *sd)
}
}
 }
+
+/**
+ * hmp_can_migrate_task(): Checks whether specified task could be migrated.
+ * @p: task to check.
+ * @env: migration parameters.
+ *
+ * Returns 1 if migration possible, else 0.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+   if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+   schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+   return 0;
+   }
+   env->flags &= ~LBF_ALL_PINNED;
+
+   if (task_running(env->src_rq, p)) {
+   schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+   return 0;
+   }
+   return 1;
+}
+
+/**
+ * detach_specified_task(): Detaches specified task.
+ * @pm: Task to move.
+ * @env: Migration parameters.
+ *
+ * Returns moved task.
+ */
+static struct task_struct *
+detach_specified_task(struct task_struct *p, struct lb_env *env)
+{
+   lockdep_assert_held(>src_rq->lock);
+
+   /* If task to move falls asleep, so don't scan runqueue and return */
+   if (p->se.migrate_candidate == 0)
+   return 0;
+
+   if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
+   goto exit;
+
+   if (!hmp_can_migrate_task(p, env))
+   goto exit;
+
+   detach_task(p, env);
+   /*
+* Right now, this is only the third place move_task()
+* is called, so we can safely collect move_task()
+* stats here rather than inside move_task().
+*/
+   schedstat_inc(env->sd, lb_gained[env->idle]);
+   return p;
+exit:
+   p->se.migrate_candidate = 0;
+
+   return NULL;
+}
+
+/**
+ * migrate_runnable_task(): Moves task that isn't running to destination CPU.
+ * @migrate_task: Task to migrate.
+ * @destination_cpu: Destination CPU.
+ *
+ * Returns moved weight.
+ *
+ * Runqueue's of @migrate_task and @destination_cpu must be locked.
+ */
+static unsigned migrate_runnable_task(struct task_struct *migrate_task,
+ int destination_cpu)
+{
+   struct sched_domain *sd = NULL;
+   int src_cpu = task_cpu(migrate_task);
+   struct rq *src_rq = task_rq(migrate_task);
+   int dst_cpu = destination_cpu;
+   struct rq *dst_rq = cpu_rq(dst_cpu);
+   unsigned int ld_moved = 0;
+   struct task_struct *p = NULL;
+
+#ifdef CONFIG_HPERF_HMP_DEBUG
+   BUG_ON(src_rq == dst_rq);
+#else
+   if (WARN_ON(src_rq == dst_rq))
+   return 0;
+#endif
+
+   rcu_read_lock();
+   for_each_domain(dst_cpu, sd) {
+   if (cpumask_test_cpu(src_cpu, sched_domain_span(sd)))
+   break;
+   }
+   if (likely(sd)) {
+   struct lb_env env = {
+   .sd = sd,
+   .dst_cpu= dst_cpu,
+   .dst_rq = dst_rq,
+   .src_cpu= src_cpu,
+   .src_rq = src_rq,
+   .idle   = CPU_NOT_IDLE,
+   };
+
+   schedstat_inc(sd, alb_count);
+   p = detach_specified_task(migrate_task, );
+   if (p) {
+   migrate_task->se.last_migration = jiffies;
+   schedstat_inc(sd, alb_pushed);
+   ld_moved = migrate_task->se.load.weight;
+   } else
+   schedstat_inc(sd, alb_failed);
+   }
+   rcu_read_unlock();
+
+   if (p)
+   attach_task(dst_rq, p);
+
+   if (migrate_task->s

[PATCH 12/13] hperf_hmp: rest of logic.

2015-11-06 Thread Arseniy Krasnov
Inserts call to main logic from 'load_balance', balance parameters
calculation during enqueue/dequeue task from runqueue and affinity mask
change callback for fair scheduling class.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 kernel/sched/fair.c | 204 +++-
 1 file changed, 202 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79be023..06f6518 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -677,6 +677,16 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 }
 
 #ifdef CONFIG_HPERF_HMP
+static void hmp_calculate_imbalance(void)
+{
+   if (atomic_long_read(_total_weight) == 0) {
+   atomic_set(_imbalance, 0);
+   return;
+   }
+
+   atomic_set(_imbalance, 1);
+}
+
 static bool
 is_task_hmp(struct task_struct *task, const struct cpumask *task_cpus)
 {
@@ -711,6 +721,13 @@ static inline void add_druntime_sum(struct rq *rq, long 
delta)
rq->druntime_sum += delta;
check_druntime_sum(rq, rq->druntime_sum);
 }
+
+static inline void sub_druntime_sum(struct rq *rq, long delta)
+{
+   rq->druntime_sum -= delta;
+   check_druntime_sum(rq, rq->druntime_sum);
+}
+
 /* Updates druntime for a task */
 static inline void
 update_hmp_stat(struct cfs_rq *cfs_rq, struct sched_entity *curr,
@@ -861,7 +878,9 @@ static void update_curr(struct cfs_rq *cfs_rq)
 
account_cfs_rq_runtime(cfs_rq, delta_exec);
 
+#ifdef CONFIG_HPERF_HMP
update_hmp_stat(cfs_rq, curr, delta_exec);
+#endif
 }
 
 static void update_curr_fair(struct rq *rq)
@@ -4200,6 +4219,66 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_HPERF_HMP
+#ifdef CONFIG_HPERF_HMP_DEBUG
+static void check_nr_hmp_tasks(struct rq *rq)
+{
+   if (rq->nr_hmp_tasks > rq->cfs.h_nr_running) {
+   pr_emerg("HMP BUG: rq->nr_hmp_tasks = %u, "
+"rq->cfs.h_nr_running = %u\n", rq->nr_hmp_tasks,
+rq->cfs.h_nr_running);
+   BUG();
+   }
+}
+#else
+static void check_nr_hmp_tasks(struct rq *rq) { }
+#endif
+
+static void nr_hmp_tasks_inc(struct rq *rq)
+{
+   if (!rq->nr_hmp_tasks) {
+   if (cpu_is_fastest(rq->cpu))
+   atomic_inc(_nr_hmp_busy);
+   else
+   atomic_inc(_nr_hmp_busy);
+   }
+
+   rq->nr_hmp_tasks++;
+   check_nr_hmp_tasks(rq);
+}
+
+static void nr_hmp_tasks_dec(struct rq *rq)
+{
+   rq->nr_hmp_tasks--;
+
+   if (!rq->nr_hmp_tasks) {
+   if (cpu_is_fastest(rq->cpu))
+   atomic_dec(_nr_hmp_busy);
+   else
+   atomic_dec(_nr_hmp_busy);
+   }
+   check_nr_hmp_tasks(rq);
+}
+
+static void
+set_cpus_allowed_hmp(struct task_struct *p, const struct cpumask *new_mask)
+{
+   bool is_hmp_before, is_hmp_after;
+
+   cpumask_copy(>cpus_allowed, new_mask);
+   p->nr_cpus_allowed = cpumask_weight(new_mask);
+   is_hmp_before = is_task_hmp(p, NULL);
+   is_hmp_after  = is_task_hmp(p, new_mask);
+
+   if (!p->on_cpu && p->se.on_rq && (is_hmp_before != is_hmp_after)) {
+   if (is_hmp_after)
+   nr_hmp_tasks_inc(rq_of(cfs_rq_of(>se)));
+   else
+   nr_hmp_tasks_dec(rq_of(cfs_rq_of(>se)));
+   }
+}
+#endif
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -4241,8 +4320,24 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, 
int flags)
update_cfs_shares(cfs_rq);
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, 1);
+#ifdef CONFIG_HPERF_HMP
+   if (is_task_hmp(p, NULL))
+   nr_hmp_tasks_inc(rq);
+
+   if (cpu_is_fastest(rq->cpu)) {
+   atomic_long_add(p->se.load.weight, _total_weight);
+   if (p->se.druntime < 0)
+   add_druntime_sum(rq, p->se.druntime);
+   } else {
+   atomic_long_add(p->se.load.weight, _total_weight);
+   if (p->se.druntime > 0)
+   add_druntime_sum(rq, p->se.druntime);
+   }
+   hmp_calculate_imbalance();
+#endif
+   }
 
hrtick_update(rq);
 }
@@ -4301,8 +4396,30 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}
 
-   if (!se)
+   if (!se) {
sub_nr_running(rq, 1);
+#ifdef CONFIG_HPERF_HMP
+  

[PATCH 11/13] hperf_hmp: task CPU selection logic.

2015-11-06 Thread Arseniy Krasnov
Adds new runqueue selection logic. If task is newly woken(fork or exec)
or it is not WF_SYNC wakeup, idlest CPU from both clusters is selected. Else,
default wake up logic is used('want_affine'). If it fails, idlest CPU from both
clusters is selected.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 kernel/sched/fair.c | 132 
 1 file changed, 101 insertions(+), 31 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd16729..79be023 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4798,6 +4798,62 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p, int sync)
return 1;
 }
 
+#ifdef CONFIG_HPERF_HMP
+/**
+ * hmp_select_task_rq_fair(): selects cpu for task.
+ * @p: task which needs cpu
+ *
+ * Returns cpu for task.
+ *
+ * Selects idlest cpu for task @p.
+ */
+static int
+hmp_select_task_rq_fair(struct task_struct *p)
+{
+   int cpu;
+   int new_cpu;
+   unsigned long load;
+   unsigned long scaled_load;
+
+   new_cpu = task_cpu(p);
+
+   load = ULONG_MAX;
+   /* First check primary cpus */
+   for_each_cpu_and(cpu, cpu_online_mask, cpu_fastest_mask) {
+   if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+   /* Select idle cpu if it exists */
+   if (idle_cpu(cpu))
+   return cpu;
+   /* Otherwise select the least loaded cpu */
+   scaled_load = (weighted_cpuload(cpu) *
+  SCHED_CAPACITY_SCALE) /
+  freq_scale_cpu_power[cpu];
+   if (scaled_load < load) {
+   new_cpu = cpu;
+   load = scaled_load;
+   }
+   }
+   }
+
+   /* Then check secondary cpus */
+   for_each_cpu_and(cpu, cpu_online_mask, cpu_slowest_mask) {
+   if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+   if (idle_cpu(cpu))
+   return cpu;
+   scaled_load = (weighted_cpuload(cpu) *
+  SCHED_CAPACITY_SCALE) /
+  freq_scale_cpu_power[cpu];
+   if (scaled_load < load) {
+   new_cpu = cpu;
+   load = scaled_load;
+   }
+   }
+   }
+
+   return new_cpu;
+}
+
+#else /* CONFIG_HPERF_HMP */
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -4905,6 +4961,7 @@ find_idlest_cpu(struct sched_group *group, struct 
task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : 
least_loaded_cpu;
 }
 
+#endif /* CONFIG_HPERF_HMP */
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
@@ -4998,6 +5055,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
 
+#ifdef CONFIG_HPERF_HMP
+   if (!(sd_flag & SD_BALANCE_WAKE) || !sync)
+   return hmp_select_task_rq_fair(p);
+#endif
+
if (sd_flag & SD_BALANCE_WAKE)
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, 
tsk_cpus_allowed(p));
 
@@ -5030,41 +5092,49 @@ select_task_rq_fair(struct task_struct *p, int 
prev_cpu, int sd_flag, int wake_f
 
if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-   new_cpu = select_idle_sibling(p, new_cpu);
-
-   } else while (sd) {
-   struct sched_group *group;
-   int weight;
+   if (IS_ENABLED(CONFIG_HPERF_HMP) && sync)
+   new_cpu = prev_cpu;
+   else
+   new_cpu = select_idle_sibling(p, prev_cpu);
+   } else {
+#ifdef CONFIG_HPERF_HMP
+   new_cpu = hmp_select_task_rq_fair(p);
+#else
+   while (sd) {
+   struct sched_group *group;
+   int weight;
 
-   if (!(sd->flags & sd_flag)) {
-   sd = sd->child;
-   continue;
-   }
+   if (!(sd->flags & sd_flag)) {
+   sd = sd->child;
+   continue;
+   }
 
-   group = find_idlest_group(sd, p, cpu, sd_flag);
-   if (!group) {
-   sd = sd->child;
-   continue;
-   }
+   group = find_idlest_group(sd, p, cpu, sd_flag);
+  

[PATCH 13/13] hperf_hmp: cpufreq routines.

2015-11-06 Thread Arseniy Krasnov
Adds CPU frequency change notifier in fair scheduling class. Every time
when governor changes frequency, it calls callback from this patch. Frequency of
each CPU is used for imbalance calculation.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 kernel/sched/fair.c | 76 +
 1 file changed, 76 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 06f6518..87dc0db 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -33,6 +33,10 @@
 
 #include 
 
+#ifdef CONFIG_HPERF_HMP
+#include 
+#endif
+
 #include "sched.h"
 
 /*
@@ -101,6 +105,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 
50UL;
 unsigned int __read_mostly sysctl_sched_shares_window = 1000UL;
 
 #ifdef CONFIG_HPERF_HMP
+/*
+ * Log level of hperf_hmp messages. Bigger means more messages.
+ * Maximum level is 3.
+ */
+unsigned int sysctl_sched_hperf_hmp_log_level;
 extern void hmp_set_cpu_masks(struct cpumask *, struct cpumask *);
 static atomic_t a15_nr_hmp_busy = ATOMIC_INIT(0);
 static atomic_t a7_nr_hmp_busy = ATOMIC_INIT(0);
@@ -7229,6 +7238,73 @@ static int should_we_balance(struct lb_env *env)
return balance_cpu == env->dst_cpu;
 }
 #ifdef CONFIG_HPERF_HMP
+static void hperf_hmp_vprint(unsigned int log_level, const char *format,
+ va_list ap)
+{
+   if (sysctl_sched_hperf_hmp_log_level < log_level)
+   return;
+   vprintk(format, ap);
+}
+
+static void hperf_hmp_print(unsigned int log_level, const char *format, ...)
+{
+   va_list ap;
+
+   va_start(ap, format);
+   hperf_hmp_vprint(log_level, format, ap);
+   va_end(ap);
+}
+
+/* Called when frequency is changed */
+static int hmp_cpufreq_callback(struct notifier_block *nb,
+   unsigned long event, void *data)
+{
+   struct cpufreq_freqs *new_freq = data;
+
+   /* recount power only after change of frequency */
+   if (event != CPUFREQ_POSTCHANGE)
+   return NOTIFY_DONE;
+
+   if (!new_freq)
+   return NOTIFY_DONE;
+
+   freq_scale_cpu_power[new_freq->cpu] = (new_freq->new >> 10);
+
+   /* Apply slowdown coefficient of 1.9 for A7 CPUs */
+   if (!cpu_is_fastest(new_freq->cpu)) {
+   freq_scale_cpu_power[new_freq->cpu] *= 10;
+   freq_scale_cpu_power[new_freq->cpu] /= 19;
+   }
+
+   hperf_hmp_print(2, KERN_INFO "hperf_hmp: CPU#%i new frequency is: %u 
MHz\n",
+new_freq->cpu, new_freq->new / 1000);
+
+   return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_notifier = {
+   .notifier_call = hmp_cpufreq_callback
+};
+
+static int __init register_sched_cpufreq_notifier(void)
+{
+   int err = 0;
+   int cpu;
+
+   for_each_online_cpu(cpu)
+   freq_scale_cpu_power[cpu] = capacity_of(cpu);
+
+   err = cpufreq_register_notifier(_notifier,
+   CPUFREQ_TRANSITION_NOTIFIER);
+   if (!err)
+   pr_info("hperf_hmp: registered cpufreq transition notifier\n");
+   else
+   pr_info("hperf_hmp: failed to register cpufreq notifier!\n");
+
+   return err;
+}
+core_initcall(register_sched_cpufreq_notifier);
+
 /**
  * is_hmp_imbalance(): Calculates imbalance between HMP domains.
  * @sd: Current sched domain.
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 09/13] hperf_hmp: one way balancing function.

2015-11-06 Thread Arseniy Krasnov
Almost identical functions which push/pull task from/to current CPU
to/from another cluster. Called when balancing between clusters is broken and we
need to fix it.

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 kernel/sched/fair.c | 254 
 1 file changed, 254 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 028d329..4fda1ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7519,6 +7519,260 @@ unlock:
 
return ld_moved;
 }
+
+/* Get idlest cpu from opposite domain of this_cpu */
+static int get_idlest_cpu(struct sched_domain *sd, int this_cpu)
+{
+   struct sched_group *opposite_sg;
+   struct cpumask *opposite_mask;
+   unsigned long load = ULONG_MAX;
+   int idlest_cpu = -1;
+   int cpu;
+
+   opposite_sg = get_opposite_group(sd, cpu_is_fastest(this_cpu));
+   opposite_mask = sched_group_cpus(opposite_sg);
+
+   for_each_cpu_and(cpu, opposite_mask, cpu_online_mask) {
+   if (cpu_rq(cpu)->load.weight < load) {
+   load = cpu_rq(cpu)->load.weight;
+   idlest_cpu = cpu;
+   }
+   }
+   return idlest_cpu;
+}
+
+/**
+ * move_a15_to_a7(): Moves one task from A15 to A7.
+ * @sd: Current sched domain.
+ * @this_cpu: without NO_HZ same as smp_processor_id().
+ *
+ * Returns moved weight.
+ *
+ * Chooses task to migrate by druntime.
+ */
+static unsigned int move_a15_to_a7(struct sched_domain *sd, int this_cpu)
+{
+   struct task_struct *task_to_move;
+   struct rq *local_rq = NULL;
+   struct rq *foreign_rq = NULL;
+   int local_stopper_flag = 0;
+   int foreign_stopper_flag = 0;
+   unsigned long local_flags;
+   unsigned int ld_moved = 0;
+
+   local_rq = cpu_rq(this_cpu);
+   local_irq_save(local_flags);
+
+   if (!cpu_is_fastest(this_cpu)) {
+   /* this A7 pulls task from A15 */
+   foreign_rq = get_unfair_rq(sd, this_cpu);
+
+   if (!foreign_rq) {
+   local_irq_restore(local_flags);
+   return 0;
+   }
+
+   double_lock_balance(foreign_rq, local_rq);
+
+   if (foreign_rq->active_balance)
+   goto unlock;
+
+   if (local_rq->active_balance)
+   goto unlock;
+
+   if (foreign_rq->cfs.h_nr_running <= 1)
+   goto unlock;
+
+   task_to_move = get_migration_candidate(sd, foreign_rq, 0,
+  this_cpu);
+
+   if (!task_to_move)
+   goto unlock;
+
+   ld_moved = try_to_move_task(task_to_move, this_cpu,
+   _stopper_flag);
+
+   if (!ld_moved) {
+   task_to_move->se.migrate_candidate = 0;
+   goto unlock;
+   }
+
+   if (foreign_stopper_flag) {
+   foreign_rq->active_balance = 1;
+   foreign_rq->push_cpu = this_cpu;
+   foreign_rq->migrate_task = task_to_move;
+   }
+   } else {
+   /* this A15 push task to A7 */
+   int dst_cpu = get_idlest_cpu(sd, this_cpu);
+
+   if (dst_cpu == -1) {
+   local_irq_restore(local_flags);
+   return 0;
+   }
+
+   foreign_rq = cpu_rq(dst_cpu);
+   raw_spin_lock(_rq->lock);
+   double_lock_balance(foreign_rq, local_rq);
+
+   if (local_rq->cfs.h_nr_running <= 1)
+   goto unlock;
+
+   if (foreign_rq->active_balance)
+   goto unlock;
+
+   if (local_rq->active_balance)
+   goto unlock;
+
+   task_to_move = get_migration_candidate(sd, local_rq, 0,
+  foreign_rq->cpu);
+
+   if (!task_to_move)
+   goto unlock;
+
+   ld_moved = try_to_move_task(task_to_move, dst_cpu,
+   _stopper_flag);
+
+   if (!ld_moved) {
+   task_to_move->se.migrate_candidate = 0;
+   goto unlock;
+   }
+
+   if (local_stopper_flag) {
+   local_rq->active_balance = 1;
+   local_rq->push_cpu = dst_cpu;
+   local_rq->migrate_task = task_to_move;
+   }
+   }
+unlock:
+   double_rq_unlock(local_rq, foreign_rq);
+   local_irq_restore(local_flags);
+
+   if (foreign_st

[PATCH 01/13] hperf_hmp: add new config for arm and arm64.

2015-11-06 Thread Arseniy Krasnov
New config option which enables new scheduler logic: HPERF_HMP. Also
adds the following options:
'HPERF_HMP_DEBUG': enables extra runtime checks of balancing parameteres.
'HMP_FAST_CPU_MASK': CPU mask of A15 cluster(in hex string).
'HMP_SLOW_CPU_MASK': CPU mask of A7 cluster(in hex string).

Signed-off-by: Tarek Dakhran 
Signed-off-by: Sergey Dyasly 
Signed-off-by: Dmitriy Safonov 
Signed-off-by: Arseniy Krasnov 
Signed-off-by: Ilya Maximets 
---
 arch/arm/Kconfig   | 21 +
 arch/arm64/Kconfig | 21 +
 2 files changed, 42 insertions(+)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 72ad724..0581914 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1387,6 +1387,27 @@ config SCHED_MC
  making when dealing with multi-core CPU chips at a cost of slightly
  increased overhead in some places. If unsure say N here.
 
+config HPERF_HMP
+   bool "HPERF_HMP load balancing enhancements for ARM big.LITTLE"
+   select SCHED_MC
+   help
+ Uses HPERF_HMP load balancing algorithm between A7 and A15 CPU 
domains.
+
+config HPERF_HMP_DEBUG
+   bool "Additional HPERF_HMP runtime debug checks"
+   depends on HPERF_HMP
+   default n
+
+config HMP_FAST_CPU_MASK
+   string "Fast (Cortex-A15) CPU mask for HPERF_HMP"
+   default ""
+   depends on HPERF_HMP
+
+config HMP_SLOW_CPU_MASK
+   string "Slow (Cortex-A7) CPU mask for HPERF_HMP"
+   default ""
+   depends on HPERF_HMP
+
 config SCHED_SMT
bool "SMT scheduler support"
depends on ARM_CPU_TOPOLOGY
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 07d1811..71a8983 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -412,6 +412,27 @@ config SCHED_MC
  making when dealing with multi-core CPU chips at a cost of slightly
  increased overhead in some places. If unsure say N here.
 
+config HPERF_HMP
+   bool "HPERF_HMP load balancing enhancements for ARM big.LITTLE"
+   select SCHED_MC
+   help
+ Uses HPERF_HMP load balancing algorithm between A7 and A15 CPU 
domains.
+
+config HPERF_HMP_DEBUG
+   bool "Additional HPERF_HMP runtime debug checks"
+   depends on HPERF_HMP
+   default n
+
+config HMP_FAST_CPU_MASK
+   string "Fast (Cortex-A15) CPU mask for HPERF_HMP"
+   default ""
+   depends on HPERF_HMP
+
+config HMP_SLOW_CPU_MASK
+   string "Slow (Cortex-A7) CPU mask for HPERF_HMP"
+   default ""
+   depends on HPERF_HMP
+
 config SCHED_SMT
bool "SMT scheduler support"
help
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 00/13] High performance balancing logic for big.LITTLE

2015-11-06 Thread Arseniy Krasnov
26  63,179  163,915
27  64,987  167,559
28  67,329  171,203
29  70,489  185,171
30  73,084  189,303
31  75,264  192,487
32  77,015  197,27
avg 40,373  87,543

Bodytrack:

This computer vision application is an Intel RMS workload which tracks a human
body with multiple cameras through an image sequence. This benchmark was
included due to the increasing significance of computer vision algorithms in
areas such as video surveillance, character animation and computer interfaces.

Threads HPERF_HMP   Linaro MP
1   15,884  16,632
2   8,536   9,42
3   6,037   7,257
4   4,846,076
5   8,835   5,739
6   4,437   5,513
7   4,119   5,474
8   3,992   5,115
9   3,854   5,164
10  3,924,911
11  3,854   4,932
12  3,834,816
13  3,839   5,643
14  3,861   4,816
15  3,889   4,896
16  3,845   4,854
17  3,872   4,837
18  3,852   4,876
19  4,304   4,868
20  3,915   4,928
21  3,874,841
22  3,858   4,995
23  3,881   4,97
24  3,876   4,899
25  3,854   4,96
26  3,869   4,902
27  3,874   4,979
28  3,884,928
29  3,914   5,008
30  3,889   5,216
31  3,898   5,242
32  3,894   5,199
avg 4,689   5,653

Blackscholes:

This application is an Intel RMS benchmark. It calculates the prices for a
portfolio of European options analytically with the Black-Scholes partial
differential equation. There is no closed-form expression for the blackscholes
equation and as such it must be computed numerically.

Threads HPERF_HMP   Linaro MP
1   7,293   6,807
2   3,886   4,044
3   2,906   2,911
4   2,429   2,427
5   2,582,985
6   2,401   2,672
7   2,205   2,411
8   2,132   2,293
9   2,074   2,41
10  2,067   2,264
11  2,054   2,205
12  2,091   2,222
13  2,042   2,28
14  2,035   2,222
15  2,026   2,25
16  2,024   2,177
17  2,021   2,173
18  2,033   2,09
19  2,032,05
20  2,024   2,158
21  2,002   2,175
22  2,026   2,179
23  2,017   2,134
24  2,012,156
25  2,009   2,155
26  2,013   2,179
27  2,017   2,177
28  2,019   2,189
29  2,013   2,158
30  2,002   2,162
31  2,016   2,16
32  2,012   2,159
avg 2,328   2,469

Also, well known Antutu benchmark was executed on Exynos 5433 board:

HPERF_HMP   Linaro MP
Integral benchmark result   42400   36860 
Result: hperf_hmp is 15% better.


Arseniy Krasnov (13):
  hperf_hmp: add new config for arm and arm64.
  hperf_hmp: introduce hew domain flag.
  hperf_hmp: add sched domains initialization.
  hperf_hmp: scheduler initialization routines.
  hperf_hmp: introduce druntime metric.
  hperf_hmp: is_hmp_imbalance introduced.
  hperf_hmp: migration auxiliary functions.
  hperf_hmp: swap tasks function.
  hperf_hmp: one way balancing function.
  hperf_hmp: idle pull function.
  hperf_hmp: task CPU selection logic.
  hperf_hmp: rest of logic.
  hperf_hmp: cpufreq routines.

 arch/arm/Kconfig   |   21 +
 arch/arm/kernel/topology.c |6 +-
 arch/arm64/Kconfig

[PATCH 05/13] hperf_hmp: introduce druntime metric.

2015-11-06 Thread Arseniy Krasnov
This patch adds special per-task metric to look for candidate for
migration between HMP domains(clusters). 'druntime' grows up when task runs on
A7 cluster, and goes down on A15 cluster. Also druntime is scaled according load
on little cluster in order to align its value with big cluster's total druntime.
For migration from big/little to little/big cluster task with lowest/highest
'druntime' chosen. 'druntime' is used to execute each task on each cluster
approximately same amount of time. 'druntime' is calculated each call of default
'update_curr' function.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 include/linux/sched.h |   3 ++
 kernel/sched/core.c   |   3 ++
 kernel/sched/fair.c   | 115 ++
 3 files changed, 121 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa72125..89c1bf3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1257,6 +1257,9 @@ struct sched_entity {
struct list_headgroup_node;
unsigned inton_rq;
 
+#ifdef CONFIG_HPERF_HMP
+   longdruntime;
+#endif
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8747e06..6883a00 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2085,6 +2085,9 @@ static void __sched_fork(unsigned long clone_flags, 
struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime  = 0;
+#ifdef CONFIG_HPERF_HMP
+   p->se.druntime  = 0;
+#endif
INIT_LIST_HEAD(>se.group_node);
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c57007f..e94fab4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -102,6 +102,10 @@ unsigned int __read_mostly sysctl_sched_shares_window = 
1000UL;
 
 #ifdef CONFIG_HPERF_HMP
 extern void hmp_set_cpu_masks(struct cpumask *, struct cpumask *);
+static atomic_t a15_nr_hmp_busy = ATOMIC_INIT(0);
+static atomic_t a7_nr_hmp_busy = ATOMIC_INIT(0);
+static atomic_t hmp_imbalance = ATOMIC_INIT(0);
+
 static unsigned int freq_scale_cpu_power[CONFIG_NR_CPUS];
 #endif /* CONFIG_HPERF_HMP */
 
@@ -660,6 +664,115 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
+#ifdef CONFIG_HPERF_HMP
+static bool
+is_task_hmp(struct task_struct *task, const struct cpumask *task_cpus)
+{
+   if (!task_cpus)
+   task_cpus = tsk_cpus_allowed(task);
+
+   /*
+* Check if a task has cpus_allowed only for one CPU domain (A15 or A7)
+*/
+   return !(cpumask_intersects(task_cpus, cpu_fastest_mask) ^
+cpumask_intersects(task_cpus, cpu_slowest_mask));
+}
+
+#ifdef CONFIG_HPERF_HMP_DEBUG
+static inline void check_druntime_sum(struct rq *rq, long druntime_sum)
+{
+   BUG_ON(rq->cfs.h_nr_running == 0 && druntime_sum != 0);
+
+   if (cpu_is_fastest(rq->cpu))
+   BUG_ON(druntime_sum > 0);
+   else
+   BUG_ON(druntime_sum < 0);
+}
+#else
+static inline void check_druntime_sum(struct rq *rq, long druntime_sum)
+{
+}
+#endif
+
+static inline void add_druntime_sum(struct rq *rq, long delta)
+{
+   rq->druntime_sum += delta;
+   check_druntime_sum(rq, rq->druntime_sum);
+}
+/* Updates druntime for a task */
+static inline void
+update_hmp_stat(struct cfs_rq *cfs_rq, struct sched_entity *curr,
+   unsigned long delta_exec)
+{
+   long to_add;
+   unsigned int hmp_fairness_threshold = 240;
+   struct rq *rq = rq_of(cfs_rq);
+   int a7_nr_hmp_busy_tmp;
+
+   if (atomic_read(_imbalance) == 0)
+   return;
+
+   if (!curr->on_rq)
+   return;
+
+   if (!entity_is_task(curr))
+   return;
+
+   if (!task_of(curr)->on_rq)
+   return;
+
+   if (!cfs_rq->h_nr_running)
+   return;
+
+   if (!is_task_hmp(task_of(curr), NULL))
+   return;
+
+   delta_exec = delta_exec >> 10;
+
+   if (cpu_is_fastest(rq->cpu))
+   to_add = -delta_exec;
+   else
+   to_add = delta_exec;
+
+   to_add -= curr->druntime;
+
+   /* Avoid values with the different sign */
+   if ((cpu_is_fastest(rq->cpu) && to_add >= 0) ||
+   (!cpu_is_fastest(rq->cpu) && to_add <= 0))
+   return;
+
+   to_add /= (long)(2 + 4 * 

[PATCH 04/13] hperf_hmp: scheduler initialization routines.

2015-11-06 Thread Arseniy Krasnov
Adds new fields to 'rq' structure and routine called during fair class
setup, which initializes some HMP scheduler variables: big and little cluster
masks. They are read from kernel config(if set), else default values are used.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 kernel/sched/core.c  |  4 
 kernel/sched/fair.c  | 46 ++
 kernel/sched/sched.h | 15 +++
 3 files changed, 65 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e3a632f..8747e06 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7488,6 +7488,10 @@ void __init sched_init(void)
 #endif
init_rq_hrtick(rq);
atomic_set(>nr_iowait, 0);
+#ifdef CONFIG_HPERF_HMP
+   rq->druntime_sum = 0;
+   rq->nr_hmp_tasks = 0;
+#endif
}
 
set_load_weight(_task);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a5e60f..c57007f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -100,6 +100,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 
50UL;
  */
 unsigned int __read_mostly sysctl_sched_shares_window = 1000UL;
 
+#ifdef CONFIG_HPERF_HMP
+extern void hmp_set_cpu_masks(struct cpumask *, struct cpumask *);
+static unsigned int freq_scale_cpu_power[CONFIG_NR_CPUS];
+#endif /* CONFIG_HPERF_HMP */
+
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
@@ -8305,8 +8310,38 @@ void show_numa_stats(struct task_struct *p, struct 
seq_file *m)
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
 
+#ifdef CONFIG_HPERF_HMP
+static unsigned long default_fast_mask = 0x0F;
+static unsigned long default_slow_mask = 0xF0;
+
+void hmp_set_cpu_masks(struct cpumask *fast_mask, struct cpumask *slow_mask)
+{
+   cpumask_clear(fast_mask);
+   cpumask_clear(slow_mask);
+
+   /* try to parse CPU masks from config */
+   if (strlen(CONFIG_HMP_FAST_CPU_MASK) &&
+   strlen(CONFIG_HMP_SLOW_CPU_MASK)) {
+   if (cpumask_parse(CONFIG_HMP_FAST_CPU_MASK, fast_mask) ||
+   cpumask_parse(CONFIG_HMP_SLOW_CPU_MASK, slow_mask))
+   pr_err("hperf_hmp: Failed to get CPU masks from 
config!\n");
+   else
+   return;
+   }
+
+   pr_err("hperf_hmp: Fast mask will be: %08lX, slow mask: %08lX\n",
+  default_fast_mask, default_slow_mask);
+
+   fast_mask->bits[0] = default_fast_mask;
+   slow_mask->bits[0] = default_slow_mask;
+}
+#endif
+
 __init void init_sched_fair_class(void)
 {
+#ifdef CONFIG_HPERF_HMP
+   int cpu;
+#endif
 #ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 
@@ -8315,6 +8350,17 @@ __init void init_sched_fair_class(void)
zalloc_cpumask_var(_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
 #endif
+
+#ifdef CONFIG_HPERF_HMP
+   for_each_possible_cpu(cpu)
+   freq_scale_cpu_power[cpu] = SCHED_CAPACITY_SCALE;
+   hmp_set_cpu_masks(cpu_fastest_mask, cpu_slowest_mask);
+   pr_info("hperf_hmp: fast CPUs mask: %08X\n",
+   (unsigned int)cpumask_bits(cpu_fastest_mask)[0]);
+   pr_info("hperf_hmp: slow CPUs mask: %08X\n",
+   (unsigned int)cpumask_bits(cpu_slowest_mask)[0]);
+#endif
+
 #endif /* SMP */
 
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d2a119..94828dc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -597,6 +597,11 @@ struct rq {
 */
unsigned long nr_uninterruptible;
 
+#ifdef CONFIG_HPERF_HMP
+   /* shows the amount of accumulated unfairness by tasks of this rq */
+   long druntime_sum;
+   unsigned int nr_hmp_tasks;
+#endif
struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
@@ -892,6 +897,16 @@ static inline unsigned int group_first_cpu(struct 
sched_group *group)
 
 extern int group_balance_cpu(struct sched_group *sg);
 
+#ifdef CONFIG_HPERF_HMP
+extern struct cpumask *cpu_fastest_mask;
+extern struct cpumask *cpu_slowest_mask;
+
+static inline bool cpu_is_fastest(int cpu)
+{
+   return cpumask_test_cpu(cpu, cpu_fastest_mask);
+}
+#endif
+
 #else
 
 static inline void sched_ttwu_pending(void) { }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 02/13] hperf_hmp: introduce hew domain flag.

2015-11-06 Thread Arseniy Krasnov
New scheduler domain type: HMP. Each big.LITTLE cluster is detected by
scheduler as HMP domain. HPERF_HMP logic works between two HMP domains, the
default CFS logic, in turn, works inside the HMP domain.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 arch/arm/kernel/topology.c | 6 +-
 include/linux/sched.h  | 4 
 kernel/sched/core.c| 9 -
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 08b7847..7fcc5fe 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -285,7 +285,11 @@ static struct sched_domain_topology_level arm_topology[] = 
{
{ cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 #endif
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+   { cpu_cpu_mask,
+#ifdef CONFIG_HPERF_HMP
+.flags = SD_HMP_BALANCE,
+#endif
+SD_INIT_NAME(DIE)},
{ NULL, },
 };
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b7b9501..eb084df 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -990,6 +990,10 @@ extern void wake_up_q(struct wake_q_head *head);
 #define SD_OVERLAP 0x2000  /* sched_domains of this level overlap 
*/
 #define SD_NUMA0x4000  /* cross-node balancing */
 
+#ifdef CONFIG_HPERF_HMP
+#define SD_HMP_BALANCE 0x8000  /* Use HMP load balancing algorithm */
+#endif
+
 #ifdef CONFIG_SCHED_SMT
 static inline int cpu_smt_flags(void)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcd214e..16092e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6410,6 +6410,9 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
| 0*SD_PREFER_SIBLING
| 0*SD_NUMA
| sd_flags
+#ifdef CONFIG_HPERF_HMP
+   | (tl->flags & SD_HMP_BALANCE)
+#endif
,
 
.last_balance   = jiffies,
@@ -6472,7 +6475,11 @@ static struct sched_domain_topology_level 
default_topology[] = {
 #ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 #endif
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+   { cpu_cpu_mask,
+#ifdef CONFIG_HPERF_HMP
+.flags = SD_HMP_BALANCE,
+#endif
+SD_INIT_NAME(DIE)},
{ NULL, },
 };
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 00/13] High performance balancing logic for big.LITTLE

2015-11-06 Thread Arseniy Krasnov
26  63,179  163,915
27  64,987  167,559
28  67,329  171,203
29  70,489  185,171
30  73,084  189,303
31  75,264  192,487
32  77,015  197,27
avg 40,373  87,543

Bodytrack:

This computer vision application is an Intel RMS workload which tracks a human
body with multiple cameras through an image sequence. This benchmark was
included due to the increasing significance of computer vision algorithms in
areas such as video surveillance, character animation and computer interfaces.

Threads HPERF_HMP   Linaro MP
1   15,884  16,632
2   8,536   9,42
3   6,037   7,257
4   4,846,076
5   8,835   5,739
6   4,437   5,513
7   4,119   5,474
8   3,992   5,115
9   3,854   5,164
10  3,924,911
11  3,854   4,932
12  3,834,816
13  3,839   5,643
14  3,861   4,816
15  3,889   4,896
16  3,845   4,854
17  3,872   4,837
18  3,852   4,876
19  4,304   4,868
20  3,915   4,928
21  3,874,841
22  3,858   4,995
23  3,881   4,97
24  3,876   4,899
25  3,854   4,96
26  3,869   4,902
27  3,874   4,979
28  3,884,928
29  3,914   5,008
30  3,889   5,216
31  3,898   5,242
32  3,894   5,199
avg 4,689   5,653

Blackscholes:

This application is an Intel RMS benchmark. It calculates the prices for a
portfolio of European options analytically with the Black-Scholes partial
differential equation. There is no closed-form expression for the blackscholes
equation and as such it must be computed numerically.

Threads HPERF_HMP   Linaro MP
1   7,293   6,807
2   3,886   4,044
3   2,906   2,911
4   2,429   2,427
5   2,582,985
6   2,401   2,672
7   2,205   2,411
8   2,132   2,293
9   2,074   2,41
10  2,067   2,264
11  2,054   2,205
12  2,091   2,222
13  2,042   2,28
14  2,035   2,222
15  2,026   2,25
16  2,024   2,177
17  2,021   2,173
18  2,033   2,09
19  2,032,05
20  2,024   2,158
21  2,002   2,175
22  2,026   2,179
23  2,017   2,134
24  2,012,156
25  2,009   2,155
26  2,013   2,179
27  2,017   2,177
28  2,019   2,189
29  2,013   2,158
30  2,002   2,162
31  2,016   2,16
32  2,012   2,159
avg 2,328   2,469

Also, well known Antutu benchmark was executed on Exynos 5433 board:

HPERF_HMP   Linaro MP
Integral benchmark result   42400   36860 
Result: hperf_hmp is 15% better.


Arseniy Krasnov (13):
  hperf_hmp: add new config for arm and arm64.
  hperf_hmp: introduce hew domain flag.
  hperf_hmp: add sched domains initialization.
  hperf_hmp: scheduler initialization routines.
  hperf_hmp: introduce druntime metric.
  hperf_hmp: is_hmp_imbalance introduced.
  hperf_hmp: migration auxiliary functions.
  hperf_hmp: swap tasks function.
  hperf_hmp: one way balancing function.
  hperf_hmp: idle pull function.
  hperf_hmp: task CPU selection logic.
  hperf_hmp: rest of logic.
  hperf_hmp: cpufreq routines.

 arch/arm/Kconfig   |   21 +
 arch/arm/kernel/topology.c |6 +-
 arch/arm64/Kconfig

[PATCH 09/13] hperf_hmp: one way balancing function.

2015-11-06 Thread Arseniy Krasnov
Almost identical functions which push/pull task from/to current CPU
to/from another cluster. Called when balancing between clusters is broken and we
need to fix it.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 kernel/sched/fair.c | 254 
 1 file changed, 254 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 028d329..4fda1ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7519,6 +7519,260 @@ unlock:
 
return ld_moved;
 }
+
+/* Get idlest cpu from opposite domain of this_cpu */
+static int get_idlest_cpu(struct sched_domain *sd, int this_cpu)
+{
+   struct sched_group *opposite_sg;
+   struct cpumask *opposite_mask;
+   unsigned long load = ULONG_MAX;
+   int idlest_cpu = -1;
+   int cpu;
+
+   opposite_sg = get_opposite_group(sd, cpu_is_fastest(this_cpu));
+   opposite_mask = sched_group_cpus(opposite_sg);
+
+   for_each_cpu_and(cpu, opposite_mask, cpu_online_mask) {
+   if (cpu_rq(cpu)->load.weight < load) {
+   load = cpu_rq(cpu)->load.weight;
+   idlest_cpu = cpu;
+   }
+   }
+   return idlest_cpu;
+}
+
+/**
+ * move_a15_to_a7(): Moves one task from A15 to A7.
+ * @sd: Current sched domain.
+ * @this_cpu: without NO_HZ same as smp_processor_id().
+ *
+ * Returns moved weight.
+ *
+ * Chooses task to migrate by druntime.
+ */
+static unsigned int move_a15_to_a7(struct sched_domain *sd, int this_cpu)
+{
+   struct task_struct *task_to_move;
+   struct rq *local_rq = NULL;
+   struct rq *foreign_rq = NULL;
+   int local_stopper_flag = 0;
+   int foreign_stopper_flag = 0;
+   unsigned long local_flags;
+   unsigned int ld_moved = 0;
+
+   local_rq = cpu_rq(this_cpu);
+   local_irq_save(local_flags);
+
+   if (!cpu_is_fastest(this_cpu)) {
+   /* this A7 pulls task from A15 */
+   foreign_rq = get_unfair_rq(sd, this_cpu);
+
+   if (!foreign_rq) {
+   local_irq_restore(local_flags);
+   return 0;
+   }
+
+   double_lock_balance(foreign_rq, local_rq);
+
+   if (foreign_rq->active_balance)
+   goto unlock;
+
+   if (local_rq->active_balance)
+   goto unlock;
+
+   if (foreign_rq->cfs.h_nr_running <= 1)
+   goto unlock;
+
+   task_to_move = get_migration_candidate(sd, foreign_rq, 0,
+  this_cpu);
+
+   if (!task_to_move)
+   goto unlock;
+
+   ld_moved = try_to_move_task(task_to_move, this_cpu,
+   _stopper_flag);
+
+   if (!ld_moved) {
+   task_to_move->se.migrate_candidate = 0;
+   goto unlock;
+   }
+
+   if (foreign_stopper_flag) {
+   foreign_rq->active_balance = 1;
+   foreign_rq->push_cpu = this_cpu;
+   foreign_rq->migrate_task = task_to_move;
+   }
+   } else {
+   /* this A15 push task to A7 */
+   int dst_cpu = get_idlest_cpu(sd, this_cpu);
+
+   if (dst_cpu == -1) {
+   local_irq_restore(local_flags);
+   return 0;
+   }
+
+   foreign_rq = cpu_rq(dst_cpu);
+   raw_spin_lock(_rq->lock);
+   double_lock_balance(foreign_rq, local_rq);
+
+   if (local_rq->cfs.h_nr_running <= 1)
+   goto unlock;
+
+   if (foreign_rq->active_balance)
+   goto unlock;
+
+   if (local_rq->active_balance)
+   goto unlock;
+
+   task_to_move = get_migration_candidate(sd, local_rq, 0,
+  foreign_rq->cpu);
+
+   if (!task_to_move)
+   goto unlock;
+
+   ld_moved = try_to_move_task(task_to_move, dst_cpu,
+   _stopper_flag);
+
+   if (!ld_moved) {
+   task_to_move->se.migrate_candidate = 0;
+   goto unlock;
+   }
+
+   if (local_stopper_flag) {
+   local_rq->active_balance = 1;
+   local_rq->push_cpu = dst_cpu;
+   local_rq->migrate_task = task_to_move;
+   }

[PATCH 01/13] hperf_hmp: add new config for arm and arm64.

2015-11-06 Thread Arseniy Krasnov
New config option which enables new scheduler logic: HPERF_HMP. Also
adds the following options:
'HPERF_HMP_DEBUG': enables extra runtime checks of balancing parameteres.
'HMP_FAST_CPU_MASK': CPU mask of A15 cluster(in hex string).
'HMP_SLOW_CPU_MASK': CPU mask of A7 cluster(in hex string).

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 arch/arm/Kconfig   | 21 +
 arch/arm64/Kconfig | 21 +
 2 files changed, 42 insertions(+)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 72ad724..0581914 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1387,6 +1387,27 @@ config SCHED_MC
  making when dealing with multi-core CPU chips at a cost of slightly
  increased overhead in some places. If unsure say N here.
 
+config HPERF_HMP
+   bool "HPERF_HMP load balancing enhancements for ARM big.LITTLE"
+   select SCHED_MC
+   help
+ Uses HPERF_HMP load balancing algorithm between A7 and A15 CPU 
domains.
+
+config HPERF_HMP_DEBUG
+   bool "Additional HPERF_HMP runtime debug checks"
+   depends on HPERF_HMP
+   default n
+
+config HMP_FAST_CPU_MASK
+   string "Fast (Cortex-A15) CPU mask for HPERF_HMP"
+   default ""
+   depends on HPERF_HMP
+
+config HMP_SLOW_CPU_MASK
+   string "Slow (Cortex-A7) CPU mask for HPERF_HMP"
+   default ""
+   depends on HPERF_HMP
+
 config SCHED_SMT
bool "SMT scheduler support"
depends on ARM_CPU_TOPOLOGY
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 07d1811..71a8983 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -412,6 +412,27 @@ config SCHED_MC
  making when dealing with multi-core CPU chips at a cost of slightly
  increased overhead in some places. If unsure say N here.
 
+config HPERF_HMP
+   bool "HPERF_HMP load balancing enhancements for ARM big.LITTLE"
+   select SCHED_MC
+   help
+ Uses HPERF_HMP load balancing algorithm between A7 and A15 CPU 
domains.
+
+config HPERF_HMP_DEBUG
+   bool "Additional HPERF_HMP runtime debug checks"
+   depends on HPERF_HMP
+   default n
+
+config HMP_FAST_CPU_MASK
+   string "Fast (Cortex-A15) CPU mask for HPERF_HMP"
+   default ""
+   depends on HPERF_HMP
+
+config HMP_SLOW_CPU_MASK
+   string "Slow (Cortex-A7) CPU mask for HPERF_HMP"
+   default ""
+   depends on HPERF_HMP
+
 config SCHED_SMT
bool "SMT scheduler support"
help
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 11/13] hperf_hmp: task CPU selection logic.

2015-11-06 Thread Arseniy Krasnov
Adds new runqueue selection logic. If task is newly woken(fork or exec)
or it is not WF_SYNC wakeup, idlest CPU from both clusters is selected. Else,
default wake up logic is used('want_affine'). If it fails, idlest CPU from both
clusters is selected.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 kernel/sched/fair.c | 132 
 1 file changed, 101 insertions(+), 31 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fd16729..79be023 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4798,6 +4798,62 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p, int sync)
return 1;
 }
 
+#ifdef CONFIG_HPERF_HMP
+/**
+ * hmp_select_task_rq_fair(): selects cpu for task.
+ * @p: task which needs cpu
+ *
+ * Returns cpu for task.
+ *
+ * Selects idlest cpu for task @p.
+ */
+static int
+hmp_select_task_rq_fair(struct task_struct *p)
+{
+   int cpu;
+   int new_cpu;
+   unsigned long load;
+   unsigned long scaled_load;
+
+   new_cpu = task_cpu(p);
+
+   load = ULONG_MAX;
+   /* First check primary cpus */
+   for_each_cpu_and(cpu, cpu_online_mask, cpu_fastest_mask) {
+   if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+   /* Select idle cpu if it exists */
+   if (idle_cpu(cpu))
+   return cpu;
+   /* Otherwise select the least loaded cpu */
+   scaled_load = (weighted_cpuload(cpu) *
+  SCHED_CAPACITY_SCALE) /
+  freq_scale_cpu_power[cpu];
+   if (scaled_load < load) {
+   new_cpu = cpu;
+   load = scaled_load;
+   }
+   }
+   }
+
+   /* Then check secondary cpus */
+   for_each_cpu_and(cpu, cpu_online_mask, cpu_slowest_mask) {
+   if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+   if (idle_cpu(cpu))
+   return cpu;
+   scaled_load = (weighted_cpuload(cpu) *
+  SCHED_CAPACITY_SCALE) /
+  freq_scale_cpu_power[cpu];
+   if (scaled_load < load) {
+   new_cpu = cpu;
+   load = scaled_load;
+   }
+   }
+   }
+
+   return new_cpu;
+}
+
+#else /* CONFIG_HPERF_HMP */
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -4905,6 +4961,7 @@ find_idlest_cpu(struct sched_group *group, struct 
task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : 
least_loaded_cpu;
 }
 
+#endif /* CONFIG_HPERF_HMP */
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
@@ -4998,6 +5055,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
 
+#ifdef CONFIG_HPERF_HMP
+   if (!(sd_flag & SD_BALANCE_WAKE) || !sync)
+   return hmp_select_task_rq_fair(p);
+#endif
+
if (sd_flag & SD_BALANCE_WAKE)
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, 
tsk_cpus_allowed(p));
 
@@ -5030,41 +5092,49 @@ select_task_rq_fair(struct task_struct *p, int 
prev_cpu, int sd_flag, int wake_f
 
if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-   new_cpu = select_idle_sibling(p, new_cpu);
-
-   } else while (sd) {
-   struct sched_group *group;
-   int weight;
+   if (IS_ENABLED(CONFIG_HPERF_HMP) && sync)
+   new_cpu = prev_cpu;
+   else
+   new_cpu = select_idle_sibling(p, prev_cpu);
+   } else {
+#ifdef CONFIG_HPERF_HMP
+   new_cpu = hmp_select_task_rq_fair(p);
+#else
+   while (sd) {
+   struct sched_group *group;
+   int weight;
 
-   if (!(sd->flags & sd_flag)) {
-   sd = sd->child;
-   continue;
-   }
+   if (!(sd->flags & sd_flag)) {
+   sd = sd->child;
+   continue;
+   }
 
-   group = find_idlest_group(sd, p, cpu, sd_flag);
- 

[PATCH 13/13] hperf_hmp: cpufreq routines.

2015-11-06 Thread Arseniy Krasnov
Adds CPU frequency change notifier in fair scheduling class. Every time
when governor changes frequency, it calls callback from this patch. Frequency of
each CPU is used for imbalance calculation.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 kernel/sched/fair.c | 76 +
 1 file changed, 76 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 06f6518..87dc0db 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -33,6 +33,10 @@
 
 #include 
 
+#ifdef CONFIG_HPERF_HMP
+#include 
+#endif
+
 #include "sched.h"
 
 /*
@@ -101,6 +105,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 
50UL;
 unsigned int __read_mostly sysctl_sched_shares_window = 1000UL;
 
 #ifdef CONFIG_HPERF_HMP
+/*
+ * Log level of hperf_hmp messages. Bigger means more messages.
+ * Maximum level is 3.
+ */
+unsigned int sysctl_sched_hperf_hmp_log_level;
 extern void hmp_set_cpu_masks(struct cpumask *, struct cpumask *);
 static atomic_t a15_nr_hmp_busy = ATOMIC_INIT(0);
 static atomic_t a7_nr_hmp_busy = ATOMIC_INIT(0);
@@ -7229,6 +7238,73 @@ static int should_we_balance(struct lb_env *env)
return balance_cpu == env->dst_cpu;
 }
 #ifdef CONFIG_HPERF_HMP
+static void hperf_hmp_vprint(unsigned int log_level, const char *format,
+ va_list ap)
+{
+   if (sysctl_sched_hperf_hmp_log_level < log_level)
+   return;
+   vprintk(format, ap);
+}
+
+static void hperf_hmp_print(unsigned int log_level, const char *format, ...)
+{
+   va_list ap;
+
+   va_start(ap, format);
+   hperf_hmp_vprint(log_level, format, ap);
+   va_end(ap);
+}
+
+/* Called when frequency is changed */
+static int hmp_cpufreq_callback(struct notifier_block *nb,
+   unsigned long event, void *data)
+{
+   struct cpufreq_freqs *new_freq = data;
+
+   /* recount power only after change of frequency */
+   if (event != CPUFREQ_POSTCHANGE)
+   return NOTIFY_DONE;
+
+   if (!new_freq)
+   return NOTIFY_DONE;
+
+   freq_scale_cpu_power[new_freq->cpu] = (new_freq->new >> 10);
+
+   /* Apply slowdown coefficient of 1.9 for A7 CPUs */
+   if (!cpu_is_fastest(new_freq->cpu)) {
+   freq_scale_cpu_power[new_freq->cpu] *= 10;
+   freq_scale_cpu_power[new_freq->cpu] /= 19;
+   }
+
+   hperf_hmp_print(2, KERN_INFO "hperf_hmp: CPU#%i new frequency is: %u 
MHz\n",
+new_freq->cpu, new_freq->new / 1000);
+
+   return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_notifier = {
+   .notifier_call = hmp_cpufreq_callback
+};
+
+static int __init register_sched_cpufreq_notifier(void)
+{
+   int err = 0;
+   int cpu;
+
+   for_each_online_cpu(cpu)
+   freq_scale_cpu_power[cpu] = capacity_of(cpu);
+
+   err = cpufreq_register_notifier(_notifier,
+   CPUFREQ_TRANSITION_NOTIFIER);
+   if (!err)
+   pr_info("hperf_hmp: registered cpufreq transition notifier\n");
+   else
+   pr_info("hperf_hmp: failed to register cpufreq notifier!\n");
+
+   return err;
+}
+core_initcall(register_sched_cpufreq_notifier);
+
 /**
  * is_hmp_imbalance(): Calculates imbalance between HMP domains.
  * @sd: Current sched domain.
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 12/13] hperf_hmp: rest of logic.

2015-11-06 Thread Arseniy Krasnov
Inserts call to main logic from 'load_balance', balance parameters
calculation during enqueue/dequeue task from runqueue and affinity mask
change callback for fair scheduling class.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 kernel/sched/fair.c | 204 +++-
 1 file changed, 202 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79be023..06f6518 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -677,6 +677,16 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 }
 
 #ifdef CONFIG_HPERF_HMP
+static void hmp_calculate_imbalance(void)
+{
+   if (atomic_long_read(_total_weight) == 0) {
+   atomic_set(_imbalance, 0);
+   return;
+   }
+
+   atomic_set(_imbalance, 1);
+}
+
 static bool
 is_task_hmp(struct task_struct *task, const struct cpumask *task_cpus)
 {
@@ -711,6 +721,13 @@ static inline void add_druntime_sum(struct rq *rq, long 
delta)
rq->druntime_sum += delta;
check_druntime_sum(rq, rq->druntime_sum);
 }
+
+static inline void sub_druntime_sum(struct rq *rq, long delta)
+{
+   rq->druntime_sum -= delta;
+   check_druntime_sum(rq, rq->druntime_sum);
+}
+
 /* Updates druntime for a task */
 static inline void
 update_hmp_stat(struct cfs_rq *cfs_rq, struct sched_entity *curr,
@@ -861,7 +878,9 @@ static void update_curr(struct cfs_rq *cfs_rq)
 
account_cfs_rq_runtime(cfs_rq, delta_exec);
 
+#ifdef CONFIG_HPERF_HMP
update_hmp_stat(cfs_rq, curr, delta_exec);
+#endif
 }
 
 static void update_curr_fair(struct rq *rq)
@@ -4200,6 +4219,66 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_HPERF_HMP
+#ifdef CONFIG_HPERF_HMP_DEBUG
+static void check_nr_hmp_tasks(struct rq *rq)
+{
+   if (rq->nr_hmp_tasks > rq->cfs.h_nr_running) {
+   pr_emerg("HMP BUG: rq->nr_hmp_tasks = %u, "
+"rq->cfs.h_nr_running = %u\n", rq->nr_hmp_tasks,
+rq->cfs.h_nr_running);
+   BUG();
+   }
+}
+#else
+static void check_nr_hmp_tasks(struct rq *rq) { }
+#endif
+
+static void nr_hmp_tasks_inc(struct rq *rq)
+{
+   if (!rq->nr_hmp_tasks) {
+   if (cpu_is_fastest(rq->cpu))
+   atomic_inc(_nr_hmp_busy);
+   else
+   atomic_inc(_nr_hmp_busy);
+   }
+
+   rq->nr_hmp_tasks++;
+   check_nr_hmp_tasks(rq);
+}
+
+static void nr_hmp_tasks_dec(struct rq *rq)
+{
+   rq->nr_hmp_tasks--;
+
+   if (!rq->nr_hmp_tasks) {
+   if (cpu_is_fastest(rq->cpu))
+   atomic_dec(_nr_hmp_busy);
+   else
+   atomic_dec(_nr_hmp_busy);
+   }
+   check_nr_hmp_tasks(rq);
+}
+
+static void
+set_cpus_allowed_hmp(struct task_struct *p, const struct cpumask *new_mask)
+{
+   bool is_hmp_before, is_hmp_after;
+
+   cpumask_copy(>cpus_allowed, new_mask);
+   p->nr_cpus_allowed = cpumask_weight(new_mask);
+   is_hmp_before = is_task_hmp(p, NULL);
+   is_hmp_after  = is_task_hmp(p, new_mask);
+
+   if (!p->on_cpu && p->se.on_rq && (is_hmp_before != is_hmp_after)) {
+   if (is_hmp_after)
+   nr_hmp_tasks_inc(rq_of(cfs_rq_of(>se)));
+   else
+   nr_hmp_tasks_dec(rq_of(cfs_rq_of(>se)));
+   }
+}
+#endif
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -4241,8 +4320,24 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, 
int flags)
update_cfs_shares(cfs_rq);
}
 
-   if (!se)
+   if (!se) {
add_nr_running(rq, 1);
+#ifdef CONFIG_HPERF_HMP
+   if (is_task_hmp(p, NULL))
+   nr_hmp_tasks_inc(rq);
+
+   if (cpu_is_fastest(rq->cpu)) {
+   atomic_long_add(p->se.load.weight, _total_weight);
+   if (p->se.druntime < 0)
+   add_druntime_sum(rq, p->se.druntime);
+   } else {
+   atomic_long_add(p->se.load.weight, _total_weight);
+   if (p->se.druntime > 0)
+   add_druntime_sum(rq, p->se.druntime);
+   }
+   hmp_calculate_imbalance();
+#endif
+   }
 
hrtick_update(rq);
 }
@@ -4301,8 +4396,30 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
update_c

[PATCH 03/13] hperf_hmp: add sched domains initialization.

2015-11-06 Thread Arseniy Krasnov
Attaching CPU clusters as 'sched_group' to HMP domains. Each HMP domain
has two pointers to A15 and A7 scheduling groups(struct sched_group).

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 include/linux/sched.h |  4 
 kernel/sched/core.c   | 49 +
 2 files changed, 53 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eb084df..aa72125 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1057,6 +1057,10 @@ struct sched_domain {
u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost;
 
+#ifdef CONFIG_HPERF_HMP
+   struct sched_group *a15_group;
+   struct sched_group *a7_group;
+#endif
 #ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 16092e0..e3a632f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,6 +90,16 @@
 #define CREATE_TRACE_POINTS
 #include 
 
+#ifdef CONFIG_HPERF_HMP
+/* cpumask for A15 cpus */
+static DECLARE_BITMAP(cpu_fastest_bits, CONFIG_NR_CPUS);
+struct cpumask *cpu_fastest_mask = to_cpumask(cpu_fastest_bits);
+
+/* cpumask for A7 cpus */
+static DECLARE_BITMAP(cpu_slowest_bits, CONFIG_NR_CPUS);
+struct cpumask *cpu_slowest_mask = to_cpumask(cpu_slowest_bits);
+#endif
+
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -6971,6 +6981,45 @@ static int build_sched_domains(const struct cpumask 
*cpu_map,
sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
+
+#ifdef CONFIG_HPERF_HMP
+   for (i = nr_cpumask_bits - 1; i >= 0; i--) {
+   if (!cpumask_test_cpu(i, cpu_map))
+   continue;
+
+   for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+   struct sched_group *sg;
+   sd->a7_group = NULL;
+   sd->a15_group = NULL;
+
+   /* Process only HMP domains */
+   if (!(sd->flags & SD_HMP_BALANCE))
+   continue;
+
+   /*
+* Process sched groups of this domain.
+* Attach sg to hmp domains.
+*/
+   sg = sd->groups;
+   do {
+   if (!sg->sgc)
+   goto next_sg;
+#ifdef CONFIG_SCHED_DEBUG
+   printk(KERN_EMERG "Attaching CPUs 0x%08lX to 
domain %s\n",
+  sched_group_cpus(sg)->bits[0], sd->name);
+#endif
+   if (cpumask_intersects(sched_group_cpus(sg),
+   cpu_fastest_mask))
+   sd->a15_group = sg;
+   else
+   sd->a7_group = sg;
+next_sg:
+   sg = sg->next;
+   } while (sg != sd->groups);
+   }
+   }
+#endif /* CONFIG_HPERF_HMP */
+
rcu_read_unlock();
 
ret = 0;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 08/13] hperf_hmp: swap tasks function.

2015-11-06 Thread Arseniy Krasnov
'swap_tasks' performs migration between current CPU and CPU from another
cluster. It scans two runqueues looking for tasks using 'druntime' metric. When
both tasks are found it pulls task from another cluster, and push task from the
current CPU.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 kernel/sched/fair.c  | 100 +++
 kernel/sched/sched.h |   1 +
 2 files changed, 101 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff05364..028d329 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7419,6 +7419,106 @@ static unsigned int try_to_move_task(struct task_struct 
*migrate_task,
 
return migrate_runnable_task(migrate_task, destination_cpu);
 }
+
+/**
+ * swap_tasks(): swaps two tasks from different HMP domains
+ * @sd: Current sched domain
+ * @this_cpu: without NO_HZ same as smp_processor_id().
+ *
+ * Returns weight of migrated tasks.
+ */
+static unsigned int swap_tasks(struct sched_domain *sd, int this_cpu)
+{
+   unsigned int ld_moved = 0;
+   int local_stopper = 0;
+   int foreign_stopper = 0;
+   struct rq *local_rq = cpu_rq(this_cpu);
+   struct rq *foreign_rq = NULL;
+   struct task_struct *local_task = NULL;
+   struct task_struct *foreign_task = NULL;
+   unsigned long local_flags;
+
+   local_irq_save(local_flags);
+   foreign_rq = get_unfair_rq(sd, this_cpu);
+
+   if (!foreign_rq) {
+   local_irq_restore(local_flags);
+   return 0;
+   }
+
+   double_lock_balance(foreign_rq, local_rq);
+
+   /* rq's waiting for stopper execution, return */
+   if (foreign_rq->active_balance)
+   goto unlock;
+
+   if (local_rq->active_balance)
+   goto unlock;
+
+   foreign_task = get_migration_candidate(sd, foreign_rq, 0, this_cpu);
+
+   if (!foreign_task)
+   goto unlock;
+
+   /* Get local task for migration */
+   local_task = get_migration_candidate(sd, local_rq, 0, foreign_rq->cpu);
+
+   if (!local_task) {
+   foreign_task->se.migrate_candidate = 0;
+   goto unlock;
+   }
+   /* First try to push local task */
+   ld_moved = try_to_move_task(local_task, foreign_rq->cpu,
+   _stopper);
+
+   /* If failed to move, then return, don't try to move foreign task */
+   if (!ld_moved) {
+   local_task->se.migrate_candidate = 0;
+   foreign_task->se.migrate_candidate = 0;
+   goto unlock;
+   }
+
+   /*
+* Migration is possible, but task is running,
+* so mark rq to run stopper.
+*/
+   if (local_stopper) {
+   local_rq->push_cpu = foreign_rq->cpu;
+   local_rq->migrate_task = local_task;
+   local_rq->active_balance = 1;
+   }
+
+   /* Now try to pull task from another cpu */
+   ld_moved = try_to_move_task(foreign_task, this_cpu,
+   _stopper);
+
+   /* Failed to move foreign_task */
+   if (!ld_moved)
+   foreign_task->se.migrate_candidate = 0;
+
+   /* Migration is possible, mark rq to run stopper */
+   if (foreign_stopper) {
+   foreign_rq->push_cpu = this_cpu;
+   foreign_rq->migrate_task = foreign_task;
+   foreign_rq->active_balance = 1;
+   }
+
+unlock:
+   double_rq_unlock(local_rq, foreign_rq);
+   local_irq_restore(local_flags);
+
+   if (local_stopper)
+   stop_one_cpu_nowait(local_rq->cpu,
+   active_load_balance_cpu_stop, local_rq,
+   _rq->active_balance_work);
+
+   if (foreign_stopper)
+   stop_one_cpu_nowait(foreign_rq->cpu,
+   active_load_balance_cpu_stop, foreign_rq,
+   _rq->active_balance_work);
+
+   return ld_moved;
+}
 #endif /* CONFIG_HPERF_HMP */
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 94828dc..47e9605 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -598,6 +598,7 @@ struct rq {
unsigned long nr_uninterruptible;
 
 #ifdef CONFIG_HPERF_HMP
+   struct task_struct *migrate_task; /* task from this rq for migration */
/* shows the amount of accumulated unfairness by tasks of this rq */
long druntime_sum;
unsigned int nr_hmp_tasks;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.or

[PATCH 07/13] hperf_hmp: migration auxiliary functions.

2015-11-06 Thread Arseniy Krasnov
Adds functions used for migration: scanning every runqueue from another
cluster for migration process, searching task to migrate from runqueue mentioned
above and function to move task from one CPU to another.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 include/linux/sched.h |   6 +
 kernel/sched/fair.c   | 301 ++
 2 files changed, 307 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89c1bf3..dafda4b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1259,6 +1259,12 @@ struct sched_entity {
 
 #ifdef CONFIG_HPERF_HMP
longdruntime;
+
+   /* Time of last migration between HMP domains (in jiffies)*/
+   unsigned long   last_migration;
+
+   /* If set, don't touch for migration */
+   int migrate_candidate;
 #endif
u64 exec_start;
u64 sum_exec_runtime;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3ab39b6..ff05364 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7118,6 +7118,307 @@ static int is_hmp_imbalance(struct sched_domain *sd)
}
}
 }
+
+/**
+ * hmp_can_migrate_task(): Checks whether specified task could be migrated.
+ * @p: task to check.
+ * @env: migration parameters.
+ *
+ * Returns 1 if migration possible, else 0.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+   if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+   schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+   return 0;
+   }
+   env->flags &= ~LBF_ALL_PINNED;
+
+   if (task_running(env->src_rq, p)) {
+   schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+   return 0;
+   }
+   return 1;
+}
+
+/**
+ * detach_specified_task(): Detaches specified task.
+ * @pm: Task to move.
+ * @env: Migration parameters.
+ *
+ * Returns moved task.
+ */
+static struct task_struct *
+detach_specified_task(struct task_struct *p, struct lb_env *env)
+{
+   lockdep_assert_held(>src_rq->lock);
+
+   /* If task to move falls asleep, so don't scan runqueue and return */
+   if (p->se.migrate_candidate == 0)
+   return 0;
+
+   if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
+   goto exit;
+
+   if (!hmp_can_migrate_task(p, env))
+   goto exit;
+
+   detach_task(p, env);
+   /*
+* Right now, this is only the third place move_task()
+* is called, so we can safely collect move_task()
+* stats here rather than inside move_task().
+*/
+   schedstat_inc(env->sd, lb_gained[env->idle]);
+   return p;
+exit:
+   p->se.migrate_candidate = 0;
+
+   return NULL;
+}
+
+/**
+ * migrate_runnable_task(): Moves task that isn't running to destination CPU.
+ * @migrate_task: Task to migrate.
+ * @destination_cpu: Destination CPU.
+ *
+ * Returns moved weight.
+ *
+ * Runqueue's of @migrate_task and @destination_cpu must be locked.
+ */
+static unsigned migrate_runnable_task(struct task_struct *migrate_task,
+ int destination_cpu)
+{
+   struct sched_domain *sd = NULL;
+   int src_cpu = task_cpu(migrate_task);
+   struct rq *src_rq = task_rq(migrate_task);
+   int dst_cpu = destination_cpu;
+   struct rq *dst_rq = cpu_rq(dst_cpu);
+   unsigned int ld_moved = 0;
+   struct task_struct *p = NULL;
+
+#ifdef CONFIG_HPERF_HMP_DEBUG
+   BUG_ON(src_rq == dst_rq);
+#else
+   if (WARN_ON(src_rq == dst_rq))
+   return 0;
+#endif
+
+   rcu_read_lock();
+   for_each_domain(dst_cpu, sd) {
+   if (cpumask_test_cpu(src_cpu, sched_domain_span(sd)))
+   break;
+   }
+   if (likely(sd)) {
+   struct lb_env env = {
+   .sd = sd,
+   .dst_cpu= dst_cpu,
+   .dst_rq = dst_rq,
+   .src_cpu= src_cpu,
+   .src_rq = src_rq,
+   .idle   = CPU_NOT_IDLE,
+   };
+
+   schedstat_inc(sd, alb_count);
+   p = detach_specified_task(migrate_task, );
+   if (p) {
+   migrate_task->se.last_migration = jiffies;
+   schedstat_inc(sd, alb_pushed);
+   ld_moved = migrate_task->se.load.weight;
+   } else
+   schedst

[PATCH 10/13] hperf_hmp: idle pull function.

2015-11-06 Thread Arseniy Krasnov
HMP idle pull is triggered when CPU becomes idle. It tries to pull task
from another cluster when it is overloaded. Also A7 can't pull alone task from
A15, but A15 can do that with A7 core. Task for migration is chosen in the same
way as for other HMP migration cases - using 'druntime' metric. Only difference
is that migration task doesn't need to run 5ms on its cluster before migration.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 kernel/sched/fair.c | 66 +
 1 file changed, 66 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4fda1ec..fd16729 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7421,6 +7421,72 @@ static unsigned int try_to_move_task(struct task_struct 
*migrate_task,
 }
 
 /**
+ * hmp_idle_pull(): Pulls task from opposite domain of this_cpu to this_cpu.
+ * @sd: Current sched domain.
+ * @this_cpu: without NO_HZ same as smp_processor_id().
+ *
+ * Returns moved weight.
+ *
+ * Chooses task by its druntime. Ignores task's druntime and
+ * time of last HMP migration. Also A7 can't pulls task from A15
+ * if A15 become idle.
+ */
+static unsigned int hmp_idle_pull(struct sched_domain *sd, int this_cpu)
+{
+   unsigned int ld_moved = 0;
+   struct task_struct *task_to_pull;
+   unsigned long local_flags;
+   int idle_stopper = 0;
+   struct rq *local_rq;
+   struct rq *rq;
+
+   local_irq_save(local_flags);
+   local_rq = cpu_rq(this_cpu);
+   rq = get_unfair_rq(sd, this_cpu);
+
+   if (!rq) {
+   local_irq_restore(local_flags);
+   return 0;
+   }
+   double_lock_balance(rq, local_rq);
+
+   if (rq->active_balance)
+   goto unlock;
+
+   if (local_rq->active_balance)
+   goto unlock;
+
+   /* Forbids secondary CPUs to pull alone task from primary CPUs */
+   if (!cpu_is_fastest(this_cpu) && rq->cfs.h_nr_running <= 1)
+   goto unlock;
+
+   /* Get task to pull from opposite domain to this_cpu */
+   task_to_pull = get_migration_candidate(sd, rq, 1, this_cpu);
+
+   if (!task_to_pull)
+   goto unlock;
+
+   ld_moved = try_to_move_task(task_to_pull, this_cpu, _stopper);
+
+   if (idle_stopper) {
+   rq->push_cpu = this_cpu;
+   rq->active_balance = 1;
+   rq->migrate_task = task_to_pull;
+   }
+
+unlock:
+   double_rq_unlock(local_rq, rq);
+   local_irq_restore(local_flags);
+
+   if (idle_stopper)
+   stop_one_cpu_nowait(rq->cpu, active_load_balance_cpu_stop,
+   rq, >active_balance_work);
+
+   return ld_moved;
+}
+
+
+/**
  * swap_tasks(): swaps two tasks from different HMP domains
  * @sd: Current sched domain
  * @this_cpu: without NO_HZ same as smp_processor_id().
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 06/13] hperf_hmp: is_hmp_imbalance introduced.

2015-11-06 Thread Arseniy Krasnov
'is_hmp_imbalance' function calculates imbalance between clusters, four
cases are possible: balancing from/to one of clusters, task swap(when clusters
are balanced) or skip rebalance. Function calculates load difference between two
cluster(cluster load / cluster power) and threshold when balancing is needed.

Signed-off-by: Tarek Dakhran <t.dakh...@samsung.com>
Signed-off-by: Sergey Dyasly <s.dya...@samsung.com>
Signed-off-by: Dmitriy Safonov <d.safo...@partner.samsung.com>
Signed-off-by: Arseniy Krasnov <a.kras...@samsung.com>
Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
---
 kernel/sched/fair.c | 103 
 1 file changed, 103 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e94fab4..3ab39b6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -104,9 +104,21 @@ unsigned int __read_mostly sysctl_sched_shares_window = 
1000UL;
 extern void hmp_set_cpu_masks(struct cpumask *, struct cpumask *);
 static atomic_t a15_nr_hmp_busy = ATOMIC_INIT(0);
 static atomic_t a7_nr_hmp_busy = ATOMIC_INIT(0);
+
+/* Total weight of all running tasks on A15 and A7 CPU domains */
+static atomic_long_t a15_total_weight = ATOMIC_LONG_INIT(0);
+static atomic_long_t a7_total_weight = ATOMIC_LONG_INIT(0);
+
 static atomic_t hmp_imbalance = ATOMIC_INIT(0);
 
 static unsigned int freq_scale_cpu_power[CONFIG_NR_CPUS];
+
+enum hmp_balance_actions {
+   SWAP_TASKS,
+   A15_TO_A7,
+   A7_TO_A15,
+   SKIP_REBALANCE,
+};
 #endif /* CONFIG_HPERF_HMP */
 
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -7016,6 +7028,97 @@ static int should_we_balance(struct lb_env *env)
 */
return balance_cpu == env->dst_cpu;
 }
+#ifdef CONFIG_HPERF_HMP
+/**
+ * is_hmp_imbalance(): Calculates imbalance between HMP domains.
+ * @sd: Current sched domain.
+ *
+ * Returns migration direction(see SWAP_TASKS, A15_TO_A7, A7_TO_A15,
+ * SKIP_REBALANCE).
+ *
+ * Imbalance depends on load of tasks on A15 cores and A7 cores,
+ * current CPU's frequencies, and A7 slowdown coefficient which is about 2.4.
+ */
+static int is_hmp_imbalance(struct sched_domain *sd)
+{
+   int imbalance, cpu;
+   int a15_group_power = 0, a7_group_power = 0,
+   hmp_imbalance_min_threshold;
+   int a15_group_load, a7_group_load, a15_a7_group_power;
+   unsigned int a7_balanced_num;
+   int reminder, divisor;
+   unsigned int a15_balanced_num;
+   long long int hmp_imbalance_threshold;
+
+   if (!sd->a15_group) {
+   return SKIP_REBALANCE;
+   }
+
+   if (!sd->a7_group) {
+   return SKIP_REBALANCE;
+   }
+   for_each_online_cpu(cpu) {
+   if (cpu_is_fastest(cpu))
+   a15_group_power += freq_scale_cpu_power[cpu];
+   else
+   a7_group_power += freq_scale_cpu_power[cpu];
+   }
+
+   if (a15_group_power == 0 || a7_group_power == 0) {
+   return SKIP_REBALANCE;
+   }
+
+   a15_balanced_num = 0;
+   a7_balanced_num = 0;
+
+   for_each_online_cpu(cpu) {
+   if (cpu_rq(cpu)->cfs.h_nr_running <= 1) {
+   if (cpu_is_fastest(cpu))
+   a15_balanced_num++;
+   else
+   a7_balanced_num++;
+   }
+   }
+
+   a7_group_load = atomic_long_read(_total_weight);
+
+   if (atomic_long_read(_total_weight) == 0 &&
+   (a15_balanced_num == sd->a15_group->group_weight)) {
+   return SKIP_REBALANCE;
+   }
+
+   a15_group_load = atomic_long_read(_total_weight);
+   a15_a7_group_power = a15_group_power + a7_group_power;
+
+   imbalance = (a15_group_load * 1024) / (a15_group_power) -
+   (a7_group_load * 1024) / (a7_group_power);
+   hmp_imbalance_threshold = ((long long int)NICE_0_LOAD *
+  1024 * a15_a7_group_power);
+   divisor = 2 * a15_group_power * a7_group_power;
+   hmp_imbalance_threshold = div_s64_rem(hmp_imbalance_threshold,
+   divisor, );
+   hmp_imbalance_min_threshold = hmp_imbalance_threshold >> 3;
+
+   if (imbalance < hmp_imbalance_min_threshold &&
+   imbalance > -hmp_imbalance_min_threshold) {
+   atomic_set(_imbalance, 0);
+   return SKIP_REBALANCE;
+   }
+
+   if (imbalance > hmp_imbalance_threshold) {
+   return A15_TO_A7;
+   } else {
+   if (imbalance < -hmp_imbalance_threshold) {
+   if (a7_balanced_num == sd->a7_group->group_weight)
+   return SWAP_TASKS;
+   else
+   return A7_TO_A15;
+   } else {
+   return SWAP_TA

  1   2   >