Re: [RFC PATCH v2 04/12] x86/mm/tlb: Flush remote and local TLBs concurrently

2019-05-31 Thread Nadav Amit via Virtualization
> On May 31, 2019, at 4:48 AM, Juergen Gross  wrote:
> 
> On 31/05/2019 08:36, Nadav Amit wrote:
>> 
>> --- a/arch/x86/include/asm/paravirt_types.h
>> +++ b/arch/x86/include/asm/paravirt_types.h
>> @@ -211,6 +211,12 @@ struct pv_mmu_ops {
>>  void (*flush_tlb_user)(void);
>>  void (*flush_tlb_kernel)(void);
>>  void (*flush_tlb_one_user)(unsigned long addr);
>> +/*
>> + * flush_tlb_multi() is the preferred interface. When it is used,
>> + * flush_tlb_others() should return false.
> 
> Didn't you want to remove/change this comment?

Yes! Sorry for that. Fixed now.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v3 5/5] vsock/virtio: change the maximum packet size allowed

2019-05-31 Thread Stefano Garzarella
Since now we are able to split packets, we can avoid limiting
their sizes to VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE.
Instead, we can use VIRTIO_VSOCK_MAX_PKT_BUF_SIZE as the max
packet size.

Signed-off-by: Stefano Garzarella 
---
 net/vmw_vsock/virtio_transport_common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index d192cb91cf25..b6ec6f81018b 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -182,8 +182,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock 
*vsk,
vvs = vsk->trans;
 
/* we can send less than pkt_len bytes */
-   if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
-   pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+   if (pkt_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+   pkt_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
 
/* virtio_transport_get_credit might return less than pkt_len credit */
pkt_len = virtio_transport_get_credit(vvs, pkt_len);
-- 
2.20.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v3 4/5] vhost/vsock: split packets to send using multiple buffers

2019-05-31 Thread Stefano Garzarella
If the packets to sent to the guest are bigger than the buffer
available, we can split them, using multiple buffers and fixing
the length in the packet header.
This is safe since virtio-vsock supports only stream sockets.

Signed-off-by: Stefano Garzarella 
---
 drivers/vhost/vsock.c   | 51 +++--
 net/vmw_vsock/virtio_transport_common.c | 15 ++--
 2 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 7964e2daee09..fb731d09f5f1 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -94,7 +94,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
struct iov_iter iov_iter;
unsigned out, in;
size_t nbytes;
-   size_t len;
+   size_t iov_len, payload_len;
int head;
 
spin_lock_bh(>send_pkt_list_lock);
@@ -139,8 +139,24 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
break;
}
 
-   len = iov_length(>iov[out], in);
-   iov_iter_init(_iter, READ, >iov[out], in, len);
+   iov_len = iov_length(>iov[out], in);
+   if (iov_len < sizeof(pkt->hdr)) {
+   virtio_transport_free_pkt(pkt);
+   vq_err(vq, "Buffer len [%zu] too small\n", iov_len);
+   break;
+   }
+
+   iov_iter_init(_iter, READ, >iov[out], in, iov_len);
+   payload_len = pkt->len - pkt->off;
+
+   /* If the packet is greater than the space available in the
+* buffer, we split it using multiple buffers.
+*/
+   if (payload_len > iov_len - sizeof(pkt->hdr))
+   payload_len = iov_len - sizeof(pkt->hdr);
+
+   /* Set the correct length in the header */
+   pkt->hdr.len = cpu_to_le32(payload_len);
 
nbytes = copy_to_iter(>hdr, sizeof(pkt->hdr), _iter);
if (nbytes != sizeof(pkt->hdr)) {
@@ -149,16 +165,34 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
break;
}
 
-   nbytes = copy_to_iter(pkt->buf, pkt->len, _iter);
-   if (nbytes != pkt->len) {
+   nbytes = copy_to_iter(pkt->buf + pkt->off, payload_len,
+ _iter);
+   if (nbytes != payload_len) {
virtio_transport_free_pkt(pkt);
vq_err(vq, "Faulted on copying pkt buf\n");
break;
}
 
-   vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+   vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len);
added = true;
 
+   /* Deliver to monitoring devices all correctly transmitted
+* packets.
+*/
+   virtio_transport_deliver_tap_pkt(pkt);
+
+   pkt->off += payload_len;
+
+   /* If we didn't send all the payload we can requeue the packet
+* to send it with the next available buffer.
+*/
+   if (pkt->off < pkt->len) {
+   spin_lock_bh(>send_pkt_list_lock);
+   list_add(>list, >send_pkt_list);
+   spin_unlock_bh(>send_pkt_list_lock);
+   continue;
+   }
+
if (pkt->reply) {
int val;
 
@@ -169,11 +203,6 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
restart_tx = true;
}
 
-   /* Deliver to monitoring devices all correctly transmitted
-* packets.
-*/
-   virtio_transport_deliver_tap_pkt(pkt);
-
virtio_transport_free_pkt(pkt);
}
if (added)
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f7b0c4911308..d192cb91cf25 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -98,8 +98,17 @@ static struct sk_buff *virtio_transport_build_skb(void 
*opaque)
struct virtio_vsock_pkt *pkt = opaque;
struct af_vsockmon_hdr *hdr;
struct sk_buff *skb;
+   size_t payload_len;
+   void *payload_buf;
 
-   skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
+   /* A packet could be split to fit the RX buffer, so we can retrieve
+* the payload length from the header and the buffer pointer taking
+* care of the offset in the original packet.
+*/
+   payload_len = le32_to_cpu(pkt->hdr.len);
+   payload_buf = pkt->buf + pkt->off;
+
+   skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len,
GFP_ATOMIC);
if (!skb)
return 

[PATCH v3 3/5] vsock/virtio: reduce credit update messages

2019-05-31 Thread Stefano Garzarella
In order to reduce the number of credit update messages,
we send them only when the space available seen by the
transmitter is less than VIRTIO_VSOCK_MAX_PKT_BUF_SIZE.

Signed-off-by: Stefano Garzarella 
---
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport_common.c | 16 +---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 53703fe03681..23396b996214 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -41,6 +41,7 @@ struct virtio_vsock_sock {
/* Protected by rx_lock */
u32 buf_alloc;
u32 fwd_cnt;
+   u32 last_fwd_cnt;
u32 rx_bytes;
struct list_head rx_queue;
 };
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 694d9805f989..f7b0c4911308 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -212,6 +212,7 @@ static void virtio_transport_dec_rx_pkt(struct 
virtio_vsock_sock *vvs,
 void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct 
virtio_vsock_pkt *pkt)
 {
spin_lock_bh(>rx_lock);
+   vvs->last_fwd_cnt = vvs->fwd_cnt;
pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
spin_unlock_bh(>rx_lock);
@@ -262,6 +263,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
struct virtio_vsock_pkt *pkt;
size_t bytes, total = 0;
+   u32 free_space;
int err = -EFAULT;
 
spin_lock_bh(>rx_lock);
@@ -292,11 +294,19 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
virtio_transport_free_pkt(pkt);
}
}
+
+   free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+
spin_unlock_bh(>rx_lock);
 
-   /* Send a credit pkt to peer */
-   virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
-   NULL);
+   /* We send a credit update only when the space available seen
+* by the transmitter is less than VIRTIO_VSOCK_MAX_PKT_BUF_SIZE
+*/
+   if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) {
+   virtio_transport_send_credit_update(vsk,
+   VIRTIO_VSOCK_TYPE_STREAM,
+   NULL);
+   }
 
return total;
 
-- 
2.20.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v3 1/5] vsock/virtio: limit the memory used per-socket

2019-05-31 Thread Stefano Garzarella
Since virtio-vsock was introduced, the buffers filled by the host
and pushed to the guest using the vring, are directly queued in
a per-socket list. These buffers are preallocated by the guest
with a fixed size (4 KB).

The maximum amount of memory used by each socket should be
controlled by the credit mechanism.
The default credit available per-socket is 256 KB, but if we use
only 1 byte per packet, the guest can queue up to 262144 of 4 KB
buffers, using up to 1 GB of memory per-socket. In addition, the
guest will continue to fill the vring with new 4 KB free buffers
to avoid starvation of other sockets.

This patch mitigates this issue copying the payload of small
packets (< 128 bytes) into the buffer of last packet queued, in
order to avoid wasting memory.

Signed-off-by: Stefano Garzarella 
---
 drivers/vhost/vsock.c   |  2 +
 include/linux/virtio_vsock.h|  1 +
 net/vmw_vsock/virtio_transport.c|  1 +
 net/vmw_vsock/virtio_transport_common.c | 60 +
 4 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index bb5fc0e9fbc2..7964e2daee09 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -320,6 +320,8 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
return NULL;
}
 
+   pkt->buf_len = pkt->len;
+
nbytes = copy_from_iter(pkt->buf, pkt->len, _iter);
if (nbytes != pkt->len) {
vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index e223e2632edd..7d973903f52e 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -52,6 +52,7 @@ struct virtio_vsock_pkt {
/* socket refcnt not held, only use for cancellation */
struct vsock_sock *vsk;
void *buf;
+   u32 buf_len;
u32 len;
u32 off;
bool reply;
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 96ab344f17bb..beabb8025907 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -280,6 +280,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
break;
}
 
+   pkt->buf_len = buf_len;
pkt->len = buf_len;
 
sg_init_one(, >hdr, sizeof(pkt->hdr));
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index f3f3d06cb6d8..4fd4987511a9 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -27,6 +27,9 @@
 /* How long to wait for graceful shutdown of a connection */
 #define VSOCK_CLOSE_TIMEOUT (8 * HZ)
 
+/* Threshold for detecting small packets to copy */
+#define GOOD_COPY_LEN  128
+
 static const struct virtio_transport *virtio_transport_get_ops(void)
 {
const struct vsock_transport *t = vsock_core_get_transport();
@@ -65,6 +68,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
pkt->buf = kmalloc(len, GFP_KERNEL);
if (!pkt->buf)
goto out_pkt;
+
+   pkt->buf_len = len;
+
err = memcpy_from_msg(pkt->buf, info->msg, len);
if (err)
goto out;
@@ -842,24 +848,60 @@ virtio_transport_recv_connecting(struct sock *sk,
return err;
 }
 
+static void
+virtio_transport_recv_enqueue(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt *pkt)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   bool free_pkt = false;
+
+   pkt->len = le32_to_cpu(pkt->hdr.len);
+   pkt->off = 0;
+
+   spin_lock_bh(>rx_lock);
+
+   virtio_transport_inc_rx_pkt(vvs, pkt);
+
+   /* Try to copy small packets into the buffer of last packet queued,
+* to avoid wasting memory queueing the entire buffer with a small
+* payload.
+*/
+   if (pkt->len <= GOOD_COPY_LEN && !list_empty(>rx_queue)) {
+   struct virtio_vsock_pkt *last_pkt;
+
+   last_pkt = list_last_entry(>rx_queue,
+  struct virtio_vsock_pkt, list);
+
+   /* If there is space in the last packet queued, we copy the
+* new packet in its buffer.
+*/
+   if (pkt->len <= last_pkt->buf_len - last_pkt->len) {
+   memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
+  pkt->len);
+   last_pkt->len += pkt->len;
+   free_pkt = true;
+   goto out;
+   }
+   }
+
+   list_add_tail(>list, >rx_queue);
+
+out:
+   spin_unlock_bh(>rx_lock);
+   if (free_pkt)
+   virtio_transport_free_pkt(pkt);
+}
+
 static int
 virtio_transport_recv_connected(struct sock *sk,

[PATCH v3 2/5] vsock/virtio: fix locking for fwd_cnt and buf_alloc

2019-05-31 Thread Stefano Garzarella
fwd_cnt is written with rx_lock, so we should read it using
the same spinlock also if we are in the TX path.

Move also buf_alloc under rx_lock and add a missing locking
when we modify it.

Signed-off-by: Stefano Garzarella 
---
 include/linux/virtio_vsock.h| 2 +-
 net/vmw_vsock/virtio_transport_common.c | 6 --
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 7d973903f52e..53703fe03681 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -35,11 +35,11 @@ struct virtio_vsock_sock {
 
/* Protected by tx_lock */
u32 tx_cnt;
-   u32 buf_alloc;
u32 peer_fwd_cnt;
u32 peer_buf_alloc;
 
/* Protected by rx_lock */
+   u32 buf_alloc;
u32 fwd_cnt;
u32 rx_bytes;
struct list_head rx_queue;
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 4fd4987511a9..694d9805f989 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -211,10 +211,10 @@ static void virtio_transport_dec_rx_pkt(struct 
virtio_vsock_sock *vvs,
 
 void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct 
virtio_vsock_pkt *pkt)
 {
-   spin_lock_bh(>tx_lock);
+   spin_lock_bh(>rx_lock);
pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
-   spin_unlock_bh(>tx_lock);
+   spin_unlock_bh(>rx_lock);
 }
 EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
 
@@ -434,7 +434,9 @@ void virtio_transport_set_buffer_size(struct vsock_sock 
*vsk, u64 val)
if (val > vvs->buf_size_max)
vvs->buf_size_max = val;
vvs->buf_size = val;
+   spin_lock_bh(>rx_lock);
vvs->buf_alloc = val;
+   spin_unlock_bh(>rx_lock);
 }
 EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
 
-- 
2.20.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v3 0/5] vsock/virtio: optimizations to increase the throughput

2019-05-31 Thread Stefano Garzarella
This series tries to increase the throughput of virtio-vsock with slight
changes.
While I was testing the v2 of this series I discovered an huge use of memory,
so I added patch 1 to mitigate this issue. I put it in this series in order
to better track the performance trends.

v3:
- Patch 1: added a threshold to copy only small packets [Jason]

- Patch 1: replaced the allocation of a new buffer with a copy of small packets
   into the buffer of last packed queued. [Jason, Stefan]

- Removed 3 patches from the series:
  - "[PATCH v2 2/8] vsock/virtio: free packets during the socket release"
Sent as a separate patch. It is already upstream.

  - "[PATCH v2 7/8] vsock/virtio: increase RX buffer size to 64 KiB" and
"[PATCH v2 8/8] vsock/virtio: make the RX buffer size tunable"
As Jason suggested, we can do the 64KB buffer in the future with the
conversion of using skb.

- Patches 2, 3, 4, 5 are the same of v2 (change only the index since 2/8 was
  sent as a separated patch)

v2: https://patchwork.kernel.org/cover/10938743

v1: https://patchwork.kernel.org/cover/10885431

Below are the benchmarks step by step. I used iperf3 [1] modified with VSOCK
support. As Micheal suggested in the v1, I booted host and guest with 'nosmap'.
I added two new rows with 32 and 128 bytes, to test small packets.

A brief description of patches:
- Patches 1: limit the memory usage with an extra copy for small packets
- Patches 2+3: fix locking and reduce the number of credit update messages
   sent to the transmitter
- Patches 4+5: allow the host to split packets on multiple buffers and use
   VIRTIO_VSOCK_MAX_PKT_BUF_SIZE as the max packet size allowed

host -> guest [Gbps]
pkt_size before opt   p 1 p 2+3p 4+5

32 0.035 0.0320.0490.051
64 0.069 0.0610.1230.126
1280.138 0.1160.2560.252
2560.247 0.2540.4340.444
5120.498 0.4820.9400.931
1K 0.951 0.9751.8781.887
2K 1.882 1.8723.7353.720
4K 3.603 3.5836.8436.842
8K 5.881 5.761   11.841   12.057
16K8.414 8.352   17.163   17.456
32K   14.02013.926   19.156   20.883
64K   21.14720.921   20.601   21.799
128K  21.09821.027   21.145   23.596
256K  21.61721.354   21.123   23.464
512K  20.96721.056   21.236   23.775

guest -> host [Gbps]
pkt_size before opt   p 1 p 2+3p 4+5

32 0.044 0.0430.0590.058
64 0.090 0.0860.1100.109
1280.176 0.1660.2170.219
2560.342 0.3350.4310.435
5120.674 0.6640.8430.883
1K 1.336 1.3151.6721.648
2K 2.613 2.6153.1633.173
4K 5.162 5.1285.6385.873
8K 8.479 8.913   10.787   10.236
16K   11.61712.027   12.871   15.840
32K   11.23411.074   11.643   24.385
64K   10.25210.655   11.223   37.201
128K  10.10110.219   10.346   39.340
256K   9.403 9.820   10.040   34.888
512K   8.822 9.911   10.410   34.562

As Stefan suggested in the v1, I measured also the efficiency in this way:
efficiency = Mbps / (%CPU_Host + %CPU_Guest)

The '%CPU_Guest' is taken inside the VM. I know that it is not the best way,
but it's provided for free from iperf3 and could be an indication.

host -> guest efficiency [Mbps / (%CPU_Host + %CPU_Guest)]
pkt_size before opt   p 1 p 2+3p 4+5

32  0.42  0.54 0.92 2.10
64  0.62  0.83 3.55 3.90
128 1.24  1.59 4.30 4.22
256 2.25  2.32 5.81 5.97
512 4.51  4.4512.0411.94
1K  8.70  8.9323.1023.33
2K 17.24 17.3044.5244.13
4K 33.01 32.9782.4581.65
8K 57.57 56.57   147.46   157.81
16K82.36 81.97   214.60   253.35
32K   153.00151.78   278.84   367.66
64K   236.54246.02   301.18   454.15
128K  327.02328.36   331.43   505.09
256K  336.23331.09   330.05   515.09
512K  323.10322.61   327.21   513.90

guest -> host efficiency [Mbps / (%CPU_Host + %CPU_Guest)]
pkt_size before opt   p 1 p 2+3 p 4+5

32  0.42  0.42 0.60 0.606
64  0.86  0.84 1.20 1.187
128 1.69  1.64 2.38 2.393
256 3.31  3.31 4.67 4.723
512 6.50  6.56 9.19 9.454
1K 12.90 12.8918.0417.778
2K 25.45 25.9234.3134.028
4K 51.13 50.5962.6464.609
8K 95.81100.98   112.95   113.105
16K   139.85142.83   142.54   181.651
32K   147.43146.66   146.86   286.545
64K   141.64145.75   140.82   432.570
128K  148.30148.87   143.89   

Re: [RFC PATCH v2 04/12] x86/mm/tlb: Flush remote and local TLBs concurrently

2019-05-31 Thread Juergen Gross
On 31/05/2019 08:36, Nadav Amit wrote:
> To improve TLB shootdown performance, flush the remote and local TLBs
> concurrently. Introduce flush_tlb_multi() that does so. The current
> flush_tlb_others() interface is kept, since paravirtual interfaces need
> to be adapted first before it can be removed. This is left for future
> work. In such PV environments, TLB flushes are not performed, at this
> time, concurrently.
> 
> Add a static key to tell whether this new interface is supported.
> 
> Cc: "K. Y. Srinivasan" 
> Cc: Haiyang Zhang 
> Cc: Stephen Hemminger 
> Cc: Sasha Levin 
> Cc: Thomas Gleixner 
> Cc: Ingo Molnar 
> Cc: Borislav Petkov 
> Cc: x...@kernel.org
> Cc: Juergen Gross 
> Cc: Paolo Bonzini 
> Cc: Dave Hansen 
> Cc: Andy Lutomirski 
> Cc: Peter Zijlstra 
> Cc: Boris Ostrovsky 
> Cc: linux-hyp...@vger.kernel.org
> Cc: linux-ker...@vger.kernel.org
> Cc: virtualization@lists.linux-foundation.org
> Cc: k...@vger.kernel.org
> Cc: xen-de...@lists.xenproject.org
> Signed-off-by: Nadav Amit 
> ---
>  arch/x86/hyperv/mmu.c |  2 +
>  arch/x86/include/asm/paravirt.h   |  8 +++
>  arch/x86/include/asm/paravirt_types.h |  6 ++
>  arch/x86/include/asm/tlbflush.h   |  6 ++
>  arch/x86/kernel/kvm.c |  1 +
>  arch/x86/kernel/paravirt.c|  3 +
>  arch/x86/mm/tlb.c | 80 +++
>  arch/x86/xen/mmu_pv.c |  2 +
>  8 files changed, 96 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
> index e65d7fe6489f..ca28b400c87c 100644
> --- a/arch/x86/hyperv/mmu.c
> +++ b/arch/x86/hyperv/mmu.c
> @@ -233,4 +233,6 @@ void hyperv_setup_mmu_ops(void)
>   pr_info("Using hypercall for remote TLB flush\n");
>   pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
>   pv_ops.mmu.tlb_remove_table = tlb_remove_table;
> +
> + static_key_disable(_tlb_multi_enabled.key);
>  }
> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
> index c25c38a05c1c..192be7254457 100644
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -47,6 +47,8 @@ static inline void slow_down_io(void)
>  #endif
>  }
>  
> +DECLARE_STATIC_KEY_TRUE(flush_tlb_multi_enabled);
> +
>  static inline void __flush_tlb(void)
>  {
>   PVOP_VCALL0(mmu.flush_tlb_user);
> @@ -62,6 +64,12 @@ static inline void __flush_tlb_one_user(unsigned long addr)
>   PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
>  }
>  
> +static inline void flush_tlb_multi(const struct cpumask *cpumask,
> +const struct flush_tlb_info *info)
> +{
> + PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
> +}
> +
>  static inline void flush_tlb_others(const struct cpumask *cpumask,
>   const struct flush_tlb_info *info)
>  {
> diff --git a/arch/x86/include/asm/paravirt_types.h 
> b/arch/x86/include/asm/paravirt_types.h
> index 946f8f1f1efc..3a156e63c57d 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -211,6 +211,12 @@ struct pv_mmu_ops {
>   void (*flush_tlb_user)(void);
>   void (*flush_tlb_kernel)(void);
>   void (*flush_tlb_one_user)(unsigned long addr);
> + /*
> +  * flush_tlb_multi() is the preferred interface. When it is used,
> +  * flush_tlb_others() should return false.

Didn't you want to remove/change this comment?


Juergen
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [virtio-dev] Re: [PATCH v8 2/7] dt-bindings: virtio: Add virtio-pci-iommu node

2019-05-31 Thread Jean-Philippe Brucker
On 30/05/2019 18:45, Michael S. Tsirkin wrote:
> On Thu, May 30, 2019 at 06:09:24PM +0100, Jean-Philippe Brucker wrote:
>> Some systems implement virtio-iommu as a PCI endpoint. The operating
>> system needs to discover the relationship between IOMMU and masters long
>> before the PCI endpoint gets probed. Add a PCI child node to describe the
>> virtio-iommu device.
>>
>> The virtio-pci-iommu is conceptually split between a PCI programming
>> interface and a translation component on the parent bus. The latter
>> doesn't have a node in the device tree. The virtio-pci-iommu node
>> describes both, by linking the PCI endpoint to "iommus" property of DMA
>> master nodes and to "iommu-map" properties of bus nodes.
>>
>> Reviewed-by: Rob Herring 
>> Reviewed-by: Eric Auger 
>> Signed-off-by: Jean-Philippe Brucker 
> 
> So this is just an example right?
> We are not defining any new properties or anything like that.

Yes it's just an example. The properties already exist but it's good to
describe how to put them together for this particular case, because
there isn't a precedent describing the topology for an IOMMU that
appears on the PCI bus.

> I think down the road for non dt platforms we want to put this
> info in the config space of the device. I do not think ACPI
> is the best option for this since not all systems have it.
> But that can wait.

There is the probe order problem - PCI needs this info before starting
to probe devices on the bus. Maybe we could store the info in a separate
memory region, that is referenced on the command-line and that the guest
can read early.

Thanks,
Jean
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 3/4] vsock/virtio: fix flush of works during the .remove()

2019-05-31 Thread Jason Wang


On 2019/5/31 下午4:18, Stefano Garzarella wrote:

On Thu, May 30, 2019 at 07:59:14PM +0800, Jason Wang wrote:

On 2019/5/30 下午6:10, Stefano Garzarella wrote:

On Thu, May 30, 2019 at 05:46:18PM +0800, Jason Wang wrote:

On 2019/5/29 下午6:58, Stefano Garzarella wrote:

On Wed, May 29, 2019 at 11:22:40AM +0800, Jason Wang wrote:

On 2019/5/28 下午6:56, Stefano Garzarella wrote:

@@ -690,6 +693,9 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
vsock->event_run = false;
mutex_unlock(>event_lock);
+   /* Flush all pending works */
+   virtio_vsock_flush_works(vsock);
+
/* Flush all device writes and interrupts, device will not use any
 * more buffers.
 */
@@ -726,6 +732,11 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
/* Delete virtqueues and flush outstanding callbacks if any */
vdev->config->del_vqs(vdev);
+   /* Other works can be queued before 'config->del_vqs()', so we flush
+* all works before to free the vsock object to avoid use after free.
+*/
+   virtio_vsock_flush_works(vsock);

Some questions after a quick glance:

1) It looks to me that the work could be queued from the path of
vsock_transport_cancel_pkt() . Is that synchronized here?


Both virtio_transport_send_pkt() and vsock_transport_cancel_pkt() can
queue work from the upper layer (socket).

Setting the_virtio_vsock to NULL, should synchronize, but after a careful look
a rare issue could happen:
we are setting the_virtio_vsock to NULL at the start of .remove() and we
are freeing the object pointed by it at the end of .remove(), so
virtio_transport_send_pkt() or vsock_transport_cancel_pkt() may still be
running, accessing the object that we are freed.

Yes, that's my point.



Should I use something like RCU to prevent this issue?

   virtio_transport_send_pkt() and vsock_transport_cancel_pkt()
   {
   rcu_read_lock();
   vsock = rcu_dereference(the_virtio_vsock_mutex);

RCU is probably a way to go. (Like what vhost_transport_send_pkt() did).


Okay, I'm going this way.


   ...
   rcu_read_unlock();
   }

   virtio_vsock_remove()
   {
   rcu_assign_pointer(the_virtio_vsock_mutex, NULL);
   synchronize_rcu();

   ...

   free(vsock);
   }

Could there be a better approach?



2) If we decide to flush after dev_vqs(), is tx_run/rx_run/event_run still
needed? It looks to me we've already done except that we need flush rx_work
in the end since send_pkt_work can requeue rx_work.

The main reason of tx_run/rx_run/event_run is to prevent that a worker
function is running while we are calling config->reset().

E.g. if an interrupt comes between virtio_vsock_flush_works() and
config->reset(), it can queue new works that can access the device while
we are in config->reset().

IMHO they are still needed.

What do you think?

I mean could we simply do flush after reset once and without tx_rx/rx_run
tricks?

rest();

virtio_vsock_flush_work();

virtio_vsock_free_buf();

My only doubt is:
is it safe to call config->reset() while a worker function could access
the device?

I had this doubt reading the Michael's advice[1] and looking at
virtnet_remove() where there are these lines before the config->reset():

/* Make sure no work handler is accessing the device. */
flush_work(>config_work);

Thanks,
Stefano

[1] https://lore.kernel.org/netdev/20190521055650-mutt-send-email-...@kernel.org


Good point. Then I agree with you. But if we can use the RCU to detect the
detach of device from socket for these, it would be even better.


What about checking 'the_virtio_vsock' in the worker functions in a RCU
critical section?
In this way, I can remove the rx_run/tx_run/event_run.

Do you think it's cleaner?



Yes, I think so.

Thanks




Thank you very much,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 3/4] vsock/virtio: fix flush of works during the .remove()

2019-05-31 Thread Stefano Garzarella
On Thu, May 30, 2019 at 07:59:14PM +0800, Jason Wang wrote:
> 
> On 2019/5/30 下午6:10, Stefano Garzarella wrote:
> > On Thu, May 30, 2019 at 05:46:18PM +0800, Jason Wang wrote:
> > > On 2019/5/29 下午6:58, Stefano Garzarella wrote:
> > > > On Wed, May 29, 2019 at 11:22:40AM +0800, Jason Wang wrote:
> > > > > On 2019/5/28 下午6:56, Stefano Garzarella wrote:
> > > > > > @@ -690,6 +693,9 @@ static void virtio_vsock_remove(struct 
> > > > > > virtio_device *vdev)
> > > > > > vsock->event_run = false;
> > > > > > mutex_unlock(>event_lock);
> > > > > > +   /* Flush all pending works */
> > > > > > +   virtio_vsock_flush_works(vsock);
> > > > > > +
> > > > > > /* Flush all device writes and interrupts, device will 
> > > > > > not use any
> > > > > >  * more buffers.
> > > > > >  */
> > > > > > @@ -726,6 +732,11 @@ static void virtio_vsock_remove(struct 
> > > > > > virtio_device *vdev)
> > > > > > /* Delete virtqueues and flush outstanding callbacks if 
> > > > > > any */
> > > > > > vdev->config->del_vqs(vdev);
> > > > > > +   /* Other works can be queued before 'config->del_vqs()', so we 
> > > > > > flush
> > > > > > +* all works before to free the vsock object to avoid use after 
> > > > > > free.
> > > > > > +*/
> > > > > > +   virtio_vsock_flush_works(vsock);
> > > > > Some questions after a quick glance:
> > > > > 
> > > > > 1) It looks to me that the work could be queued from the path of
> > > > > vsock_transport_cancel_pkt() . Is that synchronized here?
> > > > > 
> > > > Both virtio_transport_send_pkt() and vsock_transport_cancel_pkt() can
> > > > queue work from the upper layer (socket).
> > > > 
> > > > Setting the_virtio_vsock to NULL, should synchronize, but after a 
> > > > careful look
> > > > a rare issue could happen:
> > > > we are setting the_virtio_vsock to NULL at the start of .remove() and we
> > > > are freeing the object pointed by it at the end of .remove(), so
> > > > virtio_transport_send_pkt() or vsock_transport_cancel_pkt() may still be
> > > > running, accessing the object that we are freed.
> > > 
> > > Yes, that's my point.
> > > 
> > > 
> > > > Should I use something like RCU to prevent this issue?
> > > > 
> > > >   virtio_transport_send_pkt() and vsock_transport_cancel_pkt()
> > > >   {
> > > >   rcu_read_lock();
> > > >   vsock = rcu_dereference(the_virtio_vsock_mutex);
> > > 
> > > RCU is probably a way to go. (Like what vhost_transport_send_pkt() did).
> > > 
> > Okay, I'm going this way.
> > 
> > > >   ...
> > > >   rcu_read_unlock();
> > > >   }
> > > > 
> > > >   virtio_vsock_remove()
> > > >   {
> > > >   rcu_assign_pointer(the_virtio_vsock_mutex, NULL);
> > > >   synchronize_rcu();
> > > > 
> > > >   ...
> > > > 
> > > >   free(vsock);
> > > >   }
> > > > 
> > > > Could there be a better approach?
> > > > 
> > > > 
> > > > > 2) If we decide to flush after dev_vqs(), is tx_run/rx_run/event_run 
> > > > > still
> > > > > needed? It looks to me we've already done except that we need flush 
> > > > > rx_work
> > > > > in the end since send_pkt_work can requeue rx_work.
> > > > The main reason of tx_run/rx_run/event_run is to prevent that a worker
> > > > function is running while we are calling config->reset().
> > > > 
> > > > E.g. if an interrupt comes between virtio_vsock_flush_works() and
> > > > config->reset(), it can queue new works that can access the device while
> > > > we are in config->reset().
> > > > 
> > > > IMHO they are still needed.
> > > > 
> > > > What do you think?
> > > 
> > > I mean could we simply do flush after reset once and without tx_rx/rx_run
> > > tricks?
> > > 
> > > rest();
> > > 
> > > virtio_vsock_flush_work();
> > > 
> > > virtio_vsock_free_buf();
> > My only doubt is:
> > is it safe to call config->reset() while a worker function could access
> > the device?
> > 
> > I had this doubt reading the Michael's advice[1] and looking at
> > virtnet_remove() where there are these lines before the config->reset():
> > 
> > /* Make sure no work handler is accessing the device. */
> > flush_work(>config_work);
> > 
> > Thanks,
> > Stefano
> > 
> > [1] 
> > https://lore.kernel.org/netdev/20190521055650-mutt-send-email-...@kernel.org
> 
> 
> Good point. Then I agree with you. But if we can use the RCU to detect the
> detach of device from socket for these, it would be even better.
> 

What about checking 'the_virtio_vsock' in the worker functions in a RCU
critical section?
In this way, I can remove the rx_run/tx_run/event_run.

Do you think it's cleaner?

Thank you very much,
Stefano
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v2 04/12] x86/mm/tlb: Flush remote and local TLBs concurrently

2019-05-31 Thread Nadav Amit via Virtualization
To improve TLB shootdown performance, flush the remote and local TLBs
concurrently. Introduce flush_tlb_multi() that does so. The current
flush_tlb_others() interface is kept, since paravirtual interfaces need
to be adapted first before it can be removed. This is left for future
work. In such PV environments, TLB flushes are not performed, at this
time, concurrently.

Add a static key to tell whether this new interface is supported.

Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Sasha Levin 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: x...@kernel.org
Cc: Juergen Gross 
Cc: Paolo Bonzini 
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Boris Ostrovsky 
Cc: linux-hyp...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Cc: k...@vger.kernel.org
Cc: xen-de...@lists.xenproject.org
Signed-off-by: Nadav Amit 
---
 arch/x86/hyperv/mmu.c |  2 +
 arch/x86/include/asm/paravirt.h   |  8 +++
 arch/x86/include/asm/paravirt_types.h |  6 ++
 arch/x86/include/asm/tlbflush.h   |  6 ++
 arch/x86/kernel/kvm.c |  1 +
 arch/x86/kernel/paravirt.c|  3 +
 arch/x86/mm/tlb.c | 80 +++
 arch/x86/xen/mmu_pv.c |  2 +
 8 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index e65d7fe6489f..ca28b400c87c 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -233,4 +233,6 @@ void hyperv_setup_mmu_ops(void)
pr_info("Using hypercall for remote TLB flush\n");
pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+
+   static_key_disable(_tlb_multi_enabled.key);
 }
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c25c38a05c1c..192be7254457 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -47,6 +47,8 @@ static inline void slow_down_io(void)
 #endif
 }
 
+DECLARE_STATIC_KEY_TRUE(flush_tlb_multi_enabled);
+
 static inline void __flush_tlb(void)
 {
PVOP_VCALL0(mmu.flush_tlb_user);
@@ -62,6 +64,12 @@ static inline void __flush_tlb_one_user(unsigned long addr)
PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
 }
 
+static inline void flush_tlb_multi(const struct cpumask *cpumask,
+  const struct flush_tlb_info *info)
+{
+   PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
+}
+
 static inline void flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
 {
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 946f8f1f1efc..3a156e63c57d 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -211,6 +211,12 @@ struct pv_mmu_ops {
void (*flush_tlb_user)(void);
void (*flush_tlb_kernel)(void);
void (*flush_tlb_one_user)(unsigned long addr);
+   /*
+* flush_tlb_multi() is the preferred interface. When it is used,
+* flush_tlb_others() should return false.
+*/
+   void (*flush_tlb_multi)(const struct cpumask *cpus,
+   const struct flush_tlb_info *info);
void (*flush_tlb_others)(const struct cpumask *cpus,
 const struct flush_tlb_info *info);
 
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index dee375831962..79272938cf79 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -569,6 +569,9 @@ static inline void flush_tlb_page(struct vm_area_struct 
*vma, unsigned long a)
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
 }
 
+void native_flush_tlb_multi(const struct cpumask *cpumask,
+const struct flush_tlb_info *info);
+
 void native_flush_tlb_others(const struct cpumask *cpumask,
 const struct flush_tlb_info *info);
 
@@ -593,6 +596,9 @@ static inline void arch_tlbbatch_add_mm(struct 
arch_tlbflush_unmap_batch *batch,
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
 
 #ifndef CONFIG_PARAVIRT
+#define flush_tlb_multi(mask, info)\
+   native_flush_tlb_multi(mask, info)
+
 #define flush_tlb_others(mask, info)   \
native_flush_tlb_others(mask, info)
 
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 3f0cc828cc36..c1c2b88ea3f1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -643,6 +643,7 @@ static void __init kvm_guest_init(void)
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+   static_key_disable(_tlb_multi_enabled.key);
}
 
if