Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Hi Gleb. On Wed, Dec 17, 2008 at 04:31:46PM +0200, Gleb Natapov (g...@redhat.com) wrote: > Here it is. Sorry it is not in a patch format yet, but it gives > general idea how it looks. The problem with connector is that > we need different IDX for different channels and there is no way > to dynamically allocate them. Looks very good. Especially liked how you used idx.val pairs to register multiple users. Please add some comment in connector header on how you use it and feel free to add my ack if needed. -- Evgeniy Polyakov ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
On Wed, Dec 17, 2008 at 12:25:32AM +0300, Evgeniy Polyakov wrote: > On Tue, Dec 16, 2008 at 08:57:27AM +0200, Gleb Natapov (g...@redhat.com) > wrote: > > > Another approach is to implement that virtio backend with netlink based > > > userspace interface (like using connector or genetlink). This does not > > > differ too much from what you have with special socket family, but at > > > least it does not duplicate existing functionality of > > > userspace-kernelspace communications. > > > > > I implemented vmchannel using connector initially (the downside is that > > message can be dropped). Is this more expectable for upstream? The > > implementation was 300 lines of code. > > Hard to tell, it depends on implementation. But if things are good, I > have no objections as connector maintainer :) > Here it is. Sorry it is not in a patch format yet, but it gives general idea how it looks. The problem with connector is that we need different IDX for different channels and there is no way to dynamically allocate them. -- Gleb. /* * Copyright (c) 2008 Red Hat, Inc. * * Author(s): Gleb Natapov */ #include #include #include #include #include #include #include #include #include "vmchannel_connector.h" static struct vmchannel_dev vmc_dev; static int add_recq_buf(struct vmchannel_dev *vmc, struct vmchannel_hdr *hdr) { struct scatterlist sg[2]; sg_init_table(sg, 2); sg_init_one(&sg[0], hdr, sizeof(struct vmchannel_desc)); sg_init_one(&sg[1], hdr->msg.data, MAX_PACKET_LEN); if (!vmc->rq->vq_ops->add_buf(vmc->rq, sg, 0, 2, hdr)) return 1; kfree(hdr); return 0; } static int try_fill_recvq(struct vmchannel_dev *vmc) { int num = 0; for (;;) { struct vmchannel_hdr *hdr; hdr = kmalloc(sizeof(*hdr) + MAX_PACKET_LEN, GFP_KERNEL); if (unlikely(!hdr)) break; if (!add_recq_buf(vmc, hdr)) break; num++; } if (num) vmc->rq->vq_ops->kick(vmc->rq); return num; } static void vmchannel_recv(unsigned long data) { struct vmchannel_dev *vmc = (struct vmchannel_dev *)data; struct vmchannel_hdr *hdr; unsigned int len; int posted = 0; while ((hdr = vmc->rq->vq_ops->get_buf(vmc->rq, &len))) { hdr->msg.len = le32_to_cpu(hdr->desc.len); len -= sizeof(struct vmchannel_desc); if (hdr->msg.len == len) { hdr->msg.id.idx = VMCHANNEL_CONNECTOR_IDX; hdr->msg.id.val = le32_to_cpu(hdr->desc.id); hdr->msg.seq = vmc->seq++; hdr->msg.ack = random32(); cn_netlink_send(&hdr->msg, VMCHANNEL_CONNECTOR_IDX, GFP_ATOMIC); } else dev_printk(KERN_ERR, &vmc->vdev->dev, "wrong length in received descriptor" " (%d instead of %d)\n", hdr->msg.len, len); posted += add_recq_buf(vmc, hdr); } if (posted) vmc->rq->vq_ops->kick(vmc->rq); } static void recvq_notify(struct virtqueue *recvq) { struct vmchannel_dev *vmc = recvq->vdev->priv; tasklet_schedule(&vmc->tasklet); } static void cleanup_sendq(struct vmchannel_dev *vmc) { char *buf; unsigned int len; spin_lock(&vmc->sq_lock); while ((buf = vmc->sq->vq_ops->get_buf(vmc->sq, &len))) kfree(buf); spin_unlock(&vmc->sq_lock); } static void sendq_notify(struct virtqueue *sendq) { struct vmchannel_dev *vmc = sendq->vdev->priv; cleanup_sendq(vmc); } static void vmchannel_cn_callback(void *data) { struct vmchannel_desc *desc; struct cn_msg *msg = data; struct scatterlist sg; char *buf; int err; unsigned long flags; desc = kmalloc(msg->len + sizeof(*desc), GFP_KERNEL); if (!desc) return; desc->id = cpu_to_le32(msg->id.val); desc->len = cpu_to_le32(msg->len); buf = (char *)(desc + 1); memcpy(buf, msg->data, msg->len); sg_init_one(&sg, desc, msg->len + sizeof(*desc)); spin_lock_irqsave(&vmc_dev.sq_lock, flags); err = vmc_dev.sq->vq_ops->add_buf(vmc_dev.sq, &sg, 1, 0, desc); if (err) kfree(desc); else vmc_dev.sq->vq_ops->kick(vmc_dev.sq); spin_unlock_irqrestore(&vmc_dev.sq_lock, flags); } static int vmchannel_probe(struct virtio_device *vdev) { struct vmchannel_dev *vmc = &vmc_dev; struct cb_id cn_id; int r, i; __le32 count; unsigned offset; cn_id.idx = VMCHANNEL_CONNECTOR_IDX; vdev->priv = vmc; vmc->vdev = vdev; vdev->config->get(vdev, 0, &count, sizeof(count)); vmc->channel_count = le32_to_cpu(count); if (vmc->channel_count == 0) { dev_printk(KERN_ERR, &vdev->dev, "No channels present\n"); return -ENODEV; } pr_debug("vmchannel: %d channel detected\n", vmc->channel_count); vmc->channels = kzalloc(vmc->channel_count * sizeof(struct vmchannel_info), GFP_KERNEL); if (!vmc->channels) return -ENOMEM; offset = sizeof(count); for (i = 0; i < vmc->channel_count; i++) { __u32 len; __le32 tmp; vdev->config->get(vdev, offset, &tmp, 4); vmc->channels[i].id = le32_to_cpu(tmp); offset += 4; vdev->config->get(vdev, offset, &tmp, 4); len = le32_to_cpu(tmp); if (len > VMCHANNEL_NAME_MAX) { dev_printk(KERN_ERR, &vdev->dev, "Wrong device configuration. " "Channel name is too long"); r = -ENOD
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Evgeniy Polyakov wrote: > On Tue, Dec 16, 2008 at 08:57:27AM +0200, Gleb Natapov (g...@redhat.com) > wrote: > >>> Another approach is to implement that virtio backend with netlink based >>> userspace interface (like using connector or genetlink). This does not >>> differ too much from what you have with special socket family, but at >>> least it does not duplicate existing functionality of >>> userspace-kernelspace communications. >>> >>> >> I implemented vmchannel using connector initially (the downside is that >> message can be dropped). Is this more expectable for upstream? The >> implementation was 300 lines of code. >> > > Hard to tell, it depends on implementation. But if things are good, I > have no objections as connector maintainer :) > > Messages in connector in particular and netlink in general are only > dropped, when receiving buffer is full (or when there is no memory), you > can tune buffer size to match virtual queue size or vice versa. > > Gleb was aware of that and it's not a problem since all of the anticipated usages may drop msgs (guest statistics, cut&paste, mouse movements, single sign on commands, etc). Service that would need reliability could use basic acks. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
On Tue, Dec 16, 2008 at 08:57:27AM +0200, Gleb Natapov (g...@redhat.com) wrote: > > Another approach is to implement that virtio backend with netlink based > > userspace interface (like using connector or genetlink). This does not > > differ too much from what you have with special socket family, but at > > least it does not duplicate existing functionality of > > userspace-kernelspace communications. > > > I implemented vmchannel using connector initially (the downside is that > message can be dropped). Is this more expectable for upstream? The > implementation was 300 lines of code. Hard to tell, it depends on implementation. But if things are good, I have no objections as connector maintainer :) Messages in connector in particular and netlink in general are only dropped, when receiving buffer is full (or when there is no memory), you can tune buffer size to match virtual queue size or vice versa. -- Evgeniy Polyakov ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
On Tue, Dec 16, 2008 at 02:45:11AM +0300, Evgeniy Polyakov wrote: > Hi Anthony. > > On Mon, Dec 15, 2008 at 05:01:14PM -0600, Anthony Liguori > (anth...@codemonkey.ws) wrote: > > Yes, and I went down the road of using a dedicated network device and > > using raw ethernet as the protocol. The thing that killed that was the > > fact that it's not reliable. You need something like TCP to add > > reliability. > > > > But that's a lot of work and a bit backwards. Use a unreliable > > transport but use TCP on top of it to get reliability. Our link > > (virtio) is inherently reliable so why not just expose a reliable > > interface to userspace? > > I removed original mail and did not check archive, but doesn't rx/tx > queues of the virtio device have limited size? I do hope they have, > which means that either your network drops packets or blocks. > It blocks. > Another approach is to implement that virtio backend with netlink based > userspace interface (like using connector or genetlink). This does not > differ too much from what you have with special socket family, but at > least it does not duplicate existing functionality of > userspace-kernelspace communications. > I implemented vmchannel using connector initially (the downside is that message can be dropped). Is this more expectable for upstream? The implementation was 300 lines of code. -- Gleb. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Anthony Liguori wrote: > > If we used TCP, we don't have a useful TCP/IP stack in QEMU, so we'd > have to inject that traffic into the host Linux instance, and then > receive the traffic in QEMU. Besides being indirect, it has some nasty > security implications that I outlined in my response to Jeremy's last note. When combined with namespaces I don't see why using the kernel TCP stack would create any security problems that wouldn't otherwise exist. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Evgeniy Polyakov wrote: > On Mon, Dec 15, 2008 at 05:08:29PM -0600, Anthony Liguori > (anth...@codemonkey.ws) wrote: > >> The KVM model is that a guest is a process. Any IO operations original >> from the process (QEMU). The advantage to this is that you get very >> good security because you can use things like SELinux and simply treat >> the QEMU process as you would the guest. In fact, in general, I think >> we want to assume that QEMU is guest code from a security perspective. >> >> By passing up the network traffic to the host kernel, we now face a >> problem when we try to get the data back. We could setup a tun device >> to send traffic to the kernel but then the rest of the system can see >> that traffic too. If that traffic is sensitive, it's potentially unsafe. >> > > You can even use unix sockets in this case, and each socket will be > named as virtio channels names. IIRC tun/tap devices can be virtualizen > with recent kernels, which also solves all problems of shared access. > > There are plenty of ways to implement this kind of functionality instead > of developing some new protocol, which is effectively a duplication of > what already exists in the kernel. > > Well, it is kinda pv-unix-domain-socket. I did not understand how a standard unix domain in the guest can reach the host according to your solution. The initial implementation was some sort of pv-serial. Serial itself is low performing and there is no naming services what so every. Gleb did offer the netlink option as a beginning but we though a new address family would be more robust (you say too robust). So by suggestion new address family what can think of it as a pv-unix-domain-socket. Networking IS used since we think it is a good 'wheel'. Indeed, David is right that instead of adding a new chunk of code we can re-use the existing one. But we do have some 'new' (afraid to tell virtualization) problems that might prevent us of using a standard virtual nic: - Even if we can teach iptables to ignore this interface, other 3rd firewall might not obey: What if the VM is a Checkpoint firewall? What if the VM is windows? + using a non MS firewall? - Who will assign IPs for the vnic? How can I assure there is no ip clash? The standard dhcp for the other standard vnics might not be in our control. So I do understand the idea of using a standard network interface. It's just not that simple. So ideas to handle the above are welcomed. Otherwise we might need to go back to serial/pv-serial approach. btw: here are the usages/next usages of vmchannel: VMchannel is a host-guest interface and in the future guest-guest interface. Currently/soon it is used for - guest statistics - guest info - guest single sign own - guest log-in log-out - mouse channel for multiple monitors - cut&paste (guest-host, sometimes client-host-guest, company firewall blocks client-guest). - fencing (potentially) tw2: without virtualization we wouldn't have new passionate issues to discuss about! Cheers, Dor ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
On Mon, Dec 15, 2008 at 05:08:29PM -0600, Anthony Liguori (anth...@codemonkey.ws) wrote: > The KVM model is that a guest is a process. Any IO operations original > from the process (QEMU). The advantage to this is that you get very > good security because you can use things like SELinux and simply treat > the QEMU process as you would the guest. In fact, in general, I think > we want to assume that QEMU is guest code from a security perspective. > > By passing up the network traffic to the host kernel, we now face a > problem when we try to get the data back. We could setup a tun device > to send traffic to the kernel but then the rest of the system can see > that traffic too. If that traffic is sensitive, it's potentially unsafe. You can even use unix sockets in this case, and each socket will be named as virtio channels names. IIRC tun/tap devices can be virtualizen with recent kernels, which also solves all problems of shared access. There are plenty of ways to implement this kind of functionality instead of developing some new protocol, which is effectively a duplication of what already exists in the kernel. -- Evgeniy Polyakov ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Hi Anthony. On Mon, Dec 15, 2008 at 05:01:14PM -0600, Anthony Liguori (anth...@codemonkey.ws) wrote: > Yes, and I went down the road of using a dedicated network device and > using raw ethernet as the protocol. The thing that killed that was the > fact that it's not reliable. You need something like TCP to add > reliability. > > But that's a lot of work and a bit backwards. Use a unreliable > transport but use TCP on top of it to get reliability. Our link > (virtio) is inherently reliable so why not just expose a reliable > interface to userspace? I removed original mail and did not check archive, but doesn't rx/tx queues of the virtio device have limited size? I do hope they have, which means that either your network drops packets or blocks. Having dedicated preconfigured network device is essentially the same as having this special socket option: guests which do not have this (either network or vchannel socket) will not be able to communicate with the host, so there is no difference. Except that usual network will just work out of the box (and especially you will like it when there will be no need to hack on X to support new network media). Another approach is to implement that virtio backend with netlink based userspace interface (like using connector or genetlink). This does not differ too much from what you have with special socket family, but at least it does not duplicate existing functionality of userspace-kernelspace communications. But IMO having special network device or running your protocol over existing virtio network device is a cleaner solution both from technical and convenience points of view. -- Evgeniy Polyakov ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Anthony Liguori wrote: > Jeremy Fitzhardinge wrote: >> Anthony Liguori wrote: >>> >>> That seems unnecessarily complex. >>> >> >> Well, the simplest thing is to let the host TCP stack do TCP. Could >> you go into more detail about why you'd want to avoid that? > > The KVM model is that a guest is a process. Any IO operations > original from the process (QEMU). The advantage to this is that you > get very good security because you can use things like SELinux and > simply treat the QEMU process as you would the guest. In fact, in > general, I think we want to assume that QEMU is guest code from a > security perspective. > > By passing up the network traffic to the host kernel, we now face a > problem when we try to get the data back. We could setup a tun device > to send traffic to the kernel but then the rest of the system can see > that traffic too. If that traffic is sensitive, it's potentially unsafe. Well, one could come up with a mechanism to bind an interface to be only visible to a particular context/container/something. > You can use iptables to restrict who can receive traffic and possibly > use SELinux packet tagging or whatever. This gets extremely complex > though. Well, if you can just tag everything based on interface its relatively simple. > It's far easier to avoid the host kernel entirely and implement the > backends in QEMU. Then any actions the backend takes will be on > behalf of the guest. You never have to worry about transport data > leakage. Well, a stream-like protocol layered over a reliable packet transport would get you there without the complexity of tcp. Or just do a usermode tcp; its not that complex if you really think it simplifies the other aspects. > >>> This is why I've been pushing for the backends to be implemented in >>> QEMU. Then QEMU can marshal the backend-specific state and transfer >>> it during live migration. For something like copy/paste, this is >>> obvious (the clipboard state). A general command interface is >>> probably stateless so it's a nop. >>> >> >> Copy/paste seems like a particularly bogus example. Surely this >> isn't a sensible way to implement it? > > I think it's the most sensible way to implement it. Would you suggest > something different? Well, off the top of my head I'm assuming the requirements are: * the goal is to unify the user's actual desktop session with a virtual session within a vm * a given user may have multiple VMs running on their desktop * a VM may be serving multiple user sessions * the VMs are not necessarily hosted by the user's desktop machine * the VMs can migrate at any moment To me that looks like a daemon running within the context of each of the user's virtual sessions monitoring clipboard events, talking over a TCP connection to a corresponding daemon in their desktop session, which is responsible for reconciling cuts and pastes in all the various sessions. I guess you'd say that each VM would multiplex all its cut/paste events via its AF_VMCHANNEL/cut+paste channel to its qemu, which would then demultiplex them off to the user's real desktops. And that since the VM itself may have no networking, it needs to be a special magic connection. And my counter argument to this nicely placed straw man is that the VM<->qemu connection can still be TCP, even if its a private network with no outside access. > >>> I'm not a fan of having external backends to QEMU for the very >>> reasons you outline above. You cannot marshal the state of a >>> channel we know nothing about. We're really just talking about >>> extending virtio in a guest down to userspace so that we can >>> implement paravirtual device drivers in guest userspace. This may >>> be an X graphics driver, a mouse driver, copy/paste, remote >>> shutdown, etc. >>> A socket seems like a natural choice. If that's wrong, then we >>> can explore other options (like a char device, virtual fs, etc.). >> >> I think a socket is a pretty poor choice. It's too low level, and it >> only really makes sense for streaming data, not for data storage >> (name/value pairs). It means that everyone ends up making up their >> own serializations. A filesystem view with notifications seems to be >> a better match for the use-cases you mention (aside from cut/paste), >> with a single well-defined way to serialize onto any given channel. >> Each "file" may well have an application-specific content, but in >> general that's going to be something pretty simple. > > I had suggested a virtual file system at first and was thoroughly > ridiculed for it :-) There is a 9p virtio transport already so we > could even just use that. You mean 9p directly over a virtio ringbuffer rather than via the network stack? You could do that, but I'd still argue that using the network stack is a better approach. > The main issue with a virtual file system is that it does map well to > other gu
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
David Miller wrote: > From: Anthony Liguori > Date: Mon, 15 Dec 2008 17:01:14 -0600 > > >> No, TCP falls under the not simple category because it requires the >> backend to have access to a TCP/IP stack. >> > > I'm at a loss for words if you need TCP in the hypervisor, if that's > what you're implying here. > No. KVM is not a traditional "hypervisor". It's more of a userspace accelerator for emulators. QEMU, a system emulator, calls in to the Linux kernel whenever it needs to run guest code. Linux returns to QEMU whenever the guest has done an MMIO operation or something of that nature. In this way, all of our device emulation (including paravirtual backends) are implemented in the host userspace in the QEMU process. If we used TCP, we don't have a useful TCP/IP stack in QEMU, so we'd have to inject that traffic into the host Linux instance, and then receive the traffic in QEMU. Besides being indirect, it has some nasty security implications that I outlined in my response to Jeremy's last note. Regards, Anthony Liguori > You only need it in the guest and the host, which you already have, > in the Linux kernel. Just transport that over virtio or whatever > and be done with it. > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
On Mon, 15 Dec 2008 17:01:14 -0600 Anthony Liguori wrote: > David Miller wrote: > > From: Anthony Liguori > > Date: Mon, 15 Dec 2008 14:44:26 -0600 > > > > > >> We want this communication mechanism to be simple and reliable as we > >> want to implement the backends drivers in the host userspace with > >> minimum mess. > >> > > > > One implication of your statement here is that TCP is unreliable. > > That's absolutely not true. > > > > No, TCP falls under the not simple category because it requires the > backend to have access to a TCP/IP stack. > > >> Within the guest, we need the interface to be always available and > >> we need an addressing scheme that is hypervisor specific. Yes, we > >> can build this all on top of TCP/IP. We could even build it on top > >> of a serial port. Both have their down-sides wrt reliability and > >> complexity. > >> > > > > I don't know of any zero-copy through the hypervisor mechanisms for > > serial ports, but I know we do that with the various virtualization > > network devices. > > > > Yes, and I went down the road of using a dedicated network device and > using raw ethernet as the protocol. The thing that killed that was the > fact that it's not reliable. You need something like TCP to add > reliability. > > But that's a lot of work and a bit backwards. Use a unreliable > transport but use TCP on top of it to get reliability. Our link > (virtio) is inherently reliable so why not just expose a reliable > interface to userspace? > > >> Do you have another recommendation? > >> > > > > I don't have to make alternative recommendations until you can > > show that what we have can't solve the problem acceptably, and > > TCP emphatically can. > > > > It can solve the problem but I don't think it's the best way to solve > the problem mainly because the complexity it demands on the backend. "Those who don't understand TCP are doomed to reimplement it, badly." ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
From: Anthony Liguori Date: Mon, 15 Dec 2008 17:01:14 -0600 > No, TCP falls under the not simple category because it requires the > backend to have access to a TCP/IP stack. I'm at a loss for words if you need TCP in the hypervisor, if that's what you're implying here. You only need it in the guest and the host, which you already have, in the Linux kernel. Just transport that over virtio or whatever and be done with it. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Jeremy Fitzhardinge wrote: > Anthony Liguori wrote: >> >> That seems unnecessarily complex. >> > > Well, the simplest thing is to let the host TCP stack do TCP. Could > you go into more detail about why you'd want to avoid that? The KVM model is that a guest is a process. Any IO operations original from the process (QEMU). The advantage to this is that you get very good security because you can use things like SELinux and simply treat the QEMU process as you would the guest. In fact, in general, I think we want to assume that QEMU is guest code from a security perspective. By passing up the network traffic to the host kernel, we now face a problem when we try to get the data back. We could setup a tun device to send traffic to the kernel but then the rest of the system can see that traffic too. If that traffic is sensitive, it's potentially unsafe. You can use iptables to restrict who can receive traffic and possibly use SELinux packet tagging or whatever. This gets extremely complex though. It's far easier to avoid the host kernel entirely and implement the backends in QEMU. Then any actions the backend takes will be on behalf of the guest. You never have to worry about transport data leakage. >> This is why I've been pushing for the backends to be implemented in >> QEMU. Then QEMU can marshal the backend-specific state and transfer >> it during live migration. For something like copy/paste, this is >> obvious (the clipboard state). A general command interface is >> probably stateless so it's a nop. >> > > Copy/paste seems like a particularly bogus example. Surely this isn't > a sensible way to implement it? I think it's the most sensible way to implement it. Would you suggest something different? >> I'm not a fan of having external backends to QEMU for the very >> reasons you outline above. You cannot marshal the state of a channel >> we know nothing about. We're really just talking about extending >> virtio in a guest down to userspace so that we can implement >> paravirtual device drivers in guest userspace. This may be an X >> graphics driver, a mouse driver, copy/paste, remote shutdown, etc. >> A socket seems like a natural choice. If that's wrong, then we can >> explore other options (like a char device, virtual fs, etc.). > > I think a socket is a pretty poor choice. It's too low level, and it > only really makes sense for streaming data, not for data storage > (name/value pairs). It means that everyone ends up making up their > own serializations. A filesystem view with notifications seems to be > a better match for the use-cases you mention (aside from cut/paste), > with a single well-defined way to serialize onto any given channel. > Each "file" may well have an application-specific content, but in > general that's going to be something pretty simple. I had suggested a virtual file system at first and was thoroughly ridiculed for it :-) There is a 9p virtio transport already so we could even just use that. The main issue with a virtual file system is that it does map well to other guests. It's actually easier to implement a socket interface for Windows than it is to implement a new file system. But we could find ways around this with libraries. If we used 9p as a transport, we could just provide a char device in Windows that received it in userspace. >> This shouldn't be confused with networking though and all the talk >> of doing silly things like streaming fence traffic through it just >> encourages the confusion. > > I'm not sure what you're referring to here. I'm just ranting, it's not important. Regards, Anthony Liguori >J ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
David Miller wrote: > From: Anthony Liguori > Date: Mon, 15 Dec 2008 14:44:26 -0600 > > >> We want this communication mechanism to be simple and reliable as we >> want to implement the backends drivers in the host userspace with >> minimum mess. >> > > One implication of your statement here is that TCP is unreliable. > That's absolutely not true. > No, TCP falls under the not simple category because it requires the backend to have access to a TCP/IP stack. >> Within the guest, we need the interface to be always available and >> we need an addressing scheme that is hypervisor specific. Yes, we >> can build this all on top of TCP/IP. We could even build it on top >> of a serial port. Both have their down-sides wrt reliability and >> complexity. >> > > I don't know of any zero-copy through the hypervisor mechanisms for > serial ports, but I know we do that with the various virtualization > network devices. > Yes, and I went down the road of using a dedicated network device and using raw ethernet as the protocol. The thing that killed that was the fact that it's not reliable. You need something like TCP to add reliability. But that's a lot of work and a bit backwards. Use a unreliable transport but use TCP on top of it to get reliability. Our link (virtio) is inherently reliable so why not just expose a reliable interface to userspace? >> Do you have another recommendation? >> > > I don't have to make alternative recommendations until you can > show that what we have can't solve the problem acceptably, and > TCP emphatically can. > It can solve the problem but I don't think it's the best way to solve the problem mainly because the complexity it demands on the backend. Regards, Anthony Liguori ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Anthony Liguori wrote: > Jeremy Fitzhardinge wrote: > >>> Each of these sockets are going to be connected to a backend (to >>> implement guest<=>copy/paste for instance). We want to implement >>> those backends in userspace and preferably in QEMU. >>> >>> Using some raw protocol over ethernet means you don't have >>> reliability. If you use a protocol to get reliability (like TCP), >>> you now have to implement a full TCP/IP stack in userspace or get the >>> host kernel involved. I'd rather not get the host kernel involved >>> from a security perspective. >>> >>> >> There's nothing wrong with user-mode TCP, or you could run your TCP >> stack in a special-purpose guest if you're really paranoid. >> > > That seems unnecessarily complex. > Well, the simplest thing is to let the host TCP stack do TCP. Could you go into more detail about why you'd want to avoid that? > This is why I've been pushing for the backends to be implemented in > QEMU. Then QEMU can marshal the backend-specific state and transfer it > during live migration. For something like copy/paste, this is obvious > (the clipboard state). A general command interface is probably > stateless so it's a nop. > Copy/paste seems like a particularly bogus example. Surely this isn't a sensible way to implement it? > I'm not a fan of having external backends to QEMU for the very reasons > you outline above. You cannot marshal the state of a channel we know > nothing about. We're really just talking about extending virtio in a > guest down to userspace so that we can implement paravirtual device > drivers in guest userspace. This may be an X graphics driver, a mouse > driver, copy/paste, remote shutdown, etc. > > A socket seems like a natural choice. If that's wrong, then we can > explore other options (like a char device, virtual fs, etc.). I think a socket is a pretty poor choice. It's too low level, and it only really makes sense for streaming data, not for data storage (name/value pairs). It means that everyone ends up making up their own serializations. A filesystem view with notifications seems to be a better match for the use-cases you mention (aside from cut/paste), with a single well-defined way to serialize onto any given channel. Each "file" may well have an application-specific content, but in general that's going to be something pretty simple. > This > shouldn't be confused with networking though and all the talk of doing > silly things like streaming fence traffic through it just encourages the > confusion. I'm not sure what you're referring to here. J ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
From: Anthony Liguori Date: Mon, 15 Dec 2008 14:44:26 -0600 > We want this communication mechanism to be simple and reliable as we > want to implement the backends drivers in the host userspace with > minimum mess. One implication of your statement here is that TCP is unreliable. That's absolutely not true. > Within the guest, we need the interface to be always available and > we need an addressing scheme that is hypervisor specific. Yes, we > can build this all on top of TCP/IP. We could even build it on top > of a serial port. Both have their down-sides wrt reliability and > complexity. I don't know of any zero-copy through the hypervisor mechanisms for serial ports, but I know we do that with the various virtualization network devices. > Do you have another recommendation? I don't have to make alternative recommendations until you can show that what we have can't solve the problem acceptably, and TCP emphatically can. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
David Miller wrote: > From: Anthony Liguori > Date: Mon, 15 Dec 2008 09:02:23 -0600 > > >> There is already an AF_IUCV for s390. >> > > This is a scarecrow and irrelevant to this discussion. > > And this is exactly why I asked that any arguments in this thread > avoid talking about virtualization technology and why it's "special." > You cannot completely avoid talking about virtualization here. I agree that an argument based on, "we need it for virtualization", why?, "virtualization!" is not sufficient. You still didn't address my earlier question though. What we need is a mechanism for implementing paravirtual device drivers in userspace. On a modern Linux system, a lot of important things are done in userspace (mostly around X) so having some code in the guest's userspace is important. We want this communication mechanism to be simple and reliable as we want to implement the backends drivers in the host userspace with minimum mess. Within the guest, we need the interface to be always available and we need an addressing scheme that is hypervisor specific. Yes, we can build this all on top of TCP/IP. We could even build it on top of a serial port. Both have their down-sides wrt reliability and complexity. The most natural userspace interface that meets all of these requirements would appear to be a new socket family. We could also use another userspace interface (netlink was originally proposed, a chardev is possible, or a virtual file system). Do you have another recommendation? Regards, Anthony Liguori > This proposed patch here is asking to add new infrastructure for > hypervisor facilities that will be _ADDED_ and for which we have > complete control over. > > Whereas the S390 folks have to deal with existing infrastructure which > is largely outside of their control. So if they implement access > mechanisms for that, it's fine. > > I would be doing the same thing if I added a protocol socket layer for > accessing the Niagara hypervisor virtualization channels. > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
From: Anthony Liguori Date: Mon, 15 Dec 2008 09:02:23 -0600 > There is already an AF_IUCV for s390. This is a scarecrow and irrelevant to this discussion. And this is exactly why I asked that any arguments in this thread avoid talking about virtualization technology and why it's "special." This proposed patch here is asking to add new infrastructure for hypervisor facilities that will be _ADDED_ and for which we have complete control over. Whereas the S390 folks have to deal with existing infrastructure which is largely outside of their control. So if they implement access mechanisms for that, it's fine. I would be doing the same thing if I added a protocol socket layer for accessing the Niagara hypervisor virtualization channels. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
RE: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
> -Original Message- > From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On > Behalf Of Jeremy Fitzhardinge > The trouble is that it presumes that the host and guest (or whoever the > endpoints are) are on the same physical machine and will remain that > way. Given that live migration is a feature that people seem to like, > then you'd end up needing to transport this protocol over a real network > anyway - and at that point you may as well use proper TCP/IP. The > alternative is to say either "if you use this feature you can't migrate, > and you can only resume on the same host", or "you can use this feature, > and we'll work out a global namespace and proxy it over TCP for you". > Neither seems very satisfactory. [IH] when migrating a guest to another host, migration takes care of closing/opening of the VMChannel on the target host. The VMChannel is local to the hypervisor, not accessible via network. Migration is not an issue requiring the VMChannel to use TCP/IP. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Jeremy Fitzhardinge wrote: >> Each of these sockets are going to be connected to a backend (to >> implement guest<=>copy/paste for instance). We want to implement >> those backends in userspace and preferably in QEMU. >> >> Using some raw protocol over ethernet means you don't have >> reliability. If you use a protocol to get reliability (like TCP), >> you now have to implement a full TCP/IP stack in userspace or get the >> host kernel involved. I'd rather not get the host kernel involved >> from a security perspective. >> > > There's nothing wrong with user-mode TCP, or you could run your TCP > stack in a special-purpose guest if you're really paranoid. That seems unnecessarily complex. >> An inherently reliable socket transport solves the above problem >> while keeping things simple. Note, this is not a new concept. There >> is already an AF_IUCV for s390. VMware is also developing an AF_VMCI >> socket family. >> > > The trouble is that it presumes that the host and guest (or whoever > the endpoints are) are on the same physical machine and will remain > that way. Given that live migration is a feature that people seem to > like, then you'd end up needing to transport this protocol over a real > network anyway - and at that point you may as well use proper > TCP/IP. The alternative is to say either "if you use this feature > you can't migrate, and you can only resume on the same host", or "you > can use this feature, and we'll work out a global namespace and proxy > it over TCP for you". Neither seems very satisfactory. This is why I've been pushing for the backends to be implemented in QEMU. Then QEMU can marshal the backend-specific state and transfer it during live migration. For something like copy/paste, this is obvious (the clipboard state). A general command interface is probably stateless so it's a nop. I'm not a fan of having external backends to QEMU for the very reasons you outline above. You cannot marshal the state of a channel we know nothing about. We're really just talking about extending virtio in a guest down to userspace so that we can implement paravirtual device drivers in guest userspace. This may be an X graphics driver, a mouse driver, copy/paste, remote shutdown, etc. A socket seems like a natural choice. If that's wrong, then we can explore other options (like a char device, virtual fs, etc.). This shouldn't be confused with networking though and all the talk of doing silly things like streaming fence traffic through it just encourages the confusion. Regards, Anthony Liguori >J ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Anthony Liguori wrote: > David Miller wrote: > >> From: Gleb Natapov >> Date: Sun, 14 Dec 2008 13:50:55 +0200 >> >> >> >>> It is undesirable to use TCP/IP for this purpose since network >>> connectivity may not exist between host and guest and if it exists the >>> traffic can be not routable between host and guest for security reasons >>> or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user. >>> >>> >> I don't really accept this argument, sorry. >> Yes. There's no reason why the management stack couldn't implement its own private idiot-proofing network for this kind of thing. > Each of these sockets are going to be connected to a backend (to > implement guest<=>copy/paste for instance). We want to implement those > backends in userspace and preferably in QEMU. > > Using some raw protocol over ethernet means you don't have reliability. > If you use a protocol to get reliability (like TCP), you now have to > implement a full TCP/IP stack in userspace or get the host kernel > involved. I'd rather not get the host kernel involved from a security > perspective. > There's nothing wrong with user-mode TCP, or you could run your TCP stack in a special-purpose guest if you're really paranoid. > An inherently reliable socket transport solves the above problem while > keeping things simple. Note, this is not a new concept. There is > already an AF_IUCV for s390. VMware is also developing an AF_VMCI > socket family. > The trouble is that it presumes that the host and guest (or whoever the endpoints are) are on the same physical machine and will remain that way. Given that live migration is a feature that people seem to like, then you'd end up needing to transport this protocol over a real network anyway - and at that point you may as well use proper TCP/IP. The alternative is to say either "if you use this feature you can't migrate, and you can only resume on the same host", or "you can use this feature, and we'll work out a global namespace and proxy it over TCP for you". Neither seems very satisfactory. J ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Hi Gleb. On Sun, Dec 14, 2008 at 01:50:55PM +0200, Gleb Natapov (g...@redhat.com) wrote: > There is a need for communication channel between host and various > agents that are running inside a VM guest. The channel will be used > for statistic gathering, logging, cut & paste, host screen resolution > changes notifications, guest configuration etc. > > It is undesirable to use TCP/IP for this purpose since network > connectivity may not exist between host and guest and if it exists the > traffic can be not routable between host and guest for security reasons > or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user. > > This patch implement new address family AF_VMCHANNEL that is used > for communication between guest and host. Channels are created at VM > start time. Each channel has a name. Agent, that runs on a guest, can > send/receive data to/from a channel by creating AF_VMCHANNEL socket and > connecting to a channel using channels name as an address. > > Only stream sockets are supported by this implementation. Also only > connect, sendmsg and recvmsg socket ops are implemented which is enough > to allow application running in a guest to connect to a channel created > by a host and read/write from/to the channel. This can be extended to > allow channel creation from inside a guest by creating listen socket and > accepting on it if the need will arise and thus even allow guest<->guest > communication in the future (but TCP/IP may be preferable for this). Couple of comments on this. First, there is only single virtio device initialized at probe time, how this will work on the host system with multiple guests? Is it possible to have multiple virtual devices? Second, each virtual device has an array of names, and each socket can be bound to one of them, but it is not allowed to have multiple sockets bound to the same name, so it looks like there is no possibility to have several sockets communicating via signel channel, was this intentional? And third, tasklet callbacks do not use bh socket locking, and while it is not something bad, but rt folks want (dream) to replace it with process context, so this at least requires some note in comments. Except that about questions, this patch looks good. -- Evgeniy Polyakov ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
David Miller wrote: > From: Gleb Natapov > Date: Sun, 14 Dec 2008 13:50:55 +0200 > > >> It is undesirable to use TCP/IP for this purpose since network >> connectivity may not exist between host and guest and if it exists the >> traffic can be not routable between host and guest for security reasons >> or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user. >> > > I don't really accept this argument, sorry. > I couldn't agree more. That doesn't mean I don't think this isn't valuable though. Each of these sockets are going to be connected to a backend (to implement guest<=>copy/paste for instance). We want to implement those backends in userspace and preferably in QEMU. Using some raw protocol over ethernet means you don't have reliability. If you use a protocol to get reliability (like TCP), you now have to implement a full TCP/IP stack in userspace or get the host kernel involved. I'd rather not get the host kernel involved from a security perspective. An inherently reliable socket transport solves the above problem while keeping things simple. Note, this is not a new concept. There is already an AF_IUCV for s390. VMware is also developing an AF_VMCI socket family. Regards, Anthony Liguori ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
From: Gleb Natapov Date: Mon, 15 Dec 2008 09:48:19 +0200 > On Sun, Dec 14, 2008 at 10:44:36PM -0800, David Miller wrote: > > You guys really need to rethink this. Either a stream protocol is a > > workable solution to your problem, or it isn't. > > Stream protocol is workable solution for us, but we need it out of band > in regard to networking and as much zero config as possible. If we will > use networking how can it be done without additional configuration (and > reconfiguration can be required after migration BTW) You miss the whole point and you also missed the part where I said (and the one part of my comments you conveniently did NOT quote): And don't bring up any "virtualization is special because..." arguments into your reply because virtualization has nothing to do with my objections stated above. What part of that do you not understand? Don't give me this junk about zero config, it's not a plausible argument against anything I said. You want to impose a new burdon onto the kernel in the form of a whole new socket layer. When existing ones can solve any communications problem. Performance is not a good argument because we have (repeatedly) made TCP/IP go fast in just about any environment. If you have a configuration problem, you can solve it in userspace in a number of different ways. Building on top of things we have and the user need not know anything about that. I would even be OK with features such as "permanent" links or special attributes for devices or IP addresses that by default prevent tampering and filtering by things like netfilter. But not this new thing that duplicates existing functionality, no way. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
On Sun, Dec 14, 2008 at 10:44:36PM -0800, David Miller wrote: > From: Gleb Natapov > Date: Sun, 14 Dec 2008 13:50:55 +0200 > > > It is undesirable to use TCP/IP for this purpose since network > > connectivity may not exist between host and guest and if it exists the > > traffic can be not routable between host and guest for security reasons > > or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user. > > I don't really accept this argument, sorry. > > If you can't use TCP because it might be security protected or > misconfigured, adding this new stream protocol thing is not one > bit better. It doesn't make any sense at all. > It can be _accidentally_ misconfigured. Just think about sysadmin that has a remote access to a VM guest and he doesn't even know that it is a VM. (He can easily find this out but why should he care?). The sysadmin knows that the first rule of firewalling is deny everything and than allow what you want to be allowed, so that what he does and cut communication between host and guest. The problem with networking is that it is visible to VM user and perceived to be under full user control. > Also, if TCP could be "misconfigured" this new thing could just as > easily be screwed up too. And I wouldn't be surprised to see a whole > bunch of SELINUX and netfilter features proposed later for this and > then we're back to square one. > It not only can be missconfigured it may not exist between guest and host at all. IP connectivity between guest and host is not mandatory and we don't want to make it such. It is like saying "who needs serial console, just use ssh". And what subnet should be used for this purpose? Who will solve conflicts? I can see why SELINUX features may be proposed for vmchannel, but netfilter doesn't make sense for it. And vmchannel also has other advantages over TCP/IP: less overhead and better "naming". By better naming I mean that guest should not guess (or require configuration) what ip:port should be used for cut&paste, it just connects to address "cut&paste". > You guys really need to rethink this. Either a stream protocol is a > workable solution to your problem, or it isn't. > Stream protocol is workable solution for us, but we need it out of band in regard to networking and as much zero config as possible. If we will use networking how can it be done without additional configuration (and reconfiguration can be required after migration BTW) -- Gleb. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
From: Gleb Natapov Date: Sun, 14 Dec 2008 13:50:55 +0200 > It is undesirable to use TCP/IP for this purpose since network > connectivity may not exist between host and guest and if it exists the > traffic can be not routable between host and guest for security reasons > or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user. I don't really accept this argument, sorry. If you can't use TCP because it might be security protected or misconfigured, adding this new stream protocol thing is not one bit better. It doesn't make any sense at all. Also, if TCP could be "misconfigured" this new thing could just as easily be screwed up too. And I wouldn't be surprised to see a whole bunch of SELINUX and netfilter features proposed later for this and then we're back to square one. You guys really need to rethink this. Either a stream protocol is a workable solution to your problem, or it isn't. And don't bring up any "virtualization is special because..." arguments into your reply because virtualization has nothing to do with my objections stated above. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
Re: [PATCH] AF_VMCHANNEL address family for guest<->host communication.
Hi Evgeniy, On Sun, Dec 14, 2008 at 03:23:20PM +0300, Evgeniy Polyakov wrote: > On Sun, Dec 14, 2008 at 01:50:55PM +0200, Gleb Natapov (g...@redhat.com) > wrote: > > There is a need for communication channel between host and various > > agents that are running inside a VM guest. The channel will be used > > for statistic gathering, logging, cut & paste, host screen resolution > > changes notifications, guest configuration etc. > > > > It is undesirable to use TCP/IP for this purpose since network > > connectivity may not exist between host and guest and if it exists the > > traffic can be not routable between host and guest for security reasons > > or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user. > > > > This patch implement new address family AF_VMCHANNEL that is used > > for communication between guest and host. Channels are created at VM > > start time. Each channel has a name. Agent, that runs on a guest, can > > send/receive data to/from a channel by creating AF_VMCHANNEL socket and > > connecting to a channel using channels name as an address. > > > > Only stream sockets are supported by this implementation. Also only > > connect, sendmsg and recvmsg socket ops are implemented which is enough > > to allow application running in a guest to connect to a channel created > > by a host and read/write from/to the channel. This can be extended to > > allow channel creation from inside a guest by creating listen socket and > > accepting on it if the need will arise and thus even allow guest<->guest > > communication in the future (but TCP/IP may be preferable for this). > > Couple of comments on this. > First, there is only single virtio device initialized at probe time, > how this will work on the host system with multiple guests? Is it > possible to have multiple virtual devices? The module is loaded only inside a guest not host and it manages all existing channels. What would be the value to have multiple vmchannel PCI devices in a single guest? > Second, each virtual device has an array of names, and each socket can > be bound to one of them, but it is not allowed to have multiple sockets > bound to the same name, so it looks like there is no possibility to have > several sockets communicating via signel channel, was this intentional? Yes, this is intentional as it matches our usage model. It is possible to change this in the future if needed. All sockets bound to the same channel will receive the same data. > And third, tasklet callbacks do not use bh socket locking, and while it > is not something bad, but rt folks want (dream) to replace it with > process context, so this at least requires some note in comments. > This is something I need to understand better. I though that socket lock guards socket state change. The patch only access socket state from bh context in the vmchannel_socket_recv() and even if state of the socket will change after function validates it nothing bad can happen. Is this the case? I it is I will add comment explaining this. > Except that about questions, this patch looks good. Thanks for the review. -- Gleb. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/virtualization
[PATCH] AF_VMCHANNEL address family for guest<->host communication.
There is a need for communication channel between host and various agents that are running inside a VM guest. The channel will be used for statistic gathering, logging, cut & paste, host screen resolution changes notifications, guest configuration etc. It is undesirable to use TCP/IP for this purpose since network connectivity may not exist between host and guest and if it exists the traffic can be not routable between host and guest for security reasons or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user. This patch implement new address family AF_VMCHANNEL that is used for communication between guest and host. Channels are created at VM start time. Each channel has a name. Agent, that runs on a guest, can send/receive data to/from a channel by creating AF_VMCHANNEL socket and connecting to a channel using channels name as an address. Only stream sockets are supported by this implementation. Also only connect, sendmsg and recvmsg socket ops are implemented which is enough to allow application running in a guest to connect to a channel created by a host and read/write from/to the channel. This can be extended to allow channel creation from inside a guest by creating listen socket and accepting on it if the need will arise and thus even allow guest<->guest communication in the future (but TCP/IP may be preferable for this). Signed-off-by: Gleb Natapov --- include/linux/socket.h |4 include/linux/vmchannel.h| 54 +++ net/Kconfig |1 net/Makefile |1 net/vmchannel/Kconfig| 11 + net/vmchannel/Makefile |5 net/vmchannel/af_vmchannel.c | 769 ++ 7 files changed, 844 insertions(+), 1 deletions(-) create mode 100644 include/linux/vmchannel.h create mode 100644 net/vmchannel/Kconfig create mode 100644 net/vmchannel/Makefile create mode 100644 net/vmchannel/af_vmchannel.c diff --git a/include/linux/socket.h b/include/linux/socket.h index 20fc4bb..e65834c 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -191,7 +191,8 @@ struct ucred { #define AF_RXRPC 33 /* RxRPC sockets*/ #define AF_ISDN34 /* mISDN sockets*/ #define AF_PHONET 35 /* Phonet sockets */ -#define AF_MAX 36 /* For now.. */ +#define AF_VMCHANNEL 36 /* Vmchannel sockets*/ +#define AF_MAX 37 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC @@ -229,6 +230,7 @@ struct ucred { #define PF_RXRPC AF_RXRPC #define PF_ISDNAF_ISDN #define PF_PHONET AF_PHONET +#define PF_VMCHANNEL AF_VMCHANNEL #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ diff --git a/include/linux/vmchannel.h b/include/linux/vmchannel.h new file mode 100644 index 000..27c1f94 --- /dev/null +++ b/include/linux/vmchannel.h @@ -0,0 +1,54 @@ +/* + * Copyright 2008 Red Hat, Inc --- All Rights Reserved + * + * Author(s): Gleb Natapov + */ + +#ifndef VMCHANNEL_H +#define VMCHANNEL_H + +#define VMCHANNEL_NAME_MAX 80 +struct sockaddr_vmchannel { + sa_family_t svmchannel_family; + char svmchannel_name[VMCHANNEL_NAME_MAX]; +}; + +#ifdef __KERNEL__ + +#define VIRTIO_ID_VMCHANNEL 6 +#define VMCHANNEL_BAD_ID (~(__u32)0) + +#define vmchannel_sk(__sk) ((struct vmchannel_sock *) __sk) + +struct vmchannel_sock { + struct sock sk; + char name[VMCHANNEL_NAME_MAX]; + __u32 id; + struct sk_buff_head backlog_skb_q; +}; + +struct vmchannel_info { + __u32 id; + char *name; +}; + +struct vmchannel_dev { + struct virtio_device *vdev; + struct virtqueue *rq; + struct virtqueue *sq; + struct tasklet_struct rx_tasklet; + struct tasklet_struct tx_tasklet; + __u32 channel_count; + struct vmchannel_info *channels; + struct sk_buff_head rx_skbuff_q; + struct sk_buff_head tx_skbuff_q; + atomic_t recv_posted; +}; + +struct vmchannel_desc { + __u32 id; + __le32 len; +}; + +#endif /* __KERNEL__ */ +#endif diff --git a/net/Kconfig b/net/Kconfig index d789d79..d01f135 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -36,6 +36,7 @@ source "net/packet/Kconfig" source "net/unix/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" +source "net/vmchannel/Kconfig" config INET bool "TCP/IP networking" diff --git a/net/Makefile b/net/Makefile index 27d1f10..ddc89dc 100644 --- a/net/Makefile +++ b/net/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_IEEE80211) += ieee80211/ obj-$(CONFIG_TIPC) += tipc/ obj-$(CONFIG_NETLABEL) += netlabel/ obj-$(CONFIG_IUCV) += iucv/ +obj-$(CONFIG_VMCHANNEL)+= vmchannel/ obj-$(CONFIG_RFKILL) += rfkill/ obj-$(CONFIG_NET_9P) += 9p/ diff --git a/net/vmchannel/Kconfig b/n