Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-18 Thread Evgeniy Polyakov
Hi Gleb.

On Wed, Dec 17, 2008 at 04:31:46PM +0200, Gleb Natapov (g...@redhat.com) wrote:
 Here it is. Sorry it is not in a patch format yet, but it gives
 general idea how it looks. The problem with connector is that 
 we need different IDX for different channels and there is no way
 to dynamically allocate them.

Looks very good. Especially liked how you used idx.val pairs to register
multiple users. Please add some comment in connector header on how you
use it and feel free to add my ack if needed.

-- 
Evgeniy Polyakov
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-17 Thread Gleb Natapov
On Wed, Dec 17, 2008 at 12:25:32AM +0300, Evgeniy Polyakov wrote:
 On Tue, Dec 16, 2008 at 08:57:27AM +0200, Gleb Natapov (g...@redhat.com) 
 wrote:
   Another approach is to implement that virtio backend with netlink based
   userspace interface (like using connector or genetlink). This does not
   differ too much from what you have with special socket family, but at
   least it does not duplicate existing functionality of
   userspace-kernelspace communications.
   
  I implemented vmchannel using connector initially (the downside is that
  message can be dropped). Is this more expectable for upstream? The
  implementation was 300 lines of code.
 
 Hard to tell, it depends on implementation. But if things are good, I
 have no objections as connector maintainer :)
 
Here it is. Sorry it is not in a patch format yet, but it gives
general idea how it looks. The problem with connector is that 
we need different IDX for different channels and there is no way
to dynamically allocate them.

--
Gleb.
/*
 *  Copyright (c) 2008 Red Hat, Inc.
 *
 *  Author(s): Gleb Natapov g...@redhat.com
 */

#include linux/module.h
#include linux/interrupt.h
#include linux/connector.h
#include linux/virtio.h
#include linux/scatterlist.h
#include linux/virtio_config.h
#include linux/list.h
#include linux/spinlock.h
#include vmchannel_connector.h

static struct vmchannel_dev vmc_dev;

static int add_recq_buf(struct vmchannel_dev *vmc, struct vmchannel_hdr *hdr)
{
	struct scatterlist sg[2];

	sg_init_table(sg, 2);
	sg_init_one(sg[0], hdr, sizeof(struct vmchannel_desc));
	sg_init_one(sg[1], hdr-msg.data, MAX_PACKET_LEN);

	if (!vmc-rq-vq_ops-add_buf(vmc-rq, sg, 0, 2, hdr))
		return 1;

	kfree(hdr);
	return 0;
}

static int try_fill_recvq(struct vmchannel_dev *vmc)
{
	int num = 0;

	for (;;) {
		struct vmchannel_hdr *hdr;

		hdr = kmalloc(sizeof(*hdr) + MAX_PACKET_LEN, GFP_KERNEL);

		if (unlikely(!hdr))
			break;

		if (!add_recq_buf(vmc, hdr))
			break;

		num++;
	}

	if (num)
		vmc-rq-vq_ops-kick(vmc-rq);

	return num;
}

static void vmchannel_recv(unsigned long data)
{
	struct vmchannel_dev *vmc = (struct vmchannel_dev *)data;
	struct vmchannel_hdr *hdr;
	unsigned int len;
	int posted = 0;

	while ((hdr = vmc-rq-vq_ops-get_buf(vmc-rq, len))) {
		hdr-msg.len = le32_to_cpu(hdr-desc.len);
		len -= sizeof(struct vmchannel_desc);
		if (hdr-msg.len == len) {
			hdr-msg.id.idx = VMCHANNEL_CONNECTOR_IDX;
			hdr-msg.id.val = le32_to_cpu(hdr-desc.id);
			hdr-msg.seq = vmc-seq++;
			hdr-msg.ack = random32();

			cn_netlink_send(hdr-msg, VMCHANNEL_CONNECTOR_IDX,
	GFP_ATOMIC);
		} else
			dev_printk(KERN_ERR, vmc-vdev-dev,
	wrong length in received descriptor
	 (%d instead of %d)\n, hdr-msg.len,
	len);

		posted += add_recq_buf(vmc, hdr);
	}

	if (posted)
		vmc-rq-vq_ops-kick(vmc-rq);
}

static void recvq_notify(struct virtqueue *recvq)
{
	struct vmchannel_dev *vmc = recvq-vdev-priv;

	tasklet_schedule(vmc-tasklet);
}

static void cleanup_sendq(struct vmchannel_dev *vmc)
{
	char *buf;
	unsigned int len;

	spin_lock(vmc-sq_lock);
	while ((buf = vmc-sq-vq_ops-get_buf(vmc-sq, len)))
		kfree(buf);
	spin_unlock(vmc-sq_lock);
}

static void sendq_notify(struct virtqueue *sendq)
{
	struct vmchannel_dev *vmc = sendq-vdev-priv;

	cleanup_sendq(vmc);
}

static void vmchannel_cn_callback(void *data)
{
	struct vmchannel_desc *desc;
	struct cn_msg *msg = data;
	struct scatterlist sg;
	char *buf;
	int err;
	unsigned long flags;

	desc = kmalloc(msg-len + sizeof(*desc), GFP_KERNEL);

	if (!desc)
		return;

	desc-id = cpu_to_le32(msg-id.val);
	desc-len = cpu_to_le32(msg-len);

	buf = (char *)(desc + 1);

	memcpy(buf, msg-data, msg-len);

	sg_init_one(sg, desc, msg-len + sizeof(*desc));

	spin_lock_irqsave(vmc_dev.sq_lock, flags);
	err = vmc_dev.sq-vq_ops-add_buf(vmc_dev.sq, sg, 1, 0, desc);

	if (err)
		kfree(desc);
	else
		vmc_dev.sq-vq_ops-kick(vmc_dev.sq);
	spin_unlock_irqrestore(vmc_dev.sq_lock, flags);
}

static int vmchannel_probe(struct virtio_device *vdev)
{
	struct vmchannel_dev *vmc = vmc_dev;
	struct cb_id cn_id;
	int r, i;
	__le32 count;
	unsigned offset;

	cn_id.idx = VMCHANNEL_CONNECTOR_IDX;
	vdev-priv = vmc;
	vmc-vdev = vdev;

	vdev-config-get(vdev, 0, count, sizeof(count));

	vmc-channel_count = le32_to_cpu(count);
	if (vmc-channel_count == 0) {
		dev_printk(KERN_ERR, vdev-dev, No channels present\n);
		return -ENODEV;
	}

	pr_debug(vmchannel: %d channel detected\n, vmc-channel_count);

	vmc-channels =
		kzalloc(vmc-channel_count * sizeof(struct vmchannel_info),
GFP_KERNEL);
	if (!vmc-channels)
		return -ENOMEM;

	offset = sizeof(count);
	for (i = 0; i  vmc-channel_count; i++) {
		__u32 len;
		__le32 tmp;
		vdev-config-get(vdev, offset, tmp, 4);
		vmc-channels[i].id = le32_to_cpu(tmp);
		offset += 4;
		vdev-config-get(vdev, offset, tmp, 4);
		len = le32_to_cpu(tmp);
		if (len  VMCHANNEL_NAME_MAX) {
			dev_printk(KERN_ERR, vdev-dev,
	Wrong device configuration. 
	Channel name is too long);
			

Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-16 Thread Evgeniy Polyakov
On Tue, Dec 16, 2008 at 08:57:27AM +0200, Gleb Natapov (g...@redhat.com) wrote:
  Another approach is to implement that virtio backend with netlink based
  userspace interface (like using connector or genetlink). This does not
  differ too much from what you have with special socket family, but at
  least it does not duplicate existing functionality of
  userspace-kernelspace communications.
  
 I implemented vmchannel using connector initially (the downside is that
 message can be dropped). Is this more expectable for upstream? The
 implementation was 300 lines of code.

Hard to tell, it depends on implementation. But if things are good, I
have no objections as connector maintainer :)

Messages in connector in particular and netlink in general are only
dropped, when receiving buffer is full (or when there is no memory), you
can tune buffer size to match virtual queue size or vice versa.

-- 
Evgeniy Polyakov
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-16 Thread Dor Laor
Evgeniy Polyakov wrote:
 On Tue, Dec 16, 2008 at 08:57:27AM +0200, Gleb Natapov (g...@redhat.com) 
 wrote:
   
 Another approach is to implement that virtio backend with netlink based
 userspace interface (like using connector or genetlink). This does not
 differ too much from what you have with special socket family, but at
 least it does not duplicate existing functionality of
 userspace-kernelspace communications.

   
 I implemented vmchannel using connector initially (the downside is that
 message can be dropped). Is this more expectable for upstream? The
 implementation was 300 lines of code.
 

 Hard to tell, it depends on implementation. But if things are good, I
 have no objections as connector maintainer :)

 Messages in connector in particular and netlink in general are only
 dropped, when receiving buffer is full (or when there is no memory), you
 can tune buffer size to match virtual queue size or vice versa.

   
Gleb was aware of that and it's not a problem since all of the 
anticipated usages may
drop msgs (guest statistics, cutpaste, mouse movements, single sign on 
commands, etc).
Service that would need reliability could use basic acks.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Anthony Liguori
David Miller wrote:
 From: Gleb Natapov g...@redhat.com
 Date: Sun, 14 Dec 2008 13:50:55 +0200

   
 It is undesirable to use TCP/IP for this purpose since network
 connectivity may not exist between host and guest and if it exists the
 traffic can be not routable between host and guest for security reasons
 or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user.
 

 I don't really accept this argument, sorry.
   

I couldn't agree more.  That doesn't mean I don't think this isn't 
valuable though.

Each of these sockets are going to be connected to a backend (to 
implement guest=copy/paste for instance).  We want to implement those 
backends in userspace and preferably in QEMU.

Using some raw protocol over ethernet means you don't have reliability.  
If you use a protocol to get reliability (like TCP), you now have to 
implement a full TCP/IP stack in userspace or get the host kernel 
involved.  I'd rather not get the host kernel involved from a security 
perspective.

An inherently reliable socket transport solves the above problem while 
keeping things simple.  Note, this is not a new concept.  There is 
already an AF_IUCV for s390.  VMware is also developing an AF_VMCI 
socket family.

Regards,

Anthony Liguori
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Evgeniy Polyakov
Hi Gleb.

On Sun, Dec 14, 2008 at 01:50:55PM +0200, Gleb Natapov (g...@redhat.com) wrote:
 There is a need for communication channel between host and various
 agents that are running inside a VM guest. The channel will be used
 for statistic gathering, logging, cut  paste, host screen resolution
 changes notifications, guest configuration etc.
 
 It is undesirable to use TCP/IP for this purpose since network
 connectivity may not exist between host and guest and if it exists the
 traffic can be not routable between host and guest for security reasons
 or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user.
 
 This patch implement new address family AF_VMCHANNEL that is used
 for communication between guest and host. Channels are created at VM
 start time. Each channel has a name. Agent, that runs on a guest, can
 send/receive data to/from a channel by creating AF_VMCHANNEL socket and
 connecting to a channel using channels name as an address.
 
 Only stream sockets are supported by this implementation. Also only
 connect, sendmsg and recvmsg socket ops are implemented which is enough
 to allow application running in a guest to connect to a channel created
 by a host and read/write from/to the channel. This can be extended to
 allow channel creation from inside a guest by creating listen socket and
 accepting on it if the need will arise and thus even allow guest-guest
 communication in the future (but TCP/IP may be preferable for this).

Couple of comments on this.
First, there is only single virtio device initialized at probe time,
how this will work on the host system with multiple guests? Is it
possible to have multiple virtual devices?
Second, each virtual device has an array of names, and each socket can
be bound to one of them, but it is not allowed to have multiple sockets
bound to the same name, so it looks like there is no possibility to have
several sockets communicating via signel channel, was this intentional?
And third, tasklet callbacks do not use bh socket locking, and while it
is not something bad, but rt folks want (dream) to replace it with
process context, so this at least requires some note in comments.

Except that about questions, this patch looks good.

-- 
Evgeniy Polyakov
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Jeremy Fitzhardinge
Anthony Liguori wrote:
 David Miller wrote:
   
 From: Gleb Natapov g...@redhat.com
 Date: Sun, 14 Dec 2008 13:50:55 +0200

   
 
 It is undesirable to use TCP/IP for this purpose since network
 connectivity may not exist between host and guest and if it exists the
 traffic can be not routable between host and guest for security reasons
 or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user.
 
   
 I don't really accept this argument, sorry.
 

Yes.  There's no reason why the management stack couldn't implement its 
own private idiot-proofing network for this kind of thing.

 Each of these sockets are going to be connected to a backend (to 
 implement guest=copy/paste for instance).  We want to implement those 
 backends in userspace and preferably in QEMU.

 Using some raw protocol over ethernet means you don't have reliability.  
 If you use a protocol to get reliability (like TCP), you now have to 
 implement a full TCP/IP stack in userspace or get the host kernel 
 involved.  I'd rather not get the host kernel involved from a security 
 perspective.
   

There's nothing wrong with user-mode TCP, or you could run your TCP 
stack in a special-purpose guest if you're really paranoid.

 An inherently reliable socket transport solves the above problem while 
 keeping things simple.  Note, this is not a new concept.  There is 
 already an AF_IUCV for s390.  VMware is also developing an AF_VMCI 
 socket family.
   

The trouble is that it presumes that the host and guest (or whoever the 
endpoints are) are on the same physical machine and will remain that 
way.  Given that live migration is a feature that people seem to like, 
then you'd end up needing to transport this protocol over a real network 
anyway - and at that point you may as well use proper TCP/IP.   The 
alternative is to say either if you use this feature you can't migrate, 
and you can only resume on the same host, or you can use this feature, 
and we'll work out a global namespace and proxy it over TCP for you.  
Neither seems very satisfactory.

J
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


RE: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Itamar Heim
 -Original Message-
 From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On
 Behalf Of Jeremy Fitzhardinge
 
 The trouble is that it presumes that the host and guest (or whoever the
 endpoints are) are on the same physical machine and will remain that
 way.  Given that live migration is a feature that people seem to like,
 then you'd end up needing to transport this protocol over a real network
 anyway - and at that point you may as well use proper TCP/IP.   The
 alternative is to say either if you use this feature you can't migrate,
 and you can only resume on the same host, or you can use this feature,
 and we'll work out a global namespace and proxy it over TCP for you.
 Neither seems very satisfactory.
[IH] when migrating a guest to another host, migration takes care of
closing/opening of the VMChannel on the target host. The VMChannel is
local to the hypervisor, not accessible via network. Migration is not an
issue requiring the VMChannel to use TCP/IP.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread David Miller
From: Anthony Liguori anth...@codemonkey.ws
Date: Mon, 15 Dec 2008 09:02:23 -0600

 There is already an AF_IUCV for s390.

This is a scarecrow and irrelevant to this discussion.

And this is exactly why I asked that any arguments in this thread
avoid talking about virtualization technology and why it's special.

This proposed patch here is asking to add new infrastructure for
hypervisor facilities that will be _ADDED_ and for which we have
complete control over.

Whereas the S390 folks have to deal with existing infrastructure which
is largely outside of their control.  So if they implement access
mechanisms for that, it's fine.

I would be doing the same thing if I added a protocol socket layer for
accessing the Niagara hypervisor virtualization channels.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Anthony Liguori
David Miller wrote:
 From: Anthony Liguori anth...@codemonkey.ws
 Date: Mon, 15 Dec 2008 09:02:23 -0600

   
 There is already an AF_IUCV for s390.
 

 This is a scarecrow and irrelevant to this discussion.

 And this is exactly why I asked that any arguments in this thread
 avoid talking about virtualization technology and why it's special.
   

You cannot completely avoid talking about virtualization here.  I agree 
that an argument based on, we need it for virtualization, why?, 
virtualization! is not sufficient.

You still didn't address my earlier question though.

What we need is a mechanism for implementing paravirtual device drivers 
in userspace.  On a modern Linux system, a lot of important things are 
done in userspace (mostly around X) so having some code in the guest's 
userspace is important.

We want this communication mechanism to be simple and reliable as we 
want to implement the backends drivers in the host userspace with 
minimum mess.

Within the guest, we need the interface to be always available and we 
need an addressing scheme that is hypervisor specific.  Yes, we can 
build this all on top of TCP/IP.  We could even build it on top of a 
serial port.  Both have their down-sides wrt reliability and complexity.

The most natural userspace interface that meets all of these 
requirements would appear to be a new socket family.  We could also use 
another userspace interface (netlink was originally proposed, a chardev 
is possible, or a virtual file system).

Do you have another recommendation?

Regards,

Anthony Liguori

 This proposed patch here is asking to add new infrastructure for
 hypervisor facilities that will be _ADDED_ and for which we have
 complete control over.

 Whereas the S390 folks have to deal with existing infrastructure which
 is largely outside of their control.  So if they implement access
 mechanisms for that, it's fine.

 I would be doing the same thing if I added a protocol socket layer for
 accessing the Niagara hypervisor virtualization channels.
   

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread David Miller
From: Anthony Liguori anth...@codemonkey.ws
Date: Mon, 15 Dec 2008 14:44:26 -0600

 We want this communication mechanism to be simple and reliable as we
 want to implement the backends drivers in the host userspace with
 minimum mess.

One implication of your statement here is that TCP is unreliable.
That's absolutely not true.

 Within the guest, we need the interface to be always available and
 we need an addressing scheme that is hypervisor specific.  Yes, we
 can build this all on top of TCP/IP.  We could even build it on top
 of a serial port.  Both have their down-sides wrt reliability and
 complexity.

I don't know of any zero-copy through the hypervisor mechanisms for
serial ports, but I know we do that with the various virtualization
network devices.

 Do you have another recommendation?

I don't have to make alternative recommendations until you can
show that what we have can't solve the problem acceptably, and
TCP emphatically can.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Jeremy Fitzhardinge
Anthony Liguori wrote:
 Jeremy Fitzhardinge wrote:
   
 Each of these sockets are going to be connected to a backend (to 
 implement guest=copy/paste for instance).  We want to implement 
 those backends in userspace and preferably in QEMU.

 Using some raw protocol over ethernet means you don't have 
 reliability.  If you use a protocol to get reliability (like TCP), 
 you now have to implement a full TCP/IP stack in userspace or get the 
 host kernel involved.  I'd rather not get the host kernel involved 
 from a security perspective.
   
   
 There's nothing wrong with user-mode TCP, or you could run your TCP 
 stack in a special-purpose guest if you're really paranoid.
 

 That seems unnecessarily complex.
   

Well, the simplest thing is to let the host TCP stack do TCP.  Could you 
go into more detail about why you'd want to avoid that?

 This is why I've been pushing for the backends to be implemented in 
 QEMU.  Then QEMU can marshal the backend-specific state and transfer it 
 during live migration.  For something like copy/paste, this is obvious 
 (the clipboard state).  A general command interface is probably 
 stateless so it's a nop.
   

Copy/paste seems like a particularly bogus example.  Surely this isn't a 
sensible way to implement it?

 I'm not a fan of having external backends to QEMU for the very reasons 
 you outline above.  You cannot marshal the state of a channel we know 
 nothing about.  We're really just talking about extending virtio in a 
 guest down to userspace so that we can implement paravirtual device 
 drivers in guest userspace.  This may be an X graphics driver, a mouse 
 driver, copy/paste, remote shutdown, etc.
   
 A socket seems like a natural choice.  If that's wrong, then we can 
 explore other options (like a char device, virtual fs, etc.).

I think a socket is a pretty poor choice.  It's too low level, and it 
only really makes sense for streaming data, not for data storage 
(name/value pairs).  It means that everyone ends up making up their own 
serializations.  A filesystem view with notifications seems to be a 
better match for the use-cases you mention (aside from cut/paste), with 
a single well-defined way to serialize onto any given channel.  Each 
file may well have an application-specific content, but in general 
that's going to be something pretty simple.

   This 
 shouldn't be confused with networking though and all the talk of doing 
 silly things like streaming fence traffic through it just encourages the 
 confusion.

I'm not sure what you're referring to here.

J
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Anthony Liguori
David Miller wrote:
 From: Anthony Liguori anth...@codemonkey.ws
 Date: Mon, 15 Dec 2008 14:44:26 -0600

   
 We want this communication mechanism to be simple and reliable as we
 want to implement the backends drivers in the host userspace with
 minimum mess.
 

 One implication of your statement here is that TCP is unreliable.
 That's absolutely not true.
   

No, TCP falls under the not simple category because it requires the 
backend to have access to a TCP/IP stack.

 Within the guest, we need the interface to be always available and
 we need an addressing scheme that is hypervisor specific.  Yes, we
 can build this all on top of TCP/IP.  We could even build it on top
 of a serial port.  Both have their down-sides wrt reliability and
 complexity.
 

 I don't know of any zero-copy through the hypervisor mechanisms for
 serial ports, but I know we do that with the various virtualization
 network devices.
   

Yes, and I went down the road of using a dedicated network device and 
using raw ethernet as the protocol.  The thing that killed that was the 
fact that it's not reliable.  You need something like TCP to add 
reliability.

But that's a lot of work and a bit backwards.  Use a unreliable 
transport but use TCP on top of it to get reliability.  Our link 
(virtio) is inherently reliable so why not just expose a reliable 
interface to userspace?

 Do you have another recommendation?
 

 I don't have to make alternative recommendations until you can
 show that what we have can't solve the problem acceptably, and
 TCP emphatically can.
   

It can solve the problem but I don't think it's the best way to solve 
the problem mainly because the complexity it demands on the backend.

Regards,

Anthony Liguori
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Anthony Liguori
Jeremy Fitzhardinge wrote:
 Anthony Liguori wrote:

 That seems unnecessarily complex.
   

 Well, the simplest thing is to let the host TCP stack do TCP.  Could 
 you go into more detail about why you'd want to avoid that?

The KVM model is that a guest is a process.  Any IO operations original 
from the process (QEMU).  The advantage to this is that you get very 
good security because you can use things like SELinux and simply treat 
the QEMU process as you would the guest.  In fact, in general, I think 
we want to assume that QEMU is guest code from a security perspective.

By passing up the network traffic to the host kernel, we now face a 
problem when we try to get the data back.  We could setup a tun device 
to send traffic to the kernel but then the rest of the system can see 
that traffic too.  If that traffic is sensitive, it's potentially unsafe.

You can use iptables to restrict who can receive traffic and possibly 
use SELinux packet tagging or whatever.  This gets extremely complex though.

It's far easier to avoid the host kernel entirely and implement the 
backends in QEMU.  Then any actions the backend takes will be on behalf 
of the guest.  You never have to worry about transport data leakage.

 This is why I've been pushing for the backends to be implemented in 
 QEMU.  Then QEMU can marshal the backend-specific state and transfer 
 it during live migration.  For something like copy/paste, this is 
 obvious (the clipboard state).  A general command interface is 
 probably stateless so it's a nop.
   

 Copy/paste seems like a particularly bogus example.  Surely this isn't 
 a sensible way to implement it?

I think it's the most sensible way to implement it.  Would you suggest 
something different?

 I'm not a fan of having external backends to QEMU for the very 
 reasons you outline above.  You cannot marshal the state of a channel 
 we know nothing about.  We're really just talking about extending 
 virtio in a guest down to userspace so that we can implement 
 paravirtual device drivers in guest userspace.  This may be an X 
 graphics driver, a mouse driver, copy/paste, remote shutdown, etc.
   A socket seems like a natural choice.  If that's wrong, then we can 
 explore other options (like a char device, virtual fs, etc.).

 I think a socket is a pretty poor choice.  It's too low level, and it 
 only really makes sense for streaming data, not for data storage 
 (name/value pairs).  It means that everyone ends up making up their 
 own serializations.  A filesystem view with notifications seems to be 
 a better match for the use-cases you mention (aside from cut/paste), 
 with a single well-defined way to serialize onto any given channel.  
 Each file may well have an application-specific content, but in 
 general that's going to be something pretty simple.

I had suggested a virtual file system at first and was thoroughly 
ridiculed for it :-)  There is a 9p virtio transport already so we could 
even just use that.

The main issue with a virtual file system is that it does map well to 
other guests.  It's actually easier to implement a socket interface for 
Windows than it is to implement a new file system.

But we could find ways around this with libraries.  If we used 9p as a 
transport, we could just provide a char device in Windows that received 
it in userspace.

   This shouldn't be confused with networking though and all the talk 
 of doing silly things like streaming fence traffic through it just 
 encourages the confusion.

 I'm not sure what you're referring to here.

I'm just ranting, it's not important.

Regards,

Anthony Liguori

J

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Stephen Hemminger
On Mon, 15 Dec 2008 17:01:14 -0600
Anthony Liguori anth...@codemonkey.ws wrote:

 David Miller wrote:
  From: Anthony Liguori anth...@codemonkey.ws
  Date: Mon, 15 Dec 2008 14:44:26 -0600
 

  We want this communication mechanism to be simple and reliable as we
  want to implement the backends drivers in the host userspace with
  minimum mess.
  
 
  One implication of your statement here is that TCP is unreliable.
  That's absolutely not true.

 
 No, TCP falls under the not simple category because it requires the 
 backend to have access to a TCP/IP stack.
 
  Within the guest, we need the interface to be always available and
  we need an addressing scheme that is hypervisor specific.  Yes, we
  can build this all on top of TCP/IP.  We could even build it on top
  of a serial port.  Both have their down-sides wrt reliability and
  complexity.
  
 
  I don't know of any zero-copy through the hypervisor mechanisms for
  serial ports, but I know we do that with the various virtualization
  network devices.

 
 Yes, and I went down the road of using a dedicated network device and 
 using raw ethernet as the protocol.  The thing that killed that was the 
 fact that it's not reliable.  You need something like TCP to add 
 reliability.
 
 But that's a lot of work and a bit backwards.  Use a unreliable 
 transport but use TCP on top of it to get reliability.  Our link 
 (virtio) is inherently reliable so why not just expose a reliable 
 interface to userspace?
 
  Do you have another recommendation?
  
 
  I don't have to make alternative recommendations until you can
  show that what we have can't solve the problem acceptably, and
  TCP emphatically can.

 
 It can solve the problem but I don't think it's the best way to solve 
 the problem mainly because the complexity it demands on the backend.

Those who don't understand TCP are doomed to reimplement it, badly.


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Anthony Liguori
David Miller wrote:
 From: Anthony Liguori anth...@codemonkey.ws
 Date: Mon, 15 Dec 2008 17:01:14 -0600

   
 No, TCP falls under the not simple category because it requires the
 backend to have access to a TCP/IP stack.
 

 I'm at a loss for words if you need TCP in the hypervisor, if that's
 what you're implying here.
   

No.  KVM is not a traditional hypervisor.  It's more of a userspace 
accelerator for emulators.

QEMU, a system emulator, calls in to the Linux kernel whenever it needs 
to run guest code.  Linux returns to QEMU whenever the guest has done an 
MMIO operation or something of that nature.  In this way, all of our 
device emulation (including paravirtual backends) are implemented in the 
host userspace in the QEMU process.

If we used TCP, we don't have a useful TCP/IP stack in QEMU, so we'd 
have to inject that traffic into the host Linux instance, and then 
receive the traffic in QEMU.  Besides being indirect, it has some nasty 
security implications that I outlined in my response to Jeremy's last note.

Regards,

Anthony Liguori

 You only need it in the guest and the host, which you already have,
 in the Linux kernel.  Just transport that over virtio or whatever
 and be done with it.
   

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Jeremy Fitzhardinge
Anthony Liguori wrote:
 Jeremy Fitzhardinge wrote:
 Anthony Liguori wrote:

 That seems unnecessarily complex.
   

 Well, the simplest thing is to let the host TCP stack do TCP.  Could 
 you go into more detail about why you'd want to avoid that?

 The KVM model is that a guest is a process.  Any IO operations 
 original from the process (QEMU).  The advantage to this is that you 
 get very good security because you can use things like SELinux and 
 simply treat the QEMU process as you would the guest.  In fact, in 
 general, I think we want to assume that QEMU is guest code from a 
 security perspective.

 By passing up the network traffic to the host kernel, we now face a 
 problem when we try to get the data back.  We could setup a tun device 
 to send traffic to the kernel but then the rest of the system can see 
 that traffic too.  If that traffic is sensitive, it's potentially unsafe.

Well, one could come up with a mechanism to bind an interface to be only 
visible to a particular context/container/something.

 You can use iptables to restrict who can receive traffic and possibly 
 use SELinux packet tagging or whatever.  This gets extremely complex 
 though.

Well, if you can just tag everything based on interface its relatively 
simple.

 It's far easier to avoid the host kernel entirely and implement the 
 backends in QEMU.  Then any actions the backend takes will be on 
 behalf of the guest.  You never have to worry about transport data 
 leakage.

Well, a stream-like protocol layered over a reliable packet transport 
would get you there without the complexity of tcp.  Or just do a 
usermode tcp; its not that complex if you really think it simplifies the 
other aspects.


 This is why I've been pushing for the backends to be implemented in 
 QEMU.  Then QEMU can marshal the backend-specific state and transfer 
 it during live migration.  For something like copy/paste, this is 
 obvious (the clipboard state).  A general command interface is 
 probably stateless so it's a nop.
   

 Copy/paste seems like a particularly bogus example.  Surely this 
 isn't a sensible way to implement it?

 I think it's the most sensible way to implement it.  Would you suggest 
 something different?

Well, off the top of my head I'm assuming the requirements are:

* the goal is to unify the user's actual desktop session with a
  virtual session within a vm
* a given user may have multiple VMs running on their desktop
* a VM may be serving multiple user sessions
* the VMs are not necessarily hosted by the user's desktop machine
* the VMs can migrate at any moment

To me that looks like a daemon running within the context of each of the 
user's virtual sessions monitoring clipboard events, talking over a TCP 
connection to a corresponding daemon in their desktop session, which is 
responsible for reconciling cuts and pastes in all the various sessions.

I guess you'd say that each VM would multiplex all its cut/paste events 
via its AF_VMCHANNEL/cut+paste channel to its qemu, which would then 
demultiplex them off to the user's real desktops.  And that since the VM 
itself may have no networking, it needs to be a special magic connection.

And my counter argument to this nicely placed straw man is that the 
VM-qemu connection can still be TCP, even if its a private network 
with no outside access.


 I'm not a fan of having external backends to QEMU for the very 
 reasons you outline above.  You cannot marshal the state of a 
 channel we know nothing about.  We're really just talking about 
 extending virtio in a guest down to userspace so that we can 
 implement paravirtual device drivers in guest userspace.  This may 
 be an X graphics driver, a mouse driver, copy/paste, remote 
 shutdown, etc.
   A socket seems like a natural choice.  If that's wrong, then we 
 can explore other options (like a char device, virtual fs, etc.).

 I think a socket is a pretty poor choice.  It's too low level, and it 
 only really makes sense for streaming data, not for data storage 
 (name/value pairs).  It means that everyone ends up making up their 
 own serializations.  A filesystem view with notifications seems to be 
 a better match for the use-cases you mention (aside from cut/paste), 
 with a single well-defined way to serialize onto any given channel.  
 Each file may well have an application-specific content, but in 
 general that's going to be something pretty simple.

 I had suggested a virtual file system at first and was thoroughly 
 ridiculed for it :-)  There is a 9p virtio transport already so we 
 could even just use that.

You mean 9p directly over a virtio ringbuffer rather than via the 
network stack?  You could do that, but I'd still argue that using the 
network stack is a better approach.

 The main issue with a virtual file system is that it does map well to 
 other guests.  It's actually easier to implement a socket interface 
 for Windows than it is to implement a new file system.


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-15 Thread Dor Laor
Evgeniy Polyakov wrote:
 On Mon, Dec 15, 2008 at 05:08:29PM -0600, Anthony Liguori 
 (anth...@codemonkey.ws) wrote:
   
 The KVM model is that a guest is a process.  Any IO operations original 
 from the process (QEMU).  The advantage to this is that you get very 
 good security because you can use things like SELinux and simply treat 
 the QEMU process as you would the guest.  In fact, in general, I think 
 we want to assume that QEMU is guest code from a security perspective.

 By passing up the network traffic to the host kernel, we now face a 
 problem when we try to get the data back.  We could setup a tun device 
 to send traffic to the kernel but then the rest of the system can see 
 that traffic too.  If that traffic is sensitive, it's potentially unsafe.
 

 You can even use unix sockets in this case, and each socket will be
 named as virtio channels names. IIRC tun/tap devices can be virtualizen
 with recent kernels, which also solves all problems of shared access.

 There are plenty of ways to implement this kind of functionality instead
 of developing some new protocol, which is effectively a duplication of
 what already exists in the kernel.

   

Well, it is kinda pv-unix-domain-socket.
I did not understand how a standard unix domain in the guest can reach 
the host according
to your solution.

The initial implementation was some sort of pv-serial. Serial itself is 
low performing and
there is no naming services what so every. Gleb did offer the netlink 
option as a beginning
but we though a new address family would be more robust (you say too 
robust).
So by suggestion new address family what can think of it as a 
pv-unix-domain-socket.
Networking IS used since we think it is a good 'wheel'.
Indeed, David is right that instead of adding a new chunk of code we can 
re-use the
existing one. But we do have some 'new' (afraid to tell virtualization) 
problems that
might prevent us of using a standard virtual nic:
- Even if we can teach iptables to ignore this interface, other
  3rd firewall might not obey: What if the VM is a Checkpoint firewall?
  What if the VM is windows? + using a non MS firewall?
- Who will assign IPs for the vnic? How can I assure there is no ip 
clash?
   The standard dhcp for the other standard vnics might not be in 
our control.

So I do understand the idea of using a standard network interface. It's 
just not that simple.
So ideas to handle the above are welcomed.
Otherwise we might need to go back to serial/pv-serial approach.

btw: here are the usages/next usages of vmchannel:
VMchannel is a host-guest interface and in the future guest-guest interface.
Currently/soon it is used for
- guest statistics
- guest info
- guest single sign own
- guest log-in log-out
- mouse channel for multiple monitors
- cutpaste (guest-host, sometimes client-host-guest, company 
firewall blocks client-guest).
- fencing (potentially)

tw2: without virtualization we wouldn't have new passionate issues to 
discuss about!
Cheers,
Dor
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-14 Thread Gleb Natapov
Hi Evgeniy,

On Sun, Dec 14, 2008 at 03:23:20PM +0300, Evgeniy Polyakov wrote:
 On Sun, Dec 14, 2008 at 01:50:55PM +0200, Gleb Natapov (g...@redhat.com) 
 wrote:
  There is a need for communication channel between host and various
  agents that are running inside a VM guest. The channel will be used
  for statistic gathering, logging, cut  paste, host screen resolution
  changes notifications, guest configuration etc.
  
  It is undesirable to use TCP/IP for this purpose since network
  connectivity may not exist between host and guest and if it exists the
  traffic can be not routable between host and guest for security reasons
  or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user.
  
  This patch implement new address family AF_VMCHANNEL that is used
  for communication between guest and host. Channels are created at VM
  start time. Each channel has a name. Agent, that runs on a guest, can
  send/receive data to/from a channel by creating AF_VMCHANNEL socket and
  connecting to a channel using channels name as an address.
  
  Only stream sockets are supported by this implementation. Also only
  connect, sendmsg and recvmsg socket ops are implemented which is enough
  to allow application running in a guest to connect to a channel created
  by a host and read/write from/to the channel. This can be extended to
  allow channel creation from inside a guest by creating listen socket and
  accepting on it if the need will arise and thus even allow guest-guest
  communication in the future (but TCP/IP may be preferable for this).
 
 Couple of comments on this.
 First, there is only single virtio device initialized at probe time,
 how this will work on the host system with multiple guests? Is it
 possible to have multiple virtual devices?
The module is loaded only inside a guest not host and it manages all
existing channels. What would be the value to have multiple vmchannel
PCI devices in a single guest?

 Second, each virtual device has an array of names, and each socket can
 be bound to one of them, but it is not allowed to have multiple sockets
 bound to the same name, so it looks like there is no possibility to have
 several sockets communicating via signel channel, was this intentional?
Yes, this is intentional as it matches our usage model. It is possible
to change this in the future if needed. All sockets bound to the same
channel will receive the same data.

 And third, tasklet callbacks do not use bh socket locking, and while it
 is not something bad, but rt folks want (dream) to replace it with
 process context, so this at least requires some note in comments.
 
This is something I need to understand better. I though that socket
lock guards socket state change. The patch only access socket state
from bh context in the vmchannel_socket_recv() and even if state of the
socket will change after function validates it nothing bad can happen.
Is this the case? I it is I will add comment explaining this.

 Except that about questions, this patch looks good.
Thanks for the review.

--
Gleb.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-14 Thread Gleb Natapov
There is a need for communication channel between host and various
agents that are running inside a VM guest. The channel will be used
for statistic gathering, logging, cut  paste, host screen resolution
changes notifications, guest configuration etc.

It is undesirable to use TCP/IP for this purpose since network
connectivity may not exist between host and guest and if it exists the
traffic can be not routable between host and guest for security reasons
or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user.

This patch implement new address family AF_VMCHANNEL that is used
for communication between guest and host. Channels are created at VM
start time. Each channel has a name. Agent, that runs on a guest, can
send/receive data to/from a channel by creating AF_VMCHANNEL socket and
connecting to a channel using channels name as an address.

Only stream sockets are supported by this implementation. Also only
connect, sendmsg and recvmsg socket ops are implemented which is enough
to allow application running in a guest to connect to a channel created
by a host and read/write from/to the channel. This can be extended to
allow channel creation from inside a guest by creating listen socket and
accepting on it if the need will arise and thus even allow guest-guest
communication in the future (but TCP/IP may be preferable for this).

Signed-off-by: Gleb Natapov g...@redhat.com
---

 include/linux/socket.h   |4 
 include/linux/vmchannel.h|   54 +++
 net/Kconfig  |1 
 net/Makefile |1 
 net/vmchannel/Kconfig|   11 +
 net/vmchannel/Makefile   |5 
 net/vmchannel/af_vmchannel.c |  769 ++
 7 files changed, 844 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/vmchannel.h
 create mode 100644 net/vmchannel/Kconfig
 create mode 100644 net/vmchannel/Makefile
 create mode 100644 net/vmchannel/af_vmchannel.c

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 20fc4bb..e65834c 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -191,7 +191,8 @@ struct ucred {
 #define AF_RXRPC   33  /* RxRPC sockets*/
 #define AF_ISDN34  /* mISDN sockets*/
 #define AF_PHONET  35  /* Phonet sockets   */
-#define AF_MAX 36  /* For now.. */
+#define AF_VMCHANNEL   36  /* Vmchannel sockets*/
+#define AF_MAX 37  /* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC  AF_UNSPEC
@@ -229,6 +230,7 @@ struct ucred {
 #define PF_RXRPC   AF_RXRPC
 #define PF_ISDNAF_ISDN
 #define PF_PHONET  AF_PHONET
+#define PF_VMCHANNEL   AF_VMCHANNEL
 #define PF_MAX AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/include/linux/vmchannel.h b/include/linux/vmchannel.h
new file mode 100644
index 000..27c1f94
--- /dev/null
+++ b/include/linux/vmchannel.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008 Red Hat, Inc --- All Rights Reserved
+ *
+ *  Author(s): Gleb Natapov g...@redhat.com
+ */
+
+#ifndef VMCHANNEL_H
+#define VMCHANNEL_H
+
+#define VMCHANNEL_NAME_MAX 80
+struct sockaddr_vmchannel {
+   sa_family_t svmchannel_family;
+   char svmchannel_name[VMCHANNEL_NAME_MAX];
+};
+
+#ifdef __KERNEL__
+
+#define VIRTIO_ID_VMCHANNEL 6
+#define VMCHANNEL_BAD_ID (~(__u32)0)
+
+#define vmchannel_sk(__sk) ((struct vmchannel_sock *) __sk)
+
+struct vmchannel_sock {
+   struct sock sk;
+   char name[VMCHANNEL_NAME_MAX];
+   __u32 id;
+   struct sk_buff_head backlog_skb_q;
+};
+
+struct vmchannel_info {
+   __u32 id;
+   char *name;
+};
+
+struct vmchannel_dev {
+   struct virtio_device *vdev;
+   struct virtqueue *rq;
+   struct virtqueue *sq;
+   struct tasklet_struct rx_tasklet;
+   struct tasklet_struct tx_tasklet;
+   __u32 channel_count;
+   struct vmchannel_info *channels;
+   struct sk_buff_head rx_skbuff_q;
+   struct sk_buff_head tx_skbuff_q;
+   atomic_t recv_posted;
+};
+
+struct vmchannel_desc {
+   __u32 id;
+   __le32 len;
+};
+
+#endif /* __KERNEL__ */
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index d789d79..d01f135 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -36,6 +36,7 @@ source net/packet/Kconfig
 source net/unix/Kconfig
 source net/xfrm/Kconfig
 source net/iucv/Kconfig
+source net/vmchannel/Kconfig
 
 config INET
bool TCP/IP networking
diff --git a/net/Makefile b/net/Makefile
index 27d1f10..ddc89dc 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_IEEE80211)   += ieee80211/
 obj-$(CONFIG_TIPC) += tipc/
 obj-$(CONFIG_NETLABEL) += netlabel/
 obj-$(CONFIG_IUCV) += iucv/
+obj-$(CONFIG_VMCHANNEL)+= vmchannel/
 obj-$(CONFIG_RFKILL)   += rfkill/
 obj-$(CONFIG_NET_9P)   += 9p/
 
diff --git 

Re: [PATCH] AF_VMCHANNEL address family for guest-host communication.

2008-12-14 Thread David Miller
From: Gleb Natapov g...@redhat.com
Date: Sun, 14 Dec 2008 13:50:55 +0200

 It is undesirable to use TCP/IP for this purpose since network
 connectivity may not exist between host and guest and if it exists the
 traffic can be not routable between host and guest for security reasons
 or TCP/IP traffic can be firewalled (by mistake) by unsuspecting VM user.

I don't really accept this argument, sorry.

If you can't use TCP because it might be security protected or
misconfigured, adding this new stream protocol thing is not one
bit better.  It doesn't make any sense at all.

Also, if TCP could be misconfigured this new thing could just as
easily be screwed up too.  And I wouldn't be surprised to see a whole
bunch of SELINUX and netfilter features proposed later for this and
then we're back to square one.

You guys really need to rethink this.  Either a stream protocol is a
workable solution to your problem, or it isn't.

And don't bring up any virtualization is special because...
arguments into your reply because virtualization has nothing to do
with my objections stated above.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization