[PATCH] KVM-test: Add subtest cdrom

2011-07-11 Thread Amos Kong
Test cdrom about mount/format/copy/md5sum, change iso file by
monitor command, create iso files by pre_command, clean temporary
file by post_command.

Signed-off-by: Amos Kong ak...@redhat.com
---
 client/tests/kvm/tests/cdrom.py|  131 
 client/tests/kvm/tests_base.cfg.sample |   11 +++
 2 files changed, 142 insertions(+), 0 deletions(-)
 create mode 100644 client/tests/kvm/tests/cdrom.py

diff --git a/client/tests/kvm/tests/cdrom.py b/client/tests/kvm/tests/cdrom.py
new file mode 100644
index 000..be5e4cd
--- /dev/null
+++ b/client/tests/kvm/tests/cdrom.py
@@ -0,0 +1,131 @@
+import logging, re, time
+from autotest_lib.client.common_lib import error
+from autotest_lib.client.virt import virt_utils
+
+def run_cdrom(test, params, env):
+
+KVM cdrom test:
+1) Boot up a VM with one iso
+2) Check if VM identify iso file
+3) Eject cdrom and change with another iso for many times
+4) Try to fromat cdrom and check the return string
+5) Mount cdrom device to /mnt
+6) Copy file from cdrom and compare files by diff
+7) Umount and mount many times
+
+@param test: kvm test object
+@param params: Dictionary with the test parameters
+@param env: Dictionary with test environment.
+
+def get_cdrom_info():
+o = vm.monitor.info(block)
+try:
+device = re.findall((ide\d+-cd\d+): .*, o)[0]
+except IndexError:
+device = None
+try:
+file = re.findall(ide\d+-cd\d+: .*file=(\S*) , o)[0]
+except IndexError:
+file = None
+logging.debug(Device name: %s, ISO: %s % (device, file))
+return (device, file)
+
+def check_cdrom_locked(cdrom):
+blocks_info = vm.monitor.info(block)
+if isinstance(blocks_info, str):
+lock_str = locked=1
+for block in blocks_info.splitlines():
+if cdrom in block and lock_str in block:
+return True
+else:
+for block in blocks_info:
+if 'inserted' in block.keys() and\
+   block['inserted']['file'] == cdrom:
+return block['locked']
+return False
+
+
+vm = env.get_vm(params[main_vm])
+vm.verify_alive()
+
+session = vm.wait_for_login(timeout=int(params.get(login_timeout, 360)))
+cdrom_orig = params.get(cdrom_cd1)
+cdrom_new = params.get(new_iso)
+cdrom = cdrom_orig
+output = session.get_command_output(ls /dev/cdrom*)
+cdrom_dev_list = re.findall(/dev/cdrom-\w+|/dev/cdrom\d*, output)
+logging.debug(cdrom_dev_list: %s % cdrom_dev_list)
+
+cdrom_dev = 
+test_cmd = dd if=%s of=/dev/null bs=1 count=1
+for d in cdrom_dev_list:
+s, o = session.cmd_status_output(test_cmd % d)
+if s == 0:
+cdrom_dev = d
+break
+else:
+raise error.TestFail(Could not find a valid cdrom device.
+  dd returns: %d, output:\n%s % (s, o))
+
+logging.info(Detecting the existence of cdrom)
+(device, file) = get_cdrom_info()
+if file != cdrom:
+raise error.TestError(%s does not realized % cdrom)
+
+session.get_command_output(umount %s % cdrom_dev)
+if not virt_utils.wait_for(lambda: not check_cdrom_locked(file), 300):
+raise error.TestError(%s could not become unlocked % device)
+
+max_times = int(params.get(max_times, 100))
+logging.info(Eject the cdrom for %s times % max_times)
+max = max_times
+while max  0:
+vm.monitor.cmd(eject %s % device)
+(device, file) = get_cdrom_info()
+if file is not None:
+raise error.TestFail(%s is not ejected % cdrom)
+
+cdrom = cdrom_new
+if max % 2 == 0:
+cdrom = cdrom_orig
+vm.monitor.cmd(change %s %s % (device, cdrom))
+time.sleep(10)
+(device, file) = get_cdrom_info()
+if file != cdrom:
+raise error.TestError(%s is not changed % cdrom)
+max -= 1
+
+logging.info(Check whether the cdrom is read-only)
+filename = params.get(filename)
+dest_dir = /mnt
+s, o = session.get_command_status_output(echo y|mkfs %s % cdrom_dev)
+if not Read-only in o:
+logging.debug(Format cdrom doesn't return Read-only error)
+if s == 0:
+raise error.TestFail(Format %s unexpected success % cdrom_dev)
+
+if not virt_utils.wait_for(lambda: session.get_command_status(mount %s %s
+% (cdrom_dev, dest_dir)) == 0, 30):
+logging.debug(session.get_command_output(cat /etc/mtab))
+raise error.TestFail(Could not mount %s % cdrom_dev)
+
+logging.info(File copying test)
+cmd = /bin/cp -f %s/%s /tmp/
+if session.get_command_status(cmd % (dest_dir, filename)) != 0:
+raise error.TestFail(Fail to copy file from %s % dest_dir)
+cmd = diff %s/%s /tmp/%s % (dest_dir, filename, filename)

Re: [Qemu-devel] [RFC v3 31/56] ac97: convert to memory API

2011-07-11 Thread Avi Kivity
 
 Shouldn't it be possible to do something like:
 
 typedef struct OldMemoryRegionOps {
 MemoryRegionOps parent_ops;
 CPUReadMemoryFunc *readfn[3];
 CPUWriteMemoryFunc *writefn[3];
 void *opaque;
 } OldMemoryRegionOps;
 
 That should allow old-style implementations to be converted without
 introducing trampoline functions everywhere.
 

I should, I'll give it a go.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM test: Add subtest nmi_watchdog

2011-07-11 Thread Amos Kong
Uses kernel supported nmi_watchdog to test the nmi support of kvm,
check the nmi count in Linux platform through /proc/interrupts.

Signed-off-by: Amos Kong ak...@redhat.com
---
 client/tests/kvm/tests/nmi_watchdog.py |   57 
 client/tests/kvm/tests_base.cfg.sample |7 
 2 files changed, 64 insertions(+), 0 deletions(-)
 create mode 100644 client/tests/kvm/tests/nmi_watchdog.py

diff --git a/client/tests/kvm/tests/nmi_watchdog.py 
b/client/tests/kvm/tests/nmi_watchdog.py
new file mode 100644
index 000..3895ab7
--- /dev/null
+++ b/client/tests/kvm/tests/nmi_watchdog.py
@@ -0,0 +1,57 @@
+import time, logging
+from autotest_lib.client.common_lib import error
+
+
+def run_nmi_watchdog(test, params, env):
+
+Test the function of nmi injection and verify the response of guest
+
+1) Log in the guest
+2) Add 'watchdog=1' to boot option
+2) Check if guest's NMI counter augment after injecting nmi
+
+@param test: kvm test object
+@param params: Dictionary with the test parameters.
+@param env: Dictionary with test environment.
+
+vm = env.get_vm(params[main_vm])
+vm.verify_alive()
+timeout=int(params.get(login_timeout, 360))
+session = vm.wait_for_login(timeout=timeout)
+get_nmi_cmd= params.get(get_nmi_cmd)
+kernel_version = session.get_command_output(uname -r).strip()
+nmi_watchdog_type = int(params.get(nmi_watchdog_type))
+update_kernel_cmd = grubby --update-kernel=/boot/vmlinuz-%s  \
+ --args='nmi_watchdog=%d'  % (kernel_version, nmi_watchdog_type)
+
+logging.info(Add 'nmi_watchdog=%d' to guest kernel cmdline and reboot
+ % nmi_watchdog_type)
+if session.get_command_status(update_kernel_cmd) != 0:
+raise error.TestError(Fail to modify the kernel cmdline)
+time.sleep(int(params.get(sleep_before_reset, 10)))
+session = vm.reboot(session, method='shell', timeout=timeout)
+try:
+s, guest_cpu_num = session.get_command_status_output(
+  params.get(cpu_chk_cmd))
+if s != 0:
+raise error.TestError(Fail to get cpu number of guest)
+
+logging.info(Checking the nmi interrupt)
+s, o = session.get_command_status_output(get_nmi_cmd)
+if s != 0:
+raise error.TestError(Fail to get guest's NMI counter)
+nmi_counter1 = o.split()[1:]
+
+time.sleep(60)
+s, o = session.get_command_status_output(get_nmi_cmd)
+if s != 0:
+raise error.TestError(Fail to get guest's NMI counter)
+nmi_counter2 = o.split()[1:]
+
+for i in range(int(guest_cpu_num)):
+logging.info(vcpu: %s, nmi_counter1: %s, nmi_counter2: %s %
+ (i, nmi_counter1[i], nmi_counter2[i]))
+if int(nmi_counter2[i]) = int(nmi_counter1[i]):
+raise error.TestFail(The counter doesn't increase)
+finally:
+session.close()
diff --git a/client/tests/kvm/tests_base.cfg.sample 
b/client/tests/kvm/tests_base.cfg.sample
index 5d6227b..4e07b5e 100644
--- a/client/tests/kvm/tests_base.cfg.sample
+++ b/client/tests/kvm/tests_base.cfg.sample
@@ -1112,6 +1112,13 @@ variants:
 post_command = rm -rf /tmp/kvm_autotest_root/orig.iso 
/tmp/kvm_autotest_root/new.iso orig new
 only Linux
 
+- nmi_watchdog: install setup image_copy unattended_install.cdrom
+type = nmi_watchdog
+get_nmi_cmd = grep NMI /proc/interrupts
+nmi_watchdog_type = 1
+image_snapshot = yes
+only Linux
+
 
 # NICs
 variants:

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/9] kvm tools, qcow: Improve QCOW performance

2011-07-11 Thread Kevin Wolf
Am 10.07.2011 20:08, schrieb Pekka Enberg:
 Hi Ingo,
 
 * Pekka Enberg penb...@kernel.org wrote:
 This series fixes QCOW locking issues and implements delayed metadata 
 writeout.
 This improves performance of writeout to QCOW2 images that don't have 
 clusters
 and L2 tables allocated on-disk.

 I tested the series by running

   mount -t ext4 /dev/vdb /mnt
   dd if=/dev/zero of=/mnt/tmp

 in the guest multiple times for fresly generated QCOW2 image:

   dd if=/dev/zero of=fs.ext4 bs=1024k count=512  mkfs.ext4 -F fs.ext4  
 qemu-img convert -O qcow2 fs.ext4 fs.qcow2

 which causes worst-case behavior for the current code.

 Before:

   [ seekwatcher: 
 http://userweb.kernel.org/~penberg/kvm-qcow-delayed/kvm-qcow2-master.png ]

   511229952 bytes (511 MB) copied, 19.906 s, 25.7 MB/s
   511229952 bytes (511 MB) copied, 20.3168 s, 25.2 MB/s
   511229952 bytes (511 MB) copied, 20.8078 s, 24.6 MB/s
   511229952 bytes (511 MB) copied, 21.0889 s, 24.2 MB/s
   511229952 bytes (511 MB) copied, 20.7833 s, 24.6 MB/s
   511229952 bytes (511 MB) copied, 20.7536 s, 24.6 MB/s
   511229952 bytes (511 MB) copied, 20.0312 s, 25.5 MB/s

 After:

   [ seekwatcher: 
 http://userweb.kernel.org/~penberg/kvm-qcow-delayed/kvm-qcow2-delayed.png ]

   511229952 bytes (511 MB) copied, 7.68312 s, 66.5 MB/s
   511229952 bytes (511 MB) copied, 7.54065 s, 67.8 MB/s
   511229952 bytes (511 MB) copied, 9.34749 s, 54.7 MB/s
   511229952 bytes (511 MB) copied, 9.2421 s, 55.3 MB/s
   511229952 bytes (511 MB) copied, 9.9364 s, 51.5 MB/s
   511229952 bytes (511 MB) copied, 10.0337 s, 51.0 MB/s
   511229952 bytes (511 MB) copied, 9.39502 s, 54.4 MB/s
 
 On Sun, Jul 10, 2011 at 8:15 PM, Ingo Molnar mi...@elte.hu wrote:
 Just wondering, how does Qemu perform on the same system using the
 same image, with comparable settings?
 
 Freshly built from qemu-kvm.git:
 
 $ /home/penberg/qemu-kvm/x86_64-softmmu/qemu-system-x86_64 --version
 QEMU emulator version 0.14.50 (qemu-kvm-devel), Copyright (c)
 2003-2008 Fabrice Bellard
 
 Tests were run with this configuration:
 
 $ /home/penberg/qemu-kvm/x86_64-softmmu/qemu-system-x86_64 -kernel
 /boot/vmlinuz-3.0.0-rc5+ -drive
 file=/home/penberg/images/debian_squeeze_amd64_standard.img,if=virtio,boot=on
 -drive file=fs.qcow2,if=virtio -nographic -m 320 -smp 2 -append
 root=/dev/vda1 console=ttyS0 init=/root/iobench-write
 
 Not sure if that's 100% comparable settings but anyway. The results
 looks as follows:

I would love to try out your code occasionally myself, but so far I have
been to lazy to build a guest kernel only to be able to test it. Having
to deal with the huge kernel git tree just for a small program doesn't
really make it more fun either... Anyway, what I'm trying to say is that
everything in my mails is from a purely theoretical POV. I have only
looked at the code, but never really tried it.

As Ingo already said, the cache mode is probably the major difference.
From what I can see in your code, cache=writeback would be the
equivalent for what tools/kvm is doing, however cache=none (i.e.
O_DIRECT) is what people usually do with qemu.

And then there seems to be another big difference. I hope I'm not
missing anything, but you seem to be completely lacking refcount
handling for qcow2. This is okay for read-only image, but with write
access to the image, you're corrupting the images if you don't update
the refcounts. Have you checked qcow2 images with qemu-img check after
tools/kvm having written to it?

Maintaining the right order between L2 writes and refcount block writes
is another source of flushes in qemu, which of course makes a difference
for performance.

Kevin
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/9] kvm tools, qcow: Improve QCOW performance

2011-07-11 Thread Pekka Enberg
Hi Kevin,

On Mon, Jul 11, 2011 at 12:31 PM, Kevin Wolf kw...@redhat.com wrote:
 I would love to try out your code occasionally myself, but so far I have
 been to lazy to build a guest kernel only to be able to test it. Having
 to deal with the huge kernel git tree just for a small program doesn't
 really make it more fun either... Anyway, what I'm trying to say is that
 everything in my mails is from a purely theoretical POV. I have only
 looked at the code, but never really tried it.

Most distro kernels boot just fine, AFAIK. If you have a kernel tree
laying around, you can use

  git remote add kvm-tool git://github.com/penberg/linux-kvm.git
  git remote update kvm-tool

to fetch the sources.

 As Ingo already said, the cache mode is probably the major difference.
 From what I can see in your code, cache=writeback would be the
 equivalent for what tools/kvm is doing, however cache=none (i.e.
 O_DIRECT) is what people usually do with qemu.

Yup, I posted 'cache=writeback' results too which are much closer to
tools/kvm numbers.

 And then there seems to be another big difference. I hope I'm not
 missing anything, but you seem to be completely lacking refcount
 handling for qcow2. This is okay for read-only image, but with write
 access to the image, you're corrupting the images if you don't update
 the refcounts. Have you checked qcow2 images with qemu-img check after
 tools/kvm having written to it?

 Maintaining the right order between L2 writes and refcount block writes
 is another source of flushes in qemu, which of course makes a difference
 for performance.

Yes, you're absolutely correct. We don't support copy-on-write images
and I didn't realize until yesterday evening that we don't even check
the 'copied' bit to make sure writes are safe.

However, for these particular numbers, it doesn't matter that much
because it's all append-only and thus shouldn't trigger any of the
copy-on-write paths.

Pekka
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/9] kvm tools, qcow: Improve QCOW performance

2011-07-11 Thread Kevin Wolf
Am 11.07.2011 11:41, schrieb Pekka Enberg:
 Hi Kevin,
 
 On Mon, Jul 11, 2011 at 12:31 PM, Kevin Wolf kw...@redhat.com wrote:
 I would love to try out your code occasionally myself, but so far I have
 been to lazy to build a guest kernel only to be able to test it. Having
 to deal with the huge kernel git tree just for a small program doesn't
 really make it more fun either... Anyway, what I'm trying to say is that
 everything in my mails is from a purely theoretical POV. I have only
 looked at the code, but never really tried it.
 
 Most distro kernels boot just fine, AFAIK. If you have a kernel tree
 laying around, you can use
 
   git remote add kvm-tool git://github.com/penberg/linux-kvm.git
   git remote update kvm-tool
 
 to fetch the sources.

Yeah, I do have the source and I read some parts of it. Just running it
didn't seem to work with the standard Fedora kernel last time. Seems to
work now, so it was probably my fault.

Not sure what I did different last time, maybe I relied on it to pick up
kernel and initrd automatically from the host (it finds the kernel, but
not the initrd).

 As Ingo already said, the cache mode is probably the major difference.
 From what I can see in your code, cache=writeback would be the
 equivalent for what tools/kvm is doing, however cache=none (i.e.
 O_DIRECT) is what people usually do with qemu.
 
 Yup, I posted 'cache=writeback' results too which are much closer to
 tools/kvm numbers.

Saw it. cache=none would probably help with the stability, but of course
you would also have to add O_DIRECT to tools/kvm to make it fair.

 And then there seems to be another big difference. I hope I'm not
 missing anything, but you seem to be completely lacking refcount
 handling for qcow2. This is okay for read-only image, but with write
 access to the image, you're corrupting the images if you don't update
 the refcounts. Have you checked qcow2 images with qemu-img check after
 tools/kvm having written to it?

 Maintaining the right order between L2 writes and refcount block writes
 is another source of flushes in qemu, which of course makes a difference
 for performance.
 
 Yes, you're absolutely correct. We don't support copy-on-write images
 and I didn't realize until yesterday evening that we don't even check
 the 'copied' bit to make sure writes are safe.
 
 However, for these particular numbers, it doesn't matter that much
 because it's all append-only and thus shouldn't trigger any of the
 copy-on-write paths.

It has nothing to do with copy on write. Well, of course COW is the
reason why the refcounts exist at all, but for a correct qcow2 image
they must be consistent even when you don't do COW.

The problem is that when you run an image, in which tools/kvm has
allocated new clusters, in qemu, it will use the refcount table and
still see the clusters as free. So you'll end up with two guest disk
clusters mapped to the same cluster in the image file and that obviously
means that you'll get data corruption.

qemu-img check would tell you about such inconsistencies.

Kevin
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/9] kvm tools, qcow: Improve QCOW performance

2011-07-11 Thread Pekka Enberg
On Mon, Jul 11, 2011 at 1:29 PM, Kevin Wolf kw...@redhat.com wrote:
 Am 11.07.2011 11:41, schrieb Pekka Enberg:
 Hi Kevin,

 On Mon, Jul 11, 2011 at 12:31 PM, Kevin Wolf kw...@redhat.com wrote:
 I would love to try out your code occasionally myself, but so far I have
 been to lazy to build a guest kernel only to be able to test it. Having
 to deal with the huge kernel git tree just for a small program doesn't
 really make it more fun either... Anyway, what I'm trying to say is that
 everything in my mails is from a purely theoretical POV. I have only
 looked at the code, but never really tried it.

 Most distro kernels boot just fine, AFAIK. If you have a kernel tree
 laying around, you can use

   git remote add kvm-tool git://github.com/penberg/linux-kvm.git
   git remote update kvm-tool

 to fetch the sources.

 Yeah, I do have the source and I read some parts of it. Just running it
 didn't seem to work with the standard Fedora kernel last time. Seems to
 work now, so it was probably my fault.

 Not sure what I did different last time, maybe I relied on it to pick up
 kernel and initrd automatically from the host (it finds the kernel, but
 not the initrd).

Yeah, we should really add automatic initrd detection too.

 As Ingo already said, the cache mode is probably the major difference.
 From what I can see in your code, cache=writeback would be the
 equivalent for what tools/kvm is doing, however cache=none (i.e.
 O_DIRECT) is what people usually do with qemu.

 Yup, I posted 'cache=writeback' results too which are much closer to
 tools/kvm numbers.

 Saw it. cache=none would probably help with the stability, but of course
 you would also have to add O_DIRECT to tools/kvm to make it fair.

 And then there seems to be another big difference. I hope I'm not
 missing anything, but you seem to be completely lacking refcount
 handling for qcow2. This is okay for read-only image, but with write
 access to the image, you're corrupting the images if you don't update
 the refcounts. Have you checked qcow2 images with qemu-img check after
 tools/kvm having written to it?

 Maintaining the right order between L2 writes and refcount block writes
 is another source of flushes in qemu, which of course makes a difference
 for performance.

 Yes, you're absolutely correct. We don't support copy-on-write images
 and I didn't realize until yesterday evening that we don't even check
 the 'copied' bit to make sure writes are safe.

 However, for these particular numbers, it doesn't matter that much
 because it's all append-only and thus shouldn't trigger any of the
 copy-on-write paths.

 It has nothing to do with copy on write. Well, of course COW is the
 reason why the refcounts exist at all, but for a correct qcow2 image
 they must be consistent even when you don't do COW.

 The problem is that when you run an image, in which tools/kvm has
 allocated new clusters, in qemu, it will use the refcount table and
 still see the clusters as free. So you'll end up with two guest disk
 clusters mapped to the same cluster in the image file and that obviously
 means that you'll get data corruption.

 qemu-img check would tell you about such inconsistencies.

Aah, OK, we need to fix that. Thanks!
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/9] kvm tools, qcow: Improve QCOW performance

2011-07-11 Thread Ingo Molnar

* Pekka Enberg penb...@kernel.org wrote:

  As Ingo already said, the cache mode is probably the major 
  difference. From what I can see in your code, cache=writeback 
  would be the equivalent for what tools/kvm is doing, however 
  cache=none (i.e. O_DIRECT) is what people usually do with qemu.
 
 Yup, I posted 'cache=writeback' results too which are much closer 
 to tools/kvm numbers.

tools/kvm/ seems to be about 20% faster on average:

  511229952 bytes (511 MB) copied, 7.68312 s, 66.5 MB/s
  511229952 bytes (511 MB) copied, 7.54065 s, 67.8 MB/s
  511229952 bytes (511 MB) copied, 9.34749 s, 54.7 MB/s
  511229952 bytes (511 MB) copied, 9.2421 s, 55.3 MB/s
  511229952 bytes (511 MB) copied, 9.9364 s, 51.5 MB/s
  511229952 bytes (511 MB) copied, 10.0337 s, 51.0 MB/s
  511229952 bytes (511 MB) copied, 9.39502 s, 54.4 MB/s

versus the qemu numbers:

  511229952 bytes (511 MB) copied, 10.0879 s, 50.7 MB/s
  511229952 bytes (511 MB) copied, 4.92686 s, 104 MB/s
  511229952 bytes (511 MB) copied, 13.1955 s, 38.7 MB/s
  511229952 bytes (511 MB) copied, 10.7322 s, 47.6 MB/s
  511229952 bytes (511 MB) copied, 9.46115 s, 54.0 MB/s
  511229952 bytes (511 MB) copied, 14.9963 s, 34.1 MB/s
  511229952 bytes (511 MB) copied, 11.1701 s, 45.8 MB/s

but indeed there's (much) more variability in the Qemu numbers, 
suggesting some cache artifact.

Are all of these measurements done via /dev/shm, to stabilize the 
numbers and to remove disk IO delay artifacts?

Thanks,

Ingo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/9] kvm tools, qcow: Improve QCOW performance

2011-07-11 Thread Pekka Enberg
On Mon, Jul 11, 2011 at 1:36 PM, Ingo Molnar mi...@elte.hu wrote:

 * Pekka Enberg penb...@kernel.org wrote:

  As Ingo already said, the cache mode is probably the major
  difference. From what I can see in your code, cache=writeback
  would be the equivalent for what tools/kvm is doing, however
  cache=none (i.e. O_DIRECT) is what people usually do with qemu.

 Yup, I posted 'cache=writeback' results too which are much closer
 to tools/kvm numbers.

 tools/kvm/ seems to be about 20% faster on average:

   511229952 bytes (511 MB) copied, 7.68312 s, 66.5 MB/s
   511229952 bytes (511 MB) copied, 7.54065 s, 67.8 MB/s
   511229952 bytes (511 MB) copied, 9.34749 s, 54.7 MB/s
   511229952 bytes (511 MB) copied, 9.2421 s, 55.3 MB/s
   511229952 bytes (511 MB) copied, 9.9364 s, 51.5 MB/s
   511229952 bytes (511 MB) copied, 10.0337 s, 51.0 MB/s
   511229952 bytes (511 MB) copied, 9.39502 s, 54.4 MB/s

 versus the qemu numbers:

  511229952 bytes (511 MB) copied, 10.0879 s, 50.7 MB/s
  511229952 bytes (511 MB) copied, 4.92686 s, 104 MB/s
  511229952 bytes (511 MB) copied, 13.1955 s, 38.7 MB/s
  511229952 bytes (511 MB) copied, 10.7322 s, 47.6 MB/s
  511229952 bytes (511 MB) copied, 9.46115 s, 54.0 MB/s
  511229952 bytes (511 MB) copied, 14.9963 s, 34.1 MB/s
  511229952 bytes (511 MB) copied, 11.1701 s, 45.8 MB/s

 but indeed there's (much) more variability in the Qemu numbers,
 suggesting some cache artifact.

 Are all of these measurements done via /dev/shm, to stabilize the
 numbers and to remove disk IO delay artifacts?

No, I wanted to include disk IO delay artifacts because I was
comparing tools/kvm to itself using seekwatcher to see what's really
happening. And as Kevin pointed out, we're still missing refcount
tables from tools/kvm so it's not a proper comparison anyway. It does
show that tools/kvm QCOW performance improved significantly, though.

Pekka
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC v3 31/56] ac97: convert to memory API

2011-07-11 Thread Avi Kivity

On 07/11/2011 04:42 AM, Anthony Liguori wrote:

On 07/10/2011 03:33 PM, malc wrote:

On Sun, 10 Jul 2011, Avi Kivity wrote:


fixes BAR sizing as well.


I find this patch disgusting, the read and write handlers in particular.


Shouldn't it be possible to do something like:

typedef struct OldMemoryRegionOps {
MemoryRegionOps parent_ops;
CPUReadMemoryFunc *readfn[3];
CPUWriteMemoryFunc *writefn[3];
void *opaque;
} OldMemoryRegionOps;

That should allow old-style implementations to be converted without 
introducing trampoline functions everywhere.


Here's a new version:


diff --git a/hw/ac97.c b/hw/ac97.c
index 0b59896..b4f377d 100644
--- a/hw/ac97.c
+++ b/hw/ac97.c
@@ -160,8 +160,9 @@ typedef struct AC97LinkState {
 SWVoiceIn *voice_mc;
 int invalid_freq[3];
 uint8_t silence[128];
-uint32_t base[2];
 int bup_flag;
+MemoryRegion io_nam;
+MemoryRegion io_nabm;
 } AC97LinkState;

 enum {
@@ -583,7 +584,7 @@ static uint32_t nam_readw (void *opaque, uint32_t addr)
 {
 AC97LinkState *s = opaque;
 uint32_t val = ~0U;
-uint32_t index = addr - s-base[0];
+uint32_t index = addr;
 s-cas = 0;
 val = mixer_load (s, index);
 return val;
@@ -611,7 +612,7 @@ static void nam_writeb (void *opaque, uint32_t addr, 
uint32_t val)

 static void nam_writew (void *opaque, uint32_t addr, uint32_t val)
 {
 AC97LinkState *s = opaque;
-uint32_t index = addr - s-base[0];
+uint32_t index = addr;
 s-cas = 0;
 switch (index) {
 case AC97_Reset:
@@ -714,7 +715,7 @@ static uint32_t nabm_readb (void *opaque, uint32_t addr)
 {
 AC97LinkState *s = opaque;
 AC97BusMasterRegs *r = NULL;
-uint32_t index = addr - s-base[1];
+uint32_t index = addr;
 uint32_t val = ~0U;

 switch (index) {
@@ -769,7 +770,7 @@ static uint32_t nabm_readw (void *opaque, uint32_t addr)
 {
 AC97LinkState *s = opaque;
 AC97BusMasterRegs *r = NULL;
-uint32_t index = addr - s-base[1];
+uint32_t index = addr;
 uint32_t val = ~0U;

 switch (index) {
@@ -798,7 +799,7 @@ static uint32_t nabm_readl (void *opaque, uint32_t addr)
 {
 AC97LinkState *s = opaque;
 AC97BusMasterRegs *r = NULL;
-uint32_t index = addr - s-base[1];
+uint32_t index = addr;
 uint32_t val = ~0U;

 switch (index) {
@@ -848,7 +849,7 @@ static void nabm_writeb (void *opaque, uint32_t 
addr, uint32_t val)

 {
 AC97LinkState *s = opaque;
 AC97BusMasterRegs *r = NULL;
-uint32_t index = addr - s-base[1];
+uint32_t index = addr;
 switch (index) {
 case PI_LVI:
 case PO_LVI:
@@ -904,7 +905,7 @@ static void nabm_writew (void *opaque, uint32_t 
addr, uint32_t val)

 {
 AC97LinkState *s = opaque;
 AC97BusMasterRegs *r = NULL;
-uint32_t index = addr - s-base[1];
+uint32_t index = addr;
 switch (index) {
 case PI_SR:
 case PO_SR:
@@ -924,7 +925,7 @@ static void nabm_writel (void *opaque, uint32_t 
addr, uint32_t val)

 {
 AC97LinkState *s = opaque;
 AC97BusMasterRegs *r = NULL;
-uint32_t index = addr - s-base[1];
+uint32_t index = addr;
 switch (index) {
 case PI_BDBAR:
 case PO_BDBAR:
@@ -1230,31 +1231,33 @@ static const VMStateDescription vmstate_ac97 = {
 }
 };

-static void ac97_map (PCIDevice *pci_dev, int region_num,
-  pcibus_t addr, pcibus_t size, int type)
-{
-AC97LinkState *s = DO_UPCAST (AC97LinkState, dev, pci_dev);
-PCIDevice *d = s-dev;
-
-if (!region_num) {
-s-base[0] = addr;
-register_ioport_read (addr, 256 * 1, 1, nam_readb, d);
-register_ioport_read (addr, 256 * 2, 2, nam_readw, d);
-register_ioport_read (addr, 256 * 4, 4, nam_readl, d);
-register_ioport_write (addr, 256 * 1, 1, nam_writeb, d);
-register_ioport_write (addr, 256 * 2, 2, nam_writew, d);
-register_ioport_write (addr, 256 * 4, 4, nam_writel, d);
-}
-else {
-s-base[1] = addr;
-register_ioport_read (addr, 64 * 1, 1, nabm_readb, d);
-register_ioport_read (addr, 64 * 2, 2, nabm_readw, d);
-register_ioport_read (addr, 64 * 4, 4, nabm_readl, d);
-register_ioport_write (addr, 64 * 1, 1, nabm_writeb, d);
-register_ioport_write (addr, 64 * 2, 2, nabm_writew, d);
-register_ioport_write (addr, 64 * 4, 4, nabm_writel, d);
-}
-}
+static const MemoryRegionPortio nam_portio[] = {
+{ 0, 256 * 1, 1, .read = nam_readb, },
+{ 0, 256 * 2, 2, .read = nam_readw, },
+{ 0, 256 * 4, 4, .read = nam_readl, },
+{ 0, 256 * 1, 1, .write = nam_writeb, },
+{ 0, 256 * 2, 2, .write = nam_writew, },
+{ 0, 256 * 4, 4, .write = nam_writel, },
+PORTIO_END,
+};
+
+static MemoryRegionOps ac97_io_nam_ops = {
+.old_portio = nam_portio,
+};
+
+static const MemoryRegionPortio nabm_portio[] = {
+{ 0, 64 * 1, 1, .read = nabm_readb, },
+{ 0, 64 * 2, 2, .read = nabm_readw, },
+{ 0, 64 * 4, 4, .read = nabm_readl, },
+{ 0, 64 * 1, 1, 

KVM call egenda for July 12

2011-07-11 Thread Juan Quintela

Hi

Please send in any agenda items you are interested in covering.

Later, Juan.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 4/9] KVM-HV: KVM Steal time implementation

2011-07-11 Thread Avi Kivity

On 07/07/2011 08:07 PM, Glauber Costa wrote:

+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+u64 delta;
+
+if (!(vcpu-arch.st.msr_val  KVM_MSR_ENABLED))
+return;
+
+if (unlikely(kvm_read_guest_cached(vcpu-kvm,vcpu-arch.st.stime,
+ vcpu-arch.st.steal, sizeof(struct kvm_steal_time
+return;


The guest memory page is not pinned, sleeping via
__copy_from_user/to_user is not allowed in vcpu_load context. Either pin
it or use atomic acessors.



I do recognize the problem.
Avi, what's your take here?



The easiest solution is to set a KVM_REQ bit in atomic context, and move 
the sleepy code to vcpu_enter_guest().



+case MSR_KVM_STEAL_TIME:
+vcpu-arch.st.msr_val = data;
+
+if (!(data  KVM_MSR_ENABLED)) {
+break;
+}


On failure below this point, msr_val should be cleared of 
KVM_MSR_ENABLED?

No, msr_val has to hold whatever the guest wrote into it.
We should probably use an independent variable here to indicate that 
we failed to activate it.


If we fail, we return a #GP to the guest (and don't write any value into 
the msr).


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/4] scsi fixes

2011-07-11 Thread Hannes Reinecke
Hi all,

these are some fixes I found during debugging my megasas HBA emulation.
This time I've sent them as a separate patchset for inclusion.
All of them have been acked, so please apply.

Hannes Reinecke (4):
  iov: Update parameter usage in iov_(to|from)_buf()
  scsi: Add 'hba_private' to SCSIRequest
  scsi-disk: Fixup debugging statement
  scsi-disk: Mask out serial number EVPD

 hw/esp.c   |2 +-
 hw/lsi53c895a.c|   22 +++-
 hw/scsi-bus.c  |9 +--
 hw/scsi-disk.c |   21 ++-
 hw/scsi-generic.c  |5 ++-
 hw/scsi.h  |   10 ++--
 hw/spapr_vscsi.c   |   29 ---
 hw/usb-msd.c   |9 +---
 hw/virtio-net.c|2 +-
 hw/virtio-serial-bus.c |2 +-
 iov.c  |   49 ++-
 iov.h  |   10 
 12 files changed, 84 insertions(+), 86 deletions(-)

-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] iov: Update parameter usage in iov_(to|from)_buf()

2011-07-11 Thread Hannes Reinecke
iov_to_buf() has an 'offset' parameter, iov_from_buf() hasn't.
This patch adds the missing parameter to iov_from_buf().
It also renames the 'offset' parameter to 'iov_off' to
emphasize it's the offset into the iovec and not the buffer.

Signed-off-by: Hannes Reinecke h...@suse.de
Acked-by: Alexander Graf ag...@suse.de
---
 hw/virtio-net.c|2 +-
 hw/virtio-serial-bus.c |2 +-
 iov.c  |   49 ++-
 iov.h  |   10 
 4 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 6997e02..a32cc01 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -657,7 +657,7 @@ static ssize_t virtio_net_receive(VLANClientState *nc, 
const uint8_t *buf, size_
 
 /* copy in packet.  ugh */
 len = iov_from_buf(sg, elem.in_num,
-   buf + offset, size - offset);
+   buf + offset, 0, size - offset);
 total += len;
 offset += len;
 /* If buffers can't be merged, at this point we
diff --git a/hw/virtio-serial-bus.c b/hw/virtio-serial-bus.c
index 7f6db7b..53c58d0 100644
--- a/hw/virtio-serial-bus.c
+++ b/hw/virtio-serial-bus.c
@@ -103,7 +103,7 @@ static size_t write_to_port(VirtIOSerialPort *port,
 }
 
 len = iov_from_buf(elem.in_sg, elem.in_num,
-   buf + offset, size - offset);
+   buf + offset, 0, size - offset);
 offset += len;
 
 virtqueue_push(vq, elem, len);
diff --git a/iov.c b/iov.c
index 588cd04..1e02791 100644
--- a/iov.c
+++ b/iov.c
@@ -14,56 +14,61 @@
 
 #include iov.h
 
-size_t iov_from_buf(struct iovec *iov, unsigned int iovcnt,
-const void *buf, size_t size)
+size_t iov_from_buf(struct iovec *iov, unsigned int iov_cnt,
+const void *buf, size_t iov_off, size_t size)
 {
-size_t offset;
+size_t iovec_off, buf_off;
 unsigned int i;
 
-offset = 0;
-for (i = 0; offset  size  i  iovcnt; i++) {
-size_t len;
+iovec_off = 0;
+buf_off = 0;
+for (i = 0; i  iov_cnt  size; i++) {
+if (iov_off  (iovec_off + iov[i].iov_len)) {
+size_t len = MIN((iovec_off + iov[i].iov_len) - iov_off, size);
 
-len = MIN(iov[i].iov_len, size - offset);
+memcpy(iov[i].iov_base + (iov_off - iovec_off), buf + buf_off, 
len);
 
-memcpy(iov[i].iov_base, buf + offset, len);
-offset += len;
+buf_off += len;
+iov_off += len;
+size -= len;
+}
+iovec_off += iov[i].iov_len;
 }
-return offset;
+return buf_off;
 }
 
-size_t iov_to_buf(const struct iovec *iov, const unsigned int iovcnt,
-  void *buf, size_t offset, size_t size)
+size_t iov_to_buf(const struct iovec *iov, const unsigned int iov_cnt,
+  void *buf, size_t iov_off, size_t size)
 {
 uint8_t *ptr;
-size_t iov_off, buf_off;
+size_t iovec_off, buf_off;
 unsigned int i;
 
 ptr = buf;
-iov_off = 0;
+iovec_off = 0;
 buf_off = 0;
-for (i = 0; i  iovcnt  size; i++) {
-if (offset  (iov_off + iov[i].iov_len)) {
-size_t len = MIN((iov_off + iov[i].iov_len) - offset , size);
+for (i = 0; i  iov_cnt  size; i++) {
+if (iov_off  (iovec_off + iov[i].iov_len)) {
+size_t len = MIN((iovec_off + iov[i].iov_len) - iov_off , size);
 
-memcpy(ptr + buf_off, iov[i].iov_base + (offset - iov_off), len);
+memcpy(ptr + buf_off, iov[i].iov_base + (iov_off - iovec_off), 
len);
 
 buf_off += len;
-offset += len;
+iov_off += len;
 size -= len;
 }
-iov_off += iov[i].iov_len;
+iovec_off += iov[i].iov_len;
 }
 return buf_off;
 }
 
-size_t iov_size(const struct iovec *iov, const unsigned int iovcnt)
+size_t iov_size(const struct iovec *iov, const unsigned int iov_cnt)
 {
 size_t len;
 unsigned int i;
 
 len = 0;
-for (i = 0; i  iovcnt; i++) {
+for (i = 0; i  iov_cnt; i++) {
 len += iov[i].iov_len;
 }
 return len;
diff --git a/iov.h b/iov.h
index 60a8547..110f67a 100644
--- a/iov.h
+++ b/iov.h
@@ -12,8 +12,8 @@
 
 #include qemu-common.h
 
-size_t iov_from_buf(struct iovec *iov, unsigned int iovcnt,
-const void *buf, size_t size);
-size_t iov_to_buf(const struct iovec *iov, const unsigned int iovcnt,
-  void *buf, size_t offset, size_t size);
-size_t iov_size(const struct iovec *iov, const unsigned int iovcnt);
+size_t iov_from_buf(struct iovec *iov, unsigned int iov_cnt,
+const void *buf, size_t iov_off, size_t size);
+size_t iov_to_buf(const struct iovec *iov, const unsigned int iov_cnt,
+  void *buf, size_t iov_off, size_t size);
+size_t iov_size(const struct iovec *iov, const unsigned int iov_cnt);
-- 
1.7.3.4

--
To 

[PATCH 2/4] scsi: Add 'hba_private' to SCSIRequest

2011-07-11 Thread Hannes Reinecke
'tag' is just an abstraction to identify the command
from the driver. So we should make that explicit by
replacing 'tag' with a driver-defined pointer 'hba_private'.
This saves the lookup for driver handling several commands
in parallel.
'tag' is still being kept for tracing purposes.

Signed-off-by: Hannes Reinecke h...@suse.de
Acked-by: Paolo Bonzini pbonz...@redhat.com
---
 hw/esp.c  |2 +-
 hw/lsi53c895a.c   |   22 --
 hw/scsi-bus.c |9 ++---
 hw/scsi-disk.c|4 ++--
 hw/scsi-generic.c |5 +++--
 hw/scsi.h |   10 +++---
 hw/spapr_vscsi.c  |   29 +
 hw/usb-msd.c  |9 +
 8 files changed, 37 insertions(+), 53 deletions(-)

diff --git a/hw/esp.c b/hw/esp.c
index 8e95672..69209bd 100644
--- a/hw/esp.c
+++ b/hw/esp.c
@@ -244,7 +244,7 @@ static void do_busid_cmd(ESPState *s, uint8_t *buf, uint8_t 
busid)
 
 DPRINTF(do_busid_cmd: busid 0x%x\n, busid);
 lun = busid  7;
-s-current_req = scsi_req_new(s-current_dev, 0, lun);
+s-current_req = scsi_req_new(s-current_dev, 0, lun, NULL);
 datalen = scsi_req_enqueue(s-current_req, buf);
 s-ti_size = datalen;
 if (datalen != 0) {
diff --git a/hw/lsi53c895a.c b/hw/lsi53c895a.c
index 940b43a..69eec1d 100644
--- a/hw/lsi53c895a.c
+++ b/hw/lsi53c895a.c
@@ -661,7 +661,7 @@ static lsi_request *lsi_find_by_tag(LSIState *s, uint32_t 
tag)
 static void lsi_request_cancelled(SCSIRequest *req)
 {
 LSIState *s = DO_UPCAST(LSIState, dev.qdev, req-bus-qbus.parent);
-lsi_request *p;
+lsi_request *p = req-hba_private;
 
 if (s-current  req == s-current-req) {
 scsi_req_unref(req);
@@ -670,7 +670,6 @@ static void lsi_request_cancelled(SCSIRequest *req)
 return;
 }
 
-p = lsi_find_by_tag(s, req-tag);
 if (p) {
 QTAILQ_REMOVE(s-queue, p, next);
 scsi_req_unref(req);
@@ -680,18 +679,12 @@ static void lsi_request_cancelled(SCSIRequest *req)
 
 /* Record that data is available for a queued command.  Returns zero if
the device was reselected, nonzero if the IO is deferred.  */
-static int lsi_queue_tag(LSIState *s, uint32_t tag, uint32_t len)
+static int lsi_queue_req(LSIState *s, SCSIRequest *req, uint32_t len)
 {
-lsi_request *p;
-
-p = lsi_find_by_tag(s, tag);
-if (!p) {
-BADF(IO with unknown tag %d\n, tag);
-return 1;
-}
+lsi_request *p = req-hba_private;
 
 if (p-pending) {
-BADF(Multiple IO pending for tag %d\n, tag);
+BADF(Multiple IO pending for request %p\n, p);
 }
 p-pending = len;
 /* Reselect if waiting for it, or if reselection triggers an IRQ
@@ -743,9 +736,9 @@ static void lsi_transfer_data(SCSIRequest *req, uint32_t 
len)
 LSIState *s = DO_UPCAST(LSIState, dev.qdev, req-bus-qbus.parent);
 int out;
 
-if (s-waiting == 1 || !s-current || req-tag != s-current-tag ||
+if (s-waiting == 1 || !s-current || req-hba_private != s-current ||
 (lsi_irq_on_rsl(s)  !(s-scntl1  LSI_SCNTL1_CON))) {
-if (lsi_queue_tag(s, req-tag, len)) {
+if (lsi_queue_req(s, req, len)) {
 return;
 }
 }
@@ -789,7 +782,8 @@ static void lsi_do_command(LSIState *s)
 assert(s-current == NULL);
 s-current = qemu_mallocz(sizeof(lsi_request));
 s-current-tag = s-select_tag;
-s-current-req = scsi_req_new(dev, s-current-tag, s-current_lun);
+s-current-req = scsi_req_new(dev, s-current-tag, s-current_lun,
+   s-current);
 
 n = scsi_req_enqueue(s-current-req, buf);
 if (n) {
diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index ad6a730..8b1a412 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -131,7 +131,8 @@ int scsi_bus_legacy_handle_cmdline(SCSIBus *bus)
 return res;
 }
 
-SCSIRequest *scsi_req_alloc(size_t size, SCSIDevice *d, uint32_t tag, uint32_t 
lun)
+SCSIRequest *scsi_req_alloc(size_t size, SCSIDevice *d, uint32_t tag,
+uint32_t lun, void *hba_private)
 {
 SCSIRequest *req;
 
@@ -141,14 +142,16 @@ SCSIRequest *scsi_req_alloc(size_t size, SCSIDevice *d, 
uint32_t tag, uint32_t l
 req-dev = d;
 req-tag = tag;
 req-lun = lun;
+req-hba_private = hba_private;
 req-status = -1;
 trace_scsi_req_alloc(req-dev-id, req-lun, req-tag);
 return req;
 }
 
-SCSIRequest *scsi_req_new(SCSIDevice *d, uint32_t tag, uint32_t lun)
+SCSIRequest *scsi_req_new(SCSIDevice *d, uint32_t tag, uint32_t lun,
+  void *hba_private)
 {
-return d-info-alloc_req(d, tag, lun);
+return d-info-alloc_req(d, tag, lun, hba_private);
 }
 
 uint8_t *scsi_req_get_buf(SCSIRequest *req)
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index a8c7372..c2a99fe 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -81,13 +81,13 @@ static int scsi_handle_rw_error(SCSIDiskReq *r, int error, 
int type);
 static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf);
 
 static 

[PATCH 3/4] scsi-disk: Fixup debugging statement

2011-07-11 Thread Hannes Reinecke
A debugging statement wasn't converted to the new interface.

Signed-off-by: Hannes Reinecke h...@suse.de
Acked-by: Paolo Bonzini pbonz...@redhat.com
---
 hw/scsi-disk.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index c2a99fe..5804662 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -1007,7 +1007,7 @@ static int32_t scsi_send_command(SCSIRequest *req, 
uint8_t *buf)
 
 command = buf[0];
 outbuf = (uint8_t *)r-iov.iov_base;
-DPRINTF(Command: lun=%d tag=0x%x data=0x%02x, lun, tag, buf[0]);
+DPRINTF(Command: lun=%d tag=0x%x data=0x%02x, req-lun, req-tag, 
buf[0]);
 
 if (scsi_req_parse(r-req, buf) != 0) {
 BADF(Unsupported command length, command %x\n, command);
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] scsi-disk: Mask out serial number EVPD

2011-07-11 Thread Hannes Reinecke
If the serial number is not set we should mask it out in the
list of supported VPD pages and mark it as not supported.

Signed-off-by: Hannes Reinecke h...@suse.de
Acked-by: Paolo Bonzini pbonz...@redhat.com
---
 hw/scsi-disk.c |   15 ---
 1 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 5804662..05d14ab 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -398,7 +398,8 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, 
uint8_t *outbuf)
 buffer size %zd\n, req-cmd.xfer);
 pages = buflen++;
 outbuf[buflen++] = 0x00; // list of supported pages (this page)
-outbuf[buflen++] = 0x80; // unit serial number
+if (s-serial)
+outbuf[buflen++] = 0x80; // unit serial number
 outbuf[buflen++] = 0x83; // device identification
 if (s-drive_kind == SCSI_HD) {
 outbuf[buflen++] = 0xb0; // block limits
@@ -409,8 +410,14 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, 
uint8_t *outbuf)
 }
 case 0x80: /* Device serial number, optional */
 {
-int l = strlen(s-serial);
+int l;
 
+if (!s-serial) {
+DPRINTF(Inquiry (EVPD[Serial number] not supported\n);
+return -1;
+}
+
+l = strlen(s-serial);
 if (l  req-cmd.xfer)
 l = req-cmd.xfer;
 if (l  20)
@@ -1203,7 +1210,9 @@ static int scsi_initfn(SCSIDevice *dev, SCSIDriveKind 
kind)
 if (!s-serial) {
 /* try to fall back to value set with legacy -drive serial=... */
 dinfo = drive_get_by_blockdev(s-bs);
-s-serial = qemu_strdup(*dinfo-serial ? dinfo-serial : 0);
+if (*dinfo-serial) {
+s-serial = qemu_strdup(dinfo-serial);
+}
 }
 
 if (!s-version) {
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 6/9] add jump labels for ia64 paravirt

2011-07-11 Thread Avi Kivity

On 07/04/2011 06:32 PM, Glauber Costa wrote:

Since in a later patch I intend to call jump labels inside
CONFIG_PARAVIRT, IA64 would fail to compile if they are not
provided. This patch provides those jump labels for the IA64
architecture.



Please get an ack for this from the ia64 maintainer.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 4/9] KVM-HV: KVM Steal time implementation

2011-07-11 Thread Avi Kivity

On 07/04/2011 06:32 PM, Glauber Costa wrote:

To implement steal time, we need the hypervisor to pass the guest
information about how much time was spent running other processes
outside the VM, while the vcpu had meaningful work to do - halt
time does not count.

This information is acquired through the run_delay field of
delayacct/schedstats infrastructure, that counts time spent in a
runqueue but not running.

Steal time is a per-cpu information, so the traditional MSR-based
infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the
memory area address containing information about steal time

This patch contains the hypervisor part of the steal time infrasructure,
and can be backported independently of the guest portion.

Signed-off-by: Glauber Costaglom...@redhat.com
CC: Rik van Rielr...@redhat.com
CC: Jeremy Fitzhardingejeremy.fitzhardi...@citrix.com
CC: Peter Zijlstrapet...@infradead.org
CC: Avi Kivitya...@redhat.com
CC: Anthony Liguorialigu...@us.ibm.com
CC: Eric B Munsonemun...@mgebm.net


I think Peter acked this, yes?  Please include his acks.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 4/9] KVM-HV: KVM Steal time implementation

2011-07-11 Thread Avi Kivity

On 07/11/2011 04:10 PM, Avi Kivity wrote:


I think Peter acked this, yes?  Please include his acks.



Sorry, I meant this for the kernel/sched.c bits.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call egenda for July 12

2011-07-11 Thread Alexander Graf

Am 11.07.2011 um 13:46 schrieb Juan Quintela quint...@redhat.com:

 
 Hi
 
 Please send in any agenda items you are interested in covering.

Device passthrough on non-PCI (take 2)


Alex

 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 4/9] KVM-HV: KVM Steal time implementation

2011-07-11 Thread Glauber Costa

On 07/11/2011 10:11 AM, Avi Kivity wrote:

On 07/11/2011 04:10 PM, Avi Kivity wrote:


I think Peter acked this, yes? Please include his acks.



Sorry, I meant this for the kernel/sched.c bits.


Yes he did, after I posted the patches.
I will include his acks in the respin, since they won't change the 
scheduler bits.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 6/9] add jump labels for ia64 paravirt

2011-07-11 Thread Glauber Costa

On 07/11/2011 10:09 AM, Avi Kivity wrote:

On 07/04/2011 06:32 PM, Glauber Costa wrote:

Since in a later patch I intend to call jump labels inside
CONFIG_PARAVIRT, IA64 would fail to compile if they are not
provided. This patch provides those jump labels for the IA64
architecture.



Please get an ack for this from the ia64 maintainer.



The ones listed in paravirt.c are CC'd. I am CC'ing all the other
folks related to IA64 listed in MAINTAINERS now.

Just a heads up so the new folks CC'd can get up to speed:
I am proposing moving the steal time calculation to inside the core 
scheduler. This move will make it easier for us to make decisions based 
on steal time accounting on virtual machines. Ia64 KVM may or may not 
follow this route - nothing that already works should break with this 
change.


This patch is needed only to avoid breaking compilation, since it 
introduces two new variables that are expected be present when 
CONFIG_PARAVIRT, to paravirt.c.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/4] scsi fixes

2011-07-11 Thread Stefan Hajnoczi
On Mon, Jul 11, 2011 at 2:02 PM, Hannes Reinecke h...@suse.de wrote:
 Hi all,

 these are some fixes I found during debugging my megasas HBA emulation.
 This time I've sent them as a separate patchset for inclusion.
 All of them have been acked, so please apply.

Are SCSI patches going through Kevin's tree?

If not, perhaps Paolo or I should keep a tree and start doing some
sanity testing on the subsystem in the future.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/4] scsi fixes

2011-07-11 Thread Kevin Wolf
Am 11.07.2011 15:34, schrieb Stefan Hajnoczi:
 On Mon, Jul 11, 2011 at 2:02 PM, Hannes Reinecke h...@suse.de wrote:
 Hi all,

 these are some fixes I found during debugging my megasas HBA emulation.
 This time I've sent them as a separate patchset for inclusion.
 All of them have been acked, so please apply.
 
 Are SCSI patches going through Kevin's tree?
 
 If not, perhaps Paolo or I should keep a tree and start doing some
 sanity testing on the subsystem in the future.

As long as we don't have a SCSI maintainer, I'm going to pick them up
for the block tree when they have receive some review.

Doesn't mean that nobody should be doing sanity testing, of course. If
anyone wants to take care of picking up and reviewing all SCSI patches,
I'm also happy to pull from a separate tree.

Kevin
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/4] scsi fixes

2011-07-11 Thread Hannes Reinecke

On 07/11/2011 03:42 PM, Kevin Wolf wrote:

Am 11.07.2011 15:34, schrieb Stefan Hajnoczi:

On Mon, Jul 11, 2011 at 2:02 PM, Hannes Reineckeh...@suse.de  wrote:

Hi all,

these are some fixes I found during debugging my megasas HBA emulation.
This time I've sent them as a separate patchset for inclusion.
All of them have been acked, so please apply.


Are SCSI patches going through Kevin's tree?

If not, perhaps Paolo or I should keep a tree and start doing some
sanity testing on the subsystem in the future.


As long as we don't have a SCSI maintainer, I'm going to pick them up
for the block tree when they have receive some review.

Doesn't mean that nobody should be doing sanity testing, of course. If
anyone wants to take care of picking up and reviewing all SCSI patches,
I'm also happy to pull from a separate tree.

Patches have already been reviewed and tested, in conjunction with 
my megasas HBA emulation patchset.

This is just a repost as a separate patchset to get them in.

Cheers,

Hannes
--
Dr. Hannes Reinecke   zSeries  Storage
h...@suse.de  +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 4/9] KVM-HV: KVM Steal time implementation

2011-07-11 Thread Glauber Costa

On 07/11/2011 09:58 AM, Avi Kivity wrote:

On 07/07/2011 08:07 PM, Glauber Costa wrote:

+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+ u64 delta;
+
+ if (!(vcpu-arch.st.msr_val KVM_MSR_ENABLED))
+ return;
+
+ if (unlikely(kvm_read_guest_cached(vcpu-kvm,vcpu-arch.st.stime,
+ vcpu-arch.st.steal, sizeof(struct kvm_steal_time
+ return;


The guest memory page is not pinned, sleeping via
__copy_from_user/to_user is not allowed in vcpu_load context. Either pin
it or use atomic acessors.



I do recognize the problem.
Avi, what's your take here?



The easiest solution is to set a KVM_REQ bit in atomic context, and move
the sleepy code to vcpu_enter_guest().


Or I can move it all inside vcpu_run, or close enough to it. This will 
account more hypervisor time as steal time, but it seemed to be what 
some people wanted in the first place.


Given the simplification we would win - not needing a REQ set, it might 
be worth it.



+ case MSR_KVM_STEAL_TIME:
+ vcpu-arch.st.msr_val = data;
+
+ if (!(data KVM_MSR_ENABLED)) {
+ break;
+ }


On failure below this point, msr_val should be cleared of
KVM_MSR_ENABLED?

No, msr_val has to hold whatever the guest wrote into it.
We should probably use an independent variable here to indicate that
we failed to activate it.


If we fail, we return a #GP to the guest (and don't write any value into
the msr).



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 6/9] add jump labels for ia64 paravirt

2011-07-11 Thread Isaku Yamahata
On Mon, Jul 11, 2011 at 10:24:00AM -0300, Glauber Costa wrote:
 On 07/11/2011 10:09 AM, Avi Kivity wrote:
 On 07/04/2011 06:32 PM, Glauber Costa wrote:
 Since in a later patch I intend to call jump labels inside
 CONFIG_PARAVIRT, IA64 would fail to compile if they are not
 provided. This patch provides those jump labels for the IA64
 architecture.


 Please get an ack for this from the ia64 maintainer.


 The ones listed in paravirt.c are CC'd. I am CC'ing all the other
 folks related to IA64 listed in MAINTAINERS now.

 Just a heads up so the new folks CC'd can get up to speed:
 I am proposing moving the steal time calculation to inside the core  
 scheduler. This move will make it easier for us to make decisions based  
 on steal time accounting on virtual machines. Ia64 KVM may or may not  
 follow this route - nothing that already works should break with this  
 change.

 This patch is needed only to avoid breaking compilation, since it  
 introduces two new variables that are expected be present when  
 CONFIG_PARAVIRT, to paravirt.c.

Although the ia64 maintainer is Tony Luck,
I'm fine with the change as the ia64 paravirt author.

Acked-by: Isaku Yamahata yamah...@valinux.co.jp

-- 
yamahata
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: PowerPoint performance degrade greatly when logon on through rdesktop to winxp, and when lotus notes is running.

2011-07-11 Thread Avi Kivity

On 07/05/2011 05:05 PM, ya su wrote:

the previous kvm-stat is produced base on current kvm-kmod git
version, and pateched with xiaoguangrong's mmio page-fault, but it can
not produce trace-cmd report for truncated trace.dat file.


Please apply commit 140fe3b1ab9c082182ef13359fab4ddba95c24c3 from 
upstream and retry, it will fix tracing for you.  I prefer a trace from 
a recent kernel since it has more information and the code is fresher in 
my memory.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 00/18] KVM: optimize for MMIO handled

2011-07-11 Thread Xiao Guangrong
Changes in this new version:
- fix the logic of dirty bit in [PATCH 04/19] KVM: MMU: cache
  mmio info on page fault path
- rename is_mmio_pfn() to is_noslot_pfn() to avoid the conflicts
  with kvm_is_mmio_pfn
- remove [PATCH 14/19] KVM: MMU: clean up spte updating and clearing
- completely check the last spte for mmio page fault suggested
  by Marcelo Tosatti
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 01/18] KVM: MMU: fix walking shadow page table

2011-07-11 Thread Xiao Guangrong
Properly check the last mapping, and do not walk to the next level if last spte
is met

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |9 +
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index da0f3b0..03323dc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1517,10 +1517,6 @@ static bool shadow_walk_okay(struct 
kvm_shadow_walk_iterator *iterator)
if (iterator-level  PT_PAGE_TABLE_LEVEL)
return false;
 
-   if (iterator-level == PT_PAGE_TABLE_LEVEL)
-   if (is_large_pte(*iterator-sptep))
-   return false;
-
iterator-index = SHADOW_PT_INDEX(iterator-addr, iterator-level);
iterator-sptep = ((u64 *)__va(iterator-shadow_addr)) + 
iterator-index;
return true;
@@ -1528,6 +1524,11 @@ static bool shadow_walk_okay(struct 
kvm_shadow_walk_iterator *iterator)
 
 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
 {
+   if (is_last_spte(*iterator-sptep, iterator-level)) {
+   iterator-level = 0;
+   return;
+   }
+
iterator-shadow_addr = *iterator-sptep  PT64_BASE_ADDR_MASK;
--iterator-level;
 }
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 02/18] KVM: MMU: do not update slot bitmap if spte is nonpresent

2011-07-11 Thread Xiao Guangrong
Set slot bitmap only if the spte is present

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |   15 +++
 1 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 03323dc..02c839f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -743,9 +743,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t 
gfn)
struct kvm_mmu_page *sp;
unsigned long *rmapp;
 
-   if (!is_rmap_spte(*spte))
-   return 0;
-
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp-spt, gfn);
rmapp = gfn_to_rmap(vcpu-kvm, gfn, sp-role.level);
@@ -2087,11 +2084,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
if (!was_rmapped  is_large_pte(*sptep))
++vcpu-kvm-stat.lpages;
 
-   page_header_update_slot(vcpu-kvm, sptep, gfn);
-   if (!was_rmapped) {
-   rmap_count = rmap_add(vcpu, sptep, gfn);
-   if (rmap_count  RMAP_RECYCLE_THRESHOLD)
-   rmap_recycle(vcpu, sptep, gfn);
+   if (is_shadow_present_pte(*sptep)) {
+   page_header_update_slot(vcpu-kvm, sptep, gfn);
+   if (!was_rmapped) {
+   rmap_count = rmap_add(vcpu, sptep, gfn);
+   if (rmap_count  RMAP_RECYCLE_THRESHOLD)
+   rmap_recycle(vcpu, sptep, gfn);
+   }
}
kvm_release_pfn_clean(pfn);
if (speculative) {
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 03/18] KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code

2011-07-11 Thread Xiao Guangrong
Introduce vcpu_mmio_gva_to_gpa to translate the gva to gpa, we can use it
to cleanup the code between read emulation and write emulation

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/x86.c |   42 +++---
 1 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b803f0..d77ac44 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3944,6 +3944,27 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
+static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+   gpa_t *gpa, struct x86_exception *exception,
+   bool write)
+{
+   u32 access = (kvm_x86_ops-get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+   if (write)
+   access |= PFERR_WRITE_MASK;
+
+   *gpa = vcpu-arch.walk_mmu-gva_to_gpa(vcpu, gva, access, exception);
+
+   if (*gpa == UNMAPPED_GVA)
+   return -1;
+
+   /* For APIC access vmexit */
+   if ((*gpa  PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+   return 1;
+
+   return 0;
+}
+
 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
  unsigned long addr,
  void *val,
@@ -3951,8 +3972,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt 
*ctxt,
  struct x86_exception *exception)
 {
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-   gpa_t gpa;
-   int handled;
+   gpa_t gpa;
+   int handled, ret;
 
if (vcpu-mmio_read_completed) {
memcpy(val, vcpu-mmio_data, bytes);
@@ -3962,13 +3983,12 @@ static int emulator_read_emulated(struct 
x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
 
-   gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
+   ret = vcpu_mmio_gva_to_gpa(vcpu, addr, gpa, exception, false);
 
-   if (gpa == UNMAPPED_GVA)
+   if (ret  0)
return X86EMUL_PROPAGATE_FAULT;
 
-   /* For APIC access vmexit */
-   if ((gpa  PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+   if (ret)
goto mmio;
 
if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
@@ -4019,16 +4039,16 @@ static int emulator_write_emulated_onepage(unsigned 
long addr,
   struct x86_exception *exception,
   struct kvm_vcpu *vcpu)
 {
-   gpa_t gpa;
-   int handled;
+   gpa_t gpa;
+   int handled, ret;
 
-   gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
+   ret = vcpu_mmio_gva_to_gpa(vcpu, addr, gpa, exception, true);
 
-   if (gpa == UNMAPPED_GVA)
+   if (ret  0)
return X86EMUL_PROPAGATE_FAULT;
 
/* For APIC access vmexit */
-   if ((gpa  PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+   if (ret)
goto mmio;
 
if (emulator_write_phys(vcpu, gpa, val, bytes))
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 04/18] KVM: MMU: cache mmio info on page fault path

2011-07-11 Thread Xiao Guangrong
If the page fault is caused by mmio, we can cache the mmio info, later, we do
not need to walk guest page table and quickly know it is a mmio fault while we
emulate the mmio instruction

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/include/asm/kvm_host.h |5 +
 arch/x86/kvm/mmu.c  |   21 +++--
 arch/x86/kvm/mmu.h  |   23 +++
 arch/x86/kvm/paging_tmpl.h  |   21 ++---
 arch/x86/kvm/x86.c  |   11 +++
 arch/x86/kvm/x86.h  |   36 
 6 files changed, 96 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da6bbee..7b0834a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -415,6 +415,11 @@ struct kvm_vcpu_arch {
u64 mcg_ctl;
u64 *mce_banks;
 
+   /* Cache MMIO info */
+   u64 mmio_gva;
+   unsigned access;
+   gfn_t mmio_gfn;
+
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 02c839f..d1986b7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -217,11 +217,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 
accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-static bool is_write_protection(struct kvm_vcpu *vcpu)
-{
-   return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
 static int is_cpuid_PSE36(void)
 {
return 1;
@@ -243,11 +238,6 @@ static int is_large_pte(u64 pte)
return pte  PT_PAGE_SIZE_MASK;
 }
 
-static int is_writable_pte(unsigned long pte)
-{
-   return pte  PT_WRITABLE_MASK;
-}
-
 static int is_dirty_gpte(unsigned long pte)
 {
return pte  PT_DIRTY_MASK;
@@ -2247,15 +2237,17 @@ static void kvm_send_hwpoison_signal(unsigned long 
address, struct task_struct *
send_sig_info(SIGBUS, info, tsk);
 }
 
-static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
+  unsigned access, gfn_t gfn, pfn_t pfn)
 {
kvm_release_pfn_clean(pfn);
if (is_hwpoison_pfn(pfn)) {
-   kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
+   kvm_send_hwpoison_signal(gfn_to_hva(vcpu-kvm, gfn), current);
return 0;
} else if (is_fault_pfn(pfn))
return -EFAULT;
 
+   vcpu_cache_mmio_info(vcpu, gva, gfn, access);
return 1;
 }
 
@@ -2337,7 +2329,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, 
int write, gfn_t gfn,
 
/* mmio */
if (is_error_pfn(pfn))
-   return kvm_handle_bad_page(vcpu-kvm, gfn, pfn);
+   return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
 
spin_lock(vcpu-kvm-mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2564,6 +2556,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu-arch.mmu.root_hpa))
return;
 
+   vcpu_clear_mmio_info(vcpu, ~0ul);
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu-arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu-arch.mmu.root_hpa;
@@ -2710,7 +2703,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa, u32 error_code,
 
/* mmio */
if (is_error_pfn(pfn))
-   return kvm_handle_bad_page(vcpu-kvm, gfn, pfn);
+   return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
spin_lock(vcpu-kvm-mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca8..05310b1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -76,4 +76,27 @@ static inline int is_present_gpte(unsigned long pte)
return pte  PT_PRESENT_MASK;
 }
 
+static inline int is_writable_pte(unsigned long pte)
+{
+   return pte  PT_WRITABLE_MASK;
+}
+
+static inline bool is_write_protection(struct kvm_vcpu *vcpu)
+{
+   return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+}
+
+static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
+  bool write_fault, bool user_fault,
+  unsigned long pte)
+{
+   if (unlikely(write_fault  !is_writable_pte(pte)
+  (user_fault || is_write_protection(vcpu
+   return false;
+
+   if (unlikely(user_fault  !(pte  PT_USER_MASK)))
+   return false;
+
+   return true;
+}
 #endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1e1c244..f0fb1a4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -208,11 +208,8 @@ retry_walk:
goto error;
}
 
-   if (unlikely(write_fault  

[PATCH v4 05/18] KVM: MMU: optimize to handle dirty bit

2011-07-11 Thread Xiao Guangrong
If dirty bit is not set, we can make the pte access read-only to avoid handing
dirty bit everywhere

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |   13 +--
 arch/x86/kvm/paging_tmpl.h |   46 ++-
 2 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d1986b7..98812c2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1923,7 +1923,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, 
gfn_t gfn,
 
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
-   int write_fault, int dirty, int level,
+   int write_fault, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
 {
@@ -1938,8 +1938,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte = PT_PRESENT_MASK;
if (!speculative)
spte |= shadow_accessed_mask;
-   if (!dirty)
-   pte_access = ~ACC_WRITE_MASK;
+
if (pte_access  ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
@@ -2023,7 +2022,7 @@ done:
 
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 unsigned pt_access, unsigned pte_access,
-int user_fault, int write_fault, int dirty,
+int user_fault, int write_fault,
 int *ptwrite, int level, gfn_t gfn,
 pfn_t pfn, bool speculative,
 bool host_writable)
@@ -2059,7 +2058,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
}
 
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- dirty, level, gfn, pfn, speculative, true,
+ level, gfn, pfn, speculative, true,
  host_writable)) {
if (write_fault)
*ptwrite = 1;
@@ -2129,7 +2128,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 
for (i = 0; i  ret; i++, gfn++, start++)
mmu_set_spte(vcpu, start, ACC_ALL,
-access, 0, 0, 1, NULL,
+access, 0, 0, NULL,
 sp-role.level, gfn,
 page_to_pfn(pages[i]), true, true);
 
@@ -2193,7 +2192,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,
unsigned pte_access = ACC_ALL;
 
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-0, write, 1, pt_write,
+0, write, pt_write,
 level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu-stat.pf_fixed;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f0fb1a4..c9fe97b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
return (ret != orig_pte);
 }
 
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
+  bool last)
 {
unsigned access;
 
access = (gpte  (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+   if (last  !is_dirty_gpte(gpte))
+   access = ~ACC_WRITE_MASK;
+
 #if PTTYPE == 64
if (vcpu-arch.mmu.nx)
access = ~(gpte  PT64_NX_SHIFT);
@@ -232,8 +236,6 @@ retry_walk:
pte |= PT_ACCESSED_MASK;
}
 
-   pte_access = pt_access  FNAME(gpte_access)(vcpu, pte);
-
walker-ptes[walker-level - 1] = pte;
 
if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
@@ -268,7 +270,7 @@ retry_walk:
break;
}
 
-   pt_access = pte_access;
+   pt_access = FNAME(gpte_access)(vcpu, pte, false);
--walker-level;
}
 
@@ -293,6 +295,7 @@ retry_walk:
walker-ptes[walker-level - 1] = pte;
}
 
+   pte_access = pt_access  FNAME(gpte_access)(vcpu, pte, true);
walker-pt_access = pt_access;
walker-pte_access = pte_access;
pgprintk(%s: pte %llx pte_access %x pt_access %x\n,
@@ -367,7 +370,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
return;
 
pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte);
-   pte_access = sp-role.access  FNAME(gpte_access)(vcpu, gpte);
+   pte_access = sp-role.access  FNAME(gpte_access)(vcpu, gpte, true);
   

[PATCH v4 06/18] KVM: MMU: cleanup for FNAME(fetch)

2011-07-11 Thread Xiao Guangrong
gw-pte_access is the final access permission, since it is unified with
gw-pt_access when we walked guest page table:

FNAME(walk_addr_generic):
pte_access = pt_access  FNAME(gpte_access)(vcpu, pte, true);

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/paging_tmpl.h |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c9fe97b..5c2aa40 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -479,7 +479,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (!is_present_gpte(gw-ptes[gw-level - 1]))
return NULL;
 
-   direct_access = gw-pt_access  gw-pte_access;
+   direct_access = gw-pte_access;
 
top_level = vcpu-arch.mmu.root_level;
if (top_level == PT32E_ROOT_LEVEL)
@@ -537,7 +537,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(it.sptep, sp);
}
 
-   mmu_set_spte(vcpu, it.sptep, access, gw-pte_access  access,
+   mmu_set_spte(vcpu, it.sptep, access, gw-pte_access,
 user_fault, write_fault, ptwrite, it.level,
 gw-gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 07/18] KVM: MMU: rename 'pt_write' to 'emulate'

2011-07-11 Thread Xiao Guangrong
If 'pt_write' is true, we need to emulate the fault. And in later patch, we
need to emulate the fault even though it is not a pt_write event, so rename
it to better fit the meaning

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |   10 +-
 arch/x86/kvm/paging_tmpl.h |   16 
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 98812c2..a62ba46 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2023,7 +2023,7 @@ done:
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 unsigned pt_access, unsigned pte_access,
 int user_fault, int write_fault,
-int *ptwrite, int level, gfn_t gfn,
+int *emulate, int level, gfn_t gfn,
 pfn_t pfn, bool speculative,
 bool host_writable)
 {
@@ -2061,7 +2061,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
  level, gfn, pfn, speculative, true,
  host_writable)) {
if (write_fault)
-   *ptwrite = 1;
+   *emulate = 1;
kvm_mmu_flush_tlb(vcpu);
}
 
@@ -2184,7 +2184,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,
 {
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
-   int pt_write = 0;
+   int emulate = 0;
gfn_t pseudo_gfn;
 
for_each_shadow_entry(vcpu, (u64)gfn  PAGE_SHIFT, iterator) {
@@ -2192,7 +2192,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,
unsigned pte_access = ACC_ALL;
 
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-0, write, pt_write,
+0, write, emulate,
 level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu-stat.pf_fixed;
@@ -2220,7 +2220,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,
   | shadow_accessed_mask);
}
}
-   return pt_write;
+   return emulate;
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct 
*tsk)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5c2aa40..fa3b54b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -467,7 +467,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 struct guest_walker *gw,
 int user_fault, int write_fault, int hlevel,
-int *ptwrite, pfn_t pfn, bool map_writable,
+int *emulate, pfn_t pfn, bool map_writable,
 bool prefault)
 {
unsigned access = gw-pt_access;
@@ -538,7 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
 
mmu_set_spte(vcpu, it.sptep, access, gw-pte_access,
-user_fault, write_fault, ptwrite, it.level,
+user_fault, write_fault, emulate, it.level,
 gw-gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
@@ -572,7 +572,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr, u32 error_code,
int user_fault = error_code  PFERR_USER_MASK;
struct guest_walker walker;
u64 *sptep;
-   int write_pt = 0;
+   int emulate = 0;
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
@@ -633,19 +633,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr, u32 error_code,
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, walker.gfn, pfn, level);
sptep = FNAME(fetch)(vcpu, addr, walker, user_fault, write_fault,
-level, write_pt, pfn, map_writable, prefault);
+level, emulate, pfn, map_writable, prefault);
(void)sptep;
-   pgprintk(%s: shadow pte %p %llx ptwrite %d\n, __func__,
-sptep, *sptep, write_pt);
+   pgprintk(%s: shadow pte %p %llx emulate %d\n, __func__,
+sptep, *sptep, emulate);
 
-   if (!write_pt)
+   if (!emulate)
vcpu-arch.last_pt_write_count = 0; /* reset fork detector */
 
++vcpu-stat.pf_fixed;
trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(vcpu-kvm-mmu_lock);
 
-   return write_pt;
+   return emulate;
 
 out_unlock:
spin_unlock(vcpu-kvm-mmu_lock);
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body 

[PATCH v4 08/18] KVM: MMU: count used shadow pages on prepareing path

2011-07-11 Thread Xiao Guangrong
Move counting used shadow pages from commiting path to preparing path to
reduce tlb flush on some paths

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |   10 +-
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a62ba46..91d3069 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1039,7 +1039,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm 
*kvm, int nr)
percpu_counter_add(kvm_total_used_mmu_pages, nr);
 }
 
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
ASSERT(is_empty_shadow_page(sp-spt));
hlist_del(sp-hash_link);
@@ -1048,7 +1048,6 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct 
kvm_mmu_page *sp)
if (!sp-role.direct)
free_page((unsigned long)sp-gfns);
kmem_cache_free(mmu_page_header_cache, sp);
-   kvm_mod_used_mmu_pages(kvm, -1);
 }
 
 static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1655,6 +1654,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
/* Count self */
ret++;
list_move(sp-link, invalid_list);
+   kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(sp-link, kvm-arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
@@ -1678,7 +1678,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp-role.invalid || sp-root_count);
-   kvm_mmu_free_page(kvm, sp);
+   kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));
 
 }
@@ -1704,8 +1704,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned 
int goal_nr_mmu_pages)
page = container_of(kvm-arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
-   kvm_mmu_commit_zap_page(kvm, invalid_list);
}
+   kvm_mmu_commit_zap_page(kvm, invalid_list);
goal_nr_mmu_pages = kvm-arch.n_used_mmu_pages;
}
 
@@ -3302,9 +3302,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
sp = container_of(vcpu-kvm-arch.active_mmu_pages.prev,
  struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(vcpu-kvm, sp, invalid_list);
-   kvm_mmu_commit_zap_page(vcpu-kvm, invalid_list);
++vcpu-kvm-stat.mmu_recycled;
}
+   kvm_mmu_commit_zap_page(vcpu-kvm, invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 09/18] KVM: MMU: split kvm_mmu_free_page

2011-07-11 Thread Xiao Guangrong
Split kvm_mmu_free_page to kvm_mmu_isolate_page and
kvm_mmu_free_page

One is used to remove the page from cache under mmu lock and the other is
used to free page table out of mmu lock

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |   21 ++---
 1 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 91d3069..2f8543c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1039,14 +1039,28 @@ static inline void kvm_mod_used_mmu_pages(struct kvm 
*kvm, int nr)
percpu_counter_add(kvm_total_used_mmu_pages, nr);
 }
 
-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+/*
+ * Remove the sp from shadow page cache, after call it,
+ * we can not find this sp from the cache, and the shadow
+ * page table is still valid.
+ * It should be under the protection of mmu lock.
+ */
+static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
 {
ASSERT(is_empty_shadow_page(sp-spt));
hlist_del(sp-hash_link);
-   list_del(sp-link);
-   free_page((unsigned long)sp-spt);
if (!sp-role.direct)
free_page((unsigned long)sp-gfns);
+}
+
+/*
+ * Free the shadow page table and the sp, we can do it
+ * out of the protection of mmu lock.
+ */
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+{
+   list_del(sp-link);
+   free_page((unsigned long)sp-spt);
kmem_cache_free(mmu_page_header_cache, sp);
 }
 
@@ -1678,6 +1692,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp-role.invalid || sp-root_count);
+   kvm_mmu_isolate_page(sp);
kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));
 
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 10/18] KVM: MMU: remove bypass_guest_pf

2011-07-11 Thread Xiao Guangrong
The idea is from Avi:
| Maybe it's time to kill off bypass_guest_pf=1.  It's not as effective as
| it used to be, since unsync pages always use shadow_trap_nonpresent_pte,
| and since we convert between the two nonpresent_ptes during sync and unsync.

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 Documentation/kernel-parameters.txt |4 --
 arch/x86/include/asm/kvm_host.h |3 -
 arch/x86/kvm/mmu.c  |   83 ++-
 arch/x86/kvm/mmu_audit.c|   12 -
 arch/x86/kvm/paging_tmpl.h  |   51 +++--
 arch/x86/kvm/vmx.c  |   11 +
 arch/x86/kvm/x86.c  |1 -
 7 files changed, 33 insertions(+), 132 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index fd248a31..a06e4f1 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1159,10 +1159,6 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
for all guests.
Default is 1 (enabled) if in 64bit or 32bit-PAE mode
 
-   kvm-intel.bypass_guest_pf=
-   [KVM,Intel] Disables bypassing of guest page faults
-   on Intel chips. Default is 1 (enabled)
-
kvm-intel.ept=  [KVM,Intel] Disable extended page tables
(virtualized MMU) support on capable Intel chips.
Default is 1 (enabled)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7b0834a..42e577d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -266,8 +266,6 @@ struct kvm_mmu {
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
struct x86_exception *exception);
gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
-   void (*prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
 struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
@@ -638,7 +636,6 @@ void kvm_mmu_module_exit(void);
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2f8543c..5334b4e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -186,8 +186,6 @@ static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
 static u64 __read_mostly shadow_nx_mask;
 static u64 __read_mostly shadow_x_mask;/* mutual exclusive with 
nx_mask */
 static u64 __read_mostly shadow_user_mask;
@@ -199,13 +197,6 @@ static inline u64 rsvd_bits(int s, int e)
return ((1ULL  (e - s + 1)) - 1)  s;
 }
 
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
-{
-   shadow_trap_nonpresent_pte = trap_pte;
-   shadow_notrap_nonpresent_pte = notrap_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
@@ -229,8 +220,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-   return pte != shadow_trap_nonpresent_pte
-pte != shadow_notrap_nonpresent_pte;
+   return pte  PT_PRESENT_MASK;
 }
 
 static int is_large_pte(u64 pte)
@@ -777,9 +767,9 @@ static int set_spte_track_bits(u64 *sptep, u64 new_spte)
return 1;
 }
 
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
-   if (set_spte_track_bits(sptep, new_spte))
+   if (set_spte_track_bits(sptep, 0ull))
rmap_remove(kvm, sptep);
 }
 
@@ -814,8 +804,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON((*spte  (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != 
(PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk(rmap_write_protect(large): spte %p %llx 
%lld\n, spte, *spte, gfn);
if (is_writable_pte(*spte)) {
-   drop_spte(kvm, spte,
- shadow_trap_nonpresent_pte);
+   drop_spte(kvm, spte);
--kvm-stat.lpages;
spte = NULL;
write_protected = 1;
@@ -836,7 +825,7 @@ 

[PATCH v4 11/18] KVM: MMU: filter out the mmio pfn from the fault pfn

2011-07-11 Thread Xiao Guangrong
If the page fault is caused by mmio, the gfn can not be found in memslots, and
'bad_pfn' is returned on gfn_to_hva path, so we can use 'bad_pfn' to identify
the mmio page fault.
And, to clarify the meaning of mmio pfn, we return fault page instead of bad
page when the gfn is not allowd to prefetch

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c   |4 ++--
 include/linux/kvm_host.h |5 +
 virt/kvm/kvm_main.c  |   16 ++--
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5334b4e..96a7ed4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2085,8 +2085,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu 
*vcpu, gfn_t gfn,
 
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
if (!slot) {
-   get_page(bad_page);
-   return page_to_pfn(bad_page);
+   get_page(fault_page);
+   return page_to_pfn(fault_page);
}
 
hva = gfn_to_hva_memslot(slot, gfn);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 31ebb59..f61f2d7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -326,12 +326,17 @@ static inline struct kvm_memslots *kvm_memslots(struct 
kvm *kvm)
 static inline int is_error_hpa(hpa_t hpa) { return hpa  HPA_MSB; }
 
 extern struct page *bad_page;
+extern struct page *fault_page;
+
 extern pfn_t bad_pfn;
+extern pfn_t fault_pfn;
 
 int is_error_page(struct page *page);
 int is_error_pfn(pfn_t pfn);
 int is_hwpoison_pfn(pfn_t pfn);
 int is_fault_pfn(pfn_t pfn);
+int is_noslot_pfn(pfn_t pfn);
+int is_invalid_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
  struct kvm_userspace_memory_region *mem,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 11d2783..59406cf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -101,8 +101,8 @@ static bool largepages_enabled = true;
 static struct page *hwpoison_page;
 static pfn_t hwpoison_pfn;
 
-static struct page *fault_page;
-static pfn_t fault_pfn;
+struct page *fault_page;
+pfn_t fault_pfn;
 
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
@@ -931,6 +931,18 @@ int is_fault_pfn(pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(is_fault_pfn);
 
+int is_noslot_pfn(pfn_t pfn)
+{
+   return pfn == bad_pfn;
+}
+EXPORT_SYMBOL_GPL(is_noslot_pfn);
+
+int is_invalid_pfn(pfn_t pfn)
+{
+   return pfn == hwpoison_pfn || pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_invalid_pfn);
+
 static inline unsigned long bad_hva(void)
 {
return PAGE_OFFSET;
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 12/18] KVM: MMU: abstract some functions to handle fault pfn

2011-07-11 Thread Xiao Guangrong
Introduce handle_abnormal_pfn to handle fault pfn on page fault path,
introduce mmu_invalid_pfn to handle fault pfn on prefetch path

It is the preparing work for mmio page fault support

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |   47 ---
 arch/x86/kvm/paging_tmpl.h |   12 +-
 2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 96a7ed4..1d4a2d9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2221,18 +2221,15 @@ static void kvm_send_hwpoison_signal(unsigned long 
address, struct task_struct *
send_sig_info(SIGBUS, info, tsk);
 }
 
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
-  unsigned access, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 {
kvm_release_pfn_clean(pfn);
if (is_hwpoison_pfn(pfn)) {
kvm_send_hwpoison_signal(gfn_to_hva(vcpu-kvm, gfn), current);
return 0;
-   } else if (is_fault_pfn(pfn))
-   return -EFAULT;
+   }
 
-   vcpu_cache_mmio_info(vcpu, gva, gfn, access);
-   return 1;
+   return -EFAULT;
 }
 
 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2277,6 +2274,33 @@ static void transparent_hugepage_adjust(struct kvm_vcpu 
*vcpu,
}
 }
 
+static bool mmu_invalid_pfn(pfn_t pfn)
+{
+   return unlikely(is_invalid_pfn(pfn) || is_noslot_pfn(pfn));
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+   pfn_t pfn, unsigned access, int *ret_val)
+{
+   bool ret = true;
+
+   /* The pfn is invalid, report the error! */
+   if (unlikely(is_invalid_pfn(pfn))) {
+   *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+   goto exit;
+   }
+
+   if (unlikely(is_noslot_pfn(pfn))) {
+   vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+   *ret_val = 1;
+   goto exit;
+   }
+
+   ret = false;
+exit:
+   return ret;
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 gva_t gva, pfn_t *pfn, bool write, bool *writable);
 
@@ -2311,9 +2335,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, 
int write, gfn_t gfn,
if (try_async_pf(vcpu, prefault, gfn, v, pfn, write, map_writable))
return 0;
 
-   /* mmio */
-   if (is_error_pfn(pfn))
-   return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
+   if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, r))
+   return r;
 
spin_lock(vcpu-kvm-mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2685,9 +2708,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa, u32 error_code,
if (try_async_pf(vcpu, prefault, gfn, gpa, pfn, write, map_writable))
return 0;
 
-   /* mmio */
-   if (is_error_pfn(pfn))
-   return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
+   if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, r))
+   return r;
+
spin_lock(vcpu-kvm-mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a4565df..67998d3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -367,7 +367,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte);
pte_access = sp-role.access  FNAME(gpte_access)(vcpu, gpte, true);
pfn = gfn_to_pfn_atomic(vcpu-kvm, gpte_to_gfn(gpte));
-   if (is_error_pfn(pfn)) {
+   if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return;
}
@@ -445,7 +445,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
gfn = gpte_to_gfn(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
  pte_access  ACC_WRITE_MASK);
-   if (is_error_pfn(pfn)) {
+   if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
break;
}
@@ -615,10 +615,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr, u32 error_code,
 map_writable))
return 0;
 
-   /* mmio */
-   if (is_error_pfn(pfn))
-   return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 :
- addr, walker.pte_access, walker.gfn, pfn);
+   if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
+   walker.gfn, pfn, walker.pte_access, r))
+   return r;
+

[PATCH v4 13/18] KVM: MMU: introduce the rules to modify shadow page table

2011-07-11 Thread Xiao Guangrong
Introduce some interfaces to modify spte as linux kernel does:
- mmu_spte_clear_track_bits, it set the spte from present to nonpresent, and
  track the stat bits(accessed/dirty) of spte
- mmu_spte_clear_no_track, the same as mmu_spte_clear_track_bits except
  tracking the stat bits
- mmu_spte_set, set spte from nonpresent to present
- mmu_spte_update, only update the stat bits

Now, it does not allowed to set spte from present to present, later, we can
drop the atomicly opration for X86_32 host, and it is the preparing work to
get spte on X86_32 host out of the mmu lock

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |  103 +++-
 1 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1d4a2d9..982718f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -299,12 +299,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 
new_spte, u64 bit_mask)
return (old_spte  bit_mask)  !(new_spte  bit_mask);
 }
 
-static void update_spte(u64 *sptep, u64 new_spte)
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+   WARN_ON(is_shadow_present_pte(*sptep));
+   __set_spte(sptep, new_spte);
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changged.
+ */
+static void mmu_spte_update(u64 *sptep, u64 new_spte)
 {
u64 mask, old_spte = *sptep;
 
WARN_ON(!is_rmap_spte(new_spte));
 
+   if (!is_shadow_present_pte(old_spte))
+   return mmu_spte_set(sptep, new_spte);
+
new_spte |= old_spte  shadow_dirty_mask;
 
mask = shadow_accessed_mask;
@@ -325,6 +343,42 @@ static void update_spte(u64 *sptep, u64 new_spte)
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 }
 
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ */
+static int mmu_spte_clear_track_bits(u64 *sptep)
+{
+   pfn_t pfn;
+   u64 old_spte = *sptep;
+
+   if (!spte_has_volatile_bits(old_spte))
+   __set_spte(sptep, 0ull);
+   else
+   old_spte = __xchg_spte(sptep, 0ull);
+
+   if (!is_rmap_spte(old_spte))
+   return 0;
+
+   pfn = spte_to_pfn(old_spte);
+   if (!shadow_accessed_mask || old_spte  shadow_accessed_mask)
+   kvm_set_pfn_accessed(pfn);
+   if (!shadow_dirty_mask || (old_spte  shadow_dirty_mask))
+   kvm_set_pfn_dirty(pfn);
+   return 1;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+   __set_spte(sptep, 0ull);
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  struct kmem_cache *base_cache, int min)
 {
@@ -746,30 +800,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
pte_list_remove(spte, rmapp);
 }
 
-static int set_spte_track_bits(u64 *sptep, u64 new_spte)
-{
-   pfn_t pfn;
-   u64 old_spte = *sptep;
-
-   if (!spte_has_volatile_bits(old_spte))
-   __set_spte(sptep, new_spte);
-   else
-   old_spte = __xchg_spte(sptep, new_spte);
-
-   if (!is_rmap_spte(old_spte))
-   return 0;
-
-   pfn = spte_to_pfn(old_spte);
-   if (!shadow_accessed_mask || old_spte  shadow_accessed_mask)
-   kvm_set_pfn_accessed(pfn);
-   if (!shadow_dirty_mask || (old_spte  shadow_dirty_mask))
-   kvm_set_pfn_dirty(pfn);
-   return 1;
-}
-
 static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
-   if (set_spte_track_bits(sptep, 0ull))
+   if (mmu_spte_clear_track_bits(sptep))
rmap_remove(kvm, sptep);
 }
 
@@ -787,7 +820,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte  PT_PRESENT_MASK));
rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte);
if (is_writable_pte(*spte)) {
-   update_spte(spte, *spte  ~PT_WRITABLE_MASK);
+   mmu_spte_update(spte, *spte  ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -856,7 +889,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long 
*rmapp,
new_spte = ~PT_WRITABLE_MASK;
new_spte = ~SPTE_HOST_WRITEABLE;
new_spte = ~shadow_accessed_mask;
-   set_spte_track_bits(spte, 

[PATCH v4 14/18] KVM: MMU: do not need atomicly to set/clear spte

2011-07-11 Thread Xiao Guangrong
Now, the spte is just from nonprsent to present or present to nonprsent, so
we can use some trick to set/clear spte non-atomicly as linux kernel does

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |   86 +++-
 1 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 982718f..a22b5fe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -259,26 +259,82 @@ static gfn_t pse36_gfn_delta(u32 gpte)
return (gpte  PT32_DIR_PSE36_MASK)  shift;
 }
 
+#ifdef CONFIG_X86_64
 static void __set_spte(u64 *sptep, u64 spte)
 {
-   set_64bit(sptep, spte);
+   *sptep = spte;
 }
 
-static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 {
-#ifdef CONFIG_X86_64
-   return xchg(sptep, new_spte);
+   *sptep = spte;
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+   return xchg(sptep, spte);
+}
 #else
-   u64 old_spte;
+union split_spte {
+   struct {
+   u32 spte_low;
+   u32 spte_high;
+   };
+   u64 spte;
+};
 
-   do {
-   old_spte = *sptep;
-   } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+static void __set_spte(u64 *sptep, u64 spte)
+{
+   union split_spte *ssptep, sspte;
 
-   return old_spte;
-#endif
+   ssptep = (union split_spte *)sptep;
+   sspte = (union split_spte)spte;
+
+   ssptep-spte_high = sspte.spte_high;
+
+   /*
+* If we map the spte from nonpresent to present, We should store
+* the high bits firstly, then set present bit, so cpu can not
+* fetch this spte while we are setting the spte.
+*/
+   smp_wmb();
+
+   ssptep-spte_low = sspte.spte_low;
 }
 
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+   union split_spte *ssptep, sspte;
+
+   ssptep = (union split_spte *)sptep;
+   sspte = (union split_spte)spte;
+
+   ssptep-spte_low = sspte.spte_low;
+
+   /*
+* If we map the spte from present to nonpresent, we should clear
+* present bit firstly to avoid vcpu fetch the old high bits.
+*/
+   smp_wmb();
+
+   ssptep-spte_high = sspte.spte_high;
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+   union split_spte *ssptep, sspte, orig;
+
+   ssptep = (union split_spte *)sptep;
+   sspte = (union split_spte)spte;
+
+   /* xchg acts as a barrier before the setting of the high bits */
+   orig.spte_low = xchg(ssptep-spte_low, sspte.spte_low);
+   orig.spte_high = ssptep-spte_high = sspte.spte_high;
+
+   return orig.spte;
+}
+#endif
+
 static bool spte_has_volatile_bits(u64 spte)
 {
if (!shadow_accessed_mask)
@@ -330,9 +386,9 @@ static void mmu_spte_update(u64 *sptep, u64 new_spte)
mask |= shadow_dirty_mask;
 
if (!spte_has_volatile_bits(old_spte) || (new_spte  mask) == mask)
-   __set_spte(sptep, new_spte);
+   __update_clear_spte_fast(sptep, new_spte);
else
-   old_spte = __xchg_spte(sptep, new_spte);
+   old_spte = __update_clear_spte_slow(sptep, new_spte);
 
if (!shadow_accessed_mask)
return;
@@ -354,9 +410,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
u64 old_spte = *sptep;
 
if (!spte_has_volatile_bits(old_spte))
-   __set_spte(sptep, 0ull);
+   __update_clear_spte_fast(sptep, 0ull);
else
-   old_spte = __xchg_spte(sptep, 0ull);
+   old_spte = __update_clear_spte_slow(sptep, 0ull);
 
if (!is_rmap_spte(old_spte))
return 0;
@@ -376,7 +432,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
  */
 static void mmu_spte_clear_no_track(u64 *sptep)
 {
-   __set_spte(sptep, 0ull);
+   __update_clear_spte_fast(sptep, 0ull);
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 16/18] KVM: MMU: reorganize struct kvm_shadow_walk_iterator

2011-07-11 Thread Xiao Guangrong
Reorganize it for good using the cache

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 374530a..4b1aa67 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -172,8 +172,8 @@ struct pte_list_desc {
 struct kvm_shadow_walk_iterator {
u64 addr;
hpa_t shadow_addr;
-   int level;
u64 *sptep;
+   int level;
unsigned index;
 };
 
-- 
1.7.5.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 15/18] KVM: MMU: lockless walking shadow page table

2011-07-11 Thread Xiao Guangrong
Use rcu to protect shadow pages table to be freed, so we can safely walk it,
it should run fastly and is needed by mmio page fault

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/include/asm/kvm_host.h |8 +++
 arch/x86/kvm/mmu.c  |  132 ---
 2 files changed, 132 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 42e577d..87a868e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -233,6 +233,12 @@ struct kvm_mmu_page {
unsigned int unsync_children;
unsigned long parent_ptes;  /* Reverse mapping for parent_pte */
DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+#ifdef CONFIG_X86_32
+   int clear_spte_count;
+#endif
+
+   struct rcu_head rcu;
 };
 
 struct kvm_pv_mmu_op_buffer {
@@ -477,6 +483,8 @@ struct kvm_arch {
u64 hv_guest_os_id;
u64 hv_hypercall;
 
+   atomic_t reader_counter;
+
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a22b5fe..374530a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -182,6 +182,12 @@ struct kvm_shadow_walk_iterator {
 shadow_walk_okay((_walker));  \
 shadow_walk_next((_walker)))
 
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)\
+   for (shadow_walk_init((_walker), _vcpu, _addr);\
+shadow_walk_okay((_walker)) \
+   ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
+__shadow_walk_next((_walker), spte))
+
 static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
@@ -274,6 +280,11 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 {
return xchg(sptep, spte);
 }
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+   return ACCESS_ONCE(*sptep);
+}
 #else
 union split_spte {
struct {
@@ -283,6 +294,18 @@ union split_spte {
u64 spte;
 };
 
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+   struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+
+   if (is_shadow_present_pte(spte))
+   return;
+
+   /* Ensure the spte is completely set before we increase the count */
+   smp_wmb();
+   sp-clear_spte_count++;
+}
+
 static void __set_spte(u64 *sptep, u64 spte)
 {
union split_spte *ssptep, sspte;
@@ -318,6 +341,7 @@ static void __update_clear_spte_fast(u64 *sptep, u64 spte)
smp_wmb();
 
ssptep-spte_high = sspte.spte_high;
+   count_spte_clear(sptep, spte);
 }
 
 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
@@ -330,9 +354,40 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
/* xchg acts as a barrier before the setting of the high bits */
orig.spte_low = xchg(ssptep-spte_low, sspte.spte_low);
orig.spte_high = ssptep-spte_high = sspte.spte_high;
+   count_spte_clear(sptep, spte);
 
return orig.spte;
 }
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte(arch/x86/mm/gup.c).
+ * The difference is we can not catch the spte tlb flush if we leave
+ * guest mode, so we emulate it by increase clear_spte_count when spte
+ * is cleared.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+   struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+   union split_spte spte, *orig = (union split_spte *)sptep;
+   int count;
+
+retry:
+   count = sp-clear_spte_count;
+   smp_rmb();
+
+   spte.spte_low = orig-spte_low;
+   smp_rmb();
+
+   spte.spte_high = orig-spte_high;
+   smp_rmb();
+
+   if (unlikely(spte.spte_low != orig-spte_low ||
+ count != sp-clear_spte_count))
+   goto retry;
+
+   return spte.spte;
+}
 #endif
 
 static bool spte_has_volatile_bits(u64 spte)
@@ -435,6 +490,28 @@ static void mmu_spte_clear_no_track(u64 *sptep)
__update_clear_spte_fast(sptep, 0ull);
 }
 
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+   return __get_spte_lockless(sptep);
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+   rcu_read_lock();
+   atomic_inc(vcpu-kvm-arch.reader_counter);
+
+   /* Increase the counter before walking shadow page table */
+   smp_mb__after_atomic_inc();
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+   /* Decrease the counter after walking shadow page table finished */
+   smp_mb__before_atomic_dec();
+   atomic_dec(vcpu-kvm-arch.reader_counter);
+   rcu_read_unlock();
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
  struct kmem_cache *base_cache, int min)
 {
@@ -1597,17 +1674,23 @@ 

[PATCH v4 17/18] KVM: MMU: mmio page fault support

2011-07-11 Thread Xiao Guangrong
The idea is from Avi:

| We could cache the result of a miss in an spte by using a reserved bit, and
| checking the page fault error code (or seeing if we get an ept violation or
| ept misconfiguration), so if we get repeated mmio on a page, we don't need to
| search the slot list/tree.
| (https://lkml.org/lkml/2011/2/22/221)

When the page fault is caused by mmio, we cache the info in the shadow page
table, and also set the reserved bits in the shadow page table, so if the mmio
is caused again, we can quickly identify it and emulate it directly

Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it
can be reduced by this feature, and also avoid walking guest page table for
soft mmu.

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c |  192 ++--
 arch/x86/kvm/mmu.h |2 +
 arch/x86/kvm/paging_tmpl.h |   21 --
 arch/x86/kvm/vmx.c |   22 +-
 arch/x86/kvm/x86.c |   25 ++
 virt/kvm/kvm_main.c|7 ++
 6 files changed, 255 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4b1aa67..6748f1b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,6 +197,47 @@ static u64 __read_mostly shadow_x_mask;/* mutual 
exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
+
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
+{
+   shadow_mmio_mask = mmio_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+{
+   access = ACC_WRITE_MASK | ACC_USER_MASK;
+
+   mmu_spte_set(sptep, shadow_mmio_mask | access | gfn  PAGE_SHIFT);
+}
+
+static bool is_mmio_spte(u64 spte)
+{
+   return (spte  shadow_mmio_mask) == shadow_mmio_mask;
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+   return (spte  ~shadow_mmio_mask)  PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+   return (spte  ~shadow_mmio_mask)  ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+{
+   if (unlikely(is_noslot_pfn(pfn))) {
+   mark_mmio_spte(sptep, gfn, access);
+   return true;
+   }
+
+   return false;
+}
 
 static inline u64 rsvd_bits(int s, int e)
 {
@@ -226,7 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-   return pte  PT_PRESENT_MASK;
+   return pte  PT_PRESENT_MASK  !is_mmio_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -285,6 +326,12 @@ static u64 __get_spte_lockless(u64 *sptep)
 {
return ACCESS_ONCE(*sptep);
 }
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+   /* It is valid if the spte is zapped. */
+   return spte == 0ull;
+}
 #else
 union split_spte {
struct {
@@ -388,6 +435,23 @@ retry:
 
return spte.spte;
 }
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+   union split_spte sspte = (union split_spte)spte;
+   u32 high_mmio_mask = shadow_mmio_mask  32;
+
+   /* It is valid if the spte is zapped. */
+   if (spte == 0ull)
+   return true;
+
+   /* It is valid if the spte is being zapped. */
+   if (sspte.spte_low == 0ull 
+ sspte.spte_high  high_mmio_mask == high_mmio_mask)
+   return true;
+
+   return false;
+}
 #endif
 
 static bool spte_has_volatile_bits(u64 spte)
@@ -1745,7 +1809,8 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct 
kvm_mmu_page *sp,
child = page_header(pte  PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
-   }
+   } else if (is_mmio_spte(pte))
+   mmu_spte_clear_no_track(spte);
 
if (is_large_pte(pte))
--kvm-stat.lpages;
@@ -2120,6 +2185,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte, entry = *sptep;
int ret = 0;
 
+   if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+   return 0;
+
/*
 * We don't set the accessed bit, since we sometimes want to see
 * whether the guest actually used the pte (in order to detect
@@ -2255,6 +2323,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
*sptep,
kvm_mmu_flush_tlb(vcpu);
}
 
+   if (unlikely(is_mmio_spte(*sptep)  emulate))
+   *emulate = 1;
+
pgprintk(%s: setting spte %llx\n, __func__, *sptep);
pgprintk(instantiating %s PTE (%s) at %llx (%llx) addr %p\n,
 is_large_pte(*sptep)? 2MB : 4kB,
@@ -2481,7 +2552,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu 
*vcpu,
 
 static bool mmu_invalid_pfn(pfn_t pfn)
 {
-   return 

[PATCH v4 18/18] KVM: MMU: trace mmio page fault

2011-07-11 Thread Xiao Guangrong
Add tracepoints to trace mmio page fault

Signed-off-by: Xiao Guangrong xiaoguangr...@cn.fujitsu.com
---
 arch/x86/kvm/mmu.c  |5 
 arch/x86/kvm/mmutrace.h |   48 +++
 arch/x86/kvm/trace.h|   23 ++
 arch/x86/kvm/x86.c  |5 +++-
 4 files changed, 80 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6748f1b..d3d188e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -211,6 +211,7 @@ static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned 
access)
 {
access = ACC_WRITE_MASK | ACC_USER_MASK;
 
+   trace_mark_mmio_spte(sptep, gfn, access);
mmu_spte_set(sptep, shadow_mmio_mask | access | gfn  PAGE_SHIFT);
 }
 
@@ -1940,6 +1941,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
kvm_mmu_isolate_pages(invalid_list);
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
list_del_init(invalid_list);
+
+   trace_kvm_mmu_delay_free_pages(sp);
call_rcu(sp-rcu, free_pages_rcu);
return;
}
@@ -2938,6 +2941,8 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, 
u64 addr, bool direct)
 
if (direct)
addr = 0;
+
+   trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
return 1;
}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b60b4fd..eed67f3 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TP_ARGS(sp)
 );
 
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
+   TP_PROTO(struct kvm_mmu_page *sp),
+
+   TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+   mark_mmio_spte,
+   TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
+   TP_ARGS(sptep, gfn, access),
+
+   TP_STRUCT__entry(
+   __field(void *, sptep)
+   __field(gfn_t, gfn)
+   __field(unsigned, access)
+   ),
+
+   TP_fast_assign(
+   __entry-sptep = sptep;
+   __entry-gfn = gfn;
+   __entry-access = access;
+   ),
+
+   TP_printk(sptep:%p gfn %llx access %x, __entry-sptep, __entry-gfn,
+ __entry-access)
+);
+
+TRACE_EVENT(
+   handle_mmio_page_fault,
+   TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+   TP_ARGS(addr, gfn, access),
+
+   TP_STRUCT__entry(
+   __field(u64, addr)
+   __field(gfn_t, gfn)
+   __field(unsigned, access)
+   ),
+
+   TP_fast_assign(
+   __entry-addr = addr;
+   __entry-gfn = gfn;
+   __entry-access = access;
+   ),
+
+   TP_printk(addr:%llx gfn %llx access %x, __entry-addr, __entry-gfn,
+ __entry-access)
+);
+
 TRACE_EVENT(
kvm_mmu_audit,
TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 624f8cb..3ff898c 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn,
 #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
 #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
 
+TRACE_EVENT(
+   vcpu_match_mmio,
+   TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+   TP_ARGS(gva, gpa, write, gpa_match),
+
+   TP_STRUCT__entry(
+   __field(gva_t, gva)
+   __field(gpa_t, gpa)
+   __field(bool, write)
+   __field(bool, gpa_match)
+   ),
+
+   TP_fast_assign(
+   __entry-gva = gva;
+   __entry-gpa = gpa;
+   __entry-write = write;
+   __entry-gpa_match = gpa_match
+   ),
+
+   TP_printk(gva %#lx gpa %#llx %s %s, __entry-gva, __entry-gpa,
+ __entry-write ? Write : Read,
+ __entry-gpa_match ? GPA : GVA)
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84bfffb..77921cf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3955,6 +3955,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, 
unsigned long gva,
  vcpu-arch.access)) {
*gpa = vcpu-arch.mmio_gfn  PAGE_SHIFT |
(gva  (PAGE_SIZE - 1));
+   trace_vcpu_match_mmio(gva, *gpa, write, false);
return 1;
}
 
@@ -3970,8 +3971,10 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, 
unsigned long gva,
if ((*gpa  PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
return 1;
 
-   if (vcpu_match_mmio_gpa(vcpu, *gpa))
+   if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
+   

[PATCH v6 2/9] KVM-HDR Add constant to represent KVM MSRs enabled bit

2011-07-11 Thread Glauber Costa
This patch is simple, put in a different commit so it can be more easily
shared between guest and hypervisor. It just defines a named constant
to indicate the enable bit for KVM-specific MSRs.

Signed-off-by: Glauber Costa glom...@redhat.com
Acked-by: Rik van Riel r...@redhat.com
Tested-by: Eric B Munson emun...@mgebm.net
CC: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
CC: Peter Zijlstra pet...@infradead.org
CC: Avi Kivity a...@redhat.com
CC: Anthony Liguori aligu...@us.ibm.com
---
 arch/x86/include/asm/kvm_para.h |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index a427bf7..d6cd79b 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -30,6 +30,7 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#define KVM_MSR_ENABLED 1
 /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v6 7/9] KVM-GST: KVM Steal time accounting

2011-07-11 Thread Glauber Costa
This patch accounts steal time time in account_process_tick.
If one or more tick is considered stolen in the current
accounting cycle, user/system accounting is skipped. Idle is fine,
since the hypervisor does not report steal time if the guest
is halted.

Accounting steal time from the core scheduler give us the
advantage of direct acess to the runqueue data. In a later
opportunity, it can be used to tweak cpu power and make
the scheduler aware of the time it lost.

Signed-off-by: Glauber Costa glom...@redhat.com
Acked-by: Rik van Riel r...@redhat.com
Acked-by: Peter Zijlstra pet...@infradead.org
Tested-by: Eric B Munson emun...@mgebm.net
CC: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
CC: Avi Kivity a...@redhat.com
CC: Anthony Liguori aligu...@us.ibm.com
---
 kernel/sched.c |   41 +
 1 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3f2e502..aa6c030 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,7 @@
 #include asm/tlb.h
 #include asm/irq_regs.h
 #include asm/mutex.h
+#include asm/paravirt.h
 
 #include sched_cpupri.h
 #include workqueue_sched.h
@@ -528,6 +529,9 @@ struct rq {
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
 #endif
+#ifdef CONFIG_PARAVIRT
+   u64 prev_steal_time;
+#endif
 
/* calc_load related fields */
unsigned long calc_load_update;
@@ -1953,6 +1957,18 @@ void account_system_vtime(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+   if (unlikely(steal  NSEC_PER_SEC))
+   return div_u64(steal, TICK_NSEC);
+
+   return __iter_div_u64_rem(steal, TICK_NSEC, steal);
+}
+#endif
+
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
s64 irq_delta;
@@ -3845,6 +3861,25 @@ void account_idle_time(cputime_t cputime)
cpustat-idle = cputime64_add(cpustat-idle, cputime64);
 }
 
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+   if (static_branch(paravirt_steal_enabled)) {
+   u64 steal, st = 0;
+
+   steal = paravirt_steal_clock(smp_processor_id());
+   steal -= this_rq()-prev_steal_time;
+
+   st = steal_ticks(steal);
+   this_rq()-prev_steal_time += st * TICK_NSEC;
+
+   account_steal_time(st);
+   return st;
+   }
+#endif
+   return false;
+}
+
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3876,6 +3911,9 @@ static void irqtime_account_process_tick(struct 
task_struct *p, int user_tick,
cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
struct cpu_usage_stat *cpustat = kstat_this_cpu.cpustat;
 
+   if (steal_account_process_tick())
+   return;
+
if (irqtime_account_hi_update()) {
cpustat-irq = cputime64_add(cpustat-irq, tmp);
} else if (irqtime_account_si_update()) {
@@ -3929,6 +3967,9 @@ void account_process_tick(struct task_struct *p, int 
user_tick)
return;
}
 
+   if (steal_account_process_tick())
+   return;
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq-idle) || (irq_count() != HARDIRQ_OFFSET))
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v6 4/9] KVM-HV: KVM Steal time implementation

2011-07-11 Thread Glauber Costa
To implement steal time, we need the hypervisor to pass the guest
information about how much time was spent running other processes
outside the VM, while the vcpu had meaningful work to do - halt
time does not count.

This information is acquired through the run_delay field of
delayacct/schedstats infrastructure, that counts time spent in a
runqueue but not running.

Steal time is a per-cpu information, so the traditional MSR-based
infrastructure is used. A new msr, KVM_MSR_STEAL_TIME, holds the
memory area address containing information about steal time

This patch contains the hypervisor part of the steal time infrasructure,
and can be backported independently of the guest portion.

Signed-off-by: Glauber Costa glom...@redhat.com
Tested-by: Eric B Munson emun...@mgebm.net
CC: Rik van Riel r...@redhat.com
CC: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
CC: Peter Zijlstra pet...@infradead.org
CC: Avi Kivity a...@redhat.com
CC: Anthony Liguori aligu...@us.ibm.com
---
 arch/x86/include/asm/kvm_host.h |9 +
 arch/x86/include/asm/kvm_para.h |4 ++
 arch/x86/kvm/Kconfig|1 +
 arch/x86/kvm/x86.c  |   74 +-
 include/linux/kvm_host.h|1 +
 5 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da6bbee..59086a7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -389,6 +389,15 @@ struct kvm_vcpu_arch {
unsigned int hw_tsc_khz;
unsigned int time_offset;
struct page *time_page;
+
+   struct {
+   u64 msr_val;
+   u64 last_steal;
+   u64 accum_steal;
+   struct gfn_to_hva_cache stime;
+   struct kvm_steal_time steal;
+   } st;
+
u64 last_guest_tsc;
u64 last_kernel_ns;
u64 last_tsc_nsec;
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 65f8bb9..c484ba8 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -45,6 +45,10 @@ struct kvm_steal_time {
__u32 pad[12];
 };
 
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL  (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1  KVM_STEAL_ALIGNMENT_BITS) - 1 )  1)
+
 #define KVM_MAX_MMU_OP_BATCH   32
 
 #define KVM_ASYNC_PF_ENABLED   (1  0)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 50f6364..99c3f05 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -31,6 +31,7 @@ config KVM
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
+   select TASK_DELAY_ACCT
---help---
  Support hosting fully virtualized guest machines using hardware
  virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7167717..6282f6c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -808,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN8
+#define KVM_SAVE_MSRS_BEGIN9
 static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-   HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+   HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1491,6 +1491,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
}
 }
 
+static void accumulate_steal_time(struct kvm_vcpu *vcpu)
+{
+   u64 delta;
+
+   if (!(vcpu-arch.st.msr_val  KVM_MSR_ENABLED))
+   return;
+
+   delta = current-sched_info.run_delay - vcpu-arch.st.last_steal;
+   vcpu-arch.st.last_steal = current-sched_info.run_delay;
+   vcpu-arch.st.accum_steal = delta;
+}
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+   if (!(vcpu-arch.st.msr_val  KVM_MSR_ENABLED))
+   return;
+
+   if (unlikely(kvm_read_guest_cached(vcpu-kvm, vcpu-arch.st.stime,
+   vcpu-arch.st.steal, sizeof(struct kvm_steal_time
+   return;
+
+   vcpu-arch.st.steal.steal += vcpu-arch.st.accum_steal;
+   vcpu-arch.st.steal.version += 2;
+   vcpu-arch.st.accum_steal = 0;
+
+   kvm_write_guest_cached(vcpu-kvm, vcpu-arch.st.stime,
+   vcpu-arch.st.steal, sizeof(struct kvm_steal_time));
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
switch (msr) {
@@ -1573,6 +1602,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, 
u64 data)
if (kvm_pv_enable_async_pf(vcpu, data))
return 1;
break;
+   

[PATCH v6 9/9] KVM-GST: KVM Steal time registration

2011-07-11 Thread Glauber Costa
This patch implements the kvm bits of the steal time infrastructure.
The most important part of it, is the steal time clock. It is an
continuous clock that shows the accumulated amount of steal time
since vcpu creation. It is supposed to survive cpu offlining/onlining.

Signed-off-by: Glauber Costa glom...@redhat.com
Acked-by: Rik van Riel r...@redhat.com
Tested-by: Eric B Munson emun...@mgebm.net
CC: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
CC: Peter Zijlstra pet...@infradead.org
CC: Avi Kivity a...@redhat.com
CC: Anthony Liguori aligu...@us.ibm.com
---
 Documentation/kernel-parameters.txt |4 ++
 arch/x86/include/asm/kvm_para.h |1 +
 arch/x86/kernel/kvm.c   |   73 +++
 arch/x86/kernel/kvmclock.c  |2 +
 4 files changed, 80 insertions(+), 0 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index fd248a31..a722574 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1737,6 +1737,10 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
no-kvmapf   [X86,KVM] Disable paravirtualized asynchronous page
fault handling.
 
+   no-steal-acc[X86,KVM] Disable paravirtualized steal time accounting.
+   steal time is computed, but won't influence scheduler
+   behaviour
+
nolapic [X86-32,APIC] Do not enable or use the local APIC.
 
nolapic_timer   [X86-32,APIC] Do not use the local APIC timer.
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c484ba8..35d732d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -94,6 +94,7 @@ struct kvm_vcpu_pv_apf_data {
 
 extern void kvmclock_init(void);
 extern int kvm_register_clock(char *txt);
+extern void kvm_disable_steal_time(void);
 
 
 /* This instruction is vmcall.  On non-VT architectures, it will generate a
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33c07b0..58331c2 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg)
 
 early_param(no-kvmapf, parse_no_kvmapf);
 
+static int steal_acc = 1;
+static int parse_no_stealacc(char *arg)
+{
+steal_acc = 0;
+return 0;
+}
+
+early_param(no-steal-acc, parse_no_stealacc);
+
 struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
@@ -58,6 +67,8 @@ struct kvm_para_state {
 
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock = 0;
 
 static struct kvm_para_state *kvm_para_state(void)
 {
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void)
 #endif
 }
 
+static void kvm_register_steal_time(void)
+{
+   int cpu = smp_processor_id();
+   struct kvm_steal_time *st = per_cpu(steal_time, cpu);
+
+   if (!has_steal_clock)
+   return;
+
+   memset(st, 0, sizeof(*st));
+
+   wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
+   printk(KERN_INFO kvm-stealtime: cpu %d, msr %lx\n,
+   cpu, __pa(st));
+}
+
 void __cpuinit kvm_guest_cpu_init(void)
 {
if (!kvm_para_available())
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void)
printk(KERN_INFOKVM setup async PF for cpu %d\n,
   smp_processor_id());
}
+
+   if (has_steal_clock)
+   kvm_register_steal_time();
 }
 
 static void kvm_pv_disable_apf(void *unused)
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = {
.notifier_call = kvm_pv_reboot_notify,
 };
 
+static u64 kvm_steal_clock(int cpu)
+{
+   u64 steal;
+   struct kvm_steal_time *src;
+   int version;
+
+   src = per_cpu(steal_time, cpu);
+   do {
+   version = src-version;
+   rmb();
+   steal = src-steal;
+   rmb();
+   } while ((version  1) || (version != src-version));
+
+   return steal;
+}
+
+void kvm_disable_steal_time(void)
+{
+   if (!has_steal_clock)
+   return;
+
+   wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
 
 static void kvm_guest_cpu_offline(void *dummy)
 {
+   kvm_disable_steal_time();
kvm_pv_disable_apf(NULL);
apf_task_wake_all();
 }
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
x86_init.irqs.trap_init = kvm_apf_trap_init;
 
+   if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+   has_steal_clock = 1;
+   

[PATCH v6 5/9] KVM-GST: Add a pv_ops stub for steal time

2011-07-11 Thread Glauber Costa
This patch adds a function pointer in one of the many paravirt_ops
structs, to allow guests to register a steal time function. Besides
a steal time function, we also declare two jump_labels. They will be
used to allow the steal time code to be easily bypassed when not
in use.

Signed-off-by: Glauber Costa glom...@redhat.com
Acked-by: Rik van Riel r...@redhat.com
Tested-by: Eric B Munson emun...@mgebm.net
CC: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
CC: Peter Zijlstra pet...@infradead.org
CC: Avi Kivity a...@redhat.com
CC: Anthony Liguori aligu...@us.ibm.com
---
 arch/x86/include/asm/paravirt.h   |9 +
 arch/x86/include/asm/paravirt_types.h |1 +
 arch/x86/kernel/paravirt.c|9 +
 3 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ebbc4d8..a7d2db9 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void)
return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
 }
 
+struct jump_label_key;
+extern struct jump_label_key paravirt_steal_enabled;
+extern struct jump_label_key paravirt_steal_rq_enabled;
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+   return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
+}
+
 static inline unsigned long long paravirt_read_pmc(int counter)
 {
return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 8288509..2c76521 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -89,6 +89,7 @@ struct pv_lazy_ops {
 
 struct pv_time_ops {
unsigned long long (*sched_clock)(void);
+   unsigned long long (*steal_clock)(int cpu);
unsigned long (*get_tsc_khz)(void);
 };
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 869e1ae..613a793 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr)
__native_flush_tlb_single(addr);
 }
 
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+   return 0;
+}
+
 /* These are in entry.S */
 extern void native_iret(void);
 extern void native_irq_enable_sysexit(void);
@@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = {
 
 struct pv_time_ops pv_time_ops = {
.sched_clock = native_sched_clock,
+   .steal_clock = native_steal_clock,
 };
 
 struct pv_irq_ops pv_irq_ops = {
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v6 8/9] KVM-GST: adjust scheduler cpu power

2011-07-11 Thread Glauber Costa
This patch makes update_rq_clock() aware of steal time.
The mechanism of operation is not different from irq_time,
and follows the same principles. This lives in a CONFIG
option itself, and can be compiled out independently of
the rest of steal time reporting. The effect of disabling it
is that the scheduler will still report steal time (that cannot be
disabled), but won't use this information for cpu power adjustments.

Everytime update_rq_clock_task() is invoked, we query information
about how much time was stolen since last call, and feed it into
sched_rt_avg_update().

Although steal time reporting in account_process_tick() keeps
track of the last time we read the steal clock, in prev_steal_time,
this patch do it independently using another field,
prev_steal_time_rq. This is because otherwise, information about time
accounted in update_process_tick() would never reach us in update_rq_clock().

Signed-off-by: Glauber Costa glom...@redhat.com
Acked-by: Rik van Riel r...@redhat.com
Acked-by: Peter Zijlstra pet...@infradead.org
Tested-by: Eric B Munson emun...@mgebm.net
CC: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
CC: Avi Kivity a...@redhat.com
CC: Anthony Liguori aligu...@us.ibm.com
---
 arch/x86/Kconfig|   12 
 kernel/sched.c  |   47 +--
 kernel/sched_features.h |4 ++--
 3 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index da34972..b26f312 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -512,6 +512,18 @@ menuconfig PARAVIRT_GUEST
 
 if PARAVIRT_GUEST
 
+config PARAVIRT_TIME_ACCOUNTING
+   bool Paravirtual steal time accounting
+   select PARAVIRT
+   default n
+   ---help---
+ Select this option to enable fine granularity task steal time 
+ accounting. Time spent executing other tasks in parallel with
+ the current vCPU is discounted from the vCPU power. To account for
+ that, there can be a small performance impact.
+
+ If in doubt, say N here.
+
 source arch/x86/xen/Kconfig
 
 config KVM_CLOCK
diff --git a/kernel/sched.c b/kernel/sched.c
index aa6c030..8d57196 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -532,6 +532,9 @@ struct rq {
 #ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
 #endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+   u64 prev_steal_time_rq;
+#endif
 
/* calc_load related fields */
unsigned long calc_load_update;
@@ -1971,8 +1974,14 @@ static inline u64 steal_ticks(u64 steal)
 
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-   s64 irq_delta;
-
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || 
defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+   s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq-prev_irq_time;
 
/*
@@ -1995,12 +2004,35 @@ static void update_rq_clock_task(struct rq *rq, s64 
delta)
 
rq-prev_irq_time += irq_delta;
delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+   if (static_branch((paravirt_steal_rq_enabled))) {
+   u64 st;
+
+   steal = paravirt_steal_clock(cpu_of(rq));
+   steal -= rq-prev_steal_time_rq;
+
+   if (unlikely(steal  delta))
+   steal = delta;
+
+   st = steal_ticks(steal);
+   steal = st * TICK_NSEC;
+
+   rq-prev_steal_time_rq += steal;
+
+   delta -= steal;
+   }
+#endif
+
rq-clock_task += delta;
 
-   if (irq_delta  sched_feat(NONIRQ_POWER))
-   sched_rt_avg_update(rq, irq_delta);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || 
defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+   if ((irq_delta + steal)  sched_feat(NONTASK_POWER))
+   sched_rt_avg_update(rq, irq_delta + steal);
+#endif
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int irqtime_account_hi_update(void)
 {
struct cpu_usage_stat *cpustat = kstat_this_cpu.cpustat;
@@ -2035,12 +2067,7 @@ static int irqtime_account_si_update(void)
 
 #define sched_clock_irqtime(0)
 
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-   rq-clock_task += delta;
-}
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
 
 #include sched_idletask.c
 #include sched_fair.c
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f73..ca3b025 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,9 +61,9 @@ SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(OWNER_SPIN, 1)
 
 /*
- * Decrement CPU power based on irq activity
+ * Decrement CPU power based on time not spent running tasks
  */
-SCHED_FEAT(NONIRQ_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, 1)
 
 /*
  * Queue remote wakeups on the target CPU and process them
-- 
1.7.3.4

--
To 

[PATCH v6 0/9] Steal time for KVM

2011-07-11 Thread Glauber Costa
Hi guys,

All ACKs being collected, I hope this can be merged.
Only patch 4/9 differs from the last submission. It merges
Marcelo's suggestions, so I didn't transposed the other ACKs to it
All the rest, is kept the same.

Enjoy

Glauber Costa (8):
  KVM-HDR Add constant to represent KVM MSRs enabled bit
  KVM-HDR: KVM Steal time implementation
  KVM-HV: KVM Steal time implementation
  KVM-GST: Add a pv_ops stub for steal time
  add jump labels for ia64 paravirt
  KVM-GST: KVM Steal time accounting
  KVM-GST: adjust scheduler cpu power
  KVM-GST: KVM Steal time registration

Gleb Natapov (1):
  introduce kvm_read_guest_cached

 Documentation/kernel-parameters.txt   |4 ++
 Documentation/virtual/kvm/msr.txt |   35 +
 arch/ia64/include/asm/paravirt.h  |4 ++
 arch/ia64/kernel/paravirt.c   |2 +
 arch/x86/Kconfig  |   12 +
 arch/x86/include/asm/kvm_host.h   |9 +++
 arch/x86/include/asm/kvm_para.h   |   15 ++
 arch/x86/include/asm/paravirt.h   |9 +++
 arch/x86/include/asm/paravirt_types.h |1 +
 arch/x86/kernel/kvm.c |   73 +++
 arch/x86/kernel/kvmclock.c|2 +
 arch/x86/kernel/paravirt.c|9 +++
 arch/x86/kvm/Kconfig  |1 +
 arch/x86/kvm/x86.c|   74 +++-
 include/linux/kvm_host.h  |3 +
 kernel/sched.c|   88 +
 kernel/sched_features.h   |4 +-
 virt/kvm/kvm_main.c   |   20 +++
 18 files changed, 351 insertions(+), 14 deletions(-)

-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v6 1/9] introduce kvm_read_guest_cached

2011-07-11 Thread Glauber Costa
From: Gleb Natapov g...@redhat.com

Introduce kvm_read_guest_cached() function in addition to write one we
already have.

[ by glauber: export function signature in kvm header ]

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Glauber Costa glom...@redhat.com
Acked-by: Rik van Riel r...@redhat.com
Tested-by: Eric Munson emun...@mgebm.net
---
 include/linux/kvm_host.h |2 ++
 virt/kvm/kvm_main.c  |   20 
 2 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 31ebb59..f7df0a3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -381,6 +381,8 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void 
*data, int offset,
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
  unsigned long len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+  void *data, unsigned long len);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 11d2783..d5ef9eb 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1418,6 +1418,26 @@ int kvm_write_guest_cached(struct kvm *kvm, struct 
gfn_to_hva_cache *ghc,
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+  void *data, unsigned long len)
+{
+   struct kvm_memslots *slots = kvm_memslots(kvm);
+   int r;
+
+   if (slots-generation != ghc-generation)
+   kvm_gfn_to_hva_cache_init(kvm, ghc, ghc-gpa);
+
+   if (kvm_is_error_hva(ghc-hva))
+   return -EFAULT;
+
+   r = __copy_from_user(data, (void __user *)ghc-hva, len);
+   if (r)
+   return -EFAULT;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v6 3/9] KVM-HDR: KVM Steal time implementation

2011-07-11 Thread Glauber Costa
To implement steal time, we need the hypervisor to pass the guest information
about how much time was spent running other processes outside the VM.
This is per-vcpu, and using the kvmclock structure for that is an abuse
we decided not to make.

In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
holds the memory area address containing information about steal time

This patch contains the headers for it. I am keeping it separate to facilitate
backports to people who wants to backport the kernel part but not the
hypervisor, or the other way around.

Signed-off-by: Glauber Costa glom...@redhat.com
Acked-by: Rik van Riel r...@redhat.com
Tested-by: Eric B Munson emun...@mgebm.net
CC: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
CC: Peter Zijlstra pet...@infradead.org
CC: Avi Kivity a...@redhat.com
CC: Anthony Liguori aligu...@us.ibm.com
---
 Documentation/virtual/kvm/msr.txt |   35 +++
 arch/x86/include/asm/kvm_para.h   |9 +
 2 files changed, 44 insertions(+), 0 deletions(-)

diff --git a/Documentation/virtual/kvm/msr.txt 
b/Documentation/virtual/kvm/msr.txt
index d079aed..38db3f8 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -185,3 +185,38 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02
 
Currently type 2 APF will be always delivered on the same vcpu as
type 1 was, but guest should not rely on that.
+
+MSR_KVM_STEAL_TIME: 0x4b564d03
+
+   data: 64-byte alignment physical address of a memory area which must be
+   in guest RAM, plus an enable bit in bit 0. This memory is expected to
+   hold a copy of the following structure:
+
+   struct kvm_steal_time {
+   __u64 steal;
+   __u32 version;
+   __u32 flags;
+   __u32 pad[12];
+   }
+
+   whose data will be filled in by the hypervisor periodically. Only one
+   write, or registration, is needed for each VCPU. The interval between
+   updates of this structure is arbitrary and implementation-dependent.
+   The hypervisor may update this structure at any time it sees fit until
+   anything with bit0 == 0 is written to it. Guest is required to make sure
+   this structure is initialized to zero.
+
+   Fields have the following meanings:
+
+   version: a sequence counter. In other words, guest has to check
+   this field before and after grabbing time information and make 
+   sure they are both equal and even. An odd version indicates an
+   in-progress update.
+
+   flags: At this point, always zero. May be used to indicate
+   changes in this structure in the future.
+
+   steal: the amount of time in which this vCPU did not run, in
+   nanoseconds. Time during which the vcpu is idle, will not be
+   reported as steal time.
+
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index d6cd79b..65f8bb9 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -21,6 +21,7 @@
  */
 #define KVM_FEATURE_CLOCKSOURCE23
 #define KVM_FEATURE_ASYNC_PF   4
+#define KVM_FEATURE_STEAL_TIME 5
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -35,6 +36,14 @@
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+#define MSR_KVM_STEAL_TIME  0x4b564d03
+
+struct kvm_steal_time {
+   __u64 steal;
+   __u32 version;
+   __u32 flags;
+   __u32 pad[12];
+};
 
 #define KVM_MAX_MMU_OP_BATCH   32
 
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v6 6/9] add jump labels for ia64 paravirt

2011-07-11 Thread Glauber Costa
Since in a later patch I intend to call jump labels inside
CONFIG_PARAVIRT, IA64 would fail to compile if they are not
provided. This patch provides those jump labels for the IA64
architecture.

Signed-off-by: Glauber Costa glom...@redhat.com
Acked-by: Isaku Yamahata yamah...@valinux.co.jp
Acked-by: Rik van Riel r...@redhat.com
CC: Tony Luck tony.l...@intel.com
CC: Eddie Dong eddie.d...@intel.com
CC: Jeremy Fitzhardinge jeremy.fitzhardi...@citrix.com
CC: Peter Zijlstra pet...@infradead.org
CC: Avi Kivity a...@redhat.com
CC: Anthony Liguori aligu...@us.ibm.com
CC: Eric B Munson emun...@mgebm.net
---
 arch/ia64/include/asm/paravirt.h |4 
 arch/ia64/kernel/paravirt.c  |2 ++
 2 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
index 2eb0a98..32551d3 100644
--- a/arch/ia64/include/asm/paravirt.h
+++ b/arch/ia64/include/asm/paravirt.h
@@ -281,6 +281,10 @@ paravirt_init_missing_ticks_accounting(int cpu)
pv_time_ops.init_missing_ticks_accounting(cpu);
 }
 
+struct jump_label_key;
+extern struct jump_label_key paravirt_steal_enabled;
+extern struct jump_label_key paravirt_steal_rq_enabled;
+
 static inline int
 paravirt_do_steal_accounting(unsigned long *new_itm)
 {
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
index a21d7bb..1008682 100644
--- a/arch/ia64/kernel/paravirt.c
+++ b/arch/ia64/kernel/paravirt.c
@@ -634,6 +634,8 @@ struct pv_irq_ops pv_irq_ops = {
  * pv_time_ops
  * time operations
  */
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
 
 static int
 ia64_native_do_steal_accounting(unsigned long *new_itm)
-- 
1.7.3.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm tools: Wait for VCPU to reboot before returning

2011-07-11 Thread Sasha Levin
Wait until all VCPU threads have received the reboot signal and have
exited.

This prevents the occasional error messages when exiting kvm tools with the
ctrl a + x combo.

Reported-by: Asias He asias.he...@gmail.com
Signed-off-by: Sasha Levin levinsasha...@gmail.com
---
 tools/kvm/kvm-cpu.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/tools/kvm/kvm-cpu.c b/tools/kvm/kvm-cpu.c
index 2f5d23c..5825ac9 100644
--- a/tools/kvm/kvm-cpu.c
+++ b/tools/kvm/kvm-cpu.c
@@ -422,6 +422,7 @@ static void kvm_cpu__handle_coalesced_mmio(struct kvm_cpu 
*cpu)
 void kvm_cpu__reboot(void)
 {
pthread_kill(kvm_cpus[0]-thread, SIGKVMEXIT);
+   pthread_join(kvm_cpus[0]-thread, NULL);
 }
 
 int kvm_cpu__start(struct kvm_cpu *cpu)
-- 
1.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] perf: add context field to perf_event

2011-07-11 Thread Will Deacon
On Mon, Jul 04, 2011 at 03:36:57PM +0100, Frederic Weisbecker wrote:
 On Mon, Jul 04, 2011 at 05:10:20PM +0300, Avi Kivity wrote:
  On 07/04/2011 04:58 PM, Frederic Weisbecker wrote:
  Another thing I would like to do in the even longer term is to not use 
  perf anymore
  for ptrace breakpoints, because that involves a heavy dependency and few 
  people are
  happy with that. Instead we should just have a generic hook into the 
  sched_switch()
  and handle pure ptrace breakpoints there. The central breakpoint API would 
  still be
  there to reserve/schedule breakpoint resources between ptrace and perf.
  
  
  'struct preempt_notifier' may be the hook you're looking for.
 
 Yeah looks like a perfect fit as it's per task.

I had a quick look at this and I think the preempt_notifier stuff needs
slightly extending so that we can register a notifier for a task other than
current [e.g. the child of current on which we are installing breakpoints].

If the task in question is running, it looks like this will introduce a race
condition between notifier registration and rescheduling. For the purposes
of ptrace this shouldn't be a problem as the child will be stopped, but
others might also want to make use of the new functionality.

Any ideas on how this could be achieved, or am I better off just restricting
this to children that are being traced?

Cheers,

Will
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC v3 31/56] ac97: convert to memory API

2011-07-11 Thread malc
On Mon, 11 Jul 2011, Avi Kivity wrote:

 On 07/11/2011 04:42 AM, Anthony Liguori wrote:
  On 07/10/2011 03:33 PM, malc wrote:
   On Sun, 10 Jul 2011, Avi Kivity wrote:
   
fixes BAR sizing as well.
   
   I find this patch disgusting, the read and write handlers in particular.
  
  Shouldn't it be possible to do something like:
  
  typedef struct OldMemoryRegionOps {
  MemoryRegionOps parent_ops;
  CPUReadMemoryFunc *readfn[3];
  CPUWriteMemoryFunc *writefn[3];
  void *opaque;
  } OldMemoryRegionOps;
  
  That should allow old-style implementations to be converted without
  introducing trampoline functions everywhere.
 
 Here's a new version:

This one looks acceptable[1], original submission said:
fixes BAR sizing as well. what was wrong with it?

[..snip..] 

P.S. Sans minor inconsistency with trailing commas.

-- 
mailto:av1...@comtv.ru
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC] vhost: address fixme in vhost TX zero-copy support

2011-07-11 Thread Michael S. Tsirkin
So the following should do it, on top of Shirleys's patch, I think.  I'm
a bit not sure about using vq-upend_idx - vq-done_idx to check the
number of outstanding DMA, Shirley, what do you think?
Untested.

I'm also thinking about making the use of this conditinal
on a module parameter, off by default to reduce
stability risk while still enabling more people to
test the feature.
Thoughts?

Signed-off-by: Michael S. Tsirkin m...@redhat.com

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7de0c6e..cf8deb3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -156,8 +156,7 @@ static void handle_tx(struct vhost_net *net)
 
for (;;) {
/* Release DMAs done buffers first */
-   if (atomic_read(vq-refcnt)  VHOST_MAX_PEND)
-   vhost_zerocopy_signal_used(vq);
+   vhost_zerocopy_signal_used(vq);
 
head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 ARRAY_SIZE(vq-iov),
@@ -175,7 +174,7 @@ static void handle_tx(struct vhost_net *net)
break;
}
/* If more outstanding DMAs, queue the work */
-   if (atomic_read(vq-refcnt)  VHOST_MAX_PEND) {
+   if (vq-upend_idx - vq-done_idx  VHOST_MAX_PEND) {
tx_poll_start(net, sock);
set_bit(SOCK_ASYNC_NOSPACE, sock-flags);
break;
@@ -214,12 +213,12 @@ static void handle_tx(struct vhost_net *net)
 
vq-heads[vq-upend_idx].len = len;
ubuf-callback = vhost_zerocopy_callback;
-   ubuf-arg = vq;
+   ubuf-arg = vq-ubufs;
ubuf-desc = vq-upend_idx;
msg.msg_control = ubuf;
msg.msg_controllen = sizeof(ubuf);
+   kref_get(vq-ubufs-kref);
}
-   atomic_inc(vq-refcnt);
vq-upend_idx = (vq-upend_idx + 1) % UIO_MAXIOV;
}
/* TODO: Check specific error and bomb out unless ENOBUFS? */
@@ -646,6 +645,7 @@ static long vhost_net_set_backend(struct vhost_net *n, 
unsigned index, int fd)
 {
struct socket *sock, *oldsock;
struct vhost_virtqueue *vq;
+   struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
int r;
 
mutex_lock(n-dev.mutex);
@@ -675,6 +675,13 @@ static long vhost_net_set_backend(struct vhost_net *n, 
unsigned index, int fd)
oldsock = rcu_dereference_protected(vq-private_data,
lockdep_is_held(vq-mutex));
if (sock != oldsock) {
+   ubufs = vhost_ubuf_alloc(vq, sock);
+   if (IS_ERR(ubufs)) {
+   r = PTR_ERR(ubufs);
+   goto err_ubufs;
+   }
+   oldubufs = vq-ubufs;
+   vq-ubufs = ubufs;
vhost_net_disable_vq(n, vq);
rcu_assign_pointer(vq-private_data, sock);
vhost_net_enable_vq(n, vq);
@@ -682,6 +689,9 @@ static long vhost_net_set_backend(struct vhost_net *n, 
unsigned index, int fd)
 
mutex_unlock(vq-mutex);
 
+   if (oldbufs)
+   vhost_ubuf_put_and_wait(oldbufs);
+
if (oldsock) {
vhost_net_flush_vq(n, index);
fput(oldsock-file);
@@ -690,6 +700,8 @@ static long vhost_net_set_backend(struct vhost_net *n, 
unsigned index, int fd)
mutex_unlock(n-dev.mutex);
return 0;
 
+err_ubufs:
+   fput(sock);
 err_vq:
mutex_unlock(vq-mutex);
 err:
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index db242b1..81b1dd7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -181,7 +181,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq-log_ctx = NULL;
vq-upend_idx = 0;
vq-done_idx = 0;
-   atomic_set(vq-refcnt, 0);
+   vq-ubufs = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -401,7 +401,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
  * of used idx. Once lower device DMA done contiguously, we will signal KVM
  * guest used idx.
  */
-void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
+int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
 {
int i, j = 0;
 
@@ -414,10 +414,9 @@ void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
} else
break;
}
-   if (j) {
+   if (j)
vq-done_idx = i;
-   atomic_sub(j, vq-refcnt);
-   }
+   return j;
 }
 
 /* Caller should have device mutex */
@@ -430,9 +429,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
vhost_poll_stop(dev-vqs[i].poll);
  

Re: [PATCH RFC] vhost: address fixme in vhost TX zero-copy support

2011-07-11 Thread Shirley Ma
On Tue, 2011-07-12 at 01:04 +0300, Michael S. Tsirkin wrote:
 So the following should do it, on top of Shirleys's patch, I think.
 I'm
 a bit not sure about using vq-upend_idx - vq-done_idx to check the
 number of outstanding DMA, Shirley, what do you think?

Yes, you can use this to track # outstanding DMAs.

 Untested.
 
 I'm also thinking about making the use of this conditinal
 on a module parameter, off by default to reduce
 stability risk while still enabling more people to
 test the feature.
 Thoughts?

Agreed.

 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 7de0c6e..cf8deb3 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -156,8 +156,7 @@ static void handle_tx(struct vhost_net *net)
 
 for (;;) {
 /* Release DMAs done buffers first */
 -   if (atomic_read(vq-refcnt)  VHOST_MAX_PEND)
 -   vhost_zerocopy_signal_used(vq);
 +   vhost_zerocopy_signal_used(vq);
 
 head = vhost_get_vq_desc(net-dev, vq, vq-iov,
  ARRAY_SIZE(vq-iov),
 @@ -175,7 +174,7 @@ static void handle_tx(struct vhost_net *net)
 break;
 }
 /* If more outstanding DMAs, queue the work */
 -   if (atomic_read(vq-refcnt)  VHOST_MAX_PEND)
 {
 +   if (vq-upend_idx - vq-done_idx 
 VHOST_MAX_PEND) {
 tx_poll_start(net, sock);
 set_bit(SOCK_ASYNC_NOSPACE,
 sock-flags);
 break;
 @@ -214,12 +213,12 @@ static void handle_tx(struct vhost_net *net)
 
 vq-heads[vq-upend_idx].len = len;
 ubuf-callback =
 vhost_zerocopy_callback;
 -   ubuf-arg = vq;
 +   ubuf-arg = vq-ubufs;
 ubuf-desc = vq-upend_idx;
 msg.msg_control = ubuf;
 msg.msg_controllen = sizeof(ubuf);
 +   kref_get(vq-ubufs-kref);
 }
 -   atomic_inc(vq-refcnt);
 vq-upend_idx = (vq-upend_idx + 1) %
 UIO_MAXIOV;
 }
 /* TODO: Check specific error and bomb out unless
 ENOBUFS? */
 @@ -646,6 +645,7 @@ static long vhost_net_set_backend(struct vhost_net
 *n, unsigned index, int fd)
  {
 struct socket *sock, *oldsock;
 struct vhost_virtqueue *vq;
 +   struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
 int r;
 
 mutex_lock(n-dev.mutex);
 @@ -675,6 +675,13 @@ static long vhost_net_set_backend(struct
 vhost_net *n, unsigned index, int fd)
 oldsock = rcu_dereference_protected(vq-private_data,
 
 lockdep_is_held(vq-mutex));
 if (sock != oldsock) {
 +   ubufs = vhost_ubuf_alloc(vq, sock);
 +   if (IS_ERR(ubufs)) {
 +   r = PTR_ERR(ubufs);
 +   goto err_ubufs;
 +   }
 +   oldubufs = vq-ubufs;
 +   vq-ubufs = ubufs;
 vhost_net_disable_vq(n, vq);
 rcu_assign_pointer(vq-private_data, sock);
 vhost_net_enable_vq(n, vq);
 @@ -682,6 +689,9 @@ static long vhost_net_set_backend(struct vhost_net
 *n, unsigned index, int fd)
 
 mutex_unlock(vq-mutex);
 
 +   if (oldbufs)
 +   vhost_ubuf_put_and_wait(oldbufs);
 +
 if (oldsock) {
 vhost_net_flush_vq(n, index);
 fput(oldsock-file);
 @@ -690,6 +700,8 @@ static long vhost_net_set_backend(struct vhost_net
 *n, unsigned index, int fd)
 mutex_unlock(n-dev.mutex);
 return 0;
 
 +err_ubufs:
 +   fput(sock);
  err_vq:
 mutex_unlock(vq-mutex);
  err:
 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
 index db242b1..81b1dd7 100644
 --- a/drivers/vhost/vhost.c
 +++ b/drivers/vhost/vhost.c
 @@ -181,7 +181,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 vq-log_ctx = NULL;
 vq-upend_idx = 0;
 vq-done_idx = 0;
 -   atomic_set(vq-refcnt, 0);
 +   vq-ubufs = NULL;
  }
 
  static int vhost_worker(void *data)
 @@ -401,7 +401,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
   * of used idx. Once lower device DMA done contiguously, we will
 signal KVM
   * guest used idx.
   */
 -void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
 +int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
  {
 int i, j = 0;
 
 @@ -414,10 +414,9 @@ void vhost_zerocopy_signal_used(struct
 vhost_virtqueue *vq)
 } else
 break;
 }
 -   if (j) {
 +   if (j)
 vq-done_idx = i;
 -   atomic_sub(j, vq-refcnt);
 -   }
 +