[PATCH 1/5] trace: Add trace-events file for declaring trace events
This patch introduces the trace-events file where trace events can be declared like so: qemu_malloc(size_t size) "size %zu" qemu_free(void *ptr) "ptr %p" These trace event declarations are processed by a new tool called tracetool to generate code for the trace events. Trace event declarations are independent of the backend tracing system (LTTng User Space Tracing, kernel markers, DTrace). The default "nop" backend generates empty trace event functions. Therefore trace events are disabled by default. The trace-events file serves to purposes: 1. Adding trace events is easy. It is not necessary to understand the details of a backend tracing system. The trace-events file is a single location where trace events can be declared without code duplication. 2. QEMU is not tightly coupled to one particular backend tracing system. In order to support tracing across QEMU host platforms and to anticipate new backend tracing systems that are currently maturing, it is important to be flexible and not tied to one system. Signed-off-by: Stefan Hajnoczi --- .gitignore |2 + Makefile| 17 +-- Makefile.objs |5 ++ Makefile.target |1 + configure | 19 +++ trace-events| 24 tracetool | 162 +++ 7 files changed, 226 insertions(+), 4 deletions(-) create mode 100644 trace-events create mode 100755 tracetool diff --git a/.gitignore b/.gitignore index fdfe2f0..4644557 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ config-devices.* config-all-devices.* config-host.* config-target.* +trace.h +trace.c *-softmmu *-darwin-user *-linux-user diff --git a/Makefile b/Makefile index 306a1a4..ff57845 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for QEMU. -GENERATED_HEADERS = config-host.h +GENERATED_HEADERS = config-host.h trace.h ifneq ($(wildcard config-host.mak),) # Put the all: rule here so that config-host.mak can contain dependencies. @@ -130,16 +130,24 @@ bt-host.o: QEMU_CFLAGS += $(BLUEZ_CFLAGS) iov.o: iov.c iov.h +trace.h: trace-events + $(call quiet-command,sh $(SRC_PATH)/tracetool --$(TRACE_BACKEND) -h < $< > $@," GEN $@") + +trace.c: trace-events + $(call quiet-command,sh $(SRC_PATH)/tracetool --$(TRACE_BACKEND) -c < $< > $@," GEN $@") + +trace.o: trace.c + ## qemu-img.o: qemu-img-cmds.h qemu-img.o qemu-tool.o qemu-nbd.o qemu-io.o: $(GENERATED_HEADERS) -qemu-img$(EXESUF): qemu-img.o qemu-tool.o qemu-error.o $(block-obj-y) $(qobject-obj-y) +qemu-img$(EXESUF): qemu-img.o qemu-tool.o qemu-error.o $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) -qemu-nbd$(EXESUF): qemu-nbd.o qemu-tool.o qemu-error.o $(block-obj-y) $(qobject-obj-y) +qemu-nbd$(EXESUF): qemu-nbd.o qemu-tool.o qemu-error.o $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) -qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(block-obj-y) $(qobject-obj-y) +qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx $(call quiet-command,sh $(SRC_PATH)/hxtool -h < $< > $@," GEN $@") @@ -157,6 +165,7 @@ clean: rm -f *.o *.d *.a $(TOOLS) TAGS cscope.* *.pod *~ */*~ rm -f slirp/*.o slirp/*.d audio/*.o audio/*.d block/*.o block/*.d net/*.o net/*.d rm -f qemu-img-cmds.h + rm -f trace.c trace.h $(MAKE) -C tests clean for d in $(ALL_SUBDIRS) libhw32 libhw64 libuser libdis libdis-user; do \ if test -d $$d; then $(MAKE) -C $$d $@ || exit 1; fi; \ diff --git a/Makefile.objs b/Makefile.objs index acbaf22..9bbdf6f 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -248,6 +248,11 @@ libdis-$(CONFIG_S390_DIS) += s390-dis.o libdis-$(CONFIG_SH4_DIS) += sh4-dis.o libdis-$(CONFIG_SPARC_DIS) += sparc-dis.o +## +# trace + +trace-obj-y = trace.o + vl.o: QEMU_CFLAGS+=$(GPROF_CFLAGS) vl.o: QEMU_CFLAGS+=$(SDL_CFLAGS) diff --git a/Makefile.target b/Makefile.target index a22484e..4e63c02 100644 --- a/Makefile.target +++ b/Makefile.target @@ -294,6 +294,7 @@ $(obj-y) $(obj-$(TARGET_BASE_ARCH)-y): $(GENERATED_HEADERS) obj-y += $(addprefix ../, $(common-obj-y)) obj-y += $(addprefix ../libdis/, $(libdis-y)) +obj-y += $(addprefix ../, $(trace-obj-y)) obj-y += $(libobj-y) obj-y += $(addprefix $(HWDIR)/, $(hw-obj-y)) diff --git a/configure b/configure index 3cd2c5f..5e66f3a 100755 --- a/configure +++ b/configure @@ -299,6 +299,7 @@ pkgversion="" check_utests="no" user_pie="no" zero_malloc="" +trace_backend="nop" # OS specific if check_define __linux__ ; then @@ -494,6 +495,8 @@ for opt do ;; --target-list=*) target_list="$optarg" ;; + --trace-backend=*) trace_backend="$optarg" + ;; --enable-gprof) gprof="yes" ;; --static) @@ -826,6 +829,7 @
[PATCH 2/5] trace: Add simple built-in tracing backend
This patch adds a simple tracer which produces binary trace files and is built into QEMU. The main purpose of this patch is to show how new tracing backends can be added to tracetool. To try out the simple backend: ./configure --trace-backend=simple make After running QEMU you can pretty-print the trace: ./tracetool --simple --py events.py # first time only ./simpletrace.py /tmp/trace.log Signed-off-by: Stefan Hajnoczi --- This is the same trivial tracer that I posted previously. .gitignore |2 + Makefile.objs |3 + configure |2 +- simpletrace.c | 64 simpletrace.py | 38 + tracetool | 127 ++- 6 files changed, 232 insertions(+), 4 deletions(-) create mode 100644 simpletrace.c create mode 100755 simpletrace.py diff --git a/.gitignore b/.gitignore index 4644557..68fb21d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ config-host.* config-target.* trace.h trace.c +events.py *-softmmu *-darwin-user *-linux-user @@ -39,6 +40,7 @@ qemu-monitor.texi *.log *.pdf *.pg +*.pyc *.toc *.tp *.vr diff --git a/Makefile.objs b/Makefile.objs index 9bbdf6f..d870767 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -252,6 +252,9 @@ libdis-$(CONFIG_SPARC_DIS) += sparc-dis.o # trace trace-obj-y = trace.o +ifeq ($(TRACE_BACKEND),simple) +trace-obj-y += simpletrace.o +endif vl.o: QEMU_CFLAGS+=$(GPROF_CFLAGS) diff --git a/configure b/configure index 5e66f3a..d599879 100755 --- a/configure +++ b/configure @@ -829,7 +829,7 @@ echo " --enable-docsenable documentation build" echo " --disable-docs disable documentation build" echo " --disable-vhost-net disable vhost-net acceleration support" echo " --enable-vhost-net enable vhost-net acceleration support" -echo " --trace-backend=BTrace backend nop" +echo " --trace-backend=BTrace backend nop simple" echo "" echo "NOTE: The object files are built at the place where configure is launched" exit 1 diff --git a/simpletrace.c b/simpletrace.c new file mode 100644 index 000..2fec4d3 --- /dev/null +++ b/simpletrace.c @@ -0,0 +1,64 @@ +#include +#include +#include "trace.h" + +typedef struct { +unsigned long event; +unsigned long x1; +unsigned long x2; +unsigned long x3; +unsigned long x4; +unsigned long x5; +} TraceRecord; + +enum { +TRACE_BUF_LEN = 64 * 1024 / sizeof(TraceRecord), +}; + +static TraceRecord trace_buf[TRACE_BUF_LEN]; +static unsigned int trace_idx; +static FILE *trace_fp; + +static void trace(TraceEvent event, unsigned long x1, + unsigned long x2, unsigned long x3, + unsigned long x4, unsigned long x5) { +TraceRecord *rec = &trace_buf[trace_idx]; +rec->event = event; +rec->x1 = x1; +rec->x2 = x2; +rec->x3 = x3; +rec->x4 = x4; +rec->x5 = x5; + +if (++trace_idx == TRACE_BUF_LEN) { +trace_idx = 0; + +if (!trace_fp) { +trace_fp = fopen("/tmp/trace.log", "w"); +} +if (trace_fp) { +size_t result = fwrite(trace_buf, sizeof trace_buf, 1, trace_fp); +result = result; +} +} +} + +void trace1(TraceEvent event, unsigned long x1) { +trace(event, x1, 0, 0, 0, 0); +} + +void trace2(TraceEvent event, unsigned long x1, unsigned long x2) { +trace(event, x1, x2, 0, 0, 0); +} + +void trace3(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3) { +trace(event, x1, x2, x3, 0, 0); +} + +void trace4(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3, unsigned long x4) { +trace(event, x1, x2, x3, x4, 0); +} + +void trace5(TraceEvent event, unsigned long x1, unsigned long x2, unsigned long x3, unsigned long x4, unsigned long x5) { +trace(event, x1, x2, x3, x4, x5); +} diff --git a/simpletrace.py b/simpletrace.py new file mode 100755 index 000..70609cf --- /dev/null +++ b/simpletrace.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +import sys +import struct + +try: +from events import events +except ImportError: +sys.stderr.write('''Unable to import trace events from current working directory. Please run: +tracetool --simple --py events.py\n''') +sys.exit(1) + +trace_fmt = 'LL' +trace_len = struct.calcsize(trace_fmt) + +def read_record(fobj): +s = fobj.read(trace_len) +if len(s) != trace_len: +return None +return struct.unpack(trace_fmt, s) + +def format_record(rec): +event = events[rec[0]] +fields = [event[0]] +for i in xrange(1, len(event)): +fields.append('%s=0x%x' % (event[i], rec[i])) +return ' '.join(fields) + +if len(sys.argv) != 2: +sys.stderr.write('usage: %s \n' % sys.argv[0]) +sys.exit(1) + +f = open(sys.argv[1], 'rb') +while True: +rec = read_record(f) +if rec is None: +break + +print format_record(rec) diff --git a/tracetool b/tracetool
[RFC 0/5] Tracing backends
The following patches against qemu.git allow static trace events to be declared in QEMU. Trace events use a lightweight syntax and are independent of the backend tracing system (e.g. LTTng UST). Supported backends are: * my trivial tracer ("simple") * LTTng Userspace Tracer ("ust") * no tracer ("nop", the default) The ./configure option to choose a backend is --trace-backend=. Main point of this patchset: adding new trace events is easy and we can switch between backends without modifying the code. Prerna: Would you like to add your tracing system as a backend? This would be similar to my patches to add "simple" and "ust" backend support. Jan: Adding kernel marker backend support should be straightforward if you are interested. These patches are also available at: http://repo.or.cz/w/qemu/stefanha.git/shortlog/refs/heads/tracing -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/5] trace: Trace virtio-blk, multiwrite, and paio_submit
This patch adds trace events that make it possible to observe virtio-blk. Signed-off-by: Stefan Hajnoczi --- block.c|7 +++ hw/virtio-blk.c|7 +++ posix-aio-compat.c |2 ++ trace-events | 14 ++ 4 files changed, 30 insertions(+), 0 deletions(-) diff --git a/block.c b/block.c index bfe46e3..86fe7f5 100644 --- a/block.c +++ b/block.c @@ -23,6 +23,7 @@ */ #include "config-host.h" #include "qemu-common.h" +#include "trace.h" #include "monitor.h" #include "block_int.h" #include "module.h" @@ -1913,6 +1914,8 @@ static void multiwrite_cb(void *opaque, int ret) { MultiwriteCB *mcb = opaque; +trace_multiwrite_cb(mcb, ret); + if (ret < 0 && !mcb->error) { mcb->error = ret; multiwrite_user_cb(mcb); @@ -2044,6 +2047,8 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) // Check for mergable requests num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); +trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + // Run the aio requests for (i = 0; i < num_reqs; i++) { acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov, @@ -2054,9 +2059,11 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) // submitted yet. Otherwise we'll wait for the submitted AIOs to // complete and report the error in the callback. if (mcb->num_requests == 0) { +trace_bdrv_aio_multiwrite_earlyfail(mcb); reqs[i].error = -EIO; goto fail; } else { +trace_bdrv_aio_multiwrite_latefail(mcb, i); mcb->num_requests++; multiwrite_cb(mcb, -EIO); break; diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c index b05d15e..ef384e0 100644 --- a/hw/virtio-blk.c +++ b/hw/virtio-blk.c @@ -13,6 +13,7 @@ #include #include +#include "trace.h" #include "virtio-blk.h" #include "block_int.h" #ifdef __linux__ @@ -50,6 +51,8 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, int status) { VirtIOBlock *s = req->dev; +trace_virtio_blk_req_complete(req, status); + req->in->status = status; virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in)); virtio_notify(&s->vdev, s->vq); @@ -87,6 +90,8 @@ static void virtio_blk_rw_complete(void *opaque, int ret) { VirtIOBlockReq *req = opaque; +trace_virtio_blk_rw_complete(req, ret); + if (ret) { int is_read = !(req->out->type & VIRTIO_BLK_T_OUT); if (virtio_blk_handle_rw_error(req, -ret, is_read)) @@ -251,6 +256,8 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req) static void virtio_blk_handle_write(BlockRequest *blkreq, int *num_writes, VirtIOBlockReq *req, BlockDriverState **old_bs) { +trace_virtio_blk_handle_write(req, req->out->sector, req->qiov.size / 512); + if (req->out->sector & req->dev->sector_mask) { virtio_blk_rw_complete(req, -EIO); return; diff --git a/posix-aio-compat.c b/posix-aio-compat.c index b43c531..c2200fe 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -25,6 +25,7 @@ #include "qemu-queue.h" #include "osdep.h" #include "qemu-common.h" +#include "trace.h" #include "block_int.h" #include "block/raw-posix-aio.h" @@ -583,6 +584,7 @@ BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, acb->next = posix_aio_state->first_aio; posix_aio_state->first_aio = acb; +trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); qemu_paio_submit(acb); return &acb->common; } diff --git a/trace-events b/trace-events index a93ea29..4d96b8e 100644 --- a/trace-events +++ b/trace-events @@ -32,3 +32,17 @@ qemu_free(void *ptr) "ptr %p" qemu_memalign(size_t alignment, size_t size) "alignment %zu size %zu" qemu_valloc(size_t size) "size %zu" qemu_vfree(void *ptr) "ptr %p" + +# block.c +multiwrite_cb(void *mcb, int ret) "mcb %p ret %d" +bdrv_aio_multiwrite(void *mcb, int num_callbacks, int num_reqs) "mcb %p num_callbacks %d num_reqs %d" +bdrv_aio_multiwrite_earlyfail(void *mcb) "mcb %p" +bdrv_aio_multiwrite_latefail(void *mcb, int i) "mcb %p i %d" + +# hw/virtio-blk.c +virtio_blk_req_complete(void *req, int status) "req %p status %d" +virtio_blk_rw_complete(void *req, int ret) "req %p ret %d" +virtio_blk_handle_write(void *req, unsigned long sector, unsigned long nsectors) "req %p sector %lu nsectors %lu" + +# posix-aio-compat.c +paio_submit(void *acb, void *opaque, unsigned long sector_num, unsigned long nb_sectors, unsigned long type) "acb %p opaque %p sector_num %lu nb_sectors %lu type %lu" -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/5] trace: Trace qemu_malloc() and qemu_vmalloc()
It is often useful to instrument memory management functions in order to find leaks or performance problems. This patch adds trace events for the memory allocation primitives. Signed-off-by: Stefan Hajnoczi --- An example of adding trace events. osdep.c |9 + qemu-malloc.c |4 trace-events | 10 ++ 3 files changed, 23 insertions(+), 0 deletions(-) diff --git a/osdep.c b/osdep.c index abbc8a2..8e4b8ea 100644 --- a/osdep.c +++ b/osdep.c @@ -50,6 +50,7 @@ #endif #include "qemu-common.h" +#include "trace.h" #include "sysemu.h" #include "qemu_socket.h" @@ -71,6 +72,8 @@ static void *oom_check(void *ptr) #if defined(_WIN32) void *qemu_memalign(size_t alignment, size_t size) { +trace_qemu_memalign(alignment, size); + if (!size) { abort(); } @@ -79,6 +82,8 @@ void *qemu_memalign(size_t alignment, size_t size) void *qemu_vmalloc(size_t size) { +trace_qemu_vmalloc(size); + /* FIXME: this is not exactly optimal solution since VirtualAlloc has 64Kb granularity, but at least it guarantees us that the memory is page aligned. */ @@ -90,6 +95,7 @@ void *qemu_vmalloc(size_t size) void qemu_vfree(void *ptr) { +trace_qemu_vfree(ptr); VirtualFree(ptr, 0, MEM_RELEASE); } @@ -97,6 +103,8 @@ void qemu_vfree(void *ptr) void *qemu_memalign(size_t alignment, size_t size) { +trace_qemu_memalign(alignment, size); + #if defined(_POSIX_C_SOURCE) && !defined(__sun__) int ret; void *ptr; @@ -122,6 +130,7 @@ void *qemu_vmalloc(size_t size) void qemu_vfree(void *ptr) { +trace_qemu_vfree(ptr); free(ptr); } diff --git a/qemu-malloc.c b/qemu-malloc.c index 6cdc5de..69fc3cf 100644 --- a/qemu-malloc.c +++ b/qemu-malloc.c @@ -22,6 +22,7 @@ * THE SOFTWARE. */ #include "qemu-common.h" +#include "trace.h" #include static void *oom_check(void *ptr) @@ -39,6 +40,7 @@ void *get_mmap_addr(unsigned long size) void qemu_free(void *ptr) { +trace_qemu_free(ptr); free(ptr); } @@ -53,6 +55,7 @@ static int allow_zero_malloc(void) void *qemu_malloc(size_t size) { +trace_qemu_malloc(size); if (!size && !allow_zero_malloc()) { abort(); } @@ -61,6 +64,7 @@ void *qemu_malloc(size_t size) void *qemu_realloc(void *ptr, size_t size) { +trace_qemu_realloc(ptr, size); if (!size && !allow_zero_malloc()) { abort(); } diff --git a/trace-events b/trace-events index a37d3cc..a93ea29 100644 --- a/trace-events +++ b/trace-events @@ -22,3 +22,13 @@ # system may not have the necessary headers included. # # The should be a sprintf()-compatible format string. + +# qemu-malloc.c +qemu_malloc(size_t size) "size %zu" +qemu_realloc(void *ptr, size_t size) "ptr %p size %zu" +qemu_free(void *ptr) "ptr %p" + +# osdep.c +qemu_memalign(size_t alignment, size_t size) "alignment %zu size %zu" +qemu_valloc(size_t size) "size %zu" +qemu_vfree(void *ptr) "ptr %p" -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/5] trace: Add LTTng Userspace Tracer backend
This patch adds LTTng Userspace Tracer (UST) backend support. The UST system requires no kernel support but libust and liburcu must be installed. $ ./configure --trace-backend ust $ make Start the UST daemon: $ ustd & List available tracepoints and enable some: $ ustctl --list-markers $(pgrep qemu) [...] {PID: 5458, channel/marker: ust/paio_submit, state: 0, fmt: "acb %p opaque %p sector_num %lu nb_sectors %lu type %lu" 0x4b32ba} $ ustctl --enable-marker "ust/paio_submit" $(pgrep qemu) Run the trace: $ ustctl --create-trace $(pgrep qemu) $ ustctl --start-trace $(pgrep qemu) [...] $ ustctl --stop-trace $(pgrep qemu) $ ustctl --destroy-trace $(pgrep qemu) Trace results can be viewed using lttv-gui. More information about UST: http://lttng.org/ust Signed-off-by: Stefan Hajnoczi --- I wrote this as part of trying out UST. Although UST is promising, the usability is poor at the moment. The dependencies include the lttv trace viewer which I had to build from source (and it required a makefile tweak to build). Luckily libust, liburcu, and ust-bin are packaged on my distro. Error messages are periodically printed by the UST code when running QEMU. I haven't investigated but this is may be due to signals interrupting UST's thread in poll(). Finally, the UST header files include some userspace ported kernel infrastructure and pollute the namespace. I had to add some #undefs to get QEMU to build after including UST headers. I don't see LTTng UST as a default option at the moment. Hopefully this will change in the future. configure |5 +++- tracetool | 77 +++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/configure b/configure index d599879..307dbcb 100755 --- a/configure +++ b/configure @@ -829,7 +829,7 @@ echo " --enable-docsenable documentation build" echo " --disable-docs disable documentation build" echo " --disable-vhost-net disable vhost-net acceleration support" echo " --enable-vhost-net enable vhost-net acceleration support" -echo " --trace-backend=BTrace backend nop simple" +echo " --trace-backend=BTrace backend nop simple ust" echo "" echo "NOTE: The object files are built at the place where configure is launched" exit 1 @@ -2302,6 +2302,9 @@ bsd) esac echo "TRACE_BACKEND=$trace_backend" >> $config_host_mak +if test "$trace_backend" = "ust"; then + LIBS="-lust $LIBS" +fi tools= if test `expr "$target_list" : ".*softmmu.*"` != 0 ; then diff --git a/tracetool b/tracetool index bcd163e..72beb20 100755 --- a/tracetool +++ b/tracetool @@ -3,12 +3,13 @@ usage() { cat >&2 <" +} + +linetoh_ust() +{ +local name args argnames +name=$(get_name "$1") +args=$(get_args "$1") +argnames=$(get_argnames "$1") + +cat < +#include "trace.h" +EOF +} + +linetoc_ust() +{ +local name args argnames fmt +name=$(get_name "$1") +args=$(get_args "$1") +argnames=$(get_argnames "$1") +fmt=$(get_fmt "$1") + +cat
Re: repeatable hang with loop mount and heavy IO in guest (now in host - not KVM then..)
On 05/23/2010 01:10 AM, Jim Paris wrote: Antoine Martin wrote: On 02/27/2010 12:38 AM, Antoine Martin wrote: 1 0 0 98 0 1| 0 0 | 66B 354B| 0 0 | 3011 1 1 0 98 0 0| 0 0 | 66B 354B| 0 0 | 2911 > From that point onwards, nothing will happen. The host has disk IO to spare... So what is it waiting for?? Moved to an AMD64 host. No effect. Disabled swap before running the test. No effect. Moved the guest to a fully up-to-date FC12 server (2.6.31.6-145.fc12.x86_64), no effect. I have narrowed it down to the guest's filesystem used for backing the disk image which is loop mounted: although it was not completely full (and had enough inodes), freeing some space on it prevents the system from misbehaving. FYI: the disk image was clean and was fscked before each test. kvm had been updated to 0.12.3 The weird thing is that the same filesystem works fine (no system hang) if used directly from the host, it is only misbehaving via kvm... So I am not dismissing the possibility that kvm may be at least partly to blame, or that it is exposing a filesystem bug (race?) not normally encountered. (I have backed up the full 32GB virtual disk in case someone suggests further investigation) Well, well. I've just hit the exact same bug on another *host* (not a guest), running stock Fedora 12. So this isn't a kvm bug after all. Definitely a loop+ext(4?) bug. Looks like you need a pretty big loop mounted partition to trigger it. (bigger than available ram?) This is what triggered it on a quad amd system with 8Gb of ram, software raid-1 partition: mount -o loop 2GB.dd source dd if=/dev/zero of=8GB.dd bs=1048576 count=8192 mkfs.ext4 -f 8GB.dd mount -o loop 8GB.dd dest rsync -rplogtD source/* dest/ umount source umount dest ^ this is where it hangs, I then tried to issue a 'sync' from another terminal, which also hung. It took more than 10 minutes to settle itself, during that time one CPU was stuck in wait state. This sounds like: https://bugzilla.kernel.org/show_bug.cgi?id=15906 https://bugzilla.redhat.com/show_bug.cgi?id=588930 Indeed it does. Let's hope this makes it to -stable fast. Antoine -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: repeatable hang with loop mount and heavy IO in guest (now in host - not KVM then..)
Antoine Martin wrote: > On 02/27/2010 12:38 AM, Antoine Martin wrote: > >>> 1 0 0 98 0 1| 0 0 | 66B 354B| 0 0 | 3011 > >>> 1 1 0 98 0 0| 0 0 | 66B 354B| 0 0 | 2911 > >>>From that point onwards, nothing will happen. > >>>The host has disk IO to spare... So what is it waiting for?? > >>Moved to an AMD64 host. No effect. > >>Disabled swap before running the test. No effect. > >>Moved the guest to a fully up-to-date FC12 server > >>(2.6.31.6-145.fc12.x86_64), no effect. > >I have narrowed it down to the guest's filesystem used for backing > >the disk image which is loop mounted: although it was not > >completely full (and had enough inodes), freeing some space on it > >prevents the system from misbehaving. > > > >FYI: the disk image was clean and was fscked before each test. kvm > >had been updated to 0.12.3 > >The weird thing is that the same filesystem works fine (no system > >hang) if used directly from the host, it is only misbehaving via > >kvm... > > > >So I am not dismissing the possibility that kvm may be at least > >partly to blame, or that it is exposing a filesystem bug (race?) > >not normally encountered. > >(I have backed up the full 32GB virtual disk in case someone > >suggests further investigation) > Well, well. I've just hit the exact same bug on another *host* (not > a guest), running stock Fedora 12. > So this isn't a kvm bug after all. Definitely a loop+ext(4?) bug. > Looks like you need a pretty big loop mounted partition to trigger > it. (bigger than available ram?) > > This is what triggered it on a quad amd system with 8Gb of ram, > software raid-1 partition: > mount -o loop 2GB.dd source > dd if=/dev/zero of=8GB.dd bs=1048576 count=8192 > mkfs.ext4 -f 8GB.dd > mount -o loop 8GB.dd dest > rsync -rplogtD source/* dest/ > umount source > umount dest > ^ this is where it hangs, I then tried to issue a 'sync' from > another terminal, which also hung. > It took more than 10 minutes to settle itself, during that time one > CPU was stuck in wait state. This sounds like: https://bugzilla.kernel.org/show_bug.cgi?id=15906 https://bugzilla.redhat.com/show_bug.cgi?id=588930 -jim -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: ixgbe: macvlan on PF/VF when SRIOV is enabled
>-Original Message- >From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org] >On Behalf Of Shirley Ma >Sent: Friday, May 21, 2010 1:31 PM >To: Kirsher, Jeffrey T >Cc: da...@davemloft.net; kvm@vger.kernel.org; net...@vger.kernel.org; >e1000-de...@lists.sourceforge.net >Subject: ixgbe: macvlan on PF/VF when SRIOV is enabled > >Hello Jeff, > >macvlan doesn't work on PF when SRIOV is enabled. Creating macvlan has >been successful, but ping (icmp request) goes to VF interface not >PF/macvlan even arp entry is correct. I patched ixgbe driver, and >macvlan/PF has worked with the patch. But I am not sure whether it is >right since I don't have the HW spec. What I did for ixgbe driver was: > >1. PF's rar index is 0, VMDQ index is adatper->num_vfs; >2. VF's rar is based on rar_used_count and mc_addr_in_rar_count, VMDQ >index is ; >3. PF's secondary addresses is PF's rar index + i, VMDQ index is >adapter->num_vfs. As of 2.6.34 the ixgbe driver does not support multiple queues for macvlan. Support for multiple queues for macvlan will come in a subsequent release. > > >Before I submit the patch, I want to understand the right index >assignment for both rar index and VMDQ index, when SRIOV enabled: >1. VMDQ index for PF is adapter->num_vfs, or 0? rar index is 0? >2. PF's secondary address rar index is based on >rar_used_count/mc_addr_in_rar_count? >2. VF's VPDQ index is based on vf number? >3. VF's rar index is vf + 1, or should be based on rar_used_count? > >I am also working on macvlan on VF. The question here is whether macvlan >on VF should work or not? Looks like ixgbevf secondary addresses are not >in receiver address filter, so macvlan on VF doesn't work. The VF driver does not support macvlan. Future releases may but there are no immediate plans to support it. - Greg Rose Intel Corp. Lan Access Division
Re: Gentoo guest with smp: emerge freeze while recompile world
> > 4 S root 3458 3457 0 80 0 - 4454 wait 13:00 pts/000:00:00 > > -/bin/bash > > 4 S root 3462 3458 0 75 -5 - 45171 poll_s 13:00 pts/000:00:34 > > /usr/bin/python2.6 /usr/bin/emerge -e world > > 4 S root 3613 1 0 80 0 - 14014 wait 13:01 tty1 00:00:00 > > /bin/login -- > > 4 S root 3953 3613 0 80 0 - 4429 n_tty_ 13:01 tty1 00:00:00 -bash > > 0 S root 6614 3462 0 75 -5 - 972 wait 14:26 pts/000:00:00 > > [dev-util/pkgconfig-0.23] sandbox "/usr/lib64/portage/bin/ebuild.sh" > > compile > > 4 S root 6615 6614 0 75 -5 - 6362 wait 14:26 pts/000:00:00 > > /bin/bash /usr/lib64/portage/bin/ebuild.sh compile > > 5 S root 6646 6615 0 75 -5 - 6745 wait 14:26 pts/000:00:00 > > /bin/bash /usr/lib64/portage/bin/ebuild.sh compile > > 4 S root 13235 6646 0 75 -5 - 3651 wait 14:27 pts/000:00:00 > > make -j8 > > 4 S root 13238 13235 0 75 -5 - 3652 wait 14:27 pts/000:00:00 > > make all-recursive > > 4 S root 13239 13238 0 75 -5 - 5956 wait 14:27 pts/000:00:00 > > /bin/sh -c set fnord $MAKEFLAGS; amf=$2; \?dot_seen=no; \?target=`echo > > all-recursive | sed s/-recursive//`; \?list= > > 5 S root 13243 13239 0 75 -5 - 5956 wait 14:27 pts/000:00:00 > > /bin/sh -c set fnord $MAKEFLAGS; amf=$2; \?dot_seen=no; \?target=`echo > > all-recursive | sed s/-recursive//`; \?list= > > 4 S root 13244 13243 0 75 -5 - 3686 wait 14:27 pts/000:00:00 > > make all > > 4 S root 13358 13244 0 75 -5 - 3684 wait 14:27 pts/000:00:00 > > make all-recursive > > 4 S root 13359 13358 0 75 -5 - 5956 wait 14:27 pts/000:00:00 > > /bin/sh -c set fnord $MAKEFLAGS; amf=$2; \?dot_seen=no; \?target=`echo > > all-recursive | sed s/-recursive//`; \?list= > > 5 S root 16546 13359 0 75 -5 - 5956 wait 14:28 pts/000:00:00 > > /bin/sh -c set fnord $MAKEFLAGS; amf=$2; \?dot_seen=no; \?target=`echo > > all-recursive | sed s/-recursive//`; \?list= > > 4 S root 16547 16546 0 75 -5 - 3652 wait 14:28 pts/000:00:00 > > make all > > 4 S root 16548 16547 0 75 -5 - 3652 n_tty_ 14:28 pts/000:00:00 > > make all-am > > 4 S root 16599 3258 0 80 0 - 17937 poll_s 15:07 ?00:00:00 > > sshd: r...@pts/2 > > 4 S root 16602 16599 0 80 0 - 4429 wait 15:07 pts/200:00:00 -bash > > 4 R root 16611 16602 0 80 0 - 3698 - 15:08 pts/200:00:00 ps -elf > > 1 S root 31506 2 0 80 0 - 0 bdi_wr 14:25 ?00:00:00 > > [flush-253:0] > > > > All in wait? > > > > Maybe a block driver problem? Are you using virtio? > I just try lvm block drive without virtio, the problem persist with kernel >=2.6.32 Best regards, Riccardo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Detecting Guest Shutdown
Hi all, is it possible to detect a guest shutdown? I want to stop a service if my windows guest is shutted down and force a sync of the disks - because it could be possible that the user switches off the system afterwards and he has no possiblity to tell linux to shut down safely, because he only sees the shutted down screen of windows. Best regards, Erik -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: raw disks no longer work in latest kvm (kvm-88 was fine)
On 05/22/2010 06:17 PM, Michael Tokarev wrote: 22.05.2010 14:44, Antoine Martin wrote: Bump. Now that qemu is less likely to eat my data, " *[Qemu-devel] [PATCH 4/8] block: fix sector comparism in*" http://marc.info/?l=qemu-devel&m=127436114712437 I thought I would try using the raw 1.5TB partition again with KVM, still no go. Hm. I don't have so much diskspace (my largest is 750Gb, whole disk), but I created 1.5Tb sparse lvm volume. It appears to work for me, even 32bit version of qemu-kvm-0.12.4 (with the mentioned patch applied). I am still having to use: #undef CONFIG_PREADV Host and guest kernel version is 2.6.34, headers 2.6.33, glibc 2.10.1-r1 qemu-kvm 0.12.4 + patch above. eglibc-2.10.2-6, kernel #2.6.34.0-amd64, kernel headers 2.6.32-11~bpo50+1 (debian) Who do I need to bug? glibc? kvm? are you running 32bit userspace and 64bit kernel by a chance? If yes that's a kernel prob, see http://thread.gmane.org/gmane.linux.kernel.aio.general/2891 (the fix will be in 2.6.35 hopefully, now it's in Andrew Morton's tree). If not, well, I don't know ;) I'm not: 64-bit host and 64-bit guest. Thanks anyway. Antoine /mjt -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: raw disks no longer work in latest kvm (kvm-88 was fine)
22.05.2010 14:44, Antoine Martin wrote: Bump. Now that qemu is less likely to eat my data, " *[Qemu-devel] [PATCH 4/8] block: fix sector comparism in*" http://marc.info/?l=qemu-devel&m=127436114712437 I thought I would try using the raw 1.5TB partition again with KVM, still no go. Hm. I don't have so much diskspace (my largest is 750Gb, whole disk), but I created 1.5Tb sparse lvm volume. It appears to work for me, even 32bit version of qemu-kvm-0.12.4 (with the mentioned patch applied). I am still having to use: #undef CONFIG_PREADV Host and guest kernel version is 2.6.34, headers 2.6.33, glibc 2.10.1-r1 qemu-kvm 0.12.4 + patch above. eglibc-2.10.2-6, kernel #2.6.34.0-amd64, kernel headers 2.6.32-11~bpo50+1 (debian) Who do I need to bug? glibc? kvm? are you running 32bit userspace and 64bit kernel by a chance? If yes that's a kernel prob, see http://thread.gmane.org/gmane.linux.kernel.aio.general/2891 (the fix will be in 2.6.35 hopefully, now it's in Andrew Morton's tree). If not, well, I don't know ;) /mjt -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: raw disks no longer work in latest kvm (kvm-88 was fine)
Bump. Now that qemu is less likely to eat my data, " *[Qemu-devel] [PATCH 4/8] block: fix sector comparism in*" http://marc.info/?l=qemu-devel&m=127436114712437 I thought I would try using the raw 1.5TB partition again with KVM, still no go. I am still having to use: #undef CONFIG_PREADV Host and guest kernel version is 2.6.34, headers 2.6.33, glibc 2.10.1-r1 qemu-kvm 0.12.4 + patch above. Who do I need to bug? glibc? kvm? Thanks Antoine On 04/09/2010 05:00 AM, Antoine Martin wrote: Antoine Martin wrote: On 03/08/2010 02:35 AM, Avi Kivity wrote: On 03/07/2010 09:25 PM, Antoine Martin wrote: On 03/08/2010 02:17 AM, Avi Kivity wrote: On 03/07/2010 09:13 PM, Antoine Martin wrote: What version of glibc do you have installed? Latest stable: sys-devel/gcc-4.3.4 sys-libs/glibc-2.10.1-r1 $ git show glibc-2.10~108 | head commit e109c6124fe121618e42ba882e2a0af6e97b8efc Author: Ulrich Drepper Date: Fri Apr 3 19:57:16 2009 + * misc/Makefile (routines): Add preadv, preadv64, pwritev, pwritev64. * misc/Versions: Export preadv, preadv64, pwritev, pwritev64 for GLIBC_2.10. * misc/sys/uio.h: Declare preadv, preadv64, pwritev, pwritev64. * sysdeps/unix/sysv/linux/kernel-features.h: Add entries for preadv You might get away with rebuilding glibc against the 2.6.33 headers. The latest kernel headers available in gentoo (and they're masked unstable): sys-kernel/linux-headers-2.6.32 So I think I will just keep using Christoph's patch until .33 hits portage. Unless there's any reason not to? I would rather keep my system "clean". I can try it though, if that helps you clear things up? preadv/pwritev was actually introduced in 2.6.30. Perhaps you last build glibc before that? If so, a rebuild may be all that's necessary. To be certain, I've rebuilt qemu-kvm against: linux-headers-2.6.33 + glibc-2.10.1-r1 (both freshly built) And still no go! I'm still having to use the patch which disables preadv unconditionally... Better late than never, here's the relevant part of the strace (for the unpatched case where it fails): stat("./fs", {st_mode=S_IFBLK|0660, st_rdev=makedev(8, 41), ...}) = 0 open("./fs", O_RDWR|O_DIRECT|O_CLOEXEC) = 12 lseek(12, 0, SEEK_END) = 1321851815424 [pid 31266] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31266] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31266] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31266] lseek(12, 0, SEEK_SET) = 0 [pid 31266] read(12, "\240\246E\32\r\21\367c\212\316Xn\177e'\310}\234\1\273`\371\266\247\r\1nj\332\32\221\26"..., 512) = 512 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31271] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31271] pread(12, [pid 31271] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31271] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31273] pread(12, "iQ\35 \271O\203vj\ve[Ni}\355\263\272\4#yMo\266.\341\21\340Y5\204\20"..., 4096, 1321851805696) = 4096 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31271] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31273] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31273] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31273] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31271] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31273] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31273] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31271] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31273] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31271] pread(12, [pid 31267] lseek(12, 0, SEEK_END) = 1321851815424 [pid 31273] pread(12, [pid 31267] lseek(12, 0, S