于 2014/1/6 21:31, Peter Lieven 写道: > On 06.01.2014 11:08, Wenchao Xia wrote: >> 于 2014/1/6 2:02, Peter Lieven 写道: >>> vnc_update_client currently scans the dirty bitmap of each client >>> bitwise which is a very costly operation if only few bits are dirty. >>> vnc_refresh_server_surface does almost the same. >>> this patch optimizes both by utilizing the heavily optimized >>> function find_next_bit to find the offset of the next dirty >>> bit in the dirty bitmaps. >>> >>> The following artifical test (just the bitmap operation part) running >>> vnc_update_client 65536 times on a 2560x2048 surface illustrates the >>> performance difference: >>> >>> All bits clean - vnc_update_client_new: 0.07 secs >>> vnc_update_client_old: 10.98 secs >>> >>> All bits dirty - vnc_update_client_new: 11.26 secs >>> vnc_update_client_old: 20.19 secs >>> >>> Few bits dirty - vnc_update_client_new: 0.08 secs >>> vnc_update_client_old: 10.98 secs >>> >>> The case for all bits dirty is still rather slow, this >>> is due to the implementation of find_and_clear_dirty_height. >>> This will be addresses in a separate patch. >>> >>> Signed-off-by: Peter Lieven <p...@kamp.de> >>> --- >>> ui/vnc.c | 154 >>> +++++++++++++++++++++++++++++++++----------------------------- >>> ui/vnc.h | 4 ++ >>> 2 files changed, 87 insertions(+), 71 deletions(-) >>> >>> diff --git a/ui/vnc.c b/ui/vnc.c >>> index 1d2aa1a..6a0c03e 100644 >>> --- a/ui/vnc.c >>> +++ b/ui/vnc.c >>> @@ -572,6 +572,14 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y) >>> ptr += x * VNC_SERVER_FB_BYTES; >>> return ptr; >>> } >>> +/* this sets only the visible pixels of a dirty bitmap */ >>> +#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\ >>> + int y;\ >>> + memset(bitmap, 0x00, sizeof(bitmap));\ >>> + for (y = 0; y < h; y++) {\ >>> + bitmap_set(bitmap[y], 0, w / VNC_DIRTY_PIXELS_PER_BIT);\ >> Will it be a problem when vnc's width % VNC_DIRTY_PIXELS_PER_BIT != 0? >> Although it is a rare case, but I think it is better round it up, since >> "v" and "VNC_DIRTY_PIXELS_PER_BIT" are variables. A macro computing it >> would be nice: > Good point, I will use DIV_ROUND_UP here. >> >> #define VNC_DIRTY_BITS_FROM_WIDTH(w) (w + VNC_DIRTY_PIXELS_PER_BIT - 1/ >> VNC_DIRTY_PIXELS_PER_BIT) >> #define VNC_DIRTY_BITS (VNC_DIRTY_BITS_FROM_WIDTH(VNC_MAX_WIDTH) >> >> then here: >> bitmap_set(bitmap[y], 0, VNC_DIRTY_BITS_FROM_WIDTH(w)); >> >> Or simply warn or coredump when v % VNC_DIRTY_PIXELS_PER_BIT != 0. >> >> Also, in vnc.h: >> /* VNC_MAX_WIDTH must be a multiple of 16. */ >> #define VNC_MAX_WIDTH 2560 >> #define VNC_MAX_HEIGHT 2048 >> >> Maybe it should be updated as: >> /* VNC_MAX_WIDTH must be a multiple of VNC_DIRTY_PIXELS_PER_BIT. */ > correct. will fix as well. >> >>> + } \ >>> + } >>> >>> static void vnc_dpy_switch(DisplayChangeListener *dcl, >>> DisplaySurface *surface) >>> @@ -597,7 +605,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl, >>> qemu_pixman_image_unref(vd->guest.fb); >>> vd->guest.fb = pixman_image_ref(surface->image); >>> vd->guest.format = surface->format; >>> - memset(vd->guest.dirty, 0xFF, sizeof(vd->guest.dirty)); >>> + VNC_SET_VISIBLE_PIXELS_DIRTY(vd->guest.dirty, >>> + surface_width(vd->ds), >>> + surface_height(vd->ds)); >>> >>> QTAILQ_FOREACH(vs, &vd->clients, next) { >>> vnc_colordepth(vs); >>> @@ -605,7 +615,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl, >>> if (vs->vd->cursor) { >>> vnc_cursor_define(vs); >>> } >>> - memset(vs->dirty, 0xFF, sizeof(vs->dirty)); >>> + VNC_SET_VISIBLE_PIXELS_DIRTY(vs->dirty, >>> + surface_width(vd->ds), >>> + surface_height(vd->ds)); >>> } >>> } >>> >>> @@ -889,10 +901,9 @@ static int vnc_update_client(VncState *vs, int >>> has_dirty) >>> VncDisplay *vd = vs->vd; >>> VncJob *job; >>> int y; >>> - int width, height; >>> + int height; >>> int n = 0; >>> >>> - >>> if (vs->output.offset && !vs->audio_cap && !vs->force_update) >>> /* kernel send buffers are full -> drop frames to throttle */ >>> return 0; >>> @@ -908,39 +919,27 @@ static int vnc_update_client(VncState *vs, int >>> has_dirty) >>> */ >>> job = vnc_job_new(vs); >>> >>> - width = MIN(pixman_image_get_width(vd->server), vs->client_width); >>> height = MIN(pixman_image_get_height(vd->server), >>> vs->client_height); >>> >>> - for (y = 0; y < height; y++) { >>> - int x; >>> - int last_x = -1; >>> - for (x = 0; x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) { >>> - if (test_and_clear_bit(x, vs->dirty[y])) { >>> - if (last_x == -1) { >>> - last_x = x; >>> - } >>> - } else { >>> - if (last_x != -1) { >>> - int h = find_and_clear_dirty_height(vs, y, last_x, >>> x, >>> - height); >>> - >>> - n += vnc_job_add_rect(job, >>> - last_x * >>> VNC_DIRTY_PIXELS_PER_BIT, >>> - y, >>> - (x - last_x) * >>> - VNC_DIRTY_PIXELS_PER_BIT, >>> - h); >>> - } >>> - last_x = -1; >>> - } >>> - } >>> - if (last_x != -1) { >>> - int h = find_and_clear_dirty_height(vs, y, last_x, x, >>> height); >>> - n += vnc_job_add_rect(job, last_x * >>> VNC_DIRTY_PIXELS_PER_BIT, >>> - y, >>> - (x - last_x) * >>> VNC_DIRTY_PIXELS_PER_BIT, >>> - h); >>> + y = 0; >>> + for (;;) { >>> + int x, h; >>> + unsigned long x2; >>> + unsigned long offset = find_next_bit((unsigned long *) >>> &vs->dirty, >>> + height * >>> VNC_DIRTY_BPL(vs), >>> + y * VNC_DIRTY_BPL(vs)); >>> + if (offset == height * VNC_DIRTY_BPL(vs)) { >>> + /* no more dirty bits */ >>> + break; >>> } >>> + y = offset / VNC_DIRTY_BPL(vs); >>> + x = offset % VNC_DIRTY_BPL(vs); >>> + x2 = find_next_zero_bit((unsigned long *) &vs->dirty[y], >>> + VNC_DIRTY_BPL(vs), x); >>> + bitmap_clear(vs->dirty[y], x, x2 - x); >>> + h = find_and_clear_dirty_height(vs, y, x, x2, height); >>> + n += vnc_job_add_rect(job, x * VNC_DIRTY_PIXELS_PER_BIT, y, >>> + (x2 - x) * VNC_DIRTY_PIXELS_PER_BIT, h); >>> } >> Minor comments: >> VNC_DIRTY_BPL(vs) is accessing memory by pointer, should we use a >> variable instead of VNC_DIRTY_BPL(vs) in every place, in case of >> compiler didn't optimize it for us? > I am pretty sure that sizeof is evaluated at compile time or do you have other > evidence?
You are right, it is sizeof((x)->dirty), I missed the bracket before. >> >>> vnc_job_push(job); >>> @@ -2678,8 +2677,8 @@ static int vnc_refresh_server_surface(VncDisplay *vd) >>> int width = pixman_image_get_width(vd->guest.fb); >>> int height = pixman_image_get_height(vd->guest.fb); >>> int y; >>> - uint8_t *guest_row; >>> - uint8_t *server_row; >>> + uint8_t *guest_row0 = NULL, *server_row0; >> Any reason that rename those variable? > Its actually a pointer to row0 and not to any specific row. This is why > I renamed it. I see, thanks for the explanation. >> >>> + int guest_stride = 0, server_stride; >>> int cmp_bytes; >>> VncState *vs; >>> int has_dirty = 0; >>> @@ -2704,44 +2703,57 @@ static int vnc_refresh_server_surface(VncDisplay >>> *vd) >>> if (vd->guest.format != VNC_SERVER_FB_FORMAT) { >>> int width = pixman_image_get_width(vd->server); >>> tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width); >>> - } >>> - guest_row = (uint8_t *)pixman_image_get_data(vd->guest.fb); >>> - server_row = (uint8_t *)pixman_image_get_data(vd->server); >>> - for (y = 0; y < height; y++) { >>> - if (!bitmap_empty(vd->guest.dirty[y], VNC_DIRTY_BITS)) { >>> - int x; >>> - uint8_t *guest_ptr; >>> - uint8_t *server_ptr; >>> - >>> - if (vd->guest.format != VNC_SERVER_FB_FORMAT) { >>> - qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, >>> y); >>> - guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf); >>> - } else { >>> - guest_ptr = guest_row; >>> - } >>> - server_ptr = server_row; >>> + } else { >>> + guest_row0 = (uint8_t *)pixman_image_get_data(vd->guest.fb); >>> + guest_stride = pixman_image_get_stride(vd->guest.fb); >>> + } >>> + server_row0 = (uint8_t *)pixman_image_get_data(vd->server); >>> + server_stride = pixman_image_get_stride(vd->server); >>> + >>> + y = 0; >>> + for (;;) { >>> + int x; >>> + uint8_t *guest_ptr, *server_ptr; >>> + unsigned long offset = find_next_bit((unsigned long *) >>> &vd->guest.dirty, >>> + height * >>> VNC_DIRTY_BPL(&vd->guest), >>> + y * >>> VNC_DIRTY_BPL(&vd->guest)); >>> + if (offset == height * VNC_DIRTY_BPL(&vd->guest)) { >>> + /* no more dirty bits */ >>> + break; >>> + } >>> + y = offset / VNC_DIRTY_BPL(&vd->guest); >>> >>> - for (x = 0; x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width; >>> - x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes, >>> - server_ptr += cmp_bytes) { >>> - if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT), >>> - vd->guest.dirty[y])) { >>> - continue; >>> - } >>> - if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) { >>> - continue; >>> - } >>> - memcpy(server_ptr, guest_ptr, cmp_bytes); >>> - if (!vd->non_adaptive) >>> - vnc_rect_updated(vd, x, y, &tv); >>> - QTAILQ_FOREACH(vs, &vd->clients, next) { >>> - set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]); >>> - } >>> - has_dirty++; >>> + server_ptr = server_row0 + y * server_stride; >>> + >>> + if (vd->guest.format != VNC_SERVER_FB_FORMAT) { >>> + qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y); >>> + guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf); >>> + } else { >>> + guest_ptr = guest_row0 + y * guest_stride; >>> + } >>> + >>> + for (x = offset % VNC_DIRTY_BPL(&vd->guest); >>> + x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width; >>> + x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes, >>> + server_ptr += cmp_bytes) { >>> + if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT), >>> + vd->guest.dirty[y])) { >>> + continue; >>> + } >>> + if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) { >>> + continue; >>> + } >>> + memcpy(server_ptr, guest_ptr, cmp_bytes); >>> + if (!vd->non_adaptive) { >>> + vnc_rect_updated(vd, x, y, &tv); >>> } >>> + QTAILQ_FOREACH(vs, &vd->clients, next) { >>> + set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]); >>> + } >>> + has_dirty++; >>> } >>> - guest_row += pixman_image_get_stride(vd->guest.fb); >>> - server_row += pixman_image_get_stride(vd->server); >>> + >>> + y++; >>> } >>> qemu_pixman_image_unref(tmpbuf); >>> return has_dirty; >>> diff --git a/ui/vnc.h b/ui/vnc.h >>> index 4a8f33c..07e1f59 100644 >>> --- a/ui/vnc.h >>> +++ b/ui/vnc.h >>> @@ -88,6 +88,10 @@ typedef void VncSendHextileTile(VncState *vs, >>> /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */ >>> #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT) >>> >>> +/* VNC_DIRTY_BPL (BPL = bits per line) might be greater than >>> + * VNC_DIRTY_BITS due to alignment */ >>> +#define VNC_DIRTY_BPL(x) (sizeof((x)->dirty) / VNC_MAX_HEIGHT * >>> BITS_PER_BYTE) >>> + >>> #define VNC_STAT_RECT 64 >>> #define VNC_STAT_COLS (VNC_MAX_WIDTH / VNC_STAT_RECT) >>> #define VNC_STAT_ROWS (VNC_MAX_HEIGHT / VNC_STAT_RECT) >>> > > >