On 06.01.2014 11:08, Wenchao Xia wrote:
> δΊ 2014/1/6 2:02, Peter Lieven ει:
>> vnc_update_client currently scans the dirty bitmap of each client
>> bitwise which is a very costly operation if only few bits are dirty.
>> vnc_refresh_server_surface does almost the same.
>> this patch optimizes both by utilizing the heavily optimized
>> function find_next_bit to find the offset of the next dirty
>> bit in the dirty bitmaps.
>>
>> The following artifical test (just the bitmap operation part) running
>> vnc_update_client 65536 times on a 2560x2048 surface illustrates the
>> performance difference:
>>
>> All bits clean - vnc_update_client_new: 0.07 secs
>> vnc_update_client_old: 10.98 secs
>>
>> All bits dirty - vnc_update_client_new: 11.26 secs
>> vnc_update_client_old: 20.19 secs
>>
>> Few bits dirty - vnc_update_client_new: 0.08 secs
>> vnc_update_client_old: 10.98 secs
>>
>> The case for all bits dirty is still rather slow, this
>> is due to the implementation of find_and_clear_dirty_height.
>> This will be addresses in a separate patch.
>>
>> Signed-off-by: Peter Lieven <p...@kamp.de>
>> ---
>> ui/vnc.c | 154
>> +++++++++++++++++++++++++++++++++-----------------------------
>> ui/vnc.h | 4 ++
>> 2 files changed, 87 insertions(+), 71 deletions(-)
>>
>> diff --git a/ui/vnc.c b/ui/vnc.c
>> index 1d2aa1a..6a0c03e 100644
>> --- a/ui/vnc.c
>> +++ b/ui/vnc.c
>> @@ -572,6 +572,14 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y)
>> ptr += x * VNC_SERVER_FB_BYTES;
>> return ptr;
>> }
>> +/* this sets only the visible pixels of a dirty bitmap */
>> +#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\
>> + int y;\
>> + memset(bitmap, 0x00, sizeof(bitmap));\
>> + for (y = 0; y < h; y++) {\
>> + bitmap_set(bitmap[y], 0, w / VNC_DIRTY_PIXELS_PER_BIT);\
> Will it be a problem when vnc's width % VNC_DIRTY_PIXELS_PER_BIT != 0?
> Although it is a rare case, but I think it is better round it up, since
> "v" and "VNC_DIRTY_PIXELS_PER_BIT" are variables. A macro computing it
> would be nice:
Good point, I will use DIV_ROUND_UP here.
>
> #define VNC_DIRTY_BITS_FROM_WIDTH(w) (w + VNC_DIRTY_PIXELS_PER_BIT - 1/
> VNC_DIRTY_PIXELS_PER_BIT)
> #define VNC_DIRTY_BITS (VNC_DIRTY_BITS_FROM_WIDTH(VNC_MAX_WIDTH)
>
> then here:
> bitmap_set(bitmap[y], 0, VNC_DIRTY_BITS_FROM_WIDTH(w));
>
> Or simply warn or coredump when v % VNC_DIRTY_PIXELS_PER_BIT != 0.
>
> Also, in vnc.h:
> /* VNC_MAX_WIDTH must be a multiple of 16. */
> #define VNC_MAX_WIDTH 2560
> #define VNC_MAX_HEIGHT 2048
>
> Maybe it should be updated as:
> /* VNC_MAX_WIDTH must be a multiple of VNC_DIRTY_PIXELS_PER_BIT. */
correct. will fix as well.
>
>> + } \
>> + }
>>
>> static void vnc_dpy_switch(DisplayChangeListener *dcl,
>> DisplaySurface *surface)
>> @@ -597,7 +605,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
>> qemu_pixman_image_unref(vd->guest.fb);
>> vd->guest.fb = pixman_image_ref(surface->image);
>> vd->guest.format = surface->format;
>> - memset(vd->guest.dirty, 0xFF, sizeof(vd->guest.dirty));
>> + VNC_SET_VISIBLE_PIXELS_DIRTY(vd->guest.dirty,
>> + surface_width(vd->ds),
>> + surface_height(vd->ds));
>>
>> QTAILQ_FOREACH(vs, &vd->clients, next) {
>> vnc_colordepth(vs);
>> @@ -605,7 +615,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
>> if (vs->vd->cursor) {
>> vnc_cursor_define(vs);
>> }
>> - memset(vs->dirty, 0xFF, sizeof(vs->dirty));
>> + VNC_SET_VISIBLE_PIXELS_DIRTY(vs->dirty,
>> + surface_width(vd->ds),
>> + surface_height(vd->ds));
>> }
>> }
>>
>> @@ -889,10 +901,9 @@ static int vnc_update_client(VncState *vs, int
>> has_dirty)
>> VncDisplay *vd = vs->vd;
>> VncJob *job;
>> int y;
>> - int width, height;
>> + int height;
>> int n = 0;
>>
>> -
>> if (vs->output.offset && !vs->audio_cap && !vs->force_update)
>> /* kernel send buffers are full -> drop frames to throttle */
>> return 0;
>> @@ -908,39 +919,27 @@ static int vnc_update_client(VncState *vs, int
>> has_dirty)
>> */
>> job = vnc_job_new(vs);
>>
>> - width = MIN(pixman_image_get_width(vd->server), vs->client_width);
>> height = MIN(pixman_image_get_height(vd->server),
>> vs->client_height);
>>
>> - for (y = 0; y < height; y++) {
>> - int x;
>> - int last_x = -1;
>> - for (x = 0; x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) {
>> - if (test_and_clear_bit(x, vs->dirty[y])) {
>> - if (last_x == -1) {
>> - last_x = x;
>> - }
>> - } else {
>> - if (last_x != -1) {
>> - int h = find_and_clear_dirty_height(vs, y, last_x,
>> x,
>> - height);
>> -
>> - n += vnc_job_add_rect(job,
>> - last_x *
>> VNC_DIRTY_PIXELS_PER_BIT,
>> - y,
>> - (x - last_x) *
>> - VNC_DIRTY_PIXELS_PER_BIT,
>> - h);
>> - }
>> - last_x = -1;
>> - }
>> - }
>> - if (last_x != -1) {
>> - int h = find_and_clear_dirty_height(vs, y, last_x, x,
>> height);
>> - n += vnc_job_add_rect(job, last_x *
>> VNC_DIRTY_PIXELS_PER_BIT,
>> - y,
>> - (x - last_x) *
>> VNC_DIRTY_PIXELS_PER_BIT,
>> - h);
>> + y = 0;
>> + for (;;) {
>> + int x, h;
>> + unsigned long x2;
>> + unsigned long offset = find_next_bit((unsigned long *)
>> &vs->dirty,
>> + height * VNC_DIRTY_BPL(vs),
>> + y * VNC_DIRTY_BPL(vs));
>> + if (offset == height * VNC_DIRTY_BPL(vs)) {
>> + /* no more dirty bits */
>> + break;
>> }
>> + y = offset / VNC_DIRTY_BPL(vs);
>> + x = offset % VNC_DIRTY_BPL(vs);
>> + x2 = find_next_zero_bit((unsigned long *) &vs->dirty[y],
>> + VNC_DIRTY_BPL(vs), x);
>> + bitmap_clear(vs->dirty[y], x, x2 - x);
>> + h = find_and_clear_dirty_height(vs, y, x, x2, height);
>> + n += vnc_job_add_rect(job, x * VNC_DIRTY_PIXELS_PER_BIT, y,
>> + (x2 - x) * VNC_DIRTY_PIXELS_PER_BIT, h);
>> }
> Minor comments:
> VNC_DIRTY_BPL(vs) is accessing memory by pointer, should we use a
> variable instead of VNC_DIRTY_BPL(vs) in every place, in case of
> compiler didn't optimize it for us?
I am pretty sure that sizeof is evaluated at compile time or do you have other
evidence?
>
>> vnc_job_push(job);
>> @@ -2678,8 +2677,8 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
>> int width = pixman_image_get_width(vd->guest.fb);
>> int height = pixman_image_get_height(vd->guest.fb);
>> int y;
>> - uint8_t *guest_row;
>> - uint8_t *server_row;
>> + uint8_t *guest_row0 = NULL, *server_row0;
> Any reason that rename those variable?
Its actually a pointer to row0 and not to any specific row. This is why
I renamed it.
>
>> + int guest_stride = 0, server_stride;
>> int cmp_bytes;
>> VncState *vs;
>> int has_dirty = 0;
>> @@ -2704,44 +2703,57 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
>> if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
>> int width = pixman_image_get_width(vd->server);
>> tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width);
>> - }
>> - guest_row = (uint8_t *)pixman_image_get_data(vd->guest.fb);
>> - server_row = (uint8_t *)pixman_image_get_data(vd->server);
>> - for (y = 0; y < height; y++) {
>> - if (!bitmap_empty(vd->guest.dirty[y], VNC_DIRTY_BITS)) {
>> - int x;
>> - uint8_t *guest_ptr;
>> - uint8_t *server_ptr;
>> -
>> - if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
>> - qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y);
>> - guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf);
>> - } else {
>> - guest_ptr = guest_row;
>> - }
>> - server_ptr = server_row;
>> + } else {
>> + guest_row0 = (uint8_t *)pixman_image_get_data(vd->guest.fb);
>> + guest_stride = pixman_image_get_stride(vd->guest.fb);
>> + }
>> + server_row0 = (uint8_t *)pixman_image_get_data(vd->server);
>> + server_stride = pixman_image_get_stride(vd->server);
>> +
>> + y = 0;
>> + for (;;) {
>> + int x;
>> + uint8_t *guest_ptr, *server_ptr;
>> + unsigned long offset = find_next_bit((unsigned long *)
>> &vd->guest.dirty,
>> + height *
>> VNC_DIRTY_BPL(&vd->guest),
>> + y * VNC_DIRTY_BPL(&vd->guest));
>> + if (offset == height * VNC_DIRTY_BPL(&vd->guest)) {
>> + /* no more dirty bits */
>> + break;
>> + }
>> + y = offset / VNC_DIRTY_BPL(&vd->guest);
>>
>> - for (x = 0; x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width;
>> - x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes,
>> - server_ptr += cmp_bytes) {
>> - if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT),
>> - vd->guest.dirty[y])) {
>> - continue;
>> - }
>> - if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) {
>> - continue;
>> - }
>> - memcpy(server_ptr, guest_ptr, cmp_bytes);
>> - if (!vd->non_adaptive)
>> - vnc_rect_updated(vd, x, y, &tv);
>> - QTAILQ_FOREACH(vs, &vd->clients, next) {
>> - set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]);
>> - }
>> - has_dirty++;
>> + server_ptr = server_row0 + y * server_stride;
>> +
>> + if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
>> + qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y);
>> + guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf);
>> + } else {
>> + guest_ptr = guest_row0 + y * guest_stride;
>> + }
>> +
>> + for (x = offset % VNC_DIRTY_BPL(&vd->guest);
>> + x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width;
>> + x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes,
>> + server_ptr += cmp_bytes) {
>> + if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT),
>> + vd->guest.dirty[y])) {
>> + continue;
>> + }
>> + if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) {
>> + continue;
>> + }
>> + memcpy(server_ptr, guest_ptr, cmp_bytes);
>> + if (!vd->non_adaptive) {
>> + vnc_rect_updated(vd, x, y, &tv);
>> }
>> + QTAILQ_FOREACH(vs, &vd->clients, next) {
>> + set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]);
>> + }
>> + has_dirty++;
>> }
>> - guest_row += pixman_image_get_stride(vd->guest.fb);
>> - server_row += pixman_image_get_stride(vd->server);
>> +
>> + y++;
>> }
>> qemu_pixman_image_unref(tmpbuf);
>> return has_dirty;
>> diff --git a/ui/vnc.h b/ui/vnc.h
>> index 4a8f33c..07e1f59 100644
>> --- a/ui/vnc.h
>> +++ b/ui/vnc.h
>> @@ -88,6 +88,10 @@ typedef void VncSendHextileTile(VncState *vs,
>> /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */
>> #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT)
>>
>> +/* VNC_DIRTY_BPL (BPL = bits per line) might be greater than
>> + * VNC_DIRTY_BITS due to alignment */
>> +#define VNC_DIRTY_BPL(x) (sizeof((x)->dirty) / VNC_MAX_HEIGHT *
>> BITS_PER_BYTE)
>> +
>> #define VNC_STAT_RECT 64
>> #define VNC_STAT_COLS (VNC_MAX_WIDTH / VNC_STAT_RECT)
>> #define VNC_STAT_ROWS (VNC_MAX_HEIGHT / VNC_STAT_RECT)
>>