tcg: Use interval tree for user-only page tracking

Alex Bennée Thu, 27 Oct 2022 11:13:26 -0700


Richard Henderson <richard.hender...@linaro.org> writes:


> On 10/26/22 23:36, Alex Bennée wrote:
>> Richard Henderson <richard.hender...@linaro.org> writes:
>> 
>>> Finish weaning user-only away from PageDesc.
>>>
>>> Using an interval tree to track page permissions means that
>>> we can represent very large regions efficiently.
>>>
>>> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/290
>>> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/967
>>> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1214
>>> Signed-off-by: Richard Henderson <richard.hender...@linaro.org>
>>> ---
>>>   accel/tcg/internal.h           |   4 +-
>>>   accel/tcg/tb-maint.c           |  20 +-
>>>   accel/tcg/user-exec.c          | 614 ++++++++++++++++++++++-----------
>>>   tests/tcg/multiarch/test-vma.c |  22 ++
>>>   4 files changed, 451 insertions(+), 209 deletions(-)
>>>   create mode 100644 tests/tcg/multiarch/test-vma.c
>>>
>>> diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
>>> index 250f0daac9..c7e157d1cd 100644
>>> --- a/accel/tcg/internal.h
>>> +++ b/accel/tcg/internal.h
>>> @@ -24,9 +24,7 @@
>>>   #endif
>>>     typedef struct PageDesc {
>>> -#ifdef CONFIG_USER_ONLY
>>> -    unsigned long flags;
>>> -#else
>>> +#ifndef CONFIG_USER_ONLY
>>>       QemuSpin lock;
>>>       /* list of TBs intersecting this ram page */
>>>       uintptr_t first_tb;
>>> diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
>>> index 14e8e47a6a..694440cb4a 100644
>>> --- a/accel/tcg/tb-maint.c
>>> +++ b/accel/tcg/tb-maint.c
>>> @@ -68,15 +68,23 @@ static void page_flush_tb(void)
>> <snip>
>>>     int page_get_flags(target_ulong address)
>>>   {
>>> -    PageDesc *p;
>>> +    PageFlagsNode *p = pageflags_find(address, address);
>>>   -    p = page_find(address >> TARGET_PAGE_BITS);
>>> +    /*
>>> +     * See util/interval-tree.c re lockless lookups: no false positives but
>>> +     * there are false negatives.  If we find nothing, retry with the mmap
>>> +     * lock acquired.
>>> +     */
>>>       if (!p) {
>>> -        return 0;
>>> +        if (have_mmap_lock()) {
>>> +            return 0;
>>> +        }
>>> +        mmap_lock();
>>> +        p = pageflags_find(address, address);
>>> +        mmap_unlock();
>>> +        if (!p) {
>>> +            return 0;
>>> +        }
>>>       }
>>>       return p->flags;
>> To avoid the brain twisting following locks and multiple return legs
>> how about this:
>>    int page_get_flags(target_ulong address)
>>    {
>>        PageFlagsNode *p = pageflags_find(address, address);
>>        /*
>>         * See util/interval-tree.c re lockless lookups: no false positives 
>> but
>>         * there are false negatives.  If we had the lock and found
>>         * nothing we are done, otherwise retry with the mmap lock acquired.
>>         */
>>        if (have_mmap_lock()) {
>>            return p ? p->flags : 0;
>>        }
>>        mmap_lock();
>>        p = pageflags_find(address, address);
>>        mmap_unlock();
>>        return p ? p->flags : 0;
>>    }
>
> I'm unwilling to put an expensive test like a function call
> (have_mmap_lock) before an inexpensive test like pointer != NULL.

Is it really that more expensive?

> I don't see what's so brain twisting about the code as is.  The lock
> tightly surrounds a single statement, with a couple of pointer tests.

Sure, I guess I'm just trying to avoid having so many returns out of
the code at various levels of nesting. The page_get_target_data code is
harder to follow. What about:

int page_get_flags(target_ulong address)
{
    PageFlagsNode *p = pageflags_find(address, address);

    /*
     * See util/interval-tree.c re lockless lookups: no false positives but
     * there are false negatives.  If we had the lock and found
     * nothing we are done, otherwise retry with the mmap lock acquired.
     */
    if (p) {
        return p->flags;
    } else if (have_mmap_lock()) {
        return 0;
    }

    mmap_lock();
    p = pageflags_find(address, address);
    mmap_unlock();

    return p ? p->flags : 0;
}

and:

static IntervalTreeNode * new_target_data_locked(target_ulong region)
{
    IntervalTreeNode *n;
    TargetPageDataNode *t;

    t = g_new0(TargetPageDataNode, 1);
    n = &t->itree;
    n->start = region;
    n->last = region | ~TBD_MASK;
    interval_tree_insert(n, &targetdata_root);

    return n;
}

static inline void * get_target_data(IntervalTreeNode *n,
                                     target_ulong region, target_ulong page)
{
    TargetPageDataNode *t;
    t = container_of(n, TargetPageDataNode, itree);
    return t->data[(page - region) >> TARGET_PAGE_BITS];
}

void *page_get_target_data(target_ulong address)
{
    IntervalTreeNode *n;
    target_ulong page, region;

    page = address & TARGET_PAGE_MASK;
    region = address & TBD_MASK;

    n = interval_tree_iter_first(&targetdata_root, page, page);
    if (n) {
        return get_target_data(n, region, page);
    }

    /*
     * See util/interval-tree.c re lockless lookups: no false positives but
     * there are false negatives.  If we find nothing but had the lock
     * then we need a new node, otherwise try again under lock and
     * potentially allocate a new node.
     */
    if (have_mmap_lock()) {
        n = new_target_data_locked(region);
        return get_target_data(n, region, page);
    } else {
        mmap_lock();
        n = interval_tree_iter_first(&targetdata_root, page, page);

        if (!n) {
            n = new_target_data_locked(region);
        }
        mmap_unlock();
    }

    return get_target_data(n, region, page);
}

>
>>> +/*
>>> + * Test very large vma allocations.
>>> + * The qemu out-of-memory condition was within the mmap syscall itself.
>>> + * If the syscall actually returns with MAP_FAILED, the test succeeded.
>>> + */
>>> +#include <sys/mman.h>
>>> +
>>> +int main()
>>> +{
>>> +    int n = sizeof(size_t) == 4 ? 32 : 45;
>>> +
>>> +    for (int i = 28; i < n; i++) {
>>> +        size_t l = (size_t)1 << i;
>>> +        void *p = mmap(0, l, PROT_NONE,
>>> +                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
>>> +        if (p == MAP_FAILED) {
>>> +            break;
>>> +        }
>>> +        munmap(p, l);
>>> +    }
>>> +    return 0;
>>> +}
>> So is the failure mode here we actually seg or bus out?
>
> SEGV or KILL (via oom) depending on the state of the system. If the
> host is *really* beefy, it may even complete but with an unreasonable
> timeout.
>
> r~


-- 
Alex Bennée

Re: [PATCH 22/24] accel/tcg: Use interval tree for user-only page tracking

Reply via email to