On Tue, May 14, 2013 at 5:34 PM, Paolo Bonzini <pbonz...@redhat.com> wrote:
> Il 14/05/2013 07:47, liu ping fan ha scritto:
>> On Mon, May 13, 2013 at 5:31 PM, Paolo Bonzini <pbonz...@redhat.com> wrote:
>>> Il 13/05/2013 05:21, Liu Ping Fan ha scritto:
>>>> From: Liu Ping Fan <pingf...@linux.vnet.ibm.com>
>>>>
>>>> Each address space listener has PhysPageMap *cur_map, *next_map,
>>>> the switch from cur_map to next_map complete the RCU style. The
>>>> mem_commit() do the switch, and it is against reader but AddressSpace's
>>>> lock or later RCU mechanism (around address_space_translate() ).
>>>>
>>>> Signed-off-by: Liu Ping Fan <pingf...@linux.vnet.ibm.com>
>>>> ---
>>>>  exec.c                         |   36 +++++++++++++++++++++++++++---------
>>>>  include/exec/memory-internal.h |   11 ++++++++++-
>>>>  2 files changed, 37 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/exec.c b/exec.c
>>>> index bb4e540..e5871d6 100644
>>>> --- a/exec.c
>>>> +++ b/exec.c
>>>> @@ -186,24 +186,26 @@ static void phys_page_set(AddressSpaceDispatch *d,
>>>>                            hwaddr index, hwaddr nb,
>>>>                            uint16_t leaf)
>>>>  {
>>>> +    PhysPageMap *map = d->next_map;
>>>>      /* Wildly overreserve - it doesn't matter much. */
>>>>      phys_map_node_reserve(3 * P_L2_LEVELS);
>>>>
>>>> -    phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
>>>> +    phys_page_set_level(&map->root, &index, &nb, leaf, P_L2_LEVELS - 1);
>>>>  }
>>>>
>>>>  static PhysSection *phys_section_find(AddressSpaceDispatch *d,
>>>>                                        hwaddr index)
>>>>  {
>>>> -    PhysPageEntry lp = d->phys_map;
>>>>      PhysPageEntry *p;
>>>> -    PhysSection *phys_sections = cur_pgtbl->phys_sections;
>>>> -    Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
>>>> +    PhysPageEntry lp = d->cur_map->root;
>>>> +    PhysPageTable *pgtbl = d->cur_map->pgtbl;
>>>> +    PhysSection *phys_sections = pgtbl->phys_sections;
>>>> +    Node *phys_map_nodes = pgtbl->phys_map_nodes;
>>>>      int i;
>>>>
>>>>      for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
>>>>          if (lp.ptr == PHYS_MAP_NODE_NIL) {
>>>> -            return &phys_sections[cur_pgtbl->phys_section_unassigned];
>>>> +            return &phys_sections[pgtbl->phys_section_unassigned];
>>>>          }
>>>>          p = phys_map_nodes[lp.ptr];
>>>>          lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
>>>> @@ -234,7 +236,7 @@ MemoryRegionSection 
>>>> *address_space_translate(AddressSpace *as, hwaddr addr,
>>>>      IOMMUTLBEntry iotlb;
>>>>      MemoryRegionSection *section;
>>>>      hwaddr len = *plen;
>>>> -
>>>> +    PhysPageTable *pgtbl = cur_pgtbl;
>>>
>>> d->cur_map->pgtbl.
>>>
>>>>      for (;;) {
>>>>          section = address_space_lookup_region(as, addr);
>>>>
>>>> @@ -254,7 +256,7 @@ MemoryRegionSection 
>>>> *address_space_translate(AddressSpace *as, hwaddr addr,
>>>>                  | (addr & iotlb.addr_mask));
>>>>          len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
>>>>          if (!iotlb.perm[is_write]) {
>>>> -            section = 
>>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
>>>> +            section = 
>>>> &pgtbl->phys_sections[pgtbl->phys_section_unassigned].section;
>>>>              break;
>>>>          }
>>>>
>>>> @@ -1703,7 +1705,21 @@ static void mem_begin(MemoryListener *listener)
>>>>  {
>>>>      AddressSpaceDispatch *d = container_of(listener, 
>>>> AddressSpaceDispatch, listener);
>>>>
>>>> -    d->phys_map.ptr = PHYS_MAP_NODE_NIL;
>>>> +    d->next_map = g_new0(PhysPageMap, 1);
>>>> +    d->next_map->pgtbl = next_pgtbl;
>>>> +}
>>>> +
>>>> +static void mem_commit(MemoryListener *listener)
>>>> +{
>>>> +    AddressSpaceDispatch *d = container_of(listener, 
>>>> AddressSpaceDispatch, listener);
>>>> +    PhysPageMap *m = d->cur_map;
>>>> +
>>>> +    d->cur_map = d->next_map;
>>>> +    /* Fixme, Currently, we rely on biglock or address-space lock against
>>>> +    * reader. So here, we can safely drop it.
>>>> +    * After RCU, should change to call_rcu()
>>>> +    */
>>>> +    g_free(m);
>>>>  }
>>>>
>>>>  static void core_begin(MemoryListener *listener)
>>>> @@ -1771,11 +1787,12 @@ void address_space_init_dispatch(AddressSpace *as)
>>>>  {
>>>>      AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1);
>>>>
>>>> -    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 
>>>> 0 };
>>>> +    d->cur_map = g_new0(PhysPageMap, 1);
>>>>      d->listener = (MemoryListener) {
>>>>          .begin = mem_begin,
>>>>          .region_add = mem_add,
>>>>          .region_nop = mem_add,
>>>> +        .commit = mem_commit,
>>>>          .priority = 0,
>>>>      };
>>>>      as->dispatch = d;
>>>> @@ -1787,6 +1804,7 @@ void address_space_destroy_dispatch(AddressSpace *as)
>>>>      AddressSpaceDispatch *d = as->dispatch;
>>>>
>>>>      memory_listener_unregister(&d->listener);
>>>> +    g_free(d->cur_map);
>>>>      g_free(d);
>>>>      as->dispatch = NULL;
>>>>  }
>>>> diff --git a/include/exec/memory-internal.h 
>>>> b/include/exec/memory-internal.h
>>>> index 1b156fd..0dfe260 100644
>>>> --- a/include/exec/memory-internal.h
>>>> +++ b/include/exec/memory-internal.h
>>>> @@ -30,13 +30,22 @@ struct PhysPageEntry {
>>>>      uint16_t ptr : 15;
>>>>  };
>>>>
>>>> +struct PhysPageTable;
>>>> +typedef struct PhysPageMap PhysPageMap;
>>>> +
>>>> +struct PhysPageMap {
>>>> +    PhysPageEntry root;
>>>> +    struct PhysPageTable *pgtbl;
>>>
>>> cur_pgtbl should be introduced in patch 1 already.
>>>
>> PhysPageMap is a member of each AddressSpaceDispatch.  And we achieve
>> RCU based on PhysPageMap *cur_map=next_map.
>
> But assignment of cur_map is not atomic.  That's what I was trying to
> achieve with the do...while loop below.
>
It is atomic. The reason why I introduce PhysPageMap* is to ensure the
atomic assignment in RCU, since root and pgtbl can not be changed
atomically.

>> While cur_pgtbl/next_pgtbl are shared by all AddressSpaceDispatch.
>
> You cannot share them, because the root of some AddressSpaceDispatch
> structures will still refer to the old pagetable.  You need to attach

All the AddressSpaceDispatch's updaters are overlapped in
mem_commit(), the shared cur_pgtbl will be reclaimed only after all
the AddressSpaceDispatch have been reclaimed. So at that time, no
"root of some AddressSpaceDispatch  structures will still refer to the
old pagetable"

> the root of each ASD to the right pagetable, and the simplest way to do
> it is to put cur_pgtbl/next_pgtbl in the ASD.  And this problem exists
> in the first patch already, so that's where you have to put it.
>
I will implement them in each ASD, since it is more coherent with RCU
model (not coalescing updaters ).
>>>> +};
>>>> +
>>>>  typedef struct AddressSpaceDispatch AddressSpaceDispatch;
>>>>
>>>>  struct AddressSpaceDispatch {
>>>>      /* This is a multi-level map on the physical address space.
>>>>       * The bottom level has pointers to MemoryRegionSections.
>>>>       */
>>>> -    PhysPageEntry phys_map;
>>>> +    PhysPageMap *cur_map;
>>>> +    PhysPageMap *next_map;
>>>
>>> Pointers are quite expensive here.  With RCU we can fetch a consistent
>>> root/table pair like this:
>>>
>>>     rcu_read_lock();
>>>     do {
>>>         pgtbl = d->cur_pgtbl;
>>>         smp_rmb();
>>>         root = d->cur_root;
>>>
>>>         /* RCU ensures that d->cur_pgtbl remains alive, thus it cannot
>>>          * be recycled while this loop is running.  If
>>>          * d->cur_pgtbl == pgtbl, the root is the right one for this
>>>          * pgtable.
>>>          */
>>>         smp_rmb();
>>>     } while (d->cur_pgtbl == pgtbl);
>
> Ouch, != of course.
>
>>>     ...
>>>     rcu_read_unlock();
>>>
>> It seems to break the semantic of rcu_dereference() and rcu_assign().
>
> It doesn't.  In fact it is even stronger, I'm using a "full" rmb instead
> of read_barrier_depends.
>
rcu_dereference()/rcu_assign() ensure the switch from prev to next
version, based on atomic-ops. I think your method _does_ work based on
read+check+barrier skill, but it is not the standard RCU method, and
export some concept (barrier) outside RCU.

>> If pointers are expensive, how about this:
>> if (unlikely(d->prev_map!=d->cur_map)) {
>>     d->root = d->cur_map->root;
>>     d->pgtbl = d->cur_map->root;
>>     d->prev_map = d->cur_map;
>> }
>> So usually, we use cache value.
>
rcu_read_lock();
map = rcu_derefenrence(d->cur_map)
if (unlikely(d->prev_map!=map) {
    d->root = map->root;
    d->pgtbl = map->pgtbl;
}
......
rcu_read_unlock();

Then it can avoid ABA problem.

Thanks and regards,
Pingfan

> Doesn't work, it has ABA problem.  In my solution above, RCU avoids ABA
> because the read and the check are under the same RCU critical section.
>  In your solution, the read and the check are under different RCU
> critical sections, so it doesn't work.
>
> Paolo
>
>> Thanks and regards,
>> Pingfan
>>> Remember to have a matching smp_wmb() in mem_commit, and to write ->root
>>> first:
>>>
>>>     old_pgtbl = d->cur_pgtbl;
>>>     smp_wmb();
>>>     d->cur_root = d->next_root;
>>>
>>>     /* Write the root before updating the page table.  */
>>>     smp_wmb();
>>>     d->cur_pgtbl = d->next_pgtbl;
>>>
>>>     /* Write cur_pgtbl before possibly destroying the old one.  */
>>>     smp_mb();
>>>     page_table_unref(old_pgtbl); /* uses call_rcu if --refcount == 0 */
>>>
>>> If you are renaming fields, please do it as the first step.
>>>
>>> Paolo
>>>
>>>>      MemoryListener listener;
>>>>  };
>>>>
>>>>
>>>
>>
>>
>

Reply via email to