from:"Wei Yang"

Re: [PATCH v7 1/2] mm/memblock: Add "reserve_mem" to reserved named memory at boot up

2024-06-16 Thread Wei Yang

_all(void)
>   totalram_pages_add(pages);
> }
> 
>+/* Keep a table to reserve named memory */
>+#define RESERVE_MEM_MAX_ENTRIES   8
>+#define RESERVE_MEM_NAME_SIZE 16
>+struct reserve_mem_table {
>+  charname[RESERVE_MEM_NAME_SIZE];
>+  phys_addr_t start;
>+  phys_addr_t size;
>+};
>+static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES];
>+static int reserved_mem_count;
>+
>+/* Add wildcard region with a lookup name */
>+static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
>+ const char *name)
>+{

The logic looks good to me.

My concern now is the name may bring some confusion to others. We have
memblock.memory and memblock.reserved, without some careful reading users may
think this is used to add a region to memblock.reserved.

And the words with *reserve* may remind people of the concept related to
memblock.reserved.

Not good at naming, maybe "prealloc_mem"? Someone could differentiate it would
be better.

>+  struct reserve_mem_table *map;
>+
>+  map = _mem_table[reserved_mem_count++];
>+  map->start = start;
>+  map->size = size;
>+  strscpy(map->name, name);
>+}
>+
>+/**
>+ * reserve_mem_find_by_name - Find reserved memory region with a given name
>+ * @name: The name that is attached to a reserved memory region
>+ * @start: If found, holds the start address
>+ * @size: If found, holds the size of the address.
>+ *
>+ * @start and @size are only updated if @name is found.
>+ *
>+ * Returns: 1 if found or 0 if not found.
>+ */
>+int reserve_mem_find_by_name(const char *name, phys_addr_t *start, 
>phys_addr_t *size)
>+{
>+  struct reserve_mem_table *map;
>+  int i;
>+
>+  for (i = 0; i < reserved_mem_count; i++) {
>+  map = _mem_table[i];
>+  if (!map->size)
>+  continue;
>+  if (strcmp(name, map->name) == 0) {
>+  *start = map->start;
>+  *size = map->size;
>+  return 1;
>+  }
>+  }
>+  return 0;
>+}
>+EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
>+
>+/*
>+ * Parse reserve_mem=nn:align:name
>+ */
>+static int __init reserve_mem(char *p)
>+{
>+  phys_addr_t start, size, align, tmp;
>+  char *name;
>+  char *oldp;
>+  int len;
>+
>+  if (!p)
>+  return -EINVAL;
>+
>+  /* Check if there's room for more reserved memory */
>+  if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES)
>+  return -EBUSY;
>+
>+  oldp = p;
>+  size = memparse(p, );
>+  if (!size || p == oldp)
>+  return -EINVAL;
>+
>+  if (*p != ':')
>+  return -EINVAL;
>+
>+  align = memparse(p+1, );
>+  if (*p != ':')
>+  return -EINVAL;
>+
>+  /*
>+   * memblock_phys_alloc() doesn't like a zero size align,
>+   * but it is OK for this command to have it.
>+   */
>+  if (align < SMP_CACHE_BYTES)
>+  align = SMP_CACHE_BYTES;
>+
>+  name = p + 1;
>+  len = strlen(name);
>+
>+  /* name needs to have length but not too big */
>+  if (!len || len >= RESERVE_MEM_NAME_SIZE)
>+  return -EINVAL;
>+
>+  /* Make sure that name has text */
>+  for (p = name; *p; p++) {
>+  if (!isspace(*p))
>+  break;
>+  }
>+  if (!*p)
>+  return -EINVAL;
>+
>+  /* Make sure the name is not already used */
>+  if (reserve_mem_find_by_name(name, , ))
>+  return -EBUSY;
>+
>+  start = memblock_phys_alloc(size, align);
>+  if (!start)
>+  return -ENOMEM;
>+
>+  reserved_mem_add(start, size, name);
>+
>+  return 0;
>+}
>+__setup("reserve_mem=", reserve_mem);
>+
> #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
> static const char * const flagname[] = {
>   [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",
>-- 
>2.43.0
>
>

-- 
Wei Yang
Help you, Help me

Re: [PATCH RFC 0/4] static key support for error injection functions

2024-06-02 Thread Wei Yang

t;https://lore.kernel.org/all/3j5d3p22ssv7xoaghzraa7crcfih3h2qqjlhmjppbp6f42pg2t@kg7qoicog5ye/
>
>Signed-off-by: Vlastimil Babka 
>---
>Vlastimil Babka (4):
>  fault-inject: add support for static keys around fault injection sites
>  error-injection: support static keys around injectable functions
>  mm, slab: add static key for should_failslab()
>  mm, page_alloc: add static key for should_fail_alloc_page()
>
> include/asm-generic/error-injection.h | 13 ++-
> include/asm-generic/vmlinux.lds.h |  2 +-
> include/linux/error-injection.h   |  9 +---
> include/linux/fault-inject.h  |  7 +-
> kernel/fail_function.c| 22 +++---
> lib/error-inject.c|  6 -
> lib/fault-inject.c| 43 ++-
> mm/fail_page_alloc.c  |  3 ++-
> mm/failslab.c |  2 +-
> mm/internal.h |  2 ++
> mm/page_alloc.c   | 11 ++---
> mm/slab.h |  3 +++
> mm/slub.c | 10 +---
> 13 files changed, 114 insertions(+), 19 deletions(-)
>---
>base-commit: 1613e604df0cd359cf2a7fbd9be7a0bcfacfabd0
>change-id: 20240530-fault-injection-statickeys-66b7222e91b7
>
>Best regards,
>-- 
>Vlastimil Babka 
>

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 09/29] virtio-mem: don't always trigger the workqueue when offlining memory

2020-10-19 Thread Wei Yang

On Mon, Oct 19, 2020 at 11:04:40AM +0200, David Hildenbrand wrote:
>On 18.10.20 05:57, Wei Yang wrote:
>> On Fri, Oct 16, 2020 at 11:18:39AM +0200, David Hildenbrand wrote:
>>> On 16.10.20 06:03, Wei Yang wrote:
>>>> On Mon, Oct 12, 2020 at 02:53:03PM +0200, David Hildenbrand wrote:
>>>>> Let's trigger from offlining code when we're not allowed to touch online
>> 
>> Here "touch" means "unplug"? If so, maybe s/touch/unplug/ would be more easy
>> to understand.
>
>Yes, much better.
>
>[...]
>
>> I am trying to get more understanding about the logic of virtio_mem_retry().
>> 
>> Current logic seems clear to me. There are four places to trigger it:
>> 
>> * notify_offline
>> * notify_online
>> * timer_expired
>> * config_changed
>> 
>> In this patch, we try to optimize the first case, notify_offline.
>
>Yes.
>
>> 
>> Now, we would always trigger retry when one of our memory block get offlined.
>> Per my understanding, this logic is correct while missed one case (or be more
>> precise, not handle one case timely). The case this patch wants to improve is
>> virtio_mem_mb_remove(). If my understanding is correct.
>> 
>
>Yes, that's one part of it. Read below.
>
>>virtio_mem_run_wq()
>>virtio_mem_unplug_request()
>>virtio_mem_mb_unplug_any_sb_offline()
>>virtio_mem_mb_remove() --- 1
>>virtio_mem_mb_unplug_any_sb_online()
>>   virtio_mem_mb_offline_and_remove() --- 2
>> 
>> The above is two functions this patch adjusts. For 2), it will offline the
>> memory block, thus will trigger virtio_mem_retry() originally. But for 1), 
>> the
>> memory block is already offlined, so virtio_mem_retry() will not be triggered
>> originally. This is the case we want to improve in this patch. Instead of 
>> wait
>> for timer expire, we trigger retry immediately after unplug/remove an 
>> offlined
>> memory block.
>> 
>> And after this change, this patch still adjust the original
>> virtio_mem_notify_offline() path to just trigger virtio_mem_retry() when
>> unplug_online is false. (This means the offline event is notified from user
>> space instead of from unplug event).
>> 
>> If my above analysis is correct, I got one small suggestion for this patch.
>> Instead of adjust current notify_offline handling, how about just trigger
>> retry during virtio_mem_mb_remove()? Since per my understanding, we just want
>> to do immediate trigger retry when unplug an offlined memory block.
>
>I probably should have added the following to the patch description:
>
>"This is a preparation for Big Block Mode (BBM), whereby we can see some
>temporary offlining of memory blocks without actually making progress"
>
>Imagine you have a Big Block that spans to Linux memory blocks. Assume
>the first Linux memory blocks has no unmovable data on it.
>
>Assume you call offline_and_remove_memory()
>
>1. Try to offline the first block. Works, notifiers triggered.
>virtio_mem_retry().

After this patch, the virtio_mem_retry() is remove here.

>2. Try to offline the second block. Does not work.
>3. Re-online first block.
>4. Exit to main loop, exit workqueue.

Since offline_and_remove_memory() doesn't succeed, virtio_mem_retry() is not
triggered.

>5. Retry immediately (due to virtio_mem_retry()), go to 1.

So we won't have endless loop.

>
>So, you'll keep retrying forever. Found while debugging that exact issue :)
>

If this is the case, my suggestion is to record it in the changelog.
Otherwise, we may lose this corner case which is important to this change.

>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 29/29] virtio-mem: Big Block Mode (BBM) - safe memory hotunplug

2020-10-19 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:23PM +0200, David Hildenbrand wrote:
>Let's add a safe mechanism to unplug memory, avoiding long/endless loops
>when trying to offline memory - similar to in SBM.
>
>Fake-offline all memory (via alloc_contig_range()) before trying to
>offline+remove it. Use this mode as default, but allow to enable the other
>mode explicitly (which could give better memory hotunplug guarantees in
>some environments).
>
>The "unsafe" mode can be enabled e.g., via virtio_mem.bbm_safe_unplug=0
>on the cmdline.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: Wei Yang 
>Cc: Andrew Morton 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 29/29] virtio-mem: Big Block Mode (BBM) - safe memory hotunplug

2020-10-19 Thread Wei Yang

On Mon, Oct 19, 2020 at 10:50:39AM +0200, David Hildenbrand wrote:
>On 19.10.20 09:54, Wei Yang wrote:
>> On Mon, Oct 12, 2020 at 02:53:23PM +0200, David Hildenbrand wrote:
>>> Let's add a safe mechanism to unplug memory, avoiding long/endless loops
>>> when trying to offline memory - similar to in SBM.
>>>
>>> Fake-offline all memory (via alloc_contig_range()) before trying to
>>> offline+remove it. Use this mode as default, but allow to enable the other
>>> mode explicitly (which could give better memory hotunplug guarantees in
>> 
>> I don't get the point how unsafe mode would have a better guarantees?
>
>It's primarily only relevant when there is a lot of concurrent action
>going on while unplugging memory. Using alloc_contig_range() on
>ZONE_MOVABLE can fail more easily than memory offlining.
>
>alloc_contig_range() doesn't try as hard as memory offlining code to
>isolate memory. There are known issues with temporary page pinning
>(e.g., when a process dies) and the PCP. (mostly discovered via CMA
>allocations)
>
>See the TODO I add in patch #14.
>
>[...]
>>>
>>> +   if (bbm_safe_unplug) {
>>> +   /*
>>> +* Start by fake-offlining all memory. Once we marked the device
>>> +* block as fake-offline, all newly onlined memory will
>>> +* automatically be kept fake-offline. Protect from concurrent
>>> +* onlining/offlining until we have a consistent state.
>>> +*/
>>> +   mutex_lock(>hotplug_mutex);
>>> +   virtio_mem_bbm_set_bb_state(vm, bb_id,
>>> +   VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
>>> +
>> 
>> State is set here.
>> 
>>> +   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
>>> +   page = pfn_to_online_page(pfn);
>>> +   if (!page)
>>> +   continue;
>>> +
>>> +   rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION);
>>> +   if (rc) {
>>> +   end_pfn = pfn;
>>> +   goto rollback_safe_unplug;
>>> +   }
>>> +   }
>>> +   mutex_unlock(>hotplug_mutex);
>>> +   }
>>> +
>>> rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
>>> -   if (rc)
>>> +   if (rc) {
>>> +   if (bbm_safe_unplug) {
>>> +   mutex_lock(>hotplug_mutex);
>>> +   goto rollback_safe_unplug;
>>> +   }
>>> return rc;
>>> +   }
>>>
>>> rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
>>> if (rc)
>> 
>> And changed to PLUGGED or UNUSED based on rc.
>
>Right, after offlining+remove succeeded. So no longer added to Linux.
>
>The final state depends on the success of the unplug request towards the
>hypervisor.
>
>> 
>>> @@ -1987,6 +2069,17 @@ static int 
>>> virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
>>> virtio_mem_bbm_set_bb_state(vm, bb_id,
>>> VIRTIO_MEM_BBM_BB_UNUSED);
>>> return rc;
>>> +
>>> +rollback_safe_unplug:
>>> +   for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
>>> +   page = pfn_to_online_page(pfn);
>>> +   if (!page)
>>> +   continue;
>>> +   virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
>>> +   }
>>> +   virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
>> 
>> And changed to ADDED if failed.
>
>Right, back to the initial state when entering this function.
>
>> 
>>> +   mutex_unlock(>hotplug_mutex);
>>> +   return rc;
>>> }
>> 
>> So in which case, the bbm state is FAKE_OFFLINE during
>> virtio_mem_bbm_notify_going_offline() and
>> virtio_mem_bbm_notify_cancel_offline() ?
>
>Exactly, so we can do our magic with fake-offline pages and our
>virtio_mem_bbm_offline_and_remove_bb() can actually succeed.

Ah, my fault. The exact code flow is this:

virtio_mem_bbm_offline_remove_and_unplug_bb()
virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
virtio_mem_fake_offline(pfn, PAGES_PER_SECTION)
virtio_mem_bbm_offline_and_remove_bb(vm, bb_id)
offline and trigger memory notification  --- 1)

The notification is necessary at 1) to release the refcount, which is grabbed
during fake offline.

>
>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 29/29] virtio-mem: Big Block Mode (BBM) - safe memory hotunplug

2020-10-19 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:23PM +0200, David Hildenbrand wrote:
>Let's add a safe mechanism to unplug memory, avoiding long/endless loops
>when trying to offline memory - similar to in SBM.
>
>Fake-offline all memory (via alloc_contig_range()) before trying to
>offline+remove it. Use this mode as default, but allow to enable the other
>mode explicitly (which could give better memory hotunplug guarantees in

I don't get the point how unsafe mode would have a better guarantees?

>some environments).
>
>The "unsafe" mode can be enabled e.g., via virtio_mem.bbm_safe_unplug=0
>on the cmdline.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: Wei Yang 
>Cc: Andrew Morton 
>Signed-off-by: David Hildenbrand 
>---
> drivers/virtio/virtio_mem.c | 97 -
> 1 file changed, 95 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index 6bcd0acbff32..09f11489be6f 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -37,6 +37,11 @@ module_param(bbm_block_size, ulong, 0444);
> MODULE_PARM_DESC(bbm_block_size,
>"Big Block size in bytes. Default is 0 (auto-detection).");
> 
>+static bool bbm_safe_unplug = true;
>+module_param(bbm_safe_unplug, bool, 0444);
>+MODULE_PARM_DESC(bbm_safe_unplug,
>+   "Use a safe unplug mechanism in BBM, avoiding long/endless loops");
>+
> /*
>  * virtio-mem currently supports the following modes of operation:
>  *
>@@ -87,6 +92,8 @@ enum virtio_mem_bbm_bb_state {
>   VIRTIO_MEM_BBM_BB_PLUGGED,
>   /* Plugged and added to Linux. */
>   VIRTIO_MEM_BBM_BB_ADDED,
>+  /* All online parts are fake-offline, ready to remove. */
>+  VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
>   VIRTIO_MEM_BBM_BB_COUNT
> };
> 
>@@ -889,6 +896,32 @@ static void virtio_mem_sbm_notify_cancel_offline(struct 
>virtio_mem *vm,
>   }
> }
> 
>+static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
>+  unsigned long bb_id,
>+  unsigned long pfn,
>+  unsigned long nr_pages)
>+{
>+  /*
>+   * When marked as "fake-offline", all online memory of this device block
>+   * is allocated by us. Otherwise, we don't have any memory allocated.
>+   */
>+  if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
>+  VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
>+  return;
>+  virtio_mem_fake_offline_going_offline(pfn, nr_pages);
>+}
>+
>+static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
>+   unsigned long bb_id,
>+   unsigned long pfn,
>+   unsigned long nr_pages)
>+{
>+  if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
>+  VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
>+  return;
>+  virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
>+}
>+
> /*
>  * This callback will either be called synchronously from add_memory() or
>  * asynchronously (e.g., triggered via user space). We have to be careful
>@@ -949,6 +982,10 @@ static int virtio_mem_memory_notifier_cb(struct 
>notifier_block *nb,
>   vm->hotplug_active = true;
>   if (vm->in_sbm)
>   virtio_mem_sbm_notify_going_offline(vm, id);
>+  else
>+  virtio_mem_bbm_notify_going_offline(vm, id,
>+  mhp->start_pfn,
>+  mhp->nr_pages);
>   break;
>   case MEM_GOING_ONLINE:
>   mutex_lock(>hotplug_mutex);
>@@ -999,6 +1036,10 @@ static int virtio_mem_memory_notifier_cb(struct 
>notifier_block *nb,
>   break;
>   if (vm->in_sbm)
>   virtio_mem_sbm_notify_cancel_offline(vm, id);
>+  else
>+  virtio_mem_bbm_notify_cancel_offline(vm, id,
>+   mhp->start_pfn,
>+   mhp->nr_pages);
>   vm->hotplug_active = false;
>   mutex_unlock(>hotplug_mutex);
>   break;
>@@ -1189,7 +1230,13 @@ static void virtio_mem_online_page_cb(struct page 
>*page, unsigned int order)
>   do_online = virtio_mem_sbm_te

Re: [PATCH v1 28/29] virtio-mem: Big Block Mode (BBM) - basic memory hotunplug

2020-10-18 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:22PM +0200, David Hildenbrand wrote:
>Let's try to unplug completely offline big blocks first. Then, (if
>enabled via unplug_offline) try to offline and remove whole big blocks.
>
>No locking necessary - we can deal with concurrent onlining/offlining
>just fine.
>
>Note1: This is sub-optimal and might be dangerous in some environments: we
>could end up in an infinite loop when offlining (e.g., long-term pinnings),
>similar as with DIMMs. We'll introduce safe memory hotunplug via
>fake-offlining next, and use this basic mode only when explicitly enabled.
>
>Note2: Without ZONE_MOVABLE, memory unplug will be extremely unreliable
>with bigger block sizes.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: Wei Yang 
>Cc: Andrew Morton 
>Signed-off-by: David Hildenbrand 
>---
> drivers/virtio/virtio_mem.c | 156 +++-
> 1 file changed, 155 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index 94cf44b15cbf..6bcd0acbff32 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -388,6 +388,12 @@ static int 
>virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
>_bb_id++) \
>   if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
> 
>+#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
>+  for (_bb_id = vm->bbm.next_bb_id - 1; \
>+   _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
>+   _bb_id--) \
>+  if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
>+
> /*
>  * Set the state of a memory block, taking care of the state counter.
>  */
>@@ -685,6 +691,18 @@ static int virtio_mem_sbm_remove_mb(struct virtio_mem 
>*vm, unsigned long mb_id)
>   return virtio_mem_remove_memory(vm, addr, size);
> }
> 
>+/*
>+ * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks 
>covered
>+ * by the big block.
>+ */
>+static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long 
>bb_id)
>+{
>+  const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
>+  const uint64_t size = vm->bbm.bb_size;
>+
>+  return virtio_mem_remove_memory(vm, addr, size);
>+}
>+
> /*
>  * Try offlining and removing memory from Linux.
>  *
>@@ -731,6 +749,19 @@ static int virtio_mem_sbm_offline_and_remove_mb(struct 
>virtio_mem *vm,
>   return virtio_mem_offline_and_remove_memory(vm, addr, size);
> }
> 
>+/*
>+ * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
>+ * all Linux memory blocks covered by the big block.
>+ */
>+static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
>+  unsigned long bb_id)
>+{
>+  const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
>+  const uint64_t size = vm->bbm.bb_size;
>+
>+  return virtio_mem_offline_and_remove_memory(vm, addr, size);
>+}
>+
> /*
>  * Trigger the workqueue so the device can perform its magic.
>  */
>@@ -1928,6 +1959,129 @@ static int virtio_mem_sbm_unplug_request(struct 
>virtio_mem *vm, uint64_t diff)
>   return rc;
> }
> 
>+/*
>+ * Try to offline and remove a big block from Linux and unplug it. Will fail
>+ * with -EBUSY if some memory is busy and cannot get unplugged.
>+ *
>+ * Will modify the state of the memory block. Might temporarily drop the
>+ * hotplug_mutex.
>+ */
>+static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
>+ unsigned long bb_id)
>+{
>+  int rc;
>+
>+  if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
>+   VIRTIO_MEM_BBM_BB_ADDED))
>+  return -EINVAL;
>+
>+  rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
>+  if (rc)
>+  return rc;
>+
>+  rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
>+  if (rc)
>+  virtio_mem_bbm_set_bb_state(vm, bb_id,
>+  VIRTIO_MEM_BBM_BB_PLUGGED);
>+  else
>+  virtio_mem_bbm_set_bb_state(vm, bb_id,
>+  VIRTIO_MEM_BBM_BB_UNUSED);
>+  return rc;
>+}
>+
>+/*
>+ * Try to remove a big block from Linux and unplug it. Will fail with
>+ * -EBUSY if some memory is online.
>+ *
>+ * Will modify the state of the memory block.
>+ */
>+static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm,
>+

Re: [PATCH v1 27/29] mm/memory_hotplug: extend offline_and_remove_memory() to handle more than one memory block

2020-10-18 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:21PM +0200, David Hildenbrand wrote:
>virtio-mem soon wants to use offline_and_remove_memory() memory that
>exceeds a single Linux memory block (memory_block_size_bytes()). Let's
>remove that restriction.
>
>Let's remember the old state and try to restore that if anything goes
>wrong. While re-onlining can, in general, fail, it's highly unlikely to
>happen (usually only when a notifier fails to allocate memory, and these
>are rather rare).
>
>This will be used by virtio-mem to offline+remove memory ranges that are
>bigger than a single memory block - for example, with a device block
>size of 1 GiB (e.g., gigantic pages in the hypervisor) and a Linux memory
>block size of 128MB.
>
>While we could compress the state into 2 bit, using 8 bit is much
>easier.
>
>This handling is similar, but different to acpi_scan_try_to_offline():
>
>a) We don't try to offline twice. I am not sure if this CONFIG_MEMCG
>optimization is still relevant - it should only apply to ZONE_NORMAL
>(where we have no guarantees). If relevant, we can always add it.
>
>b) acpi_scan_try_to_offline() simply onlines all memory in case
>something goes wrong. It doesn't restore previous online type. Let's do
>that, so we won't overwrite what e.g., user space configured.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: Wei Yang 
>Cc: Andrew Morton 
>Signed-off-by: David Hildenbrand 

Looks good to me.

Reviewed-by: Wei Yang 

>---
> mm/memory_hotplug.c | 105 +---
> 1 file changed, 89 insertions(+), 16 deletions(-)
>
>diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>index b44d4c7ba73b..217080ca93e5 100644
>--- a/mm/memory_hotplug.c
>+++ b/mm/memory_hotplug.c
>@@ -1806,39 +1806,112 @@ int remove_memory(int nid, u64 start, u64 size)
> }
> EXPORT_SYMBOL_GPL(remove_memory);
> 
>+static int try_offline_memory_block(struct memory_block *mem, void *arg)
>+{
>+  uint8_t online_type = MMOP_ONLINE_KERNEL;
>+  uint8_t **online_types = arg;
>+  struct page *page;
>+  int rc;
>+
>+  /*
>+   * Sense the online_type via the zone of the memory block. Offlining
>+   * with multiple zones within one memory block will be rejected
>+   * by offlining code ... so we don't care about that.
>+   */
>+  page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
>+  if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
>+  online_type = MMOP_ONLINE_MOVABLE;
>+
>+  rc = device_offline(>dev);
>+  /*
>+   * Default is MMOP_OFFLINE - change it only if offlining succeeded,
>+   * so try_reonline_memory_block() can do the right thing.
>+   */
>+  if (!rc)
>+  **online_types = online_type;
>+
>+  (*online_types)++;
>+  /* Ignore if already offline. */
>+  return rc < 0 ? rc : 0;
>+}
>+
>+static int try_reonline_memory_block(struct memory_block *mem, void *arg)
>+{
>+  uint8_t **online_types = arg;
>+  int rc;
>+
>+  if (**online_types != MMOP_OFFLINE) {
>+  mem->online_type = **online_types;
>+  rc = device_online(>dev);
>+  if (rc < 0)
>+  pr_warn("%s: Failed to re-online memory: %d",
>+  __func__, rc);
>+  }
>+
>+  /* Continue processing all remaining memory blocks. */
>+  (*online_types)++;
>+  return 0;
>+}
>+
> /*
>- * Try to offline and remove a memory block. Might take a long time to
>- * finish in case memory is still in use. Primarily useful for memory devices
>- * that logically unplugged all memory (so it's no longer in use) and want to
>- * offline + remove the memory block.
>+ * Try to offline and remove memory. Might take a long time to finish in case
>+ * memory is still in use. Primarily useful for memory devices that logically
>+ * unplugged all memory (so it's no longer in use) and want to offline + 
>remove
>+ * that memory.
>  */
> int offline_and_remove_memory(int nid, u64 start, u64 size)
> {
>-  struct memory_block *mem;
>-  int rc = -EINVAL;
>+  const unsigned long mb_count = size / memory_block_size_bytes();
>+  uint8_t *online_types, *tmp;
>+  int rc;
> 
>   if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
>-  size != memory_block_size_bytes())
>-  return rc;
>+  !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
>+  return -EINVAL;
>+
>+  /*
>+   * We'll remember the old online type of each memory block, so we can

Re: [PATCH v1 25/29] virtio-mem: Big Block Mode (BBM) memory hotplug

2020-10-18 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:19PM +0200, David Hildenbrand wrote:
>Currently, we do not support device block sizes that exceed the Linux
>memory block size. For example, having a device block size of 1 GiB (e.g.,
>gigantic pages in the hypervisor) won't work with 128 MiB Linux memory
>blocks.
>
>Let's implement Big Block Mode (BBM), whereby we add/remove at least
>one Linux memory block at a time. With a 1 GiB device block size, a Big
>Block (BB) will cover 8 Linux memory blocks.
>
>We'll keep registering the online_page_callback machinery, it will be used
>for safe memory hotunplug in BBM next.
>
>Note: BBM is properly prepared for variable-sized Linux memory
>blocks that we might see in the future. So we won't care how many Linux
>memory blocks a big block actually spans, and how the memory notifier is
>called.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: Wei Yang 
>Cc: Andrew Morton 
>Signed-off-by: David Hildenbrand 
>---
> drivers/virtio/virtio_mem.c | 484 ++--
> 1 file changed, 402 insertions(+), 82 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index e68d0d99590c..4d396ef98a92 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -30,12 +30,18 @@ MODULE_PARM_DESC(unplug_online, "Try to unplug online 
>memory");
> /*
>  * virtio-mem currently supports the following modes of operation:
>  *
>- * * Sub Block Mode (SBM): A Linux memory block spans 1..X subblocks (SB). The
>+ * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
>  *   size of a Sub Block (SB) is determined based on the device block size, 
> the
>  *   pageblock size, and the maximum allocation granularity of the buddy.
>  *   Subblocks within a Linux memory block might either be plugged or 
> unplugged.
>  *   Memory is added/removed to Linux MM in Linux memory block granularity.
>  *
>+ * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
>+ *   Memory is added/removed to Linux MM in Big Block granularity.
>+ *
>+ * The mode is determined automatically based on the Linux memory block size
>+ * and the device block size.
>+ *
>  * User space / core MM (auto onlining) is responsible for onlining added
>  * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
>  * always onlined separately, and all memory within a Linux memory block is
>@@ -61,6 +67,19 @@ enum virtio_mem_sbm_mb_state {
>   VIRTIO_MEM_SBM_MB_COUNT
> };
> 
>+/*
>+ * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
>+ */
>+enum virtio_mem_bbm_bb_state {
>+  /* Unplugged, not added to Linux. Can be reused later. */
>+  VIRTIO_MEM_BBM_BB_UNUSED = 0,
>+  /* Plugged, not added to Linux. Error on add_memory(). */
>+  VIRTIO_MEM_BBM_BB_PLUGGED,
>+  /* Plugged and added to Linux. */
>+  VIRTIO_MEM_BBM_BB_ADDED,
>+  VIRTIO_MEM_BBM_BB_COUNT
>+};
>+
> struct virtio_mem {
>   struct virtio_device *vdev;
> 
>@@ -113,6 +132,9 @@ struct virtio_mem {
>   atomic64_t offline_size;
>   uint64_t offline_threshold;
> 
>+  /* If set, the driver is in SBM, otherwise in BBM. */
>+  bool in_sbm;
>+
>   struct {
>   /* Id of the first memory block of this device. */
>   unsigned long first_mb_id;
>@@ -151,9 +173,27 @@ struct virtio_mem {
>   unsigned long *sb_states;
>   } sbm;
> 
>+  struct {
>+  /* Id of the first big block of this device. */
>+  unsigned long first_bb_id;
>+  /* Id of the last usable big block of this device. */
>+  unsigned long last_usable_bb_id;
>+  /* Id of the next device bock to prepare when needed. */
>+  unsigned long next_bb_id;
>+
>+  /* Summary of all big block states. */
>+  unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
>+
>+  /* One byte state per big block. See sbm.mb_states. */
>+  uint8_t *bb_states;
>+
>+  /* The block size used for (un)plugged, adding/removing. */
>+  uint64_t bb_size;
>+  } bbm;
>+
>   /*
>-   * Mutex that protects the sbm.mb_count, sbm.mb_states, and
>-   * sbm.sb_states.
>+   * Mutex that protects the sbm.mb_count, sbm.mb_states,
>+   * sbm.sb_states, bbm.bb_count, and bbm.bb_states
>*
>* When this lock is held the pointers can't change, ONLINE and
>* OFFLINE blocks can't change the state and no subblocks will get
>@@ -247,6 +2

Re: [PATCH v1 21/29] virtio-mem: memory notifier callbacks are specific to Sub Block Mode (SBM)

2020-10-18 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:15PM +0200, David Hildenbrand wrote:
>Let's rename accordingly.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 
>---
> drivers/virtio/virtio_mem.c | 29 +++--
> 1 file changed, 15 insertions(+), 14 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index 3a772714fec9..d06c8760b337 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -589,8 +589,8 @@ static bool virtio_mem_contains_range(struct virtio_mem 
>*vm, uint64_t start,
>   return start >= vm->addr && start + size <= vm->addr + vm->region_size;
> }
> 
>-static int virtio_mem_notify_going_online(struct virtio_mem *vm,
>-unsigned long mb_id)
>+static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
>+unsigned long mb_id)

Look into this patch with "virtio-mem: Big Block Mode (BBM) memory hotplug"
together, I thought the code is a little "complex".

The final logic of virtio_mem_memory_notifier_cb() looks like this:

virtio_mem_memory_notifier_cb()
if (vm->in_sbm)
notify_xxx()
if (vm->in_sbm)
notify_xxx()

Can we adjust this like

virtio_mem_memory_notifier_cb()
notify_xxx()
if (vm->in_sbm)
        return
notify_xxx()
if (vm->in_sbm)
return

This style looks a little better to me.


-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 00/29] virtio-mem: Big Block Mode (BBM)

2020-10-18 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:52:54PM +0200, David Hildenbrand wrote:
>virtio-mem currently only supports device block sizes that span at most
>a single Linux memory block. For example, gigantic pages in the hypervisor
>result on x86-64 in a device block size of 1 GiB - when the Linux memory
>block size is 128 MiB, we cannot support such devices (we fail loading the
>driver). Of course, we want to support any device block size in any Linux
>VM.
>
>Bigger device block sizes will become especially important once supporting
>VFIO in QEMU - each device block has to be mapped separately, and the
>maximum number of mappings for VFIO is 64k. So we usually want blocks in
>the gigabyte range when wanting to grow the VM big.
>
>This series:
>- Performs some cleanups
>- Factors out existing Sub Block Mode (SBM)
>- Implements memory hot(un)plug in Big Block Mode (BBM)
>
>I need one core-mm change, to make offline_and_remove_memory() eat bigger
>chunks.
>
>This series is based on "next-20201009" and can be found at:
>   g...@gitlab.com:virtio-mem/linux.git virtio-mem-dbm-v1
>

I am trying to apply this patch set, while found I can't 'git fetch' this
repo. Is there any other repo I would apply this patch set?

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 20/29] virtio-mem: nb_sb_per_mb and subblock_size are specific to Sub Block Mode (SBM)

2020-10-18 Thread Wei Yang

On Fri, Oct 16, 2020 at 03:17:06PM +0200, David Hildenbrand wrote:
>On 16.10.20 10:53, Wei Yang wrote:
>> On Mon, Oct 12, 2020 at 02:53:14PM +0200, David Hildenbrand wrote:
>>> Let's rename to "sbs_per_mb" and "sb_size" and move accordingly.
>>>
>>> Cc: "Michael S. Tsirkin" 
>>> Cc: Jason Wang 
>>> Cc: Pankaj Gupta 
>>> Signed-off-by: David Hildenbrand 
>> 
>> One trivial suggestion, could we move this patch close the data structure
>> movement patch?
>> 
>> I know this would be some work, since you have changed some of the code 
>> logic.
>> This would take you some time to rebase.
>
>You mean after patch #17 ?

Yes

>
>I guess I can move patch #18 (prereq) a little further up (e.g., after
>patch #15). Guess moving it in front of #19 shouldn't be too hard.
>
>Will give it a try - if it takes too much effort, I'll leave it like this.
>

Not a big deal, while it will make the change more intact to me.

This is a big patch set to me. In case it could be split into two parts, like
bug fix/logic improvement and BBM implementation, that would be more friendly
to review.

>Thanks!
>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 13/29] virtio-mem: factor out handling of fake-offline pages in memory notifier

2020-10-18 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:07PM +0200, David Hildenbrand wrote:
>Let's factor out the core pieces and place the implementation next to
>virtio_mem_fake_offline(). We'll reuse this functionality soon.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 73 +
> 1 file changed, 50 insertions(+), 23 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index d132bc54ef57..a2124892e510 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -168,6 +168,10 @@ static LIST_HEAD(virtio_mem_devices);
> 
> static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
> static void virtio_mem_retry(struct virtio_mem *vm);
>+static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
>+unsigned long nr_pages);
>+static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
>+ unsigned long nr_pages);
> 
> /*
>  * Register a virtio-mem device so it will be considered for the online_page
>@@ -604,27 +608,15 @@ static void virtio_mem_notify_going_offline(struct 
>virtio_mem *vm,
>   unsigned long mb_id)
> {
>   const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
>-  struct page *page;
>   unsigned long pfn;
>-  int sb_id, i;
>+  int sb_id;
> 
>   for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
>   if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
>   continue;
>-  /*
>-   * Drop our reference to the pages so the memory can get
>-   * offlined and add the unplugged pages to the managed
>-   * page counters (so offlining code can correctly subtract
>-   * them again).
>-   */
>   pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
>  sb_id * vm->subblock_size);
>-  adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
>-  for (i = 0; i < nr_pages; i++) {
>-  page = pfn_to_page(pfn + i);
>-  if (WARN_ON(!page_ref_dec_and_test(page)))
>-  dump_page(page, "unplugged page referenced");
>-  }
>+  virtio_mem_fake_offline_going_offline(pfn, nr_pages);
>   }
> }
> 
>@@ -633,21 +625,14 @@ static void virtio_mem_notify_cancel_offline(struct 
>virtio_mem *vm,
> {
>   const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
>   unsigned long pfn;
>-  int sb_id, i;
>+  int sb_id;
> 
>   for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
>   if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
>   continue;
>-  /*
>-   * Get the reference we dropped when going offline and
>-   * subtract the unplugged pages from the managed page
>-   * counters.
>-   */
>   pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
>  sb_id * vm->subblock_size);
>-  adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
>-  for (i = 0; i < nr_pages; i++)
>-  page_ref_inc(pfn_to_page(pfn + i));
>+  virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
>   }
> }
> 
>@@ -853,6 +838,48 @@ static int virtio_mem_fake_offline(unsigned long pfn, 
>unsigned long nr_pages)
>   return 0;
> }
> 
>+/*
>+ * Handle fake-offline pages when memory is going offline - such that the
>+ * pages can be skipped by mm-core when offlining.
>+ */
>+static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
>+unsigned long nr_pages)
>+{
>+  struct page *page;
>+  unsigned long i;
>+
>+  /*
>+   * Drop our reference to the pages so the memory can get offlined
>+   * and add the unplugged pages to the managed page counters (so
>+   * offlining code can correctly subtract them again).
>+   */
>+  adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
>+  /* Drop our reference to the pages so the memory can get offlined. */
>+  for (i = 0; i < nr_pages; i++) {
>+  page = pfn_to_page(pfn + i);
>+  if (WARN_ON(!page_ref_dec_and_test(page)))
>+  dump_page(page, "fake-offline page

Re: [PATCH v1 13/29] virtio-mem: factor out handling of fake-offline pages in memory notifier

2020-10-18 Thread Wei Yang

On Fri, Oct 16, 2020 at 10:57:35AM +0200, David Hildenbrand wrote:
>>> Do we adjust the count twice?
>>>
>> 
>> Ah, I got the reason why we need to adjust count for *unplugged* sub-blocks.
>
>Exactly.
>
>> 
>>>> -  for (i = 0; i < nr_pages; i++) {
>>>> -  page = pfn_to_page(pfn + i);
>>>> -  if (WARN_ON(!page_ref_dec_and_test(page)))
>> 
>> Another question is when we grab a refcount for the unpluged pages? The one
>> you mentioned in virtio_mem_set_fake_offline().
>
>Yeah, that was confusing on my side. I actually meant
>virtio_mem_fake_offline() - patch #12.
>
>We have a reference on unplugged (fake offline) blocks via
>
>1. memmap initialization, if never online via generic_online_page()
>
>So if we keep pages fake offline when onlining memory, they
>
>a) Have a refcount of 1
>b) Have *not* increased the managed page count
>
>2. alloc_contig_range(), if fake offlined. After we fake-offlined pages
>(e.g., patch #12), such pages
>
>a) Have a refcount of 1
>b) Have *not* increased the managed page count (because we manually
>decreased it)
>

Yep, I got the reason now.

>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 05/29] virtio-mem: generalize check for added memory

2020-10-18 Thread Wei Yang

On Sat, Oct 17, 2020 at 09:39:38AM +0200, David Hildenbrand wrote:
>
>> Am 17.10.2020 um 00:38 schrieb Wei Yang :
>> 
>> On Fri, Oct 16, 2020 at 12:32:50PM +0200, David Hildenbrand wrote:
>>>>>> Ok, I seems to understand the logic now.
>>>>>> 
>>>>>> But how we prevent ONLINE_PARTIAL memory block get offlined? There are 
>>>>>> three
>>>>>> calls in virtio_mem_set_fake_offline(), while all of them adjust page's 
>>>>>> flag.
>>>>>> How they hold reference to struct page?
>>>>> 
>>>>> Sorry, I should have given you the right pointer. (similar to my other
>>>>> reply)
>>>>> 
>>>>> We hold a reference either via
>>>>> 
>>>>> 1. alloc_contig_range()
>>>> 
>>>> I am not familiar with this one, need to spend some time to look into.
>>> 
>>> Each individual page will have a pagecount of 1.
>>> 
>>>> 
>>>>> 2. memmap init code, when not calling generic_online_page().
>>>> 
>>>> I may miss some code here. Before online pages, memmaps are allocated in
>>>> section_activate(). They are supposed to be zero-ed. (I don't get the exact
>>>> code line.) I am not sure when we grab a refcount here.
>>> 
>>> Best to refer to __init_single_page() -> init_page_count().
>>> 
>>> Each page that wasn't onlined via generic_online_page() has a refcount
>>> of 1 and looks like allocated.
>>> 
>> 
>> Thanks, I see the logic.
>> 
>>online_pages()
>>move_pfn_range_to_zone()  --- 1)
>>online_pages_range()  --- 2)
>> 
>> At 1), __init_single_page() would set page count to 1. At 2),
>> generic_online_page() would clear page count, while the call back would not.
>> 
>> Then I am trying to search the place where un-zero page count prevent 
>> offline.
>> scan_movable_pages() would fail, since this is a PageOffline() and has 1 page
>> count.
>> 
>> So the GUARD we prevent offline partial-onlined pages is
>> 
>>(PageOffline && page_count)
>> 
>> And your commit aa218795cb5fd583c94f
>> 
>> mm: Allow to offline unmovable PageOffline() pages via MEM_GOING_OFFLINE
>> 
>> is introduced to handle this case.
>> 
>> That's pretty clear now.
>> 
>
>I‘m happy to see that I am no longer the only person that understands all this 
>magic :)

Thanks for sharing the magic :-)

>
>Thanks for having a look / reviewing!
>
>>> -- 
>>> Thanks,
>>> 
>>> David / dhildenb
>> 
>> -- 
>> Wei Yang
>> Help you, Help me
>> 

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 09/29] virtio-mem: don't always trigger the workqueue when offlining memory

2020-10-17 Thread Wei Yang

On Fri, Oct 16, 2020 at 11:18:39AM +0200, David Hildenbrand wrote:
>On 16.10.20 06:03, Wei Yang wrote:
>> On Mon, Oct 12, 2020 at 02:53:03PM +0200, David Hildenbrand wrote:
>>> Let's trigger from offlining code when we're not allowed to touch online

Here "touch" means "unplug"? If so, maybe s/touch/unplug/ would be more easy
to understand.

>>> memory.
>> 
>> This describes the change in virtio_mem_memory_notifier_cb()?
>
>Ah, yes, can try to make that clearer.
>
>> 
>>>
>>> Handle the other case (memmap possibly freeing up another memory block)
>>> when actually removing memory. When removing via virtio_mem_remove(),
>>> virtio_mem_retry() is a NOP and safe to use.
>>>
>>> While at it, move retry handling when offlining out of
>>> virtio_mem_notify_offline(), to share it with Device Block Mode (DBM)
>>> soon.
>> 
>> I may not understand the logic fully. Here is my understanding of current
>> logic:
>> 
>> 
>>   virtio_mem_run_wq()
>>   virtio_mem_unplug_request()
>>   virtio_mem_mb_unplug_any_sb_offline()
>>virtio_mem_mb_remove() --- 1
>>virtio_mem_mb_unplug_any_sb_online()
>>virtio_mem_mb_offline_and_remove() --- 2
>> 

I am trying to get more understanding about the logic of virtio_mem_retry().

Current logic seems clear to me. There are four places to trigger it:

* notify_offline
* notify_online
* timer_expired
* config_changed

In this patch, we try to optimize the first case, notify_offline.

Now, we would always trigger retry when one of our memory block get offlined.
Per my understanding, this logic is correct while missed one case (or be more
precise, not handle one case timely). The case this patch wants to improve is
virtio_mem_mb_remove(). If my understanding is correct.

   virtio_mem_run_wq()
   virtio_mem_unplug_request()
   virtio_mem_mb_unplug_any_sb_offline()
  virtio_mem_mb_remove() --- 1
   virtio_mem_mb_unplug_any_sb_online()
  virtio_mem_mb_offline_and_remove() --- 2

The above is two functions this patch adjusts. For 2), it will offline the
memory block, thus will trigger virtio_mem_retry() originally. But for 1), the
memory block is already offlined, so virtio_mem_retry() will not be triggered
originally. This is the case we want to improve in this patch. Instead of wait
for timer expire, we trigger retry immediately after unplug/remove an offlined
memory block.

And after this change, this patch still adjust the original
virtio_mem_notify_offline() path to just trigger virtio_mem_retry() when
unplug_online is false. (This means the offline event is notified from user
space instead of from unplug event).

If my above analysis is correct, I got one small suggestion for this patch.
Instead of adjust current notify_offline handling, how about just trigger
retry during virtio_mem_mb_remove()? Since per my understanding, we just want
to do immediate trigger retry when unplug an offlined memory block.

>> This patch tries to trigger the wq at 1 and 2. And these two functions are
>> only valid during this code flow.
>
>Exactly.
>
>> 
>> These two functions actually remove some memory from the system. So I am not
>> sure where extra unplug-able memory comes from. I guess those memory is from
>> memory block device and mem_sectioin, memmap? While those memory is still
>> marked as online, right?
>
>Imagine you end up (only after some repeating plugging and unplugging of
>memory, otherwise it's obviously impossible):
>
>Memory block X: Contains only movable data
>
>Memory block X + 1: Contains memmap of Memory block X:
>
>
>We start to unplug from high, to low.
>
>1. Try to unplug/offline/remove block X + 1: fails, because of the
>   memmap
>2. Try to unplug/offline/remove block X: succeeds.
>3. Not all requested memory got unplugged. Sleep for 30 seconds.
>4. Retry to unplug/offline/remove block X + 1: succeeds
>
>What we do in 2, is that we trigger a retry of ourselves. That means,
>that in 3. we don't actually sleep, but retry immediately.
>
>This has been proven helpful in some of my tests, where you want to
>unplug *a lot* of memory again, not just some parts.
>
>
>Triggering a retry is fairly cheap. Assume you don't actually have to
>perform any more unplugging. The workqueue wakes up, detects that
>nothing is to do, and goes back to sleep.
>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 05/29] virtio-mem: generalize check for added memory

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:52:59PM +0200, David Hildenbrand wrote:
>Let's check by traversing busy system RAM resources instead, to avoid
>relying on memory block states.
>
>Don't use walk_system_ram_range(), as that works on pages and we want to
>use the bare addresses we have easily at hand.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 19 +++
> 1 file changed, 15 insertions(+), 4 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index b3eebac7191f..6bbd1cfd10d3 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -1749,6 +1749,20 @@ static void virtio_mem_delete_resource(struct 
>virtio_mem *vm)
>   vm->parent_resource = NULL;
> }
> 
>+static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
>+{
>+  return 1;
>+}
>+
>+static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
>+{
>+  const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>+
>+  return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
>+ vm->addr + vm->region_size, NULL,
>+ virtio_mem_range_has_system_ram) == 1;
>+}
>+
> static int virtio_mem_probe(struct virtio_device *vdev)
> {
>   struct virtio_mem *vm;
>@@ -1870,10 +1884,7 @@ static void virtio_mem_remove(struct virtio_device 
>*vdev)
>* the system. And there is no way to stop the driver/device from going
>* away. Warn at least.
>*/
>-  if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] ||
>-  vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] ||
>-  vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] ||
>-  vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) {
>+  if (virtio_mem_has_memory_added(vm)) {
>   dev_warn(>dev, "device still has system memory added\n");
>   } else {
>   virtio_mem_delete_resource(vm);
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 05/29] virtio-mem: generalize check for added memory

2020-10-16 Thread Wei Yang

On Fri, Oct 16, 2020 at 12:32:50PM +0200, David Hildenbrand wrote:
>>>> Ok, I seems to understand the logic now.
>>>>
>>>> But how we prevent ONLINE_PARTIAL memory block get offlined? There are 
>>>> three
>>>> calls in virtio_mem_set_fake_offline(), while all of them adjust page's 
>>>> flag.
>>>> How they hold reference to struct page?
>>>
>>> Sorry, I should have given you the right pointer. (similar to my other
>>> reply)
>>>
>>> We hold a reference either via
>>>
>>> 1. alloc_contig_range()
>> 
>> I am not familiar with this one, need to spend some time to look into.
>
>Each individual page will have a pagecount of 1.
>
>> 
>>> 2. memmap init code, when not calling generic_online_page().
>> 
>> I may miss some code here. Before online pages, memmaps are allocated in
>> section_activate(). They are supposed to be zero-ed. (I don't get the exact
>> code line.) I am not sure when we grab a refcount here.
>
>Best to refer to __init_single_page() -> init_page_count().
>
>Each page that wasn't onlined via generic_online_page() has a refcount
>of 1 and looks like allocated.
>

Thanks, I see the logic.

online_pages()
move_pfn_range_to_zone()  --- 1)
online_pages_range()  --- 2)

At 1), __init_single_page() would set page count to 1. At 2),
generic_online_page() would clear page count, while the call back would not.

Then I am trying to search the place where un-zero page count prevent offline.
scan_movable_pages() would fail, since this is a PageOffline() and has 1 page
count.

So the GUARD we prevent offline partial-onlined pages is

(PageOffline && page_count)

And your commit aa218795cb5fd583c94f

mm: Allow to offline unmovable PageOffline() pages via MEM_GOING_OFFLINE

is introduced to handle this case.

That's pretty clear now.

>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 05/29] virtio-mem: generalize check for added memory

2020-10-16 Thread Wei Yang

On Fri, Oct 16, 2020 at 11:11:24AM +0200, David Hildenbrand wrote:
>>> That's an interesting corner case. Assume you have a 128MB memory block
>>> but only 64MB are plugged.
>> 
>> Since we just plug a part of memory block, this state is OFFLINE_PARTIAL
>> first. But then we would add these memory and online it. This means the state
>> of this memory block is ONLINE_PARTIAL.
>> 
>> When this state is changed to OFFLINE_PARTIAL again?
>
>Please note that memory onlining is *completely* controllable by user
>space. User space can offline/online memory blocks as it wants. Not
>saying this might actually be the right thing to do - but we cannot
>trust that user space does the right thing.
>
>So at any point in time, you have to assume that
>
>a) added memory might not get onlined
>b) previously onlined memory might get offlined
>c) previously offline memory might get onlined
>
>> 
>>>
>>> As long as we have our online_pages callback in place, we can hinder the
>>> unplugged 64MB from getting exposed to the buddy
>>> (virtio_mem_online_page_cb()). However, once we unloaded the driver,
>> 
>> Yes,
>> 
>> virtio_mem_set_fake_offline() would __SetPageOffline() to those pages.
>> 
>>> this is no longer the case. If someone would online that memory block,
>>> we would expose unplugged memory to the buddy - very bad.
>>>
>> 
>> Per my understanding, at this point of time, the memory block is at online
>> state. Even part of it is set to *fake* offline.
>> 
>> So how could user trigger another online from sysfs interface?
>
>Assume we added a partially plugged memory block, which is now offline.
>Further assume user space did not online the memory block (e.g., no udev
>rules).
>
>User space could happily online the block after unloading the driver.
>Again, we have to assume user space could do crazy things.
>

You are right, online memory is not a forced behavior.

>> 
>>> So we have to remove these partially plugged, offline memory blocks when
>>> losing control over them.
>>>
>>> I tried to document that via:
>>>
>>> "After we unregistered our callbacks, user space can online partially
>>> plugged offline blocks. Make sure to remove them."
>>>
>>>>
>>>> Also, during virtio_mem_remove(), we just handle OFFLINE_PARTIAL memory 
>>>> block.
>>>> How about memory block in other states? It is not necessary to remove
>>>> ONLINE[_PARTIAL] memroy blocks?
>>>
>>> Blocks that are fully plugged (ONLINE or OFFLINE) can get
>>> onlined/offlined without us having to care. Works fine - we only have to
>>> care about partially plugged blocks.
>>>
>>> While we *could* unplug OFFLINE blocks, there is no way we can
>>> deterministically offline+remove ONLINE blocks. So that memory has to
>>> stay, even after we unloaded the driver (similar to the dax/kmem driver).
>> 
>> For OFFLINE memory blocks, would that leave the situation:
>> 
>> Guest doesn't need those pages, while host still maps them?
>
>Yes, but the guest could online the memory and make use of it.
>
>(again, whoever decides to unload the driver better be knowing what he does)
>
>To do it even more cleanly, we would
>
>a) Have to remove completely plugged offline blocks (not done)
>b) Have to remove partially plugged offline blocks (done)
>c) Actually send unplug requests to the hypervisor
>
>Right now, only b) is done, because it might actually cause harm (as
>discussed). However, the problem is, that c) might actually fail.
>
>Long short: we could add a) if it turns out to be a real issue. But
>than, unloading the driver isn't really suggested, the current
>implementation just "keeps it working without crashes" - and I guess
>that's good enough for now.
>
>> 
>>>
>>> ONLINE_PARTIAL is already taken care of: it cannot get offlined anymore,
>>> as we still hold references to these struct pages
>>> (virtio_mem_set_fake_offline()), and as we no longer have the memory
>>> notifier in place, we can no longer agree to offline this memory (when
>>> going_offline).
>>>
>> 
>> Ok, I seems to understand the logic now.
>> 
>> But how we prevent ONLINE_PARTIAL memory block get offlined? There are three
>> calls in virtio_mem_set_fake_offline(), while all of them adjust page's flag.
>> How they hold reference to struct page?
>
>Sorry, I should have given you the right pointer. (similar to my other
>reply)
>
>We hold a reference either via
>
>1. alloc_contig_range()

I am not familiar with this one, need to spend some time to look into.

>2. memmap init code, when not calling generic_online_page().

I may miss some code here. Before online pages, memmaps are allocated in
section_activate(). They are supposed to be zero-ed. (I don't get the exact
code line.) I am not sure when we grab a refcount here.

>
>So these fake-offline pages can never be actually offlined, because we
>no longer have the memory notifier registered to fix that up.
>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 25/29] virtio-mem: Big Block Mode (BBM) memory hotplug

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:19PM +0200, David Hildenbrand wrote:
>Currently, we do not support device block sizes that exceed the Linux
>memory block size. For example, having a device block size of 1 GiB (e.g.,
>gigantic pages in the hypervisor) won't work with 128 MiB Linux memory
>blocks.
>
>Let's implement Big Block Mode (BBM), whereby we add/remove at least
>one Linux memory block at a time. With a 1 GiB device block size, a Big
>Block (BB) will cover 8 Linux memory blocks.
>
>We'll keep registering the online_page_callback machinery, it will be used
>for safe memory hotunplug in BBM next.
>
>Note: BBM is properly prepared for variable-sized Linux memory
>blocks that we might see in the future. So we won't care how many Linux
>memory blocks a big block actually spans, and how the memory notifier is
>called.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Cc: Michal Hocko 
>Cc: Oscar Salvador 
>Cc: Wei Yang 
>Cc: Andrew Morton 
>Signed-off-by: David Hildenbrand 
>---
> drivers/virtio/virtio_mem.c | 484 ++--
> 1 file changed, 402 insertions(+), 82 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index e68d0d99590c..4d396ef98a92 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -30,12 +30,18 @@ MODULE_PARM_DESC(unplug_online, "Try to unplug online 
>memory");
> /*
>  * virtio-mem currently supports the following modes of operation:
>  *
>- * * Sub Block Mode (SBM): A Linux memory block spans 1..X subblocks (SB). The
>+ * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
>  *   size of a Sub Block (SB) is determined based on the device block size, 
> the
>  *   pageblock size, and the maximum allocation granularity of the buddy.
>  *   Subblocks within a Linux memory block might either be plugged or 
> unplugged.
>  *   Memory is added/removed to Linux MM in Linux memory block granularity.
>  *
>+ * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
>+ *   Memory is added/removed to Linux MM in Big Block granularity.
>+ *
>+ * The mode is determined automatically based on the Linux memory block size
>+ * and the device block size.
>+ *
>  * User space / core MM (auto onlining) is responsible for onlining added
>  * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
>  * always onlined separately, and all memory within a Linux memory block is
>@@ -61,6 +67,19 @@ enum virtio_mem_sbm_mb_state {
>   VIRTIO_MEM_SBM_MB_COUNT
> };
> 
>+/*
>+ * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
>+ */
>+enum virtio_mem_bbm_bb_state {
>+  /* Unplugged, not added to Linux. Can be reused later. */
>+  VIRTIO_MEM_BBM_BB_UNUSED = 0,
>+  /* Plugged, not added to Linux. Error on add_memory(). */
>+  VIRTIO_MEM_BBM_BB_PLUGGED,
>+  /* Plugged and added to Linux. */
>+  VIRTIO_MEM_BBM_BB_ADDED,
>+  VIRTIO_MEM_BBM_BB_COUNT
>+};
>+
> struct virtio_mem {
>   struct virtio_device *vdev;
> 
>@@ -113,6 +132,9 @@ struct virtio_mem {
>   atomic64_t offline_size;
>   uint64_t offline_threshold;
> 
>+  /* If set, the driver is in SBM, otherwise in BBM. */
>+  bool in_sbm;
>+
>   struct {
>   /* Id of the first memory block of this device. */
>   unsigned long first_mb_id;
>@@ -151,9 +173,27 @@ struct virtio_mem {
>   unsigned long *sb_states;
>   } sbm;
> 
>+  struct {
>+  /* Id of the first big block of this device. */
>+  unsigned long first_bb_id;
>+  /* Id of the last usable big block of this device. */
>+  unsigned long last_usable_bb_id;
>+  /* Id of the next device bock to prepare when needed. */
>+  unsigned long next_bb_id;
>+
>+  /* Summary of all big block states. */
>+  unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
>+
>+  /* One byte state per big block. See sbm.mb_states. */
>+  uint8_t *bb_states;
>+
>+  /* The block size used for (un)plugged, adding/removing. */
>+  uint64_t bb_size;
>+  } bbm;

Can we use a union here?

>+
>   /*
>-   * Mutex that protects the sbm.mb_count, sbm.mb_states, and
>-   * sbm.sb_states.
>+   * Mutex that protects the sbm.mb_count, sbm.mb_states,
>+   * sbm.sb_states, bbm.bb_count, and bbm.bb_states
>*
>* When this lock is held the pointers can't change, ONLINE and
>* OFFLINE blocks can't change the state and no subblocks will get
>

Re: [PATCH v1 24/29] virtio-mem: print debug messages from virtio_mem_send_*_request()

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:18PM +0200, David Hildenbrand wrote:
>Let's move the existing dev_dbg() into the functions, print if something
>went wrong, and also print for virtio_mem_send_unplug_all_request().
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 50 ++---
> 1 file changed, 35 insertions(+), 15 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index eb2ad31a8d8a..e68d0d99590c 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -1053,23 +1053,33 @@ static int virtio_mem_send_plug_request(struct 
>virtio_mem *vm, uint64_t addr,
>   .u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
>   .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
>   };
>+  int rc = -ENOMEM;
> 
>   if (atomic_read(>config_changed))
>   return -EAGAIN;
> 
>+  dev_dbg(>vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
>+  addr + size - 1);
>+
>   switch (virtio_mem_send_request(vm, )) {
>   case VIRTIO_MEM_RESP_ACK:
>   vm->plugged_size += size;
>   return 0;
>   case VIRTIO_MEM_RESP_NACK:
>-  return -EAGAIN;
>+  rc = -EAGAIN;
>+  break;
>   case VIRTIO_MEM_RESP_BUSY:
>-  return -ETXTBSY;
>+  rc = -ETXTBSY;
>+  break;
>   case VIRTIO_MEM_RESP_ERROR:
>-  return -EINVAL;
>+  rc = -EINVAL;
>+  break;
>   default:
>-  return -ENOMEM;
>+  break;
>   }
>+
>+  dev_dbg(>vdev->dev, "plugging memory failed: %d\n", rc);
>+  return rc;
> }
> 
> static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t 
> addr,
>@@ -1081,21 +1091,30 @@ static int virtio_mem_send_unplug_request(struct 
>virtio_mem *vm, uint64_t addr,
>   .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
>   .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
>   };
>+  int rc = -ENOMEM;
> 
>   if (atomic_read(>config_changed))
>   return -EAGAIN;
> 
>+  dev_dbg(>vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
>+  addr + size - 1);
>+
>   switch (virtio_mem_send_request(vm, )) {
>   case VIRTIO_MEM_RESP_ACK:
>   vm->plugged_size -= size;
>   return 0;
>   case VIRTIO_MEM_RESP_BUSY:
>-  return -ETXTBSY;
>+  rc = -ETXTBSY;
>+  break;
>   case VIRTIO_MEM_RESP_ERROR:
>-  return -EINVAL;
>+  rc = -EINVAL;
>+  break;
>   default:
>-  return -ENOMEM;
>+  break;
>   }
>+
>+  dev_dbg(>vdev->dev, "unplugging memory failed: %d\n", rc);
>+  return rc;
> }
> 
> static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
>@@ -1103,6 +1122,9 @@ static int virtio_mem_send_unplug_all_request(struct 
>virtio_mem *vm)
>   const struct virtio_mem_req req = {
>   .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
>   };
>+  int rc = -ENOMEM;
>+
>+  dev_dbg(>vdev->dev, "unplugging all memory");
> 
>   switch (virtio_mem_send_request(vm, )) {
>   case VIRTIO_MEM_RESP_ACK:
>@@ -1112,10 +1134,14 @@ static int virtio_mem_send_unplug_all_request(struct 
>virtio_mem *vm)
>   atomic_set(>config_changed, 1);
>   return 0;
>   case VIRTIO_MEM_RESP_BUSY:
>-  return -ETXTBSY;
>+  rc = -ETXTBSY;
>+  break;
>   default:
>-  return -ENOMEM;
>+  break;
>   }
>+
>+  dev_dbg(>vdev->dev, "unplugging all memory failed: %d\n", rc);
>+  return rc;
> }
> 
> /*
>@@ -1130,9 +1156,6 @@ static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, 
>unsigned long mb_id,
>   const uint64_t size = count * vm->sbm.sb_size;
>   int rc;
> 
>-  dev_dbg(>vdev->dev, "plugging memory block: %lu : %i - %i\n", mb_id,
>-  sb_id, sb_id + count - 1);
>-
>   rc = virtio_mem_send_plug_request(vm, addr, size);
>   if (!rc)
>       virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
>@@ -1151,9 +1174,6 @@ static int virtio_mem_sbm_unplug_sb(struct virtio_mem 
>*vm, unsigned long mb_id,
>   const uint64_t size = count * vm->sbm.sb_size;
>   int rc;
> 
>-  dev_dbg(>vdev->dev, "unplugging memory block: %lu : %i - %i\n",
>-  mb_id, sb_id, sb_id + count - 1);
>-
>   rc = virtio_mem_send_unplug_request(vm, addr, size);
>   if (!rc)
>   virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 23/29] virtio-mem: factor out adding/removing memory from Linux

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:17PM +0200, David Hildenbrand wrote:
>Let's use wrappers for the low-level functions that dev_dbg/dev_warn
>and work on addr + size, such that we can reuse them for adding/removing
>in other granularity.
>
>We only warn when adding memory failed, because that's something to pay
>attention to. We won't warn when removing failed, we'll reuse that in
>racy context soon (and we do have proper BUG_ON() statements in the
>current cases where it must never happen).
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 107 
> 1 file changed, 73 insertions(+), 34 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index d3ab04f655ee..eb2ad31a8d8a 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -453,18 +453,16 @@ static bool virtio_mem_could_add_memory(struct 
>virtio_mem *vm, uint64_t size)
> }
> 
> /*
>- * Try to add a memory block to Linux. This will usually only fail
>- * if out of memory.
>+ * Try adding memory to Linux. Will usually only fail if out of memory.
>  *
>  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
>  * onlining code).
>  *
>- * Will not modify the state of the memory block.
>+ * Will not modify the state of memory blocks in virtio-mem.
>  */
>-static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
>+static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
>+   uint64_t size)
> {
>-  const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
>-  const uint64_t size = memory_block_size_bytes();
>   int rc;
> 
>   /*
>@@ -478,32 +476,50 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, 
>unsigned long mb_id)
>   return -ENOMEM;
>   }
> 
>-  dev_dbg(>vdev->dev, "adding memory block: %lu\n", mb_id);
>+  dev_dbg(>vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
>+  addr + size - 1);
>   /* Memory might get onlined immediately. */
>   atomic64_add(size, >offline_size);
>   rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
>  MEMHP_MERGE_RESOURCE);
>-  if (rc)
>+  if (rc) {
>   atomic64_sub(size, >offline_size);
>+  dev_warn(>vdev->dev, "adding memory failed: %d\n", rc);
>+  /*
>+   * TODO: Linux MM does not properly clean up yet in all cases
>+   * where adding of memory failed - especially on -ENOMEM.
>+   */
>+  }
>   return rc;
> }
> 
> /*
>- * Try to remove a memory block from Linux. Will only fail if the memory block
>- * is not offline.
>+ * See virtio_mem_add_memory(): Try adding a single Linux memory block.
>+ */
>+static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
>+{
>+  const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
>+  const uint64_t size = memory_block_size_bytes();
>+
>+  return virtio_mem_add_memory(vm, addr, size);
>+}
>+
>+/*
>+ * Try removing memory from Linux. Will only fail if memory blocks aren't
>+ * offline.
>  *
>  * Must not be called with the vm->hotplug_mutex held (possible deadlock with
>  * onlining code).
>  *
>- * Will not modify the state of the memory block.
>+ * Will not modify the state of memory blocks in virtio-mem.
>  */
>-static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id)
>+static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
>+  uint64_t size)
> {
>-  const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
>-  const uint64_t size = memory_block_size_bytes();
>   int rc;
> 
>-  dev_dbg(>vdev->dev, "removing memory block: %lu\n", mb_id);
>+  dev_dbg(>vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
>+  addr + size - 1);
>   rc = remove_memory(vm->nid, addr, size);
>   if (!rc) {
>   atomic64_sub(size, >offline_size);
>@@ -512,27 +528,41 @@ static int virtio_mem_mb_remove(struct virtio_mem *vm, 
>unsigned long mb_id)
>* immediately instead of waiting.
>*/
>   virtio_mem_retry(vm);
>+  } else {
>+  dev_dbg(>vdev->dev, "removing memory failed: %d\n", rc);
>   }
>   return rc;
> }
> 
> /*
>

Re: [PATCH v1 22/29] virtio-mem: memory block ids are specific to Sub Block Mode (SBM)

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:16PM +0200, David Hildenbrand wrote:
>Let's move first_mb_id/next_mb_id/last_usable_mb_id accordingly.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 44 ++---
> 1 file changed, 22 insertions(+), 22 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index d06c8760b337..d3ab04f655ee 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -96,13 +96,6 @@ struct virtio_mem {
>   /* Maximum region size in bytes. */
>   uint64_t region_size;
> 
>-  /* Id of the first memory block of this device. */
>-  unsigned long first_mb_id;
>-  /* Id of the last usable memory block of this device. */
>-  unsigned long last_usable_mb_id;
>-  /* Id of the next memory bock to prepare when needed. */
>-  unsigned long next_mb_id;
>-
>   /* The parent resource for all memory added via this device. */
>   struct resource *parent_resource;
>   /*
>@@ -121,6 +114,13 @@ struct virtio_mem {
>   uint64_t offline_threshold;
> 
>   struct {
>+  /* Id of the first memory block of this device. */
>+  unsigned long first_mb_id;
>+  /* Id of the last usable memory block of this device. */
>+  unsigned long last_usable_mb_id;
>+  /* Id of the next memory bock to prepare when needed. */
>+  unsigned long next_mb_id;
>+
>   /* The subblock size. */
>   uint64_t sb_size;
>   /* The number of subblocks per Linux memory block. */
>@@ -265,7 +265,7 @@ static unsigned long virtio_mem_phys_to_sb_id(struct 
>virtio_mem *vm,
> static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
>   unsigned long mb_id, uint8_t state)
> {
>-  const unsigned long idx = mb_id - vm->first_mb_id;
>+  const unsigned long idx = mb_id - vm->sbm.first_mb_id;
>   uint8_t old_state;
> 
>   old_state = vm->sbm.mb_states[idx];
>@@ -282,7 +282,7 @@ static void virtio_mem_sbm_set_mb_state(struct virtio_mem 
>*vm,
> static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
>  unsigned long mb_id)
> {
>-  const unsigned long idx = mb_id - vm->first_mb_id;
>+  const unsigned long idx = mb_id - vm->sbm.first_mb_id;
> 
>   return vm->sbm.mb_states[idx];
> }
>@@ -292,7 +292,7 @@ static uint8_t virtio_mem_sbm_get_mb_state(struct 
>virtio_mem *vm,
>  */
> static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
> {
>-  unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id;
>+  unsigned long old_bytes = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
>   unsigned long new_bytes = old_bytes + 1;
>   int old_pages = PFN_UP(old_bytes);
>   int new_pages = PFN_UP(new_bytes);
>@@ -316,14 +316,14 @@ static int 
>virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
> }
> 
> #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
>-  for (_mb_id = _vm->first_mb_id; \
>-   _mb_id < _vm->next_mb_id && _vm->sbm.mb_count[_state]; \
>+  for (_mb_id = _vm->sbm.first_mb_id; \
>+   _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
>_mb_id++) \
>   if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
> 
> #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
>-  for (_mb_id = _vm->next_mb_id - 1; \
>-   _mb_id >= _vm->first_mb_id && _vm->sbm.mb_count[_state]; \
>+  for (_mb_id = _vm->sbm.next_mb_id - 1; \
>+   _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
>_mb_id--) \
>   if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
> 
>@@ -334,7 +334,7 @@ static int virtio_mem_sbm_mb_states_prepare_next_mb(struct 
>virtio_mem *vm)
> static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
> unsigned long mb_id, int sb_id)
> {
>-  return (mb_id - vm->first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
>+  return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
> }
> 
> /*
>@@ -414,7 +414,7 @@ static int virtio_mem_sbm_first_unplugged_sb(struct 
>virtio_mem *vm,
>  */
> static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
> {
>-  const unsigned long old_nb_mb = vm->next_mb_id - vm->fir

Re: [PATCH v1 20/29] virtio-mem: nb_sb_per_mb and subblock_size are specific to Sub Block Mode (SBM)

2020-10-16 Thread Wei Yang

virtio_mem *vm,
>  */
> static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
> {
>-  uint64_t nb_sb = diff / vm->subblock_size;
>+  uint64_t nb_sb = diff / vm->sbm.sb_size;
>   unsigned long mb_id;
>   int rc;
> 
>@@ -1805,11 +1805,11 @@ static int virtio_mem_init(struct virtio_mem *vm)
>* - Is required for now for alloc_contig_range() to work reliably -
>*   it doesn't properly handle smaller granularity on ZONE_NORMAL.
>*/
>-  vm->subblock_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
>-pageblock_nr_pages) * PAGE_SIZE;
>-  vm->subblock_size = max_t(uint64_t, vm->device_block_size,
>-vm->subblock_size);
>-  vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size;
>+  vm->sbm.sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
>+  pageblock_nr_pages) * PAGE_SIZE;
>+  vm->sbm.sb_size = max_t(uint64_t, vm->device_block_size,
>+  vm->sbm.sb_size);
>+  vm->sbm.sbs_per_mb = memory_block_size_bytes() / vm->sbm.sb_size;
> 
>   /* Round up to the next full memory block */
>   vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 +
>@@ -1827,7 +1827,7 @@ static int virtio_mem_init(struct virtio_mem *vm)
>   dev_info(>vdev->dev, "memory block size: 0x%lx",
>memory_block_size_bytes());
>   dev_info(>vdev->dev, "subblock size: 0x%llx",
>-   (unsigned long long)vm->subblock_size);
>+   (unsigned long long)vm->sbm.sb_size);
>   if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
>   dev_info(>vdev->dev, "nid: %d", vm->nid);
> 
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 20/29] virtio-mem: nb_sb_per_mb and subblock_size are specific to Sub Block Mode (SBM)

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:14PM +0200, David Hildenbrand wrote:
>Let's rename to "sbs_per_mb" and "sb_size" and move accordingly.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 96 ++---
> 1 file changed, 48 insertions(+), 48 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index fc2b1ff3beed..3a772714fec9 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -96,11 +96,6 @@ struct virtio_mem {
>   /* Maximum region size in bytes. */
>   uint64_t region_size;
> 
>-  /* The subblock size. */
>-  uint64_t subblock_size;
>-  /* The number of subblocks per memory block. */
>-  uint32_t nb_sb_per_mb;
>-
>   /* Id of the first memory block of this device. */
>   unsigned long first_mb_id;
>   /* Id of the last usable memory block of this device. */
>@@ -126,6 +121,11 @@ struct virtio_mem {
>   uint64_t offline_threshold;
> 
>   struct {
>+  /* The subblock size. */
>+  uint64_t sb_size;
>+  /* The number of subblocks per Linux memory block. */
>+  uint32_t sbs_per_mb;
>+
>   /* Summary of all memory block states. */
>   unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
> 
>@@ -256,7 +256,7 @@ static unsigned long virtio_mem_phys_to_sb_id(struct 
>virtio_mem *vm,
>   const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
>   const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
> 
>-  return (addr - mb_addr) / vm->subblock_size;
>+  return (addr - mb_addr) / vm->sbm.sb_size;
> }
> 
> /*
>@@ -334,7 +334,7 @@ static int virtio_mem_sbm_mb_states_prepare_next_mb(struct 
>virtio_mem *vm)
> static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
> unsigned long mb_id, int sb_id)
> {
>-  return (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
>+  return (mb_id - vm->first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
> }
> 
> /*
>@@ -397,7 +397,7 @@ static bool virtio_mem_sbm_test_sb_unplugged(struct 
>virtio_mem *vm,
> }
> 
> /*
>- * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there 
>is
>+ * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case 
>there is
>  * none.
>  */
> static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
>@@ -406,7 +406,7 @@ static int virtio_mem_sbm_first_unplugged_sb(struct 
>virtio_mem *vm,
>   const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
> 
>   return find_next_zero_bit(vm->sbm.sb_states,
>-bit + vm->nb_sb_per_mb, bit) - bit;
>+bit + vm->sbm.sbs_per_mb, bit) - bit;
> }
> 
> /*
>@@ -415,8 +415,8 @@ static int virtio_mem_sbm_first_unplugged_sb(struct 
>virtio_mem *vm,
> static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
> {
>   const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id;
>-  const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb;
>-  const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb;
>+  const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
>+  const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
>   int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
>   int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
>   unsigned long *new_bitmap, *old_bitmap;
>@@ -642,15 +642,15 @@ static void virtio_mem_notify_online(struct virtio_mem 
>*vm, unsigned long mb_id)
> static void virtio_mem_notify_going_offline(struct virtio_mem *vm,
>   unsigned long mb_id)
> {
>-  const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
>+  const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
>   unsigned long pfn;
>   int sb_id;
> 
>-  for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
>+  for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
>   if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
>   continue;
>   pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
>- sb_id * vm->subblock_size);
>+ sb_id * vm->sbm.sb_size);
>   virtio_mem_fake_offline_going_offline(pfn, nr_pages);
>   }
>

Re: [PATCH v1 19/29] virito-mem: existing (un)plug functions are specific to Sub Block Mode (SBM)

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:13PM +0200, David Hildenbrand wrote:
>Let's rename them accordingly. virtio_mem_plug_request() and
>virtio_mem_unplug_request() will be handled separately.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

The code is correct, while the naming is a bit long to understand...

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 90 ++---
> 1 file changed, 43 insertions(+), 47 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index 73ff6e9ba839..fc2b1ff3beed 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -1075,8 +1075,8 @@ static int virtio_mem_send_unplug_all_request(struct 
>virtio_mem *vm)
>  * Plug selected subblocks. Updates the plugged state, but not the state
>  * of the memory block.
>  */
>-static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
>-   int sb_id, int count)
>+static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
>+int sb_id, int count)
> {
>   const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
> sb_id * vm->subblock_size;
>@@ -1096,8 +1096,8 @@ static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, 
>unsigned long mb_id,
>  * Unplug selected subblocks. Updates the plugged state, but not the state
>  * of the memory block.
>  */
>-static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
>- int sb_id, int count)
>+static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long 
>mb_id,
>+  int sb_id, int count)
> {
>   const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
> sb_id * vm->subblock_size;
>@@ -1122,8 +1122,8 @@ static int virtio_mem_mb_unplug_sb(struct virtio_mem 
>*vm, unsigned long mb_id,
>  *
>  * Note: can fail after some subblocks were unplugged.
>  */
>-static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm,
>- unsigned long mb_id, uint64_t *nb_sb)
>+static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
>+  unsigned long mb_id, uint64_t *nb_sb)
> {
>   int sb_id, count;
>   int rc;
>@@ -1144,7 +1144,7 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem 
>*vm,
>   sb_id--;
>   }
> 
>-  rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count);
>+  rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
>   if (rc)
>   return rc;
>   *nb_sb -= count;
>@@ -1161,18 +1161,18 @@ static int virtio_mem_mb_unplug_any_sb(struct 
>virtio_mem *vm,
>  *
>  * Note: can fail after some subblocks were unplugged.
>  */
>-static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id)
>+static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long 
>mb_id)
> {
>   uint64_t nb_sb = vm->nb_sb_per_mb;
> 
>-  return virtio_mem_mb_unplug_any_sb(vm, mb_id, _sb);
>+  return virtio_mem_sbm_unplug_any_sb(vm, mb_id, _sb);
> }
> 
> /*
>  * Prepare tracking data for the next memory block.
>  */
>-static int virtio_mem_prepare_next_mb(struct virtio_mem *vm,
>-unsigned long *mb_id)
>+static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
>+unsigned long *mb_id)
> {
>   int rc;
> 
>@@ -1200,9 +1200,8 @@ static int virtio_mem_prepare_next_mb(struct virtio_mem 
>*vm,
>  *
>  * Will modify the state of the memory block.
>  */
>-static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
>-unsigned long mb_id,
>-uint64_t *nb_sb)
>+static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
>+unsigned long mb_id, uint64_t *nb_sb)
> {
>   const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb);
>   int rc;
>@@ -1214,7 +1213,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem 
>*vm,
>* Plug the requested number of subblocks before adding it to linux,
>* so that onlining will directly online all plugged subblocks.
>*/
>-  rc = virtio_mem_mb_plug_sb(vm, mb_id, 0, count);
>+  rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
>   if (rc)
>   return rc;
> 
>@@ -1241,7 +1240,7 @@ static int

Re: [PATCH v1 18/29] virtio-mem: factor out calculation of the bit number within the sb_states bitmap

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:12PM +0200, David Hildenbrand wrote:
>The calculation is already complicated enough, let's limit it to one
>location.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 20 +++-
> 1 file changed, 15 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index 2cc497ad8298..73ff6e9ba839 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -327,6 +327,16 @@ static int 
>virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
>_mb_id--) \
>   if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
> 
>+/*
>+ * Calculate the bit number in the sb_states bitmap for the given subblock
>+ * inside the given memory block.
>+ */
>+static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
>+unsigned long mb_id, int sb_id)
>+{
>+  return (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
>+}
>+
> /*
>  * Mark all selected subblocks plugged.
>  *
>@@ -336,7 +346,7 @@ static void virtio_mem_sbm_set_sb_plugged(struct 
>virtio_mem *vm,
> unsigned long mb_id, int sb_id,
> int count)
> {
>-  const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
>+  const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
> 
>   __bitmap_set(vm->sbm.sb_states, bit, count);
> }
>@@ -350,7 +360,7 @@ static void virtio_mem_sbm_set_sb_unplugged(struct 
>virtio_mem *vm,
>   unsigned long mb_id, int sb_id,
>   int count)
> {
>-  const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
>+  const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
> 
>   __bitmap_clear(vm->sbm.sb_states, bit, count);
> }
>@@ -362,7 +372,7 @@ static bool virtio_mem_sbm_test_sb_plugged(struct 
>virtio_mem *vm,
>  unsigned long mb_id, int sb_id,
>  int count)
> {
>-  const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
>+  const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
> 
>   if (count == 1)
>   return test_bit(bit, vm->sbm.sb_states);
>@@ -379,7 +389,7 @@ static bool virtio_mem_sbm_test_sb_unplugged(struct 
>virtio_mem *vm,
>unsigned long mb_id, int sb_id,
>int count)
> {
>-  const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
>+  const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
> 
>   /* TODO: Helper similar to bitmap_set() */
>   return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
>@@ -393,7 +403,7 @@ static bool virtio_mem_sbm_test_sb_unplugged(struct 
>virtio_mem *vm,
> static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
>   unsigned long mb_id)
> {
>-  const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb;
>+  const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
> 
>   return find_next_zero_bit(vm->sbm.sb_states,
> bit + vm->nb_sb_per_mb, bit) - bit;
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 16/29] virtio-mem: memory block states are specific to Sub Block Mode (SBM)

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:10PM +0200, David Hildenbrand wrote:
>let's use a new "sbm" sub-struct to hold SBM-specific state and rename +
>move applicable definitions, frunctions, and variables (related to
>memory block states).
>
>While at it:
>- Drop the "_STATE" part from memory block states
>- Rename "nb_mb_state" to "mb_count"
>- "set_mb_state" / "get_mb_state" vs. "mb_set_state" / "mb_get_state"
>- Don't use lengthy "enum virtio_mem_smb_mb_state", simply use "uint8_t"
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 215 ++--
> 1 file changed, 109 insertions(+), 106 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index fd8685673fe4..e76d6f769aa5 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -42,20 +42,23 @@ MODULE_PARM_DESC(unplug_online, "Try to unplug online 
>memory");
>  * onlined to the same zone - virtio-mem relies on this behavior.
>  */
> 
>-enum virtio_mem_mb_state {
>+/*
>+ * State of a Linux memory block in SBM.
>+ */
>+enum virtio_mem_sbm_mb_state {
>   /* Unplugged, not added to Linux. Can be reused later. */
>-  VIRTIO_MEM_MB_STATE_UNUSED = 0,
>+  VIRTIO_MEM_SBM_MB_UNUSED = 0,
>   /* (Partially) plugged, not added to Linux. Error on add_memory(). */
>-  VIRTIO_MEM_MB_STATE_PLUGGED,
>+  VIRTIO_MEM_SBM_MB_PLUGGED,
>   /* Fully plugged, fully added to Linux, offline. */
>-  VIRTIO_MEM_MB_STATE_OFFLINE,
>+  VIRTIO_MEM_SBM_MB_OFFLINE,
>   /* Partially plugged, fully added to Linux, offline. */
>-  VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL,
>+  VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
>   /* Fully plugged, fully added to Linux, online. */
>-  VIRTIO_MEM_MB_STATE_ONLINE,
>+  VIRTIO_MEM_SBM_MB_ONLINE,
>   /* Partially plugged, fully added to Linux, online. */
>-  VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL,
>-  VIRTIO_MEM_MB_STATE_COUNT
>+  VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL,
>+  VIRTIO_MEM_SBM_MB_COUNT
> };
> 
> struct virtio_mem {
>@@ -113,9 +116,6 @@ struct virtio_mem {
>*/
>   const char *resource_name;
> 
>-  /* Summary of all memory block states. */
>-  unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT];
>-
>   /*
>* We don't want to add too much memory if it's not getting onlined,
>* to avoid running OOM. Besides this threshold, we allow to have at
>@@ -125,27 +125,29 @@ struct virtio_mem {
>   atomic64_t offline_size;
>   uint64_t offline_threshold;
> 
>-  /*
>-   * One byte state per memory block.
>-   *
>-   * Allocated via vmalloc(). When preparing new blocks, resized
>-   * (alloc+copy+free) when needed (crossing pages with the next mb).
>-   * (when crossing pages).
>-   *
>-   * With 128MB memory blocks, we have states for 512GB of memory in one
>-   * page.
>-   */
>-  uint8_t *mb_state;
>+  struct {
>+  /* Summary of all memory block states. */
>+  unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
>+
>+  /*
>+   * One byte state per memory block. Allocated via vmalloc().
>+   * Resized (alloc+copy+free) on demand.
>+   *
>+   * With 128 MiB memory blocks, we have states for 512 GiB of
>+   * memory in one 4 KiB page.
>+   */
>+  uint8_t *mb_states;
>+  } sbm;
> 
>   /*
>-   * $nb_sb_per_mb bit per memory block. Handled similar to mb_state.
>+   * $nb_sb_per_mb bit per memory block. Handled similar to sbm.mb_states.
>*
>* With 4MB subblocks, we manage 128GB of memory in one page.
>*/
>   unsigned long *sb_bitmap;
> 
>   /*
>-   * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap.
>+   * Mutex that protects the sbm.mb_count, sbm.mb_states, and sb_bitmap.
>*
>* When this lock is held the pointers can't change, ONLINE and
>* OFFLINE blocks can't change the state and no subblocks will get
>@@ -254,70 +256,70 @@ static unsigned long virtio_mem_phys_to_sb_id(struct 
>virtio_mem *vm,
> /*
>  * Set the state of a memory block, taking care of the state counter.
>  */
>-static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long 
>mb_id,
>-  enum virtio_mem_mb_state state)
>+static void virtio_mem_sb

Re: [PATCH v1 17/29] virito-mem: subblock states are specific to Sub Block Mode (SBM)

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:11PM +0200, David Hildenbrand wrote:
>Let's rename and move accordingly. While at it, rename sb_bitmap to
>"sb_states".
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Ok, you separate the change into two parts.

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 118 +++-
> 1 file changed, 62 insertions(+), 56 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index e76d6f769aa5..2cc497ad8298 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -137,17 +137,23 @@ struct virtio_mem {
>* memory in one 4 KiB page.
>*/
>   uint8_t *mb_states;
>-  } sbm;
> 
>-  /*
>-   * $nb_sb_per_mb bit per memory block. Handled similar to sbm.mb_states.
>-   *
>-   * With 4MB subblocks, we manage 128GB of memory in one page.
>-   */
>-  unsigned long *sb_bitmap;
>+  /*
>+   * Bitmap: one bit per subblock. Allocated similar to
>+   * sbm.mb_states.
>+   *
>+   * A set bit means the corresponding subblock is plugged,
>+   * otherwise it's unblocked.
>+   *
>+   * With 4 MiB subblocks, we manage 128 GiB of memory in one
>+   * 4 KiB page.
>+   */
>+  unsigned long *sb_states;
>+  } sbm;
> 
>   /*
>-   * Mutex that protects the sbm.mb_count, sbm.mb_states, and sb_bitmap.
>+   * Mutex that protects the sbm.mb_count, sbm.mb_states, and
>+   * sbm.sb_states.
>*
>* When this lock is held the pointers can't change, ONLINE and
>* OFFLINE blocks can't change the state and no subblocks will get
>@@ -326,13 +332,13 @@ static int 
>virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
>  *
>  * Will not modify the state of the memory block.
>  */
>-static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm,
>-   unsigned long mb_id, int sb_id,
>-   int count)
>+static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
>+unsigned long mb_id, int sb_id,
>+int count)
> {
>   const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
> 
>-  __bitmap_set(vm->sb_bitmap, bit, count);
>+  __bitmap_set(vm->sbm.sb_states, bit, count);
> }
> 
> /*
>@@ -340,86 +346,87 @@ static void virtio_mem_mb_set_sb_plugged(struct 
>virtio_mem *vm,
>  *
>  * Will not modify the state of the memory block.
>  */
>-static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm,
>- unsigned long mb_id, int sb_id,
>- int count)
>+static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
>+  unsigned long mb_id, int sb_id,
>+  int count)
> {
>   const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
> 
>-  __bitmap_clear(vm->sb_bitmap, bit, count);
>+  __bitmap_clear(vm->sbm.sb_states, bit, count);
> }
> 
> /*
>  * Test if all selected subblocks are plugged.
>  */
>-static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm,
>-unsigned long mb_id, int sb_id,
>-int count)
>+static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
>+ unsigned long mb_id, int sb_id,
>+ int count)
> {
>   const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id;
> 
>   if (count == 1)
>-  return test_bit(bit, vm->sb_bitmap);
>+  return test_bit(bit, vm->sbm.sb_states);
> 
>   /* TODO: Helper similar to bitmap_set() */
>-  return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >=
>+  return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
>  bit + count;
> }
> 
> /*
>  * Test if all selected subblocks are unplugged.
>  */
>-static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm,
>-  unsigned long mb_id, int sb_id,
>-  int count)
>+static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
>+

Re: [PATCH v1 16/29] virtio-mem: memory block states are specific to Sub Block Mode (SBM)

2020-10-16 Thread Wei Yang

locks of plugged offline blocks. */
>-  virtio_mem_for_each_mb_state_rev(vm, mb_id,
>-   VIRTIO_MEM_MB_STATE_OFFLINE) {
>+  virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) {
>   rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id,
>_sb);
>   if (rc || !nb_sb)
>@@ -1539,8 +1541,8 @@ static int virtio_mem_unplug_request(struct virtio_mem 
>*vm, uint64_t diff)
>   }
> 
>   /* Try to unplug subblocks of partially plugged online blocks. */
>-  virtio_mem_for_each_mb_state_rev(vm, mb_id,
>-   VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) {
>+  virtio_mem_sbm_for_each_mb_rev(vm, mb_id,
>+ VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) {
>   rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
>   _sb);
>   if (rc || !nb_sb)
>@@ -1551,8 +1553,7 @@ static int virtio_mem_unplug_request(struct virtio_mem 
>*vm, uint64_t diff)
>   }
> 
>   /* Try to unplug subblocks of plugged online blocks. */
>-  virtio_mem_for_each_mb_state_rev(vm, mb_id,
>-   VIRTIO_MEM_MB_STATE_ONLINE) {
>+  virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) {
>   rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
>   _sb);
>   if (rc || !nb_sb)
>@@ -1578,11 +1579,12 @@ static int virtio_mem_unplug_pending_mb(struct 
>virtio_mem *vm)
>   unsigned long mb_id;
>   int rc;
> 
>-  virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_PLUGGED) {
>+  virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_PLUGGED) {
>   rc = virtio_mem_mb_unplug(vm, mb_id);
>   if (rc)
>   return rc;
>-  virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED);
>+  virtio_mem_sbm_set_mb_state(vm, mb_id,
>+  VIRTIO_MEM_SBM_MB_UNUSED);
>   }
> 
>   return 0;
>@@ -1974,11 +1976,12 @@ static void virtio_mem_remove(struct virtio_device 
>*vdev)
>* After we unregistered our callbacks, user space can online partially
>* plugged offline blocks. Make sure to remove them.
>*/
>-  virtio_mem_for_each_mb_state(vm, mb_id,
>-   VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) {
>+  virtio_mem_sbm_for_each_mb(vm, mb_id,
>+ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
>   rc = virtio_mem_mb_remove(vm, mb_id);
>   BUG_ON(rc);
>-  virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED);
>+  virtio_mem_sbm_set_mb_state(vm, mb_id,
>+  VIRTIO_MEM_SBM_MB_UNUSED);
>   }
>   /*
>* After we unregistered our callbacks, user space can no longer
>@@ -2003,7 +2006,7 @@ static void virtio_mem_remove(struct virtio_device *vdev)
>   }
> 
>   /* remove all tracking data - no locking needed */
>-  vfree(vm->mb_state);
>+  vfree(vm->sbm.mb_states);
>   vfree(vm->sb_bitmap);
> 
>   /* reset the device and cleanup the queues */
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 15/29] virito-mem: document Sub Block Mode (SBM)

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:09PM +0200, David Hildenbrand wrote:
>Let's add some documentation for the current mode - Sub Block Mode (SBM) -
>to prepare for a new mode - Big Block Mode (BBM).
>
>Follow-up patches will properly factor out the existing Sub Block Mode
>(SBM) and implement Device Block Mode (DBM).
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 15 +++
> 1 file changed, 15 insertions(+)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index faeb759687fe..fd8685673fe4 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -27,6 +27,21 @@ static bool unplug_online = true;
> module_param(unplug_online, bool, 0644);
> MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
> 
>+/*
>+ * virtio-mem currently supports the following modes of operation:
>+ *
>+ * * Sub Block Mode (SBM): A Linux memory block spans 1..X subblocks (SB). The
>+ *   size of a Sub Block (SB) is determined based on the device block size, 
>the
>+ *   pageblock size, and the maximum allocation granularity of the buddy.
>+ *   Subblocks within a Linux memory block might either be plugged or 
>unplugged.
>+ *   Memory is added/removed to Linux MM in Linux memory block granularity.
>+ *
>+ * User space / core MM (auto onlining) is responsible for onlining added
>+ * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
>+ * always onlined separately, and all memory within a Linux memory block is
>+ * onlined to the same zone - virtio-mem relies on this behavior.
>+ */
>+
> enum virtio_mem_mb_state {
>   /* Unplugged, not added to Linux. Can be reused later. */
>   VIRTIO_MEM_MB_STATE_UNUSED = 0,
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 13/29] virtio-mem: factor out handling of fake-offline pages in memory notifier

2020-10-16 Thread Wei Yang

On Fri, Oct 16, 2020 at 03:15:03PM +0800, Wei Yang wrote:
>On Mon, Oct 12, 2020 at 02:53:07PM +0200, David Hildenbrand wrote:
>>Let's factor out the core pieces and place the implementation next to
>>virtio_mem_fake_offline(). We'll reuse this functionality soon.
>>
>>Cc: "Michael S. Tsirkin" 
>>Cc: Jason Wang 
>>Cc: Pankaj Gupta 
>>Signed-off-by: David Hildenbrand 
>>---
>> drivers/virtio/virtio_mem.c | 73 +
>> 1 file changed, 50 insertions(+), 23 deletions(-)
>>
>>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>>index d132bc54ef57..a2124892e510 100644
>>--- a/drivers/virtio/virtio_mem.c
>>+++ b/drivers/virtio/virtio_mem.c
>>@@ -168,6 +168,10 @@ static LIST_HEAD(virtio_mem_devices);
>> 
>> static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
>> static void virtio_mem_retry(struct virtio_mem *vm);
>>+static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
>>+   unsigned long nr_pages);
>>+static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
>>+unsigned long nr_pages);
>> 
>> /*
>>  * Register a virtio-mem device so it will be considered for the online_page
>>@@ -604,27 +608,15 @@ static void virtio_mem_notify_going_offline(struct 
>>virtio_mem *vm,
>>  unsigned long mb_id)
>> {
>>  const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
>>- struct page *page;
>>  unsigned long pfn;
>>- int sb_id, i;
>>+ int sb_id;
>> 
>>  for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
>>  if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
>>  continue;
>>- /*
>>-  * Drop our reference to the pages so the memory can get
>>-  * offlined and add the unplugged pages to the managed
>>-  * page counters (so offlining code can correctly subtract
>>-  * them again).
>>-  */
>>  pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
>> sb_id * vm->subblock_size);
>>- adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
>
>One question about the original code, why we want to adjust count here?
>
>The code flow is
>
>__offline_pages()
>memory_notify(MEM_GOING_OFFLINE, )
>   virtio_mem_notify_going_offline(vm, mb_id)
>   adjust_managed_page_count(pfn_to_page(pfn), nr_pages)
>   adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages)
>
>Do we adjust the count twice?
>

Ah, I got the reason why we need to adjust count for *unplugged* sub-blocks.

>>- for (i = 0; i < nr_pages; i++) {
>>- page = pfn_to_page(pfn + i);
>>- if (WARN_ON(!page_ref_dec_and_test(page)))

Another question is when we grab a refcount for the unpluged pages? The one
you mentioned in virtio_mem_set_fake_offline().

>>- dump_page(page, "unplugged page referenced");
>>- }
>>+ virtio_mem_fake_offline_going_offline(pfn, nr_pages);
>>  }
>> }
>> 
>>@@ -633,21 +625,14 @@ static void virtio_mem_notify_cancel_offline(struct 
>>virtio_mem *vm,
>> {
>>  const unsigned long nr_pages = PFN_DOWN(vm->subblock_size);
>>  unsigned long pfn;
>>- int sb_id, i;
>>+ int sb_id;
>> 
>>  for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) {
>>  if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1))
>>  continue;
>>- /*
>>-  * Get the reference we dropped when going offline and
>>-  * subtract the unplugged pages from the managed page
>>-  * counters.
>>-  */
>>  pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
>> sb_id * vm->subblock_size);
>>- adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
>>- for (i = 0; i < nr_pages; i++)
>>- page_ref_inc(pfn_to_page(pfn + i));
>>+ virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
>>  }
>> }
>> 
>>@@ -853,6 +838,48 @@ static int virtio_mem_fake_offline(unsigned long pfn, 
>>unsigned long nr_pages)
>>  return 0;
>> }
>> 
>>+/*
>&

Re: [PATCH v1 13/29] virtio-mem: factor out handling of fake-offline pages in memory notifier

2020-10-16 Thread Wei Yang

>+  adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
>+  /* Drop our reference to the pages so the memory can get offlined. */
>+  for (i = 0; i < nr_pages; i++) {
>+  page = pfn_to_page(pfn + i);
>+  if (WARN_ON(!page_ref_dec_and_test(page)))
>+  dump_page(page, "fake-offline page referenced");
>+  }
>+}
>+
>+/*
>+ * Handle fake-offline pages when memory offlining is canceled - to undo
>+ * what we did in virtio_mem_fake_offline_going_offline().
>+ */
>+static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
>+ unsigned long nr_pages)
>+{
>+  unsigned long i;
>+
>+  /*
>+   * Get the reference we dropped when going offline and subtract the
>+   * unplugged pages from the managed page counters.
>+   */
>+  adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
>+  for (i = 0; i < nr_pages; i++)
>+  page_ref_inc(pfn_to_page(pfn + i));
>+}
>+
> static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
> {
>   const unsigned long addr = page_to_phys(page);
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 12/29] virtio-mem: factor out fake-offlining into virtio_mem_fake_offline()

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:06PM +0200, David Hildenbrand wrote:
>... which now matches virtio_mem_fake_online(). We'll reuse this
>functionality soon.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 34 --
> 1 file changed, 24 insertions(+), 10 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index 00d1cfca4713..d132bc54ef57 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -832,6 +832,27 @@ static void virtio_mem_fake_online(unsigned long pfn, 
>unsigned long nr_pages)
>   }
> }
> 
>+/*
>+ * Try to allocate a range, marking pages fake-offline, effectively
>+ * fake-offlining them.
>+ */
>+static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
>+{
>+  int rc;
>+
>+  rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
>+  GFP_KERNEL);
>+  if (rc == -ENOMEM)
>+  /* whoops, out of memory */
>+  return rc;
>+  if (rc)
>+  return -EBUSY;
>+
>+  virtio_mem_set_fake_offline(pfn, nr_pages, true);
>+  adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
>+  return 0;
>+}
>+
> static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
> {
>   const unsigned long addr = page_to_phys(page);
>@@ -1335,17 +1356,10 @@ static int virtio_mem_mb_unplug_sb_online(struct 
>virtio_mem *vm,
> 
>   start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
>sb_id * vm->subblock_size);
>-  rc = alloc_contig_range(start_pfn, start_pfn + nr_pages,
>-  MIGRATE_MOVABLE, GFP_KERNEL);
>-  if (rc == -ENOMEM)
>-  /* whoops, out of memory */
>-  return rc;
>-  if (rc)
>-  return -EBUSY;
> 
>-  /* Mark it as fake-offline before unplugging it */
>-  virtio_mem_set_fake_offline(start_pfn, nr_pages, true);
>-  adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
>+  rc = virtio_mem_fake_offline(start_pfn, nr_pages);
>+  if (rc)
>+  return rc;
> 
>   /* Try to unplug the allocated memory */
>   rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count);
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 11/29] virtio-mem: use "unsigned long" for nr_pages when fake onlining/offlining

2020-10-16 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:05PM +0200, David Hildenbrand wrote:
>No harm done, but let's be consistent.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 8 
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index cb2e8f254650..00d1cfca4713 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -766,7 +766,7 @@ static int virtio_mem_memory_notifier_cb(struct 
>notifier_block *nb,
>  * (via generic_online_page()) using PageDirty().
>  */
> static void virtio_mem_set_fake_offline(unsigned long pfn,
>-  unsigned int nr_pages, bool onlined)
>+  unsigned long nr_pages, bool onlined)
> {
>   for (; nr_pages--; pfn++) {
>   struct page *page = pfn_to_page(pfn);
>@@ -785,7 +785,7 @@ static void virtio_mem_set_fake_offline(unsigned long pfn,
>  * (via generic_online_page()), clear PageDirty().
>  */
> static void virtio_mem_clear_fake_offline(unsigned long pfn,
>-unsigned int nr_pages, bool onlined)
>+unsigned long nr_pages, bool onlined)
> {
>   for (; nr_pages--; pfn++) {
>   struct page *page = pfn_to_page(pfn);
>@@ -800,10 +800,10 @@ static void virtio_mem_clear_fake_offline(unsigned long 
>pfn,
>  * Release a range of fake-offline pages to the buddy, effectively
>  * fake-onlining them.
>  */
>-static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages)
>+static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
> {
>   const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES;
>-  int i;
>+  unsigned long i;
> 
>   /*
>* We are always called at least with MAX_ORDER_NR_PAGES
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 09/29] virtio-mem: don't always trigger the workqueue when offlining memory

2020-10-15 Thread Wei Yang

/*
>+   * Trigger the workqueue. Now that we have some offline memory,
>+   * maybe we can handle pending unplug requests.
>+   */
>+  if (!unplug_online)
>+  virtio_mem_retry(vm);
>+
>   vm->hotplug_active = false;
>   mutex_unlock(>hotplug_mutex);
>   break;
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 05/29] virtio-mem: generalize check for added memory

2020-10-15 Thread Wei Yang

On Thu, Oct 15, 2020 at 10:50:27AM +0200, David Hildenbrand wrote:
[...]
>> 
>>> dev_warn(>dev, "device still has system memory added\n");
>>> } else {
>>> virtio_mem_delete_resource(vm);
>> 
>> BTW, I got one question during review.
>> 
>> Per my understanding, there are 4 states of a virtio memory block
>> 
>>   * OFFLINE[_PARTIAL]
>>   * ONLINE[_PARTIAL]
>> 
>> While, if my understanding is correct, those two offline states are 
>> transient.
>> If the required range is onlined, the state would be change to
>> ONLINE[_PARTIAL] respectively. If it is not, the state is reverted to UNUSED
>> or PLUGGED.
>
>Very right.
>
>> 
>> What I am lost is why you do virtio_mem_mb_remove() on OFFLINE_PARTIAL memory
>> block? Since we wait for the workqueue finish its job.

I have tried to understand the logic, while still have some confusion.

>
>That's an interesting corner case. Assume you have a 128MB memory block
>but only 64MB are plugged.

Since we just plug a part of memory block, this state is OFFLINE_PARTIAL
first. But then we would add these memory and online it. This means the state
of this memory block is ONLINE_PARTIAL.

When this state is changed to OFFLINE_PARTIAL again?

>
>As long as we have our online_pages callback in place, we can hinder the
>unplugged 64MB from getting exposed to the buddy
>(virtio_mem_online_page_cb()). However, once we unloaded the driver,

Yes,

virtio_mem_set_fake_offline() would __SetPageOffline() to those pages.

>this is no longer the case. If someone would online that memory block,
>we would expose unplugged memory to the buddy - very bad.
>

Per my understanding, at this point of time, the memory block is at online
state. Even part of it is set to *fake* offline.

So how could user trigger another online from sysfs interface?

>So we have to remove these partially plugged, offline memory blocks when
>losing control over them.
>
>I tried to document that via:
>
>"After we unregistered our callbacks, user space can online partially
>plugged offline blocks. Make sure to remove them."
>
>> 
>> Also, during virtio_mem_remove(), we just handle OFFLINE_PARTIAL memory 
>> block.
>> How about memory block in other states? It is not necessary to remove
>> ONLINE[_PARTIAL] memroy blocks?
>
>Blocks that are fully plugged (ONLINE or OFFLINE) can get
>onlined/offlined without us having to care. Works fine - we only have to
>care about partially plugged blocks.
>
>While we *could* unplug OFFLINE blocks, there is no way we can
>deterministically offline+remove ONLINE blocks. So that memory has to
>stay, even after we unloaded the driver (similar to the dax/kmem driver).

For OFFLINE memory blocks, would that leave the situation:

Guest doesn't need those pages, while host still maps them?

>
>ONLINE_PARTIAL is already taken care of: it cannot get offlined anymore,
>as we still hold references to these struct pages
>(virtio_mem_set_fake_offline()), and as we no longer have the memory
>notifier in place, we can no longer agree to offline this memory (when
>going_offline).
>

Ok, I seems to understand the logic now.

But how we prevent ONLINE_PARTIAL memory block get offlined? There are three
calls in virtio_mem_set_fake_offline(), while all of them adjust page's flag.
How they hold reference to struct page?

>I tried to document that via
>
>"After we unregistered our callbacks, user space can no longer offline
>partially plugged online memory blocks. No need to worry about them."
>
>
>> 
>> Thanks in advance, since I may missed some concepts.
>
>(force) driver unloading is a complicated corner case.
>
>Thanks!
>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 02/29] virtio-mem: simplify calculation in virtio_mem_mb_state_prepare_next_mb()

2020-10-15 Thread Wei Yang

On Thu, Oct 15, 2020 at 10:00:15AM +0200, David Hildenbrand wrote:
>On 15.10.20 06:02, Wei Yang wrote:
>> On Mon, Oct 12, 2020 at 02:52:56PM +0200, David Hildenbrand wrote:
>>> We actually need one byte less (next_mb_id is exclusive, first_mb_id is
>>> inclusive). Simplify.
>>>
>>> Cc: "Michael S. Tsirkin" 
>>> Cc: Jason Wang 
>>> Cc: Pankaj Gupta 
>>> Signed-off-by: David Hildenbrand 
>>> ---
>>> drivers/virtio/virtio_mem.c | 4 ++--
>>> 1 file changed, 2 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>>> index a1f5bf7a571a..670b3faf412d 100644
>>> --- a/drivers/virtio/virtio_mem.c
>>> +++ b/drivers/virtio/virtio_mem.c
>>> @@ -257,8 +257,8 @@ static enum virtio_mem_mb_state 
>>> virtio_mem_mb_get_state(struct virtio_mem *vm,
>>>  */
>>> static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm)
>>> {
>>> -   unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1;
>>> -   unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2;
>>> +   unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id;
>>> +   unsigned long new_bytes = old_bytes + 1;
>> 
>> This is correct.
>> 
>> So this looks more like a fix?
>
>We allocate an additional new page "one memory block too early".
>
>So we would allocate the first page for blocks 0..510, and already
>allocate the second page with block 511, although we could have fit it
>into the first page. Block 512 will then find that the second page is
>already there and simply use the second page.
>
>So as we do it consistently, nothing will go wrong - that's why I
>avoided using the "fix" terminology.
>

Yes, my feeling is this is not a simplification. Instead this is a more
precise calculation.

How about use this subject?

virtio-mem: more precise calculation in virtio_mem_mb_state_prepare_next_mb()

>Thanks!
>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 08/29] virtio-mem: drop last_mb_id

2020-10-15 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:02PM +0200, David Hildenbrand wrote:
>No longer used, let's drop it.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

If above two patches are merged.

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 4 
> 1 file changed, 4 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index 37a0e338ae4a..5c93f8a65eba 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -84,8 +84,6 @@ struct virtio_mem {
> 
>   /* Id of the first memory block of this device. */
>   unsigned long first_mb_id;
>-  /* Id of the last memory block of this device. */
>-  unsigned long last_mb_id;
>   /* Id of the last usable memory block of this device. */
>   unsigned long last_usable_mb_id;
>   /* Id of the next memory bock to prepare when needed. */
>@@ -1689,8 +1687,6 @@ static int virtio_mem_init(struct virtio_mem *vm)
>   vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 +
>  memory_block_size_bytes());
>   vm->next_mb_id = vm->first_mb_id;
>-  vm->last_mb_id = virtio_mem_phys_to_mb_id(vm->addr +
>-   vm->region_size) - 1;
> 
>   dev_info(>vdev->dev, "start address: 0x%llx", vm->addr);
>   dev_info(>vdev->dev, "region size: 0x%llx", vm->region_size);
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 06/29] virtio-mem: generalize virtio_mem_owned_mb()

2020-10-15 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:53:00PM +0200, David Hildenbrand wrote:
>Avoid using memory block ids. Rename it to virtio_mem_contains_range().
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 
>---
> drivers/virtio/virtio_mem.c | 9 +
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index 6bbd1cfd10d3..821143db14fe 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -500,12 +500,13 @@ static bool virtio_mem_overlaps_range(struct virtio_mem 
>*vm,
> }
> 
> /*
>- * Test if a virtio-mem device owns a memory block. Can be called from
>+ * Test if a virtio-mem device contains a given range. Can be called from
>  * (notifier) callbacks lockless.
>  */
>-static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id)
>+static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
>+uint64_t size)
> {
>-  return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id;
>+  return start >= vm->addr && start + size <= vm->addr + vm->region_size;

Do we have some reason to do this change?

> }
> 
> static int virtio_mem_notify_going_online(struct virtio_mem *vm,
>@@ -800,7 +801,7 @@ static void virtio_mem_online_page_cb(struct page *page, 
>unsigned int order)
>*/
>   rcu_read_lock();
>   list_for_each_entry_rcu(vm, _mem_devices, next) {
>-  if (!virtio_mem_owned_mb(vm, mb_id))
>+  if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
>   continue;
> 
>   sb_id = virtio_mem_phys_to_sb_id(vm, addr);
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 05/29] virtio-mem: generalize check for added memory

2020-10-15 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:52:59PM +0200, David Hildenbrand wrote:
>Let's check by traversing busy system RAM resources instead, to avoid
>relying on memory block states.
>
>Don't use walk_system_ram_range(), as that works on pages and we want to
>use the bare addresses we have easily at hand.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 
>---
> drivers/virtio/virtio_mem.c | 19 +++
> 1 file changed, 15 insertions(+), 4 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index b3eebac7191f..6bbd1cfd10d3 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -1749,6 +1749,20 @@ static void virtio_mem_delete_resource(struct 
>virtio_mem *vm)
>   vm->parent_resource = NULL;
> }
> 
>+static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
>+{
>+  return 1;
>+}
>+
>+static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
>+{
>+  const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>+
>+  return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
>+ vm->addr + vm->region_size, NULL,
>+ virtio_mem_range_has_system_ram) == 1;
>+}
>+
> static int virtio_mem_probe(struct virtio_device *vdev)
> {
>   struct virtio_mem *vm;
>@@ -1870,10 +1884,7 @@ static void virtio_mem_remove(struct virtio_device 
>*vdev)
>* the system. And there is no way to stop the driver/device from going
>* away. Warn at least.
>*/
>-  if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] ||
>-  vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] ||
>-  vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] ||
>-  vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) {
>+  if (virtio_mem_has_memory_added(vm)) {

I am not sure this would be more efficient.

>   dev_warn(>dev, "device still has system memory added\n");
>   } else {
>   virtio_mem_delete_resource(vm);

BTW, I got one question during review.

Per my understanding, there are 4 states of a virtio memory block

  * OFFLINE[_PARTIAL]
  * ONLINE[_PARTIAL]

While, if my understanding is correct, those two offline states are transient.
If the required range is onlined, the state would be change to
ONLINE[_PARTIAL] respectively. If it is not, the state is reverted to UNUSED
or PLUGGED.

What I am lost is why you do virtio_mem_mb_remove() on OFFLINE_PARTIAL memory
block? Since we wait for the workqueue finish its job.

Also, during virtio_mem_remove(), we just handle OFFLINE_PARTIAL memory block.
How about memory block in other states? It is not necessary to remove
ONLINE[_PARTIAL] memroy blocks?

Thanks in advance, since I may missed some concepts.

>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 04/29] virtio-mem: drop rc2 in virtio_mem_mb_plug_and_add()

2020-10-15 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:52:58PM +0200, David Hildenbrand wrote:
>We can drop rc2, we don't actually need the value.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 


-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 03/29] virtio-mem: simplify MAX_ORDER - 1 / pageblock_order handling

2020-10-15 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:52:57PM +0200, David Hildenbrand wrote:
>Let's use pageblock_nr_pages and MAX_ORDER_NR_PAGES instead where
>possible, so we don't have do deal with allocation orders.
>
>Add a comment why we have that restriction for now.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 
-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 02/29] virtio-mem: simplify calculation in virtio_mem_mb_state_prepare_next_mb()

2020-10-14 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:52:56PM +0200, David Hildenbrand wrote:
>We actually need one byte less (next_mb_id is exclusive, first_mb_id is
>inclusive). Simplify.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 
>---
> drivers/virtio/virtio_mem.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index a1f5bf7a571a..670b3faf412d 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -257,8 +257,8 @@ static enum virtio_mem_mb_state 
>virtio_mem_mb_get_state(struct virtio_mem *vm,
>  */
> static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm)
> {
>-  unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1;
>-  unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2;
>+  unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id;
>+  unsigned long new_bytes = old_bytes + 1;

This is correct.

So this looks more like a fix?

>   int old_pages = PFN_UP(old_bytes);
>   int new_pages = PFN_UP(new_bytes);
>   uint8_t *new_mb_state;
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 01/29] virtio-mem: determine nid only once using memory_add_physaddr_to_nid()

2020-10-14 Thread Wei Yang

On Mon, Oct 12, 2020 at 02:52:55PM +0200, David Hildenbrand wrote:
>Let's determine the target nid only once in case we have none specified -
>usually, we'll end up with node 0 either way.
>
>Cc: "Michael S. Tsirkin" 
>Cc: Jason Wang 
>Cc: Pankaj Gupta 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> drivers/virtio/virtio_mem.c | 28 +++-
> 1 file changed, 11 insertions(+), 17 deletions(-)
>
>diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
>index ba4de598f663..a1f5bf7a571a 100644
>--- a/drivers/virtio/virtio_mem.c
>+++ b/drivers/virtio/virtio_mem.c
>@@ -70,7 +70,7 @@ struct virtio_mem {
> 
>   /* The device block size (for communicating with the device). */
>   uint64_t device_block_size;
>-  /* The translated node id. NUMA_NO_NODE in case not specified. */
>+  /* The determined node id for all memory of the device. */
>   int nid;
>   /* Physical start address of the memory region. */
>   uint64_t addr;
>@@ -406,10 +406,6 @@ static int virtio_mem_sb_bitmap_prepare_next_mb(struct 
>virtio_mem *vm)
> static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
> {
>   const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
>-  int nid = vm->nid;
>-
>-  if (nid == NUMA_NO_NODE)
>-  nid = memory_add_physaddr_to_nid(addr);
> 
>   /*
>* When force-unloading the driver and we still have memory added to
>@@ -423,7 +419,8 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, 
>unsigned long mb_id)
>   }
> 
>   dev_dbg(>vdev->dev, "adding memory block: %lu\n", mb_id);
>-  return add_memory_driver_managed(nid, addr, memory_block_size_bytes(),
>+  return add_memory_driver_managed(vm->nid, addr,
>+   memory_block_size_bytes(),
>vm->resource_name,
>MEMHP_MERGE_RESOURCE);
> }
>@@ -440,13 +437,9 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, 
>unsigned long mb_id)
> static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id)
> {
>   const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
>-  int nid = vm->nid;
>-
>-  if (nid == NUMA_NO_NODE)
>-  nid = memory_add_physaddr_to_nid(addr);
> 
>   dev_dbg(>vdev->dev, "removing memory block: %lu\n", mb_id);
>-  return remove_memory(nid, addr, memory_block_size_bytes());
>+  return remove_memory(vm->nid, addr, memory_block_size_bytes());
> }
> 
> /*
>@@ -461,14 +454,11 @@ static int virtio_mem_mb_offline_and_remove(struct 
>virtio_mem *vm,
>   unsigned long mb_id)
> {
>   const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
>-  int nid = vm->nid;
>-
>-  if (nid == NUMA_NO_NODE)
>-  nid = memory_add_physaddr_to_nid(addr);
> 
>   dev_dbg(>vdev->dev, "offlining and removing memory block: %lu\n",
>   mb_id);
>-  return offline_and_remove_memory(nid, addr, memory_block_size_bytes());
>+  return offline_and_remove_memory(vm->nid, addr,
>+   memory_block_size_bytes());
> }
> 
> /*
>@@ -1659,6 +1649,10 @@ static int virtio_mem_init(struct virtio_mem *vm)
>   virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
>   >region_size);
> 
>+  /* Determine the nid for the device based on the lowest address. */
>+  if (vm->nid == NUMA_NO_NODE)
>+  vm->nid = memory_add_physaddr_to_nid(vm->addr);
>+
>   /*
>* We always hotplug memory in memory block granularity. This way,
>* we have to wait for exactly one memory block to online.
>@@ -1707,7 +1701,7 @@ static int virtio_mem_init(struct virtio_mem *vm)
>memory_block_size_bytes());
>   dev_info(>vdev->dev, "subblock size: 0x%llx",
>(unsigned long long)vm->subblock_size);
>-  if (vm->nid != NUMA_NO_NODE)
>+  if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
>   dev_info(>vdev->dev, "nid: %d", vm->nid);
> 
>   return 0;
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH] kernel/resource: Fix use of ternary condition in release_mem_region_adjustable

2020-10-09 Thread Wei Yang

On Mon, Sep 21, 2020 at 11:07:48PM -0700, Nathan Chancellor wrote:
>Clang warns:
>
>kernel/resource.c:1281:53: warning: operator '?:' has lower precedence
>than '|'; '|' will be evaluated first
>[-Wbitwise-conditional-parentheses]
>new_res = alloc_resource(GFP_KERNEL | alloc_nofail ? __GFP_NOFAIL : 0);
> ~ ^
>kernel/resource.c:1281:53: note: place parentheses around the '|'
>expression to silence this warning
>new_res = alloc_resource(GFP_KERNEL | alloc_nofail ? __GFP_NOFAIL : 0);
> ~ ^
>kernel/resource.c:1281:53: note: place parentheses around the '?:'
>expression to evaluate it first
>new_res = alloc_resource(GFP_KERNEL | alloc_nofail ? __GFP_NOFAIL : 0);
>   ^
>  (  )
>1 warning generated.
>
>Add the parentheses as it was clearly intended for the ternary condition
>to be evaluated first.
>
>Fixes: 5fd23bd0d739 ("kernel/resource: make release_mem_region_adjustable() 
>never fail")
>Link: https://github.com/ClangBuiltLinux/linux/issues/1159
>Signed-off-by: Nathan Chancellor 

Reviewed-by: Wei Yang 

>---
>
>Presumably, this will be squashed but I included a fixes tag
>nonetheless. Apologies if this has already been noticed and fixed
>already, I did not find anything on LKML.
>
> kernel/resource.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
>diff --git a/kernel/resource.c b/kernel/resource.c
>index ca2a666e4317..3ae2f56cc79d 100644
>--- a/kernel/resource.c
>+++ b/kernel/resource.c
>@@ -1278,7 +1278,7 @@ void release_mem_region_adjustable(resource_size_t 
>start, resource_size_t size)
>* similarly).
>*/
> retry:
>-  new_res = alloc_resource(GFP_KERNEL | alloc_nofail ? __GFP_NOFAIL : 0);
>+  new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 
>0));
> 
>   p = >child;
>   write_lock(_lock);
>
>base-commit: 40ee82f47bf297e31d0c47547cd8f24ede52415a
>-- 
>2.28.0

-- 
Wei Yang
Help you, Help me

Re: [PATCH 5/6] ftrace: replace do_for_each_ftrace_rec() with for_ftrace_rec_iter()

2020-10-07 Thread Wei Yang

On Tue, Oct 06, 2020 at 10:42:17AM -0400, Steven Rostedt wrote:
>On Mon, 31 Aug 2020 11:11:03 +0800
>Wei Yang  wrote:
>
>> Now we have two similar infrastructure to iterate ftrace_page and
>> dyn_ftrace:
>> 
>>   * do_for_each_ftrace_rec()
>>   * for_ftrace_rec_iter()
>> 
>> The 2nd one, for_ftrace_rec_iter(), looks more generic, so preserve it
>> and replace do_for_each_ftrace_rec() with it.
>> 
>
>I also didn't pull in this patch. The reason is that the
>for_ftrace_rec_iter() is specific for external usage (for archs to use) and
>requires two function calls to do the iterations.
>
>The do_for_each_ftrace_rec() is a faster, light weight version, but for
>internal use only.
>
>I rather keep both, as each has their own, but slightly different, use
>cases.
>

Got it, thanks

>-- Steve

-- 
Wei Yang
Help you, Help me

Re: [PATCH 1/6] ftrace: define seq_file only for FMODE_READ

2020-10-07 Thread Wei Yang

On Tue, Oct 06, 2020 at 10:36:38AM -0400, Steven Rostedt wrote:
>On Mon, 31 Aug 2020 11:10:59 +0800
>Wei Yang  wrote:
>
>> The purpose of the operation is to get ftrace_iterator, which is embedded
>> in file or seq_file for FMODE_WRITE/FMODE_READ respectively. Since we
>> don't have a seq_file for FMODE_WRITE case, it is meaningless to cast
>> file->private_data to seq_file.
>> 
>> Let's move the definition when there is a valid seq_file.
>
>I didn't pull in this patch because I find the original more expressive,
>and there's really no benefit in changing it.
>

Got it, thanks

>-- Steve
>
>
>> 
>> Signed-off-by: Wei Yang 
>> ---
>>  kernel/trace/ftrace.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>> 
>> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
>> index edc233122598..12cb535769bc 100644
>> --- a/kernel/trace/ftrace.c
>> +++ b/kernel/trace/ftrace.c
>> @@ -5558,7 +5558,6 @@ static void __init set_ftrace_early_filters(void)
>>  
>>  int ftrace_regex_release(struct inode *inode, struct file *file)
>>  {
>> -struct seq_file *m = (struct seq_file *)file->private_data;
>>  struct ftrace_iterator *iter;
>>  struct ftrace_hash **orig_hash;
>>  struct trace_parser *parser;
>> @@ -5566,6 +5565,7 @@ int ftrace_regex_release(struct inode *inode, struct 
>> file *file)
>>  int ret;
>>  
>>  if (file->f_mode & FMODE_READ) {
>> +struct seq_file *m = file->private_data;
>>  iter = m->private;
>>  seq_release(inode, file);
>>  } else

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 3/5] mm/page_alloc: always move pages to the tail of the freelist in unset_migratetype_isolate()

2020-09-30 Thread Wei Yang

On Tue, Sep 29, 2020 at 12:12:14PM +0200, David Hildenbrand wrote:
>On 29.09.20 11:18, Wei Yang wrote:
>> On Mon, Sep 28, 2020 at 08:21:08PM +0200, David Hildenbrand wrote:
>>> Page isolation doesn't actually touch the pages, it simply isolates
>>> pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist.
>>>
>>> We already place pages to the tail of the freelists when undoing
>>> isolation via __putback_isolated_page(), let's do it in any case
>>> (e.g., if order <= pageblock_order) and document the behavior.
>>>
>>> Add a "to_tail" parameter to move_freepages_block() but introduce a
>>> a new move_to_free_list_tail() - similar to add_to_free_list_tail().
>
>s/a a/a/
>
>>>
>>> This change results in all pages getting onlined via online_pages() to
>>> be placed to the tail of the freelist.
>>>
>>> Reviewed-by: Oscar Salvador 
>>> Cc: Andrew Morton 
>>> Cc: Alexander Duyck 
>>> Cc: Mel Gorman 
>>> Cc: Michal Hocko 
>>> Cc: Dave Hansen 
>>> Cc: Vlastimil Babka 
>>> Cc: Wei Yang 
>>> Cc: Oscar Salvador 
>>> Cc: Mike Rapoport 
>>> Cc: Scott Cheloha 
>>> Cc: Michael Ellerman 
>>> Signed-off-by: David Hildenbrand 
>>> ---
>>> include/linux/page-isolation.h |  4 ++--
>>> mm/page_alloc.c| 35 +++---
>>> mm/page_isolation.c| 12 +---
>>> 3 files changed, 35 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>>> index 572458016331..3eca9b3c5305 100644
>>> --- a/include/linux/page-isolation.h
>>> +++ b/include/linux/page-isolation.h
>>> @@ -36,8 +36,8 @@ static inline bool is_migrate_isolate(int migratetype)
>>> struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>>>  int migratetype, int flags);
>>> void set_pageblock_migratetype(struct page *page, int migratetype);
>>> -int move_freepages_block(struct zone *zone, struct page *page,
>>> -   int migratetype, int *num_movable);
>>> +int move_freepages_block(struct zone *zone, struct page *page, int 
>>> migratetype,
>>> +bool to_tail, int *num_movable);
>>>
>>> /*
>>>  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index 9e3ed4a6f69a..d5a5f528b8ca 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -905,6 +905,15 @@ static inline void move_to_free_list(struct page 
>>> *page, struct zone *zone,
>>> list_move(>lru, >free_list[migratetype]);
>>> }
>>>
>>> +/* Used for pages which are on another list */
>>> +static inline void move_to_free_list_tail(struct page *page, struct zone 
>>> *zone,
>>> +     unsigned int order, int migratetype)
>>> +{
>>> +   struct free_area *area = >free_area[order];
>>> +
>>> +   list_move_tail(>lru, >free_list[migratetype]);
>>> +}
>>> +
>> 
>> Would it be better to pass the *to_tail* to move_to_free_list(), so we won't
>> have a new function?
>
>Hi,
>
>thanks for the review!
>
>See discussion in RFC + cover letter:
>
>"Add a "to_tail" parameter to move_freepages_block() but introduce a new
>move_to_free_list_tail() - similar to add_to_free_list_tail()."

Hmm, sounds reasonable.

Reviewed-by: Wei Yang 

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 5/5] mm/memory_hotplug: update comment regarding zone shuffling

2020-09-29 Thread Wei Yang

On Mon, Sep 28, 2020 at 08:21:10PM +0200, David Hildenbrand wrote:
>As we no longer shuffle via generic_online_page() and when undoing
>isolation, we can simplify the comment.
>
>We now effectively shuffle only once (properly) when onlining new
>memory.
>
>Cc: Andrew Morton 
>Cc: Alexander Duyck 
>Cc: Mel Gorman 
>Cc: Michal Hocko 
>Cc: Dave Hansen 
>Cc: Vlastimil Babka 
>Cc: Wei Yang 
>Cc: Oscar Salvador 
>Cc: Mike Rapoport 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> mm/memory_hotplug.c | 11 ---
> 1 file changed, 4 insertions(+), 7 deletions(-)
>
>diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>index 9db80ee29caa..c589bd8801bb 100644
>--- a/mm/memory_hotplug.c
>+++ b/mm/memory_hotplug.c
>@@ -859,13 +859,10 @@ int __ref online_pages(unsigned long pfn, unsigned long 
>nr_pages,
>   undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
> 
>   /*
>-   * When exposing larger, physically contiguous memory areas to the
>-   * buddy, shuffling in the buddy (when freeing onlined pages, putting
>-   * them either to the head or the tail of the freelist) is only helpful
>-   * for maintaining the shuffle, but not for creating the initial
>-   * shuffle. Shuffle the whole zone to make sure the just onlined pages
>-   * are properly distributed across the whole freelist. Make sure to
>-   * shuffle once pageblocks are no longer isolated.
>+   * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
>+   * the tail of the freelist when undoing isolation). Shuffle the whole
>+   * zone to make sure the just onlined pages are properly distributed
>+   * across the whole freelist - to create an initial shuffle.
>*/
>   shuffle_zone(zone);
> 
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 4/5] mm/page_alloc: place pages to tail in __free_pages_core()

2020-09-29 Thread Wei Yang

On Mon, Sep 28, 2020 at 08:21:09PM +0200, David Hildenbrand wrote:
>__free_pages_core() is used when exposing fresh memory to the buddy
>during system boot and when onlining memory in generic_online_page().
>
>generic_online_page() is used in two cases:
>
>1. Direct memory onlining in online_pages().
>2. Deferred memory onlining in memory-ballooning-like mechanisms (HyperV
>   balloon and virtio-mem), when parts of a section are kept
>   fake-offline to be fake-onlined later on.
>
>In 1, we already place pages to the tail of the freelist. Pages will be
>freed to MIGRATE_ISOLATE lists first and moved to the tail of the freelists
>via undo_isolate_page_range().
>
>In 2, we currently don't implement a proper rule. In case of virtio-mem,
>where we currently always online MAX_ORDER - 1 pages, the pages will be
>placed to the HEAD of the freelist - undesireable. While the hyper-v
>balloon calls generic_online_page() with single pages, usually it will
>call it on successive single pages in a larger block.
>
>The pages are fresh, so place them to the tail of the freelists and avoid
>the PCP. In __free_pages_core(), remove the now superflouos call to
>set_page_refcounted() and add a comment regarding page initialization and
>the refcount.
>
>Note: In 2. we currently don't shuffle. If ever relevant (page shuffling
>is usually of limited use in virtualized environments), we might want to
>shuffle after a sequence of generic_online_page() calls in the
>relevant callers.
>
>Reviewed-by: Vlastimil Babka 
>Reviewed-by: Oscar Salvador 
>Cc: Andrew Morton 
>Cc: Alexander Duyck 
>Cc: Mel Gorman 
>Cc: Michal Hocko 
>Cc: Dave Hansen 
>Cc: Vlastimil Babka 
>Cc: Wei Yang 
>Cc: Oscar Salvador 
>Cc: Mike Rapoport 
>Cc: "K. Y. Srinivasan" 
>Cc: Haiyang Zhang 
>Cc: Stephen Hemminger 
>Cc: Wei Liu 
>Signed-off-by: David Hildenbrand 
>---
> mm/page_alloc.c | 37 -
> 1 file changed, 24 insertions(+), 13 deletions(-)
>
>diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>index d5a5f528b8ca..8a2134fe9947 100644
>--- a/mm/page_alloc.c
>+++ b/mm/page_alloc.c
>@@ -270,7 +270,8 @@ bool pm_suspended_storage(void)
> unsigned int pageblock_order __read_mostly;
> #endif
> 
>-static void __free_pages_ok(struct page *page, unsigned int order);
>+static void __free_pages_ok(struct page *page, unsigned int order,
>+  fop_t fop_flags);
> 
> /*
>  * results with 256, 32 in the lowmem_reserve sysctl:
>@@ -682,7 +683,7 @@ static void bad_page(struct page *page, const char *reason)
> void free_compound_page(struct page *page)
> {
>   mem_cgroup_uncharge(page);
>-  __free_pages_ok(page, compound_order(page));
>+  __free_pages_ok(page, compound_order(page), FOP_NONE);
> }
> 
> void prep_compound_page(struct page *page, unsigned int order)
>@@ -1419,17 +1420,15 @@ static void free_pcppages_bulk(struct zone *zone, int 
>count,
>   spin_unlock(>lock);
> }
> 
>-static void free_one_page(struct zone *zone,
>-  struct page *page, unsigned long pfn,
>-  unsigned int order,
>-  int migratetype)
>+static void free_one_page(struct zone *zone, struct page *page, unsigned long 
>pfn,
>+unsigned int order, int migratetype, fop_t fop_flags)
> {
>   spin_lock(>lock);
>   if (unlikely(has_isolate_pageblock(zone) ||
>   is_migrate_isolate(migratetype))) {
>   migratetype = get_pfnblock_migratetype(page, pfn);
>   }
>-  __free_one_page(page, pfn, zone, order, migratetype, FOP_NONE);
>+  __free_one_page(page, pfn, zone, order, migratetype, fop_flags);
>   spin_unlock(>lock);
> }
> 
>@@ -1507,7 +1506,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, 
>phys_addr_t end)
>   }
> }
> 
>-static void __free_pages_ok(struct page *page, unsigned int order)
>+static void __free_pages_ok(struct page *page, unsigned int order,
>+  fop_t fop_flags)
> {
>   unsigned long flags;
>   int migratetype;
>@@ -1519,7 +1519,8 @@ static void __free_pages_ok(struct page *page, unsigned 
>int order)
>   migratetype = get_pfnblock_migratetype(page, pfn);
>   local_irq_save(flags);
>   __count_vm_events(PGFREE, 1 << order);
>-  free_one_page(page_zone(page), page, pfn, order, migratetype);
>+  free_one_page(page_zone(page), page, pfn, order, migratetype,
>+fop_flags);
>   local_irq_restore(flags);
> }
> 
>@@ -1529,6 +1530,11 @@ void __free_pages_core(struct page *page, unsigned int 
>order)

Re: [PATCH v1 3/5] mm/page_alloc: always move pages to the tail of the freelist in unset_migratetype_isolate()

2020-09-29 Thread Wei Yang

On Mon, Sep 28, 2020 at 08:21:08PM +0200, David Hildenbrand wrote:
>Page isolation doesn't actually touch the pages, it simply isolates
>pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist.
>
>We already place pages to the tail of the freelists when undoing
>isolation via __putback_isolated_page(), let's do it in any case
>(e.g., if order <= pageblock_order) and document the behavior.
>
>Add a "to_tail" parameter to move_freepages_block() but introduce a
>a new move_to_free_list_tail() - similar to add_to_free_list_tail().
>
>This change results in all pages getting onlined via online_pages() to
>be placed to the tail of the freelist.
>
>Reviewed-by: Oscar Salvador 
>Cc: Andrew Morton 
>Cc: Alexander Duyck 
>Cc: Mel Gorman 
>Cc: Michal Hocko 
>Cc: Dave Hansen 
>Cc: Vlastimil Babka 
>Cc: Wei Yang 
>Cc: Oscar Salvador 
>Cc: Mike Rapoport 
>Cc: Scott Cheloha 
>Cc: Michael Ellerman 
>Signed-off-by: David Hildenbrand 
>---
> include/linux/page-isolation.h |  4 ++--
> mm/page_alloc.c| 35 +++---
> mm/page_isolation.c| 12 +---
> 3 files changed, 35 insertions(+), 16 deletions(-)
>
>diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>index 572458016331..3eca9b3c5305 100644
>--- a/include/linux/page-isolation.h
>+++ b/include/linux/page-isolation.h
>@@ -36,8 +36,8 @@ static inline bool is_migrate_isolate(int migratetype)
> struct page *has_unmovable_pages(struct zone *zone, struct page *page,
>int migratetype, int flags);
> void set_pageblock_migratetype(struct page *page, int migratetype);
>-int move_freepages_block(struct zone *zone, struct page *page,
>-  int migratetype, int *num_movable);
>+int move_freepages_block(struct zone *zone, struct page *page, int 
>migratetype,
>+   bool to_tail, int *num_movable);
> 
> /*
>  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
>diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>index 9e3ed4a6f69a..d5a5f528b8ca 100644
>--- a/mm/page_alloc.c
>+++ b/mm/page_alloc.c
>@@ -905,6 +905,15 @@ static inline void move_to_free_list(struct page *page, 
>struct zone *zone,
>   list_move(>lru, >free_list[migratetype]);
> }
> 
>+/* Used for pages which are on another list */
>+static inline void move_to_free_list_tail(struct page *page, struct zone 
>*zone,
>+unsigned int order, int migratetype)
>+{
>+  struct free_area *area = >free_area[order];
>+
>+  list_move_tail(>lru, >free_list[migratetype]);
>+}
>+

Would it be better to pass the *to_tail* to move_to_free_list(), so we won't
have a new function?

> static inline void del_page_from_free_list(struct page *page, struct zone 
> *zone,
>  unsigned int order)
> {
>@@ -2338,9 +2347,9 @@ static inline struct page *__rmqueue_cma_fallback(struct 
>zone *zone,
>  * Note that start_page and end_pages are not aligned on a pageblock
>  * boundary. If alignment is required, use move_freepages_block()
>  */
>-static int move_freepages(struct zone *zone,
>-struct page *start_page, struct page *end_page,
>-int migratetype, int *num_movable)
>+static int move_freepages(struct zone *zone, struct page *start_page,
>+struct page *end_page, int migratetype,
>+bool to_tail, int *num_movable)
> {
>   struct page *page;
>   unsigned int order;
>@@ -2371,7 +2380,10 @@ static int move_freepages(struct zone *zone,
>   VM_BUG_ON_PAGE(page_zone(page) != zone, page);
> 
>   order = page_order(page);
>-  move_to_free_list(page, zone, order, migratetype);
>+  if (to_tail)
>+  move_to_free_list_tail(page, zone, order, migratetype);
>+  else
>+  move_to_free_list(page, zone, order, migratetype);

And here, we just need to pass the *to_tail* to move_to_free_list().

>   page += 1 << order;
>   pages_moved += 1 << order;
>   }
>@@ -2379,8 +2391,8 @@ static int move_freepages(struct zone *zone,
>   return pages_moved;
> }
> 
>-int move_freepages_block(struct zone *zone, struct page *page,
>-  int migratetype, int *num_movable)
>+int move_freepages_block(struct zone *zone, struct page *page, int 
>migratetype,
>+   bool to_tail, int *num_movable)
> {
>   unsigned long start_pfn, end_pfn;
>   struct page *start_page, *end_page;
&g

Re: [PATCH v1 2/5] mm/page_alloc: place pages to tail in __putback_isolated_page()

2020-09-29 Thread Wei Yang

On Mon, Sep 28, 2020 at 08:21:07PM +0200, David Hildenbrand wrote:
>__putback_isolated_page() already documents that pages will be placed to
>the tail of the freelist - this is, however, not the case for
>"order >= MAX_ORDER - 2" (see buddy_merge_likely()) - which should be
>the case for all existing users.
>
>This change affects two users:
>- free page reporting
>- page isolation, when undoing the isolation (including memory onlining).
>
>This behavior is desireable for pages that haven't really been touched
>lately, so exactly the two users that don't actually read/write page
>content, but rather move untouched pages.
>
>The new behavior is especially desirable for memory onlining, where we
>allow allocation of newly onlined pages via undo_isolate_page_range()
>in online_pages(). Right now, we always place them to the head of the
>free list, resulting in undesireable behavior: Assume we add
>individual memory chunks via add_memory() and online them right away to
>the NORMAL zone. We create a dependency chain of unmovable allocations
>e.g., via the memmap. The memmap of the next chunk will be placed onto
>previous chunks - if the last block cannot get offlined+removed, all
>dependent ones cannot get offlined+removed. While this can already be
>observed with individual DIMMs, it's more of an issue for virtio-mem
>(and I suspect also ppc DLPAR).
>
>Document that this should only be used for optimizations, and no code
>should realy on this for correction (if the order of freepage lists
>ever changes).
>
>We won't care about page shuffling: memory onlining already properly
>shuffles after onlining. free page reporting doesn't care about
>physically contiguous ranges, and there are already cases where page
>isolation will simply move (physically close) free pages to (currently)
>the head of the freelists via move_freepages_block() instead of
>shuffling. If this becomes ever relevant, we should shuffle the whole
>zone when undoing isolation of larger ranges, and after
>free_contig_range().
>
>Reviewed-by: Alexander Duyck 
>Reviewed-by: Oscar Salvador 
>Cc: Andrew Morton 
>Cc: Alexander Duyck 
>Cc: Mel Gorman 
>Cc: Michal Hocko 
>Cc: Dave Hansen 
>Cc: Vlastimil Babka 
>Cc: Wei Yang 
>Cc: Oscar Salvador 
>Cc: Mike Rapoport 
>Cc: Scott Cheloha 
>Cc: Michael Ellerman 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> mm/page_alloc.c | 18 --
> 1 file changed, 16 insertions(+), 2 deletions(-)
>
>diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>index daab90e960fe..9e3ed4a6f69a 100644
>--- a/mm/page_alloc.c
>+++ b/mm/page_alloc.c
>@@ -89,6 +89,18 @@ typedef int __bitwise fop_t;
>  */
> #define FOP_SKIP_REPORT_NOTIFY((__force fop_t)BIT(0))
> 
>+/*
>+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
>+ * page shuffling (relevant code - e.g., memory onlining - is expected to
>+ * shuffle the whole zone).
>+ *
>+ * Note: No code should rely onto this flag for correctness - it's purely
>+ *   to allow for optimizations when handing back either fresh pages
>+ *   (memory onlining) or untouched pages (page isolation, free page
>+ *   reporting).
>+ */
>+#define FOP_TO_TAIL   ((__force fop_t)BIT(1))
>+
> /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
> static DEFINE_MUTEX(pcp_batch_high_lock);
> #define MIN_PERCPU_PAGELIST_FRACTION  (8)
>@@ -1038,7 +1050,9 @@ static inline void __free_one_page(struct page *page, 
>unsigned long pfn,
> done_merging:
>   set_page_order(page, order);
> 
>-  if (is_shuffle_order(order))
>+  if (fop_flags & FOP_TO_TAIL)
>+  to_tail = true;
>+  else if (is_shuffle_order(order))
>   to_tail = shuffle_pick_tail();
>   else
>   to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
>@@ -3300,7 +3314,7 @@ void __putback_isolated_page(struct page *page, unsigned 
>int order, int mt)
> 
>   /* Return isolated page to tail of freelist. */
>   __free_one_page(page, page_to_pfn(page), zone, order, mt,
>-  FOP_SKIP_REPORT_NOTIFY);
>+  FOP_SKIP_REPORT_NOTIFY | FOP_TO_TAIL);
> }
> 
> /*
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v1 1/5] mm/page_alloc: convert "report" flag of __free_one_page() to a proper flag

2020-09-29 Thread Wei Yang

On Mon, Sep 28, 2020 at 08:21:06PM +0200, David Hildenbrand wrote:
>Let's prepare for additional flags and avoid long parameter lists of bools.
>Follow-up patches will also make use of the flags in __free_pages_ok(),
>however, I wasn't able to come up with a better name for the type - should
>be good enough for internal purposes.
>
>Reviewed-by: Alexander Duyck 
>Reviewed-by: Vlastimil Babka 
>Reviewed-by: Oscar Salvador 
>Cc: Andrew Morton 
>Cc: Alexander Duyck 
>Cc: Mel Gorman 
>Cc: Michal Hocko 
>Cc: Dave Hansen 
>Cc: Vlastimil Babka 
>Cc: Wei Yang 
>Cc: Oscar Salvador 
>Cc: Mike Rapoport 
>Signed-off-by: David Hildenbrand 

Reviewed-by: Wei Yang 

>---
> mm/page_alloc.c | 28 
> 1 file changed, 20 insertions(+), 8 deletions(-)
>
>diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>index df90e3654f97..daab90e960fe 100644
>--- a/mm/page_alloc.c
>+++ b/mm/page_alloc.c
>@@ -77,6 +77,18 @@
> #include "shuffle.h"
> #include "page_reporting.h"
> 
>+/* Free One Page flags: for internal, non-pcp variants of free_pages(). */
>+typedef int __bitwise fop_t;
>+
>+/* No special request */
>+#define FOP_NONE  ((__force fop_t)0)
>+
>+/*
>+ * Skip free page reporting notification for the (possibly merged) page. (will
>+ * *not* mark the page reported, only skip the notification).
>+ */
>+#define FOP_SKIP_REPORT_NOTIFY((__force fop_t)BIT(0))
>+
> /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
> static DEFINE_MUTEX(pcp_batch_high_lock);
> #define MIN_PERCPU_PAGELIST_FRACTION  (8)
>@@ -948,10 +960,9 @@ buddy_merge_likely(unsigned long pfn, unsigned long 
>buddy_pfn,
>  * -- nyc
>  */
> 
>-static inline void __free_one_page(struct page *page,
>-  unsigned long pfn,
>-  struct zone *zone, unsigned int order,
>-  int migratetype, bool report)
>+static inline void __free_one_page(struct page *page, unsigned long pfn,
>+ struct zone *zone, unsigned int order,
>+ int migratetype, fop_t fop_flags)
> {
>   struct capture_control *capc = task_capc(zone);
>   unsigned long buddy_pfn;
>@@ -1038,7 +1049,7 @@ static inline void __free_one_page(struct page *page,
>   add_to_free_list(page, zone, order, migratetype);
> 
>   /* Notify page reporting subsystem of freed page */
>-  if (report)
>+  if (!(fop_flags & FOP_SKIP_REPORT_NOTIFY))
>   page_reporting_notify_free(order);
> }
> 
>@@ -1379,7 +1390,7 @@ static void free_pcppages_bulk(struct zone *zone, int 
>count,
>   if (unlikely(isolated_pageblocks))
>   mt = get_pageblock_migratetype(page);
> 
>-  __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
>+  __free_one_page(page, page_to_pfn(page), zone, 0, mt, FOP_NONE);
>   trace_mm_page_pcpu_drain(page, 0, mt);
>   }
>   spin_unlock(>lock);
>@@ -1395,7 +1406,7 @@ static void free_one_page(struct zone *zone,
>   is_migrate_isolate(migratetype))) {
>   migratetype = get_pfnblock_migratetype(page, pfn);
>   }
>-  __free_one_page(page, pfn, zone, order, migratetype, true);
>+  __free_one_page(page, pfn, zone, order, migratetype, FOP_NONE);
>   spin_unlock(>lock);
> }
> 
>@@ -3288,7 +3299,8 @@ void __putback_isolated_page(struct page *page, unsigned 
>int order, int mt)
>       lockdep_assert_held(>lock);
> 
>   /* Return isolated page to tail of freelist. */
>-  __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
>+  __free_one_page(page, page_to_pfn(page), zone, order, mt,
>+  FOP_SKIP_REPORT_NOTIFY);
> }
> 
> /*
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

[PATCH] cgroup: remove redundant kernfs_activate in cgroup_setup_root()

2020-09-25 Thread Wei Yang

This step is already done in rebind_subsystems().

Not necessary to do it again.

Signed-off-by: Wei Yang 
---
 kernel/cgroup/cgroup.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index dd247747ec14..809b13588124 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2006,7 +2006,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 
ss_mask)
BUG_ON(!list_empty(_cgrp->self.children));
BUG_ON(atomic_read(>nr_cgrps) != 1);
 
-   kernfs_activate(root_cgrp->kn);
ret = 0;
goto out;
 
-- 
2.20.1 (Apple Git-117)

Re: [PATCH RFC 3/4] mm/page_alloc: always move pages to the tail of the freelist in unset_migratetype_isolate()

2020-09-24 Thread Wei Yang

On Thu, Sep 24, 2020 at 01:13:29PM +0200, Vlastimil Babka wrote:
>On 9/16/20 8:34 PM, David Hildenbrand wrote:
>> Page isolation doesn't actually touch the pages, it simply isolates
>> pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist.
>> 
>> We already place pages to the tail of the freelists when undoing
>> isolation via __putback_isolated_page(), let's do it in any case
>> (e.g., if order == pageblock_order) and document the behavior.
>> 
>> This change results in all pages getting onlined via online_pages() to
>> be placed to the tail of the freelist.
>> 
>> Cc: Andrew Morton 
>> Cc: Alexander Duyck 
>> Cc: Mel Gorman 
>> Cc: Michal Hocko 
>> Cc: Dave Hansen 
>> Cc: Vlastimil Babka 
>> Cc: Wei Yang 
>> Cc: Oscar Salvador 
>> Cc: Mike Rapoport 
>> Cc: Scott Cheloha 
>> Cc: Michael Ellerman 
>> Signed-off-by: David Hildenbrand 
>> ---
>>  include/linux/page-isolation.h |  2 ++
>>  mm/page_alloc.c| 36 +-
>>  mm/page_isolation.c|  8 ++--
>>  3 files changed, 39 insertions(+), 7 deletions(-)
>> 
>> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>> index 572458016331..a36be2cf4dbb 100644
>> --- a/include/linux/page-isolation.h
>> +++ b/include/linux/page-isolation.h
>> @@ -38,6 +38,8 @@ struct page *has_unmovable_pages(struct zone *zone, struct 
>> page *page,
>>  void set_pageblock_migratetype(struct page *page, int migratetype);
>>  int move_freepages_block(struct zone *zone, struct page *page,
>>  int migratetype, int *num_movable);
>> +int move_freepages_block_tail(struct zone *zone, struct page *page,
>> +  int migratetype);
>>  
>>  /*
>>   * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index bba9a0f60c70..75b0f49b4022 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -899,6 +899,15 @@ static inline void move_to_free_list(struct page *page, 
>> struct zone *zone,
>>  list_move(>lru, >free_list[migratetype]);
>>  }
>>  
>> +/* Used for pages which are on another list */
>> +static inline void move_to_free_list_tail(struct page *page, struct zone 
>> *zone,
>> +  unsigned int order, int migratetype)
>> +{
>> +struct free_area *area = >free_area[order];
>> +
>> +list_move_tail(>lru, >free_list[migratetype]);
>> +}
>
>There are just 3 callers of move_to_free_list() before this patch, I would just
>add the to_tail parameter there instead of new wrapper. For callers with
>constant parameter, the inline will eliminate it anyway.

Got the same feeling :-)

>
>>  static inline void del_page_from_free_list(struct page *page, struct zone 
>> *zone,
>> unsigned int order)
>>  {
>> @@ -2323,7 +2332,7 @@ static inline struct page 
>> *__rmqueue_cma_fallback(struct zone *zone,
>>   */
>>  static int move_freepages(struct zone *zone,
>>struct page *start_page, struct page *end_page,
>> -  int migratetype, int *num_movable)
>> +  int migratetype, int *num_movable, bool to_tail)
>>  {
>>  struct page *page;
>>  unsigned int order;
>> @@ -2354,7 +2363,10 @@ static int move_freepages(struct zone *zone,
>>  VM_BUG_ON_PAGE(page_zone(page) != zone, page);
>>  
>>  order = page_order(page);
>> -move_to_free_list(page, zone, order, migratetype);
>> +if (to_tail)
>> +move_to_free_list_tail(page, zone, order, migratetype);
>> +else
>> +move_to_free_list(page, zone, order, migratetype);
>>  page += 1 << order;
>>  pages_moved += 1 << order;
>>  }
>> @@ -2362,8 +2374,9 @@ static int move_freepages(struct zone *zone,
>>  return pages_moved;
>>  }
>>  
>> -int move_freepages_block(struct zone *zone, struct page *page,
>> -int migratetype, int *num_movable)
>> +static int __move_freepages_block(struct zone *zone, struct page *page,
>> +  int migratetype, int *num_movable,
>> +  bool to_tail)
>>  {
>>  unsigned long start_pfn, end_pfn;
>>  struct page *start_page, *end_page;
>> @

Re: [PATCH RFC 0/4] mm: place pages to the freelist tail when onling and undoing isolation

2020-09-23 Thread Wei Yang

On Wed, Sep 23, 2020 at 04:31:25PM +0200, Vlastimil Babka wrote:
>On 9/16/20 9:31 PM, David Hildenbrand wrote:
>> 
>> 
>>> Am 16.09.2020 um 20:50 schrieb osalva...@suse.de:
>>> 
>>> On 2020-09-16 20:34, David Hildenbrand wrote:
>>>> When adding separate memory blocks via add_memory*() and onlining them
>>>> immediately, the metadata (especially the memmap) of the next block will be
>>>> placed onto one of the just added+onlined block. This creates a chain
>>>> of unmovable allocations: If the last memory block cannot get
>>>> offlined+removed() so will all dependant ones. We directly have unmovable
>>>> allocations all over the place.
>>>> This can be observed quite easily using virtio-mem, however, it can also
>>>> be observed when using DIMMs. The freshly onlined pages will usually be
>>>> placed to the head of the freelists, meaning they will be allocated next,
>>>> turning the just-added memory usually immediately un-removable. The
>>>> fresh pages are cold, prefering to allocate others (that might be hot)
>>>> also feels to be the natural thing to do.
>>>> It also applies to the hyper-v balloon xen-balloon, and ppc64 dlpar: when
>>>> adding separate, successive memory blocks, each memory block will have
>>>> unmovable allocations on them - for example gigantic pages will fail to
>>>> allocate.
>>>> While the ZONE_NORMAL doesn't provide any guarantees that memory can get
>>>> offlined+removed again (any kind of fragmentation with unmovable
>>>> allocations is possible), there are many scenarios (hotplugging a lot of
>>>> memory, running workload, hotunplug some memory/as much as possible) where
>>>> we can offline+remove quite a lot with this patchset.
>>> 
>>> Hi David,
>>> 
>> 
>> Hi Oscar.
>> 
>>> I did not read through the patchset yet, so sorry if the question is 
>>> nonsense, but is this not trying to fix the same issue the vmemmap patches 
>>> did? [1]
>> 
>> Not nonesense at all. It only helps to some degree, though. It solves the 
>> dependencies due to the memmap. However, it‘s not completely ideal, 
>> especially for single memory blocks.
>> 
>> With single memory blocks (virtio-mem, xen-balloon, hv balloon, ppc dlpar) 
>> you still have unmovable (vmemmap chunks) all over the physical address 
>> space. Consider the gigantic page example after hotplug. You directly 
>> fragmented all hotplugged memory.
>> 
>> Of course, there might be (less extreme) dependencies due page tables for 
>> the identity mapping, extended struct pages and similar.
>> 
>> Having that said, there are other benefits when preferring other memory over 
>> just hotplugged memory. Think about adding+onlining memory during boot 
>> (dimms under QEMU, virtio-mem), once the system is up you will have most 
>> (all) of that memory completely untouched.
>> 
>> So while vmemmap on hotplugged memory would tackle some part of the issue, 
>> there are cases where this approach is better, and there are even benefits 
>> when combining both.
>
>I see the point, but I don't think the head/tail mechanism is great for this. 
>It
>might sort of work, but with other interfering activity there are no guarantees
>and it relies on a subtle implementation detail. There are better mechanisms
>possible I think, such as preparing a larger MIGRATE_UNMOVABLE area in the
>existing memory before we allocate those long-term management structures. Or
>onlining a bunch of blocks as zone_movable first and only later convert to
>zone_normal in a controlled way when existing normal zone becomes depeted?
>

To be honest, David's approach is easy to understand for me.

And I don't see some negative effect.

>I guess it's an issue that the e.g. 128M block onlines are so disconnected from
>each other it's hard to employ a strategy that works best for e.g. a whole 
>bunch
>of GB onlined at once. But I noticed some effort towards new API, so maybe that
>will be solved there too?
>
>> Thanks!
>> 
>> David
>> 
>>> 
>>> I was about to give it a new respin now that thw hwpoison stuff has been 
>>> settled.
>>> 
>>> [1] https://patchwork.kernel.org/cover/11059175/
>>> 
>> 

-- 
Wei Yang
Help you, Help me

[PATCH] mm/mempolicy: remove or narrow the lock on current

2020-09-20 Thread Wei Yang

It is not necessary to hold the lock of current when setting nodemask of
a new policy.

Signed-off-by: Wei Yang 
---
 mm/mempolicy.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 906adc58d86f..3fde772ef5ef 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -875,13 +875,12 @@ static long do_set_mempolicy(unsigned short mode, 
unsigned short flags,
goto out;
}
 
-   task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
-   task_unlock(current);
mpol_put(new);
goto out;
}
+   task_lock(current);
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
@@ -1324,9 +1323,7 @@ static long do_mbind(unsigned long start, unsigned long 
len,
NODEMASK_SCRATCH(scratch);
if (scratch) {
mmap_write_lock(mm);
-   task_lock(current);
err = mpol_set_nodemask(new, nmask, scratch);
-   task_unlock(current);
if (err)
mmap_write_unlock(mm);
} else
-- 
2.20.1 (Apple Git-117)

[PATCH] mm: remove unused alloc_page_vma_node()

2020-09-20 Thread Wei Yang

No one use this macro anymore.

Also fix code style of policy_node().

Signed-off-by: Wei Yang 
---
 include/linux/gfp.h | 2 --
 mm/mempolicy.c  | 3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 67a0774e080b..e83cc7f5a2fc 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -560,8 +560,6 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int 
order,
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)\
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
-#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
-   alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eddbe4e56c73..906adc58d86f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1885,8 +1885,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy 
*policy)
 }
 
 /* Return the node id preferred by the given mempolicy, or the given id */
-static int policy_node(gfp_t gfp, struct mempolicy *policy,
-   int nd)
+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 {
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
-- 
2.20.1 (Apple Git-117)

Re: [PATCH RFC 2/4] mm/page_alloc: place pages to tail in __putback_isolated_page()

2020-09-20 Thread Wei Yang

On Fri, Sep 18, 2020 at 09:27:23AM +0200, David Hildenbrand wrote:
>On 18.09.20 04:07, Wei Yang wrote:
>> On Wed, Sep 16, 2020 at 08:34:09PM +0200, David Hildenbrand wrote:
>>> __putback_isolated_page() already documents that pages will be placed to
>>> the tail of the freelist - this is, however, not the case for
>>> "order >= MAX_ORDER - 2" (see buddy_merge_likely()) - which should be
>>> the case for all existing users.
>>>
>>> This change affects two users:
>>> - free page reporting
>>> - page isolation, when undoing the isolation.
>>>
>>> This behavior is desireable for pages that haven't really been touched
>>> lately, so exactly the two users that don't actually read/write page
>>> content, but rather move untouched pages.
>>>
>>> The new behavior is especially desirable for memory onlining, where we
>>> allow allocation of newly onlined pages via undo_isolate_page_range()
>>> in online_pages(). Right now, we always place them to the head of the
>> 
>> The code looks good, while I don't fully understand the log here.
>> 
>> undo_isolate_page_range() is used in __offline_pages and alloc_contig_range. 
>> I
>> don't connect them with online_pages(). Do I miss something?
>
>Yeah, please look at -mm / -next instead. See
>
>https://lkml.kernel.org/r/20200819175957.28465-11-da...@redhat.com
>

Thanks, I get the point.

>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH RFC 2/4] mm/page_alloc: place pages to tail in __putback_isolated_page()

2020-09-17 Thread Wei Yang

On Wed, Sep 16, 2020 at 08:34:09PM +0200, David Hildenbrand wrote:
>__putback_isolated_page() already documents that pages will be placed to
>the tail of the freelist - this is, however, not the case for
>"order >= MAX_ORDER - 2" (see buddy_merge_likely()) - which should be
>the case for all existing users.
>
>This change affects two users:
>- free page reporting
>- page isolation, when undoing the isolation.
>
>This behavior is desireable for pages that haven't really been touched
>lately, so exactly the two users that don't actually read/write page
>content, but rather move untouched pages.
>
>The new behavior is especially desirable for memory onlining, where we
>allow allocation of newly onlined pages via undo_isolate_page_range()
>in online_pages(). Right now, we always place them to the head of the

The code looks good, while I don't fully understand the log here.

undo_isolate_page_range() is used in __offline_pages and alloc_contig_range. I
don't connect them with online_pages(). Do I miss something?

>free list, resulting in undesireable behavior: Assume we add
>individual memory chunks via add_memory() and online them right away to
>the NORMAL zone. We create a dependency chain of unmovable allocations
>e.g., via the memmap. The memmap of the next chunk will be placed onto
>previous chunks - if the last block cannot get offlined+removed, all
>dependent ones cannot get offlined+removed. While this can already be
>observed with individual DIMMs, it's more of an issue for virtio-mem
>(and I suspect also ppc DLPAR).
>
>Note: If we observe a degradation due to the changed page isolation
>behavior (which I doubt), we can always make this configurable by the
>instance triggering undo of isolation (e.g., alloc_contig_range(),
>memory onlining, memory offlining).
>
>Cc: Andrew Morton 
>Cc: Alexander Duyck 
>Cc: Mel Gorman 
>Cc: Michal Hocko 
>Cc: Dave Hansen 
>Cc: Vlastimil Babka 
>Cc: Wei Yang 
>Cc: Oscar Salvador 
>Cc: Mike Rapoport 
>Cc: Scott Cheloha 
>Cc: Michael Ellerman 
>Signed-off-by: David Hildenbrand 
>---
> mm/page_alloc.c | 10 +-
> 1 file changed, 9 insertions(+), 1 deletion(-)
>
>diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>index 91cefb8157dd..bba9a0f60c70 100644
>--- a/mm/page_alloc.c
>+++ b/mm/page_alloc.c
>@@ -89,6 +89,12 @@ typedef int __bitwise fop_t;
>  */
> #define FOP_SKIP_REPORT_NOTIFY((__force fop_t)BIT(0))
> 
>+/*
>+ * Place the freed page to the tail of the freelist after buddy merging. Will
>+ * get ignored with page shuffling enabled.
>+ */
>+#define FOP_TO_TAIL   ((__force fop_t)BIT(1))
>+
> /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
> static DEFINE_MUTEX(pcp_batch_high_lock);
> #define MIN_PERCPU_PAGELIST_FRACTION  (8)
>@@ -1040,6 +1046,8 @@ static inline void __free_one_page(struct page *page, 
>unsigned long pfn,
> 
>   if (is_shuffle_order(order))
>   to_tail = shuffle_pick_tail();
>+  else if (fop_flags & FOP_TO_TAIL)
>+  to_tail = true;
>   else
>   to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
> 
>@@ -3289,7 +3297,7 @@ void __putback_isolated_page(struct page *page, unsigned 
>int order, int mt)
> 
>   /* Return isolated page to tail of freelist. */
>   __free_one_page(page, page_to_pfn(page), zone, order, mt,
>-  FOP_SKIP_REPORT_NOTIFY);
>+  FOP_SKIP_REPORT_NOTIFY | FOP_TO_TAIL);
> }
> 
> /*
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH RFC 0/4] mm: place pages to the freelist tail when onling and undoing isolation

2020-09-17 Thread Wei Yang

On Wed, Sep 16, 2020 at 09:31:21PM +0200, David Hildenbrand wrote:
>
>
>> Am 16.09.2020 um 20:50 schrieb osalva...@suse.de:
>> 
>> On 2020-09-16 20:34, David Hildenbrand wrote:
>>> When adding separate memory blocks via add_memory*() and onlining them
>>> immediately, the metadata (especially the memmap) of the next block will be
>>> placed onto one of the just added+onlined block. This creates a chain
>>> of unmovable allocations: If the last memory block cannot get
>>> offlined+removed() so will all dependant ones. We directly have unmovable
>>> allocations all over the place.
>>> This can be observed quite easily using virtio-mem, however, it can also
>>> be observed when using DIMMs. The freshly onlined pages will usually be
>>> placed to the head of the freelists, meaning they will be allocated next,
>>> turning the just-added memory usually immediately un-removable. The
>>> fresh pages are cold, prefering to allocate others (that might be hot)
>>> also feels to be the natural thing to do.
>>> It also applies to the hyper-v balloon xen-balloon, and ppc64 dlpar: when
>>> adding separate, successive memory blocks, each memory block will have
>>> unmovable allocations on them - for example gigantic pages will fail to
>>> allocate.
>>> While the ZONE_NORMAL doesn't provide any guarantees that memory can get
>>> offlined+removed again (any kind of fragmentation with unmovable
>>> allocations is possible), there are many scenarios (hotplugging a lot of
>>> memory, running workload, hotunplug some memory/as much as possible) where
>>> we can offline+remove quite a lot with this patchset.
>> 
>> Hi David,
>> 
>
>Hi Oscar.
>
>> I did not read through the patchset yet, so sorry if the question is 
>> nonsense, but is this not trying to fix the same issue the vmemmap patches 
>> did? [1]
>
>Not nonesense at all. It only helps to some degree, though. It solves the 
>dependencies due to the memmap. However, it‘s not completely ideal, especially 
>for single memory blocks.
>
>With single memory blocks (virtio-mem, xen-balloon, hv balloon, ppc dlpar) you 
>still have unmovable (vmemmap chunks) all over the physical address space. 
>Consider the gigantic page example after hotplug. You directly fragmented all 
>hotplugged memory.
>
>Of course, there might be (less extreme) dependencies due page tables for the 
>identity mapping, extended struct pages and similar.
>
>Having that said, there are other benefits when preferring other memory over 
>just hotplugged memory. Think about adding+onlining memory during boot (dimms 
>under QEMU, virtio-mem), once the system is up you will have most (all) of 
>that memory completely untouched.
>
>So while vmemmap on hotplugged memory would tackle some part of the issue, 
>there are cases where this approach is better, and there are even benefits 
>when combining both.

While everything changes with shuffle.

>
>Thanks!
>
>David
>
>> 
>> I was about to give it a new respin now that thw hwpoison stuff has been 
>> settled.
>> 
>> [1] https://patchwork.kernel.org/cover/11059175/
>> 

-- 
Wei Yang
Help you, Help me

Re: [PATCH RFC 3/4] mm/page_alloc: always move pages to the tail of the freelist in unset_migratetype_isolate()

2020-09-17 Thread Wei Yang

On Wed, Sep 16, 2020 at 08:34:10PM +0200, David Hildenbrand wrote:
>Page isolation doesn't actually touch the pages, it simply isolates
>pageblocks and moves all free pages to the MIGRATE_ISOLATE freelist.
>
>We already place pages to the tail of the freelists when undoing
>isolation via __putback_isolated_page(), let's do it in any case
>(e.g., if order == pageblock_order) and document the behavior.
>
>This change results in all pages getting onlined via online_pages() to
>be placed to the tail of the freelist.

I am sorry to not follow again. unset_migratetype_isolate() is used in
__offline_pages if my understanding is correct. How does it contribute on
online_pages? 

>
>Cc: Andrew Morton 
>Cc: Alexander Duyck 
>Cc: Mel Gorman 
>Cc: Michal Hocko 
>Cc: Dave Hansen 
>Cc: Vlastimil Babka 
>Cc: Wei Yang 
>Cc: Oscar Salvador 
>Cc: Mike Rapoport 
>Cc: Scott Cheloha 
>Cc: Michael Ellerman 
>Signed-off-by: David Hildenbrand 
>---
> include/linux/page-isolation.h |  2 ++
> mm/page_alloc.c| 36 +-
> mm/page_isolation.c|  8 ++--
> 3 files changed, 39 insertions(+), 7 deletions(-)
>
>diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
>index 572458016331..a36be2cf4dbb 100644
>--- a/include/linux/page-isolation.h
>+++ b/include/linux/page-isolation.h
>@@ -38,6 +38,8 @@ struct page *has_unmovable_pages(struct zone *zone, struct 
>page *page,
> void set_pageblock_migratetype(struct page *page, int migratetype);
> int move_freepages_block(struct zone *zone, struct page *page,
>   int migratetype, int *num_movable);
>+int move_freepages_block_tail(struct zone *zone, struct page *page,
>+int migratetype);
> 
> /*
>  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
>diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>index bba9a0f60c70..75b0f49b4022 100644
>--- a/mm/page_alloc.c
>+++ b/mm/page_alloc.c
>@@ -899,6 +899,15 @@ static inline void move_to_free_list(struct page *page, 
>struct zone *zone,
>   list_move(>lru, >free_list[migratetype]);
> }
> 
>+/* Used for pages which are on another list */
>+static inline void move_to_free_list_tail(struct page *page, struct zone 
>*zone,
>+unsigned int order, int migratetype)
>+{
>+  struct free_area *area = >free_area[order];
>+
>+  list_move_tail(>lru, >free_list[migratetype]);
>+}
>+
> static inline void del_page_from_free_list(struct page *page, struct zone 
> *zone,
>  unsigned int order)
> {
>@@ -2323,7 +2332,7 @@ static inline struct page *__rmqueue_cma_fallback(struct 
>zone *zone,
>  */
> static int move_freepages(struct zone *zone,
> struct page *start_page, struct page *end_page,
>-int migratetype, int *num_movable)
>+int migratetype, int *num_movable, bool to_tail)
> {
>   struct page *page;
>   unsigned int order;
>@@ -2354,7 +2363,10 @@ static int move_freepages(struct zone *zone,
>   VM_BUG_ON_PAGE(page_zone(page) != zone, page);
> 
>   order = page_order(page);
>-  move_to_free_list(page, zone, order, migratetype);
>+  if (to_tail)
>+  move_to_free_list_tail(page, zone, order, migratetype);
>+  else
>+  move_to_free_list(page, zone, order, migratetype);
>   page += 1 << order;
>   pages_moved += 1 << order;
>   }
>@@ -2362,8 +2374,9 @@ static int move_freepages(struct zone *zone,
>   return pages_moved;
> }
> 
>-int move_freepages_block(struct zone *zone, struct page *page,
>-  int migratetype, int *num_movable)
>+static int __move_freepages_block(struct zone *zone, struct page *page,
>+int migratetype, int *num_movable,
>+bool to_tail)
> {
>   unsigned long start_pfn, end_pfn;
>   struct page *start_page, *end_page;
>@@ -2384,7 +2397,20 @@ int move_freepages_block(struct zone *zone, struct page 
>*page,
>   return 0;
> 
>   return move_freepages(zone, start_page, end_page, migratetype,
>-  num_movable);
>+num_movable, to_tail);
>+}
>+
>+int move_freepages_block(struct zone *zone, struct page *page,
>+   int migratetype, int *num_movable)
>+{
>+  return __move_freepages_block(zone, page, migratetype, num_movable

Re: [PATCH RFC 2/4] mm/page_alloc: place pages to tail in __putback_isolated_page()

2020-09-17 Thread Wei Yang

On Wed, Sep 16, 2020 at 08:34:09PM +0200, David Hildenbrand wrote:
>__putback_isolated_page() already documents that pages will be placed to
>the tail of the freelist - this is, however, not the case for
>"order >= MAX_ORDER - 2" (see buddy_merge_likely()) - which should be
>the case for all existing users.
>
>This change affects two users:
>- free page reporting
>- page isolation, when undoing the isolation.
>
>This behavior is desireable for pages that haven't really been touched
>lately, so exactly the two users that don't actually read/write page
>content, but rather move untouched pages.
>
>The new behavior is especially desirable for memory onlining, where we
>allow allocation of newly onlined pages via undo_isolate_page_range()
>in online_pages(). Right now, we always place them to the head of the
>free list, resulting in undesireable behavior: Assume we add
>individual memory chunks via add_memory() and online them right away to
>the NORMAL zone. We create a dependency chain of unmovable allocations
>e.g., via the memmap. The memmap of the next chunk will be placed onto
>previous chunks - if the last block cannot get offlined+removed, all
>dependent ones cannot get offlined+removed. While this can already be
>observed with individual DIMMs, it's more of an issue for virtio-mem
>(and I suspect also ppc DLPAR).
>
>Note: If we observe a degradation due to the changed page isolation
>behavior (which I doubt), we can always make this configurable by the
>instance triggering undo of isolation (e.g., alloc_contig_range(),
>memory onlining, memory offlining).
>
>Cc: Andrew Morton 
>Cc: Alexander Duyck 
>Cc: Mel Gorman 
>Cc: Michal Hocko 
>Cc: Dave Hansen 
>Cc: Vlastimil Babka 
>Cc: Wei Yang 
>Cc: Oscar Salvador 
>Cc: Mike Rapoport 
>Cc: Scott Cheloha 
>Cc: Michael Ellerman 
>Signed-off-by: David Hildenbrand 
>---
> mm/page_alloc.c | 10 +-
> 1 file changed, 9 insertions(+), 1 deletion(-)
>
>diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>index 91cefb8157dd..bba9a0f60c70 100644
>--- a/mm/page_alloc.c
>+++ b/mm/page_alloc.c
>@@ -89,6 +89,12 @@ typedef int __bitwise fop_t;
>  */
> #define FOP_SKIP_REPORT_NOTIFY((__force fop_t)BIT(0))
> 
>+/*
>+ * Place the freed page to the tail of the freelist after buddy merging. Will
>+ * get ignored with page shuffling enabled.
>+ */
>+#define FOP_TO_TAIL   ((__force fop_t)BIT(1))
>+
> /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
> static DEFINE_MUTEX(pcp_batch_high_lock);
> #define MIN_PERCPU_PAGELIST_FRACTION  (8)
>@@ -1040,6 +1046,8 @@ static inline void __free_one_page(struct page *page, 
>unsigned long pfn,
> 
>   if (is_shuffle_order(order))
>   to_tail = shuffle_pick_tail();
>+  else if (fop_flags & FOP_TO_TAIL)
>+  to_tail = true;

Take another look into this part. Maybe we can move this check at top?

For online_page case, currently we have following call flow:

online_page
online_pages_range
shuffle_zone

This means we would always shuffle the newly added pages. Maybe we don't need
to do the shuffle when adding them to the free_list?

>   else
>   to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
> 
>@@ -3289,7 +3297,7 @@ void __putback_isolated_page(struct page *page, unsigned 
>int order, int mt)
> 
>   /* Return isolated page to tail of freelist. */
>   __free_one_page(page, page_to_pfn(page), zone, order, mt,
>-  FOP_SKIP_REPORT_NOTIFY);
>+  FOP_SKIP_REPORT_NOTIFY | FOP_TO_TAIL);
> }
> 
> /*
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH RFC 1/4] mm/page_alloc: convert "report" flag of __free_one_page() to a proper flag

2020-09-17 Thread Wei Yang

On Wed, Sep 16, 2020 at 08:34:08PM +0200, David Hildenbrand wrote:
>Let's prepare for additional flags and avoid long parameter lists of bools.
>Follow-up patches will also make use of the flags in __free_pages_ok(),
>however, I wasn't able to come up with a better name for the type - should
>be good enough for internal purposes.
>
>Cc: Andrew Morton 
>Cc: Alexander Duyck 
>Cc: Mel Gorman 
>Cc: Michal Hocko 
>Cc: Dave Hansen 
>Cc: Vlastimil Babka 
>Cc: Wei Yang 
>Cc: Oscar Salvador 
>Cc: Mike Rapoport 
>Signed-off-by: David Hildenbrand 
>---
> mm/page_alloc.c | 28 
> 1 file changed, 20 insertions(+), 8 deletions(-)
>
>diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>index 6b699d273d6e..91cefb8157dd 100644
>--- a/mm/page_alloc.c
>+++ b/mm/page_alloc.c
>@@ -77,6 +77,18 @@
> #include "shuffle.h"
> #include "page_reporting.h"
> 
>+/* Free One Page flags: for internal, non-pcp variants of free_pages(). */
>+typedef int __bitwise fop_t;
>+
>+/* No special request */
>+#define FOP_NONE  ((__force fop_t)0)
>+
>+/*
>+ * Skip free page reporting notification after buddy merging (will *not* mark

__free_one_page() may not merge buddy when its buddy is not available.

Would this comment be a little confusing?

>+ * the page reported, only skip the notification).
>+ */
>+#define FOP_SKIP_REPORT_NOTIFY((__force fop_t)BIT(0))
>+
> /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
> static DEFINE_MUTEX(pcp_batch_high_lock);
> #define MIN_PERCPU_PAGELIST_FRACTION  (8)
>@@ -948,10 +960,9 @@ buddy_merge_likely(unsigned long pfn, unsigned long 
>buddy_pfn,
>  * -- nyc
>  */
> 
>-static inline void __free_one_page(struct page *page,
>-  unsigned long pfn,
>-  struct zone *zone, unsigned int order,
>-  int migratetype, bool report)
>+static inline void __free_one_page(struct page *page, unsigned long pfn,
>+ struct zone *zone, unsigned int order,
>+ int migratetype, fop_t fop_flags)
> {
>   struct capture_control *capc = task_capc(zone);
>   unsigned long buddy_pfn;
>@@ -1038,7 +1049,7 @@ static inline void __free_one_page(struct page *page,
>   add_to_free_list(page, zone, order, migratetype);
> 
>   /* Notify page reporting subsystem of freed page */
>-  if (report)
>+  if (!(fop_flags & FOP_SKIP_REPORT_NOTIFY))
>   page_reporting_notify_free(order);
> }
> 
>@@ -1368,7 +1379,7 @@ static void free_pcppages_bulk(struct zone *zone, int 
>count,
>   if (unlikely(isolated_pageblocks))
>   mt = get_pageblock_migratetype(page);
> 
>-  __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
>+  __free_one_page(page, page_to_pfn(page), zone, 0, mt, FOP_NONE);
>   trace_mm_page_pcpu_drain(page, 0, mt);
>   }
>   spin_unlock(>lock);
>@@ -1384,7 +1395,7 @@ static void free_one_page(struct zone *zone,
>   is_migrate_isolate(migratetype))) {
>   migratetype = get_pfnblock_migratetype(page, pfn);
>   }
>-  __free_one_page(page, pfn, zone, order, migratetype, true);
>+  __free_one_page(page, pfn, zone, order, migratetype, FOP_NONE);
>   spin_unlock(>lock);
> }
> 
>@@ -3277,7 +3288,8 @@ void __putback_isolated_page(struct page *page, unsigned 
>int order, int mt)
>   lockdep_assert_held(>lock);
> 
>   /* Return isolated page to tail of freelist. */
>-  __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
>+  __free_one_page(page, page_to_pfn(page), zone, order, mt,
>+  FOP_SKIP_REPORT_NOTIFY);
> }
> 
> /*
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

[PATCH] mm/mmap: do the check on expand instead of on insert

2020-09-17 Thread Wei Yang

Function __vma_adjust() checks on *insert* to decide whether it is
necessary to remove_next/adjust_next. While it is more reasonable to do
the check on *expand* instead of on *insert*, since this is only necessary
when *expand* is non-NULL.

There are several users for __vma_adjust, let's classify them based on
the value of *insert*/*expand*:

   caller  |insert/expand
   +-
   vma_merge   |NULL/non-NULL
   __split_vma |non-NULL/NULL
   shift_arg_pages/mremap  |NULL/NULL

To make this change, we need to make sure those non-NULL *insert* cases
wouldn't go into this if branch except vma_merge. There are two cases we
need to take care of: shift_arg_pages and mremap. Let's look at it one
by one.

For mremap, it is for sure we won't go into this if branch since
vma_adjust tries to expand the vma and the vma is expandable(the end
wouldn't interact with vma and next).

For shift_arg_pages, this case is a little tricky. Actually, for this
case, vma->vm_next should be NULL. Otherwise we would go into the branch
of "end < vma->vm_end" since we are shifting left. This means we would
expand the vma->vm_next by accident. Luckily, in current code, we won't
fall into this situation because shift_arg_pages only shift the stack
which is the highest one in virtual space.

To make the code more easy to understand(only vma_merge has a non-NULL
expand), and to make it handle the corner case(shift_arg_pages)
properly, let's do the check on *expand* instead of *insert*.

Signed-off-by: Wei Yang 
---
 mm/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 829897646a9c..ca31b405fbfa 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -706,7 +706,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
long adjust_next = 0;
int remove_next = 0;
 
-   if (next && !insert) {
+   if (next && expand) {
struct vm_area_struct *exporter = NULL, *importer = NULL;
 
if (end >= next->vm_end) {
-- 
2.20.1 (Apple Git-117)

Re: [PATCH] kernel/resource: make iomem_resource implicit in release_mem_region_adjustable()

2020-09-16 Thread Wei Yang

On Wed, Sep 16, 2020 at 09:30:41AM +0200, David Hildenbrand wrote:
>"mem" in the name already indicates the root, similar to
>release_mem_region() and devm_request_mem_region(). Make it implicit.
>The only single caller always passes iomem_resource, other parents are
>not applicable.
>

Looks good to me.

Reviewed-by: Wei Yang 

>Suggested-by: Wei Yang 
>Cc: Andrew Morton 
>Cc: Michal Hocko 
>Cc: Dan Williams 
>Cc: Jason Gunthorpe 
>Cc: Kees Cook 
>Cc: Ard Biesheuvel 
>Cc: Pankaj Gupta 
>Cc: Baoquan He 
>Cc: Wei Yang 
>Signed-off-by: David Hildenbrand 
>---
>
>Based on next-20200915. Follow up on
>   "[PATCH v4 0/8] selective merging of system ram resources" [1]
>That's in next-20200915. As noted during review of v2 by Wei [2].
>
>[1] https://lkml.kernel.org/r/20200911103459.10306-1-da...@redhat.com
>[2] https://lkml.kernel.org/r/20200915021012.GC2007@L-31X9LVDL-1304.local
>
>---
> include/linux/ioport.h | 3 +--
> kernel/resource.c  | 5 ++---
> mm/memory_hotplug.c| 2 +-
> 3 files changed, 4 insertions(+), 6 deletions(-)
>
>diff --git a/include/linux/ioport.h b/include/linux/ioport.h
>index 7e61389dcb01..5135d4b86cd6 100644
>--- a/include/linux/ioport.h
>+++ b/include/linux/ioport.h
>@@ -251,8 +251,7 @@ extern struct resource * __request_region(struct resource 
>*,
> extern void __release_region(struct resource *, resource_size_t,
>   resource_size_t);
> #ifdef CONFIG_MEMORY_HOTREMOVE
>-extern void release_mem_region_adjustable(struct resource *, resource_size_t,
>-resource_size_t);
>+extern void release_mem_region_adjustable(resource_size_t, resource_size_t);
> #endif
> #ifdef CONFIG_MEMORY_HOTPLUG
> extern void merge_system_ram_resource(struct resource *res);
>diff --git a/kernel/resource.c b/kernel/resource.c
>index 7a91b935f4c2..ca2a666e4317 100644
>--- a/kernel/resource.c
>+++ b/kernel/resource.c
>@@ -1240,7 +1240,6 @@ EXPORT_SYMBOL(__release_region);
> #ifdef CONFIG_MEMORY_HOTREMOVE
> /**
>  * release_mem_region_adjustable - release a previously reserved memory region
>- * @parent: parent resource descriptor
>  * @start: resource start address
>  * @size: resource region size
>  *
>@@ -1258,9 +1257,9 @@ EXPORT_SYMBOL(__release_region);
>  *   assumes that all children remain in the lower address entry for
>  *   simplicity.  Enhance this logic when necessary.
>  */
>-void release_mem_region_adjustable(struct resource *parent,
>- resource_size_t start, resource_size_t size)
>+void release_mem_region_adjustable(resource_size_t start, resource_size_t 
>size)
> {
>+  struct resource *parent = _resource;
>   struct resource *new_res = NULL;
>   bool alloc_nofail = false;
>   struct resource **p;
>diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>index 553c718226b3..7c5e4744ac51 100644
>--- a/mm/memory_hotplug.c
>+++ b/mm/memory_hotplug.c
>@@ -1764,7 +1764,7 @@ static int __ref try_remove_memory(int nid, u64 start, 
>u64 size)
>   memblock_remove(start, size);
>   }
> 
>-  release_mem_region_adjustable(_resource, start, size);
>+  release_mem_region_adjustable(start, size);
> 
>   try_offline_node(nid);
> 
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v2 1/7] kernel/resource: make release_mem_region_adjustable() never fail

2020-09-15 Thread Wei Yang

On Tue, Sep 15, 2020 at 11:15:53AM +0200, David Hildenbrand wrote:
>On 15.09.20 11:06, Wei Yang wrote:
>> On Tue, Sep 15, 2020 at 09:35:30AM +0200, David Hildenbrand wrote:
>>>
>>>>> static int __ref try_remove_memory(int nid, u64 start, u64 size)
>>>>> {
>>>>>   int rc = 0;
>>>>> @@ -1777,7 +1757,7 @@ static int __ref try_remove_memory(int nid, u64 
>>>>> start, u64 size)
>>>>>   memblock_remove(start, size);
>>>>>   }
>>>>>
>>>>> - __release_memory_resource(start, size);
>>>>> + release_mem_region_adjustable(_resource, start, size);
>>>>>
>>>>
>>>> Seems the only user of release_mem_region_adjustable() is here, can we move
>>>> iomem_resource into the function body? Actually, we don't iterate the 
>>>> resource
>>>> tree from any level. We always start from the root.
>>>
>>> You mean, making iomem_resource implicit? I can spot that something
>>> similar was done for
>>>
>>> #define devm_release_mem_region(dev, start, n) \
>>> __devm_release_region(dev, _resource, (start), (n))
>>>
>> 
>> What I prefer is remove iomem_resource from the parameter list. Just use is 
>> in
>> the function body.
>> 
>> For the example you listed, __release_region() would have varies of *parent*,
>> which looks reasonable to keep it here.
>
>Yeah I got that ("making iomem_resource implicit"), as I said:
>

Thanks

>>> I'll send an addon patch for that, ok? - thanks.
>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v2 1/7] kernel/resource: make release_mem_region_adjustable() never fail

2020-09-15 Thread Wei Yang

On Tue, Sep 15, 2020 at 09:35:30AM +0200, David Hildenbrand wrote:
>
>>> static int __ref try_remove_memory(int nid, u64 start, u64 size)
>>> {
>>> int rc = 0;
>>> @@ -1777,7 +1757,7 @@ static int __ref try_remove_memory(int nid, u64 
>>> start, u64 size)
>>> memblock_remove(start, size);
>>> }
>>>
>>> -   __release_memory_resource(start, size);
>>> +   release_mem_region_adjustable(_resource, start, size);
>>>
>> 
>> Seems the only user of release_mem_region_adjustable() is here, can we move
>> iomem_resource into the function body? Actually, we don't iterate the 
>> resource
>> tree from any level. We always start from the root.
>
>You mean, making iomem_resource implicit? I can spot that something
>similar was done for
>
>#define devm_release_mem_region(dev, start, n) \
>   __devm_release_region(dev, _resource, (start), (n))
>

What I prefer is remove iomem_resource from the parameter list. Just use is in
the function body.

For the example you listed, __release_region() would have varies of *parent*,
which looks reasonable to keep it here.

>I'll send an addon patch for that, ok? - thanks.
>
>-- 
>Thanks,
>
>David / dhildenb

-- 
Wei Yang
Help you, Help me

Re: [PATCH v4 5/8] mm/memory_hotplug: MEMHP_MERGE_RESOURCE to specify merging of System RAM resources

2020-09-14 Thread Wei Yang

On Fri, Sep 11, 2020 at 12:34:56PM +0200, David Hildenbrand wrote:
>Some add_memory*() users add memory in small, contiguous memory blocks.
>Examples include virtio-mem, hyper-v balloon, and the XEN balloon.
>
>This can quickly result in a lot of memory resources, whereby the actual
>resource boundaries are not of interest (e.g., it might be relevant for
>DIMMs, exposed via /proc/iomem to user space). We really want to merge
>added resources in this scenario where possible.
>
>Let's provide a flag (MEMHP_MERGE_RESOURCE) to specify that a resource
>either created within add_memory*() or passed via add_memory_resource()
>shall be marked mergeable and merged with applicable siblings.
>
>To implement that, we need a kernel/resource interface to mark selected
>System RAM resources mergeable (IORESOURCE_SYSRAM_MERGEABLE) and trigger
>merging.
>
>Note: We really want to merge after the whole operation succeeded, not
>directly when adding a resource to the resource tree (it would break
>add_memory_resource() and require splitting resources again when the
>operation failed - e.g., due to -ENOMEM).

Oops, the latest version is here.

BTW, I don't see patch 4. Not sure it is junked by my mail system?

>
>Reviewed-by: Pankaj Gupta 
>Cc: Andrew Morton 
>Cc: Michal Hocko 
>Cc: Dan Williams 
>Cc: Jason Gunthorpe 
>Cc: Kees Cook 
>Cc: Ard Biesheuvel 
>Cc: Thomas Gleixner 
>Cc: "K. Y. Srinivasan" 
>Cc: Haiyang Zhang 
>Cc: Stephen Hemminger 
>Cc: Wei Liu 
>Cc: Boris Ostrovsky 
>Cc: Juergen Gross 
>Cc: Stefano Stabellini 
>Cc: Roger Pau Monné 
>Cc: Julien Grall 
>Cc: Pankaj Gupta 
>Cc: Baoquan He 
>Cc: Wei Yang 
>Signed-off-by: David Hildenbrand 
>---
> include/linux/ioport.h |  4 +++
> include/linux/memory_hotplug.h |  7 
> kernel/resource.c  | 60 ++
> mm/memory_hotplug.c|  7 
> 4 files changed, 78 insertions(+)
>
>diff --git a/include/linux/ioport.h b/include/linux/ioport.h
>index d7620d7c941a0..7e61389dcb017 100644
>--- a/include/linux/ioport.h
>+++ b/include/linux/ioport.h
>@@ -60,6 +60,7 @@ struct resource {
> 
> /* IORESOURCE_SYSRAM specific bits. */
> #define IORESOURCE_SYSRAM_DRIVER_MANAGED  0x0200 /* Always detected 
> via a driver. */
>+#define IORESOURCE_SYSRAM_MERGEABLE   0x0400 /* Resource can be 
>merged. */
> 
> #define IORESOURCE_EXCLUSIVE  0x0800  /* Userland may not map this 
> resource */
> 
>@@ -253,6 +254,9 @@ extern void __release_region(struct resource *, 
>resource_size_t,
> extern void release_mem_region_adjustable(struct resource *, resource_size_t,
> resource_size_t);
> #endif
>+#ifdef CONFIG_MEMORY_HOTPLUG
>+extern void merge_system_ram_resource(struct resource *res);
>+#endif
> 
> /* Wrappers for managed devices */
> struct device;
>diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>index 33eb80fdba22f..d65c6fdc5cfc3 100644
>--- a/include/linux/memory_hotplug.h
>+++ b/include/linux/memory_hotplug.h
>@@ -62,6 +62,13 @@ typedef int __bitwise mhp_t;
> 
> /* No special request */
> #define MHP_NONE  ((__force mhp_t)0)
>+/*
>+ * Allow merging of the added System RAM resource with adjacent,
>+ * mergeable resources. After a successful call to add_memory_resource()
>+ * with this flag set, the resource pointer must no longer be used as it
>+ * might be stale, or the resource might have changed.
>+ */
>+#define MEMHP_MERGE_RESOURCE  ((__force mhp_t)BIT(0))
> 
> /*
>  * Extended parameters for memory hotplug:
>diff --git a/kernel/resource.c b/kernel/resource.c
>index 36b3552210120..7a91b935f4c20 100644
>--- a/kernel/resource.c
>+++ b/kernel/resource.c
>@@ -1363,6 +1363,66 @@ void release_mem_region_adjustable(struct resource 
>*parent,
> }
> #endif/* CONFIG_MEMORY_HOTREMOVE */
> 
>+#ifdef CONFIG_MEMORY_HOTPLUG
>+static bool system_ram_resources_mergeable(struct resource *r1,
>+ struct resource *r2)
>+{
>+  /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */
>+  return r1->flags == r2->flags && r1->end + 1 == r2->start &&
>+ r1->name == r2->name && r1->desc == r2->desc &&
>+ !r1->child && !r2->child;
>+}
>+
>+/*
>+ * merge_system_ram_resource - mark the System RAM resource mergeable and try 
>to
>+ * merge it with adjacent, mergeable resources
>+ * @res: resource descriptor
>+ *
>+ * This interface is intended for memory hotplug, whereby lots of contiguous
>+ * system ram resources are added (e.g., vi

Re: [PATCH v2 2/7] kernel/resource: move and rename IORESOURCE_MEM_DRIVER_MANAGED

2020-09-14 Thread Wei Yang

On Tue, Sep 08, 2020 at 10:10:07PM +0200, David Hildenbrand wrote:
>IORESOURCE_MEM_DRIVER_MANAGED currently uses an unused PnP bit, which is
>always set to 0 by hardware. This is far from beautiful (and confusing),
>and the bit only applies to SYSRAM. So let's move it out of the
>bus-specific (PnP) defined bits.
>
>We'll add another SYSRAM specific bit soon. If we ever need more bits for
>other purposes, we can steal some from "desc", or reshuffle/regroup what we
>have.

I think you make this definition because we use IORESOURCE_SYSRAM_RAM for
hotpluged memory? So we make them all in IORESOURCE_SYSRAM_XXX family?

>
>Cc: Andrew Morton 
>Cc: Michal Hocko 
>Cc: Dan Williams 
>Cc: Jason Gunthorpe 
>Cc: Kees Cook 
>Cc: Ard Biesheuvel 
>Cc: Pankaj Gupta 
>Cc: Baoquan He 
>Cc: Wei Yang 
>Cc: Eric Biederman 
>Cc: Thomas Gleixner 
>Cc: Greg Kroah-Hartman 
>Cc: ke...@lists.infradead.org
>Signed-off-by: David Hildenbrand 
>---
> include/linux/ioport.h | 4 +++-
> kernel/kexec_file.c| 2 +-
> mm/memory_hotplug.c| 4 ++--
> 3 files changed, 6 insertions(+), 4 deletions(-)
>
>diff --git a/include/linux/ioport.h b/include/linux/ioport.h
>index 52a91f5fa1a36..d7620d7c941a0 100644
>--- a/include/linux/ioport.h
>+++ b/include/linux/ioport.h
>@@ -58,6 +58,9 @@ struct resource {
> #define IORESOURCE_EXT_TYPE_BITS 0x0100   /* Resource extended types */
> #define IORESOURCE_SYSRAM 0x0100  /* System RAM (modifier) */
> 
>+/* IORESOURCE_SYSRAM specific bits. */
>+#define IORESOURCE_SYSRAM_DRIVER_MANAGED  0x0200 /* Always detected 
>via a driver. */
>+
> #define IORESOURCE_EXCLUSIVE  0x0800  /* Userland may not map this 
> resource */
> 
> #define IORESOURCE_DISABLED   0x1000
>@@ -103,7 +106,6 @@ struct resource {
> #define IORESOURCE_MEM_32BIT  (3<<3)
> #define IORESOURCE_MEM_SHADOWABLE (1<<5)  /* dup: IORESOURCE_SHADOWABLE */
> #define IORESOURCE_MEM_EXPANSIONROM   (1<<6)
>-#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7)
> 
> /* PnP I/O specific bits (IORESOURCE_BITS) */
> #define IORESOURCE_IO_16BIT_ADDR  (1<<0)
>diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
>index ca40bef75a616..dfeeed1aed084 100644
>--- a/kernel/kexec_file.c
>+++ b/kernel/kexec_file.c
>@@ -520,7 +520,7 @@ static int locate_mem_hole_callback(struct resource *res, 
>void *arg)
>   /* Returning 0 will take to next memory range */
> 
>   /* Don't use memory that will be detected and handled by a driver. */
>-  if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED)
>+  if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
>   return 0;
> 
>   if (sz < kbuf->memsz)
>diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>index 4c47b68a9f4b5..8e1cd18b5cf14 100644
>--- a/mm/memory_hotplug.c
>+++ b/mm/memory_hotplug.c
>@@ -105,7 +105,7 @@ static struct resource *register_memory_resource(u64 
>start, u64 size,
>   unsigned long flags =  IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
> 
>   if (strcmp(resource_name, "System RAM"))
>-  flags |= IORESOURCE_MEM_DRIVER_MANAGED;
>+  flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
> 
>   /*
>* Make sure value parsed from 'mem=' only restricts memory adding
>@@ -1160,7 +1160,7 @@ EXPORT_SYMBOL_GPL(add_memory);
>  *
>  * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
>  * memory map") are created. Also, the created memory resource is flagged
>- * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case
>+ * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
>  * this memory as well (esp., not place kexec images onto it).
>  *
>  * The resource_name (visible via /proc/iomem) has to have the format
>-- 
>2.26.2

-- 
Wei Yang
Help you, Help me

Re: [PATCH v2 1/7] kernel/resource: make release_mem_region_adjustable() never fail

2020-09-14 Thread Wei Yang

On Tue, Sep 08, 2020 at 10:10:06PM +0200, David Hildenbrand wrote:
>Let's make sure splitting a resource on memory hotunplug will never fail.
>This will become more relevant once we merge selected System RAM
>resources - then, we'll trigger that case more often on memory hotunplug.
>
>In general, this function is already unlikely to fail. When we remove
>memory, we free up quite a lot of metadata (memmap, page tables, memory
>block device, etc.). The only reason it could really fail would be when
>injecting allocation errors.
>
>All other error cases inside release_mem_region_adjustable() seem to be
>sanity checks if the function would be abused in different context -
>let's add WARN_ON_ONCE() in these cases so we can catch them.
>
>Cc: Andrew Morton 
>Cc: Michal Hocko 
>Cc: Dan Williams 
>Cc: Jason Gunthorpe 
>Cc: Kees Cook 
>Cc: Ard Biesheuvel 
>Cc: Pankaj Gupta 
>Cc: Baoquan He 
>Cc: Wei Yang 
>Signed-off-by: David Hildenbrand 
>---
> include/linux/ioport.h |  4 ++--
> kernel/resource.c  | 49 --
> mm/memory_hotplug.c| 22 +--
> 3 files changed, 31 insertions(+), 44 deletions(-)
>
>diff --git a/include/linux/ioport.h b/include/linux/ioport.h
>index 6c2b06fe8beb7..52a91f5fa1a36 100644
>--- a/include/linux/ioport.h
>+++ b/include/linux/ioport.h
>@@ -248,8 +248,8 @@ extern struct resource * __request_region(struct resource 
>*,
> extern void __release_region(struct resource *, resource_size_t,
>   resource_size_t);
> #ifdef CONFIG_MEMORY_HOTREMOVE
>-extern int release_mem_region_adjustable(struct resource *, resource_size_t,
>-  resource_size_t);
>+extern void release_mem_region_adjustable(struct resource *, resource_size_t,
>+resource_size_t);
> #endif
> 
> /* Wrappers for managed devices */
>diff --git a/kernel/resource.c b/kernel/resource.c
>index f1175ce93a1d5..36b3552210120 100644
>--- a/kernel/resource.c
>+++ b/kernel/resource.c
>@@ -1258,21 +1258,28 @@ EXPORT_SYMBOL(__release_region);
>  *   assumes that all children remain in the lower address entry for
>  *   simplicity.  Enhance this logic when necessary.
>  */
>-int release_mem_region_adjustable(struct resource *parent,
>-resource_size_t start, resource_size_t size)
>+void release_mem_region_adjustable(struct resource *parent,
>+ resource_size_t start, resource_size_t size)
> {
>+  struct resource *new_res = NULL;
>+  bool alloc_nofail = false;
>   struct resource **p;
>   struct resource *res;
>-  struct resource *new_res;
>   resource_size_t end;
>-  int ret = -EINVAL;
> 
>   end = start + size - 1;
>-  if ((start < parent->start) || (end > parent->end))
>-  return ret;
>+  if (WARN_ON_ONCE((start < parent->start) || (end > parent->end)))
>+  return;
> 
>-  /* The alloc_resource() result gets checked later */
>-  new_res = alloc_resource(GFP_KERNEL);
>+  /*
>+   * We free up quite a lot of memory on memory hotunplug (esp., memap),
>+   * just before releasing the region. This is highly unlikely to
>+   * fail - let's play save and make it never fail as the caller cannot
>+   * perform any error handling (e.g., trying to re-add memory will fail
>+   * similarly).
>+   */
>+retry:
>+  new_res = alloc_resource(GFP_KERNEL | alloc_nofail ? __GFP_NOFAIL : 0);
> 
>   p = >child;
>   write_lock(_lock);
>@@ -1298,7 +1305,6 @@ int release_mem_region_adjustable(struct resource 
>*parent,
>* so if we are dealing with them, let us just back off here.
>*/
>   if (!(res->flags & IORESOURCE_SYSRAM)) {
>-  ret = 0;
>   break;
>   }
> 
>@@ -1315,20 +1321,23 @@ int release_mem_region_adjustable(struct resource 
>*parent,
>   /* free the whole entry */
>   *p = res->sibling;
>   free_resource(res);
>-  ret = 0;
>   } else if (res->start == start && res->end != end) {
>   /* adjust the start */
>-  ret = __adjust_resource(res, end + 1,
>-  res->end - end);
>+  WARN_ON_ONCE(__adjust_resource(res, end + 1,
>+ res->end - end));
>   } else if (res->start != start && res->end == end) {
>

Re: [PATCH v2 1/7] kernel/resource: make release_mem_region_adjustable() never fail

2020-09-14 Thread Wei Yang

On Tue, Sep 08, 2020 at 10:10:06PM +0200, David Hildenbrand wrote:
>Let's make sure splitting a resource on memory hotunplug will never fail.
>This will become more relevant once we merge selected System RAM
>resources - then, we'll trigger that case more often on memory hotunplug.
>
>In general, this function is already unlikely to fail. When we remove
>memory, we free up quite a lot of metadata (memmap, page tables, memory
>block device, etc.). The only reason it could really fail would be when
>injecting allocation errors.
>
>All other error cases inside release_mem_region_adjustable() seem to be
>sanity checks if the function would be abused in different context -
>let's add WARN_ON_ONCE() in these cases so we can catch them.
>
>Cc: Andrew Morton 
>Cc: Michal Hocko 
>Cc: Dan Williams 
>Cc: Jason Gunthorpe 
>Cc: Kees Cook 
>Cc: Ard Biesheuvel 
>Cc: Pankaj Gupta 
>Cc: Baoquan He 
>Cc: Wei Yang 
>Signed-off-by: David Hildenbrand 
>---
> include/linux/ioport.h |  4 ++--
> kernel/resource.c  | 49 --
> mm/memory_hotplug.c| 22 +--
> 3 files changed, 31 insertions(+), 44 deletions(-)
>
>diff --git a/include/linux/ioport.h b/include/linux/ioport.h
>index 6c2b06fe8beb7..52a91f5fa1a36 100644
>--- a/include/linux/ioport.h
>+++ b/include/linux/ioport.h
>@@ -248,8 +248,8 @@ extern struct resource * __request_region(struct resource 
>*,
> extern void __release_region(struct resource *, resource_size_t,
>   resource_size_t);
> #ifdef CONFIG_MEMORY_HOTREMOVE
>-extern int release_mem_region_adjustable(struct resource *, resource_size_t,
>-  resource_size_t);
>+extern void release_mem_region_adjustable(struct resource *, resource_size_t,
>+resource_size_t);
> #endif
> 
> /* Wrappers for managed devices */
>diff --git a/kernel/resource.c b/kernel/resource.c
>index f1175ce93a1d5..36b3552210120 100644
>--- a/kernel/resource.c
>+++ b/kernel/resource.c
>@@ -1258,21 +1258,28 @@ EXPORT_SYMBOL(__release_region);
>  *   assumes that all children remain in the lower address entry for
>  *   simplicity.  Enhance this logic when necessary.
>  */
>-int release_mem_region_adjustable(struct resource *parent,
>-resource_size_t start, resource_size_t size)
>+void release_mem_region_adjustable(struct resource *parent,
>+ resource_size_t start, resource_size_t size)
> {
>+  struct resource *new_res = NULL;
>+  bool alloc_nofail = false;
>   struct resource **p;
>   struct resource *res;
>-  struct resource *new_res;
>   resource_size_t end;
>-  int ret = -EINVAL;
> 
>   end = start + size - 1;
>-  if ((start < parent->start) || (end > parent->end))
>-  return ret;
>+  if (WARN_ON_ONCE((start < parent->start) || (end > parent->end)))
>+  return;
> 
>-  /* The alloc_resource() result gets checked later */
>-  new_res = alloc_resource(GFP_KERNEL);
>+  /*
>+   * We free up quite a lot of memory on memory hotunplug (esp., memap),
>+   * just before releasing the region. This is highly unlikely to
>+   * fail - let's play save and make it never fail as the caller cannot
>+   * perform any error handling (e.g., trying to re-add memory will fail
>+   * similarly).
>+   */
>+retry:
>+  new_res = alloc_resource(GFP_KERNEL | alloc_nofail ? __GFP_NOFAIL : 0);
> 

It looks like a bold change, while I don't find a reason to object it.

>   p = >child;
>   write_lock(_lock);
>@@ -1298,7 +1305,6 @@ int release_mem_region_adjustable(struct resource 
>*parent,
>* so if we are dealing with them, let us just back off here.
>*/
>   if (!(res->flags & IORESOURCE_SYSRAM)) {
>-  ret = 0;
>   break;
>   }
> 
>@@ -1315,20 +1321,23 @@ int release_mem_region_adjustable(struct resource 
>*parent,
>   /* free the whole entry */
>   *p = res->sibling;
>   free_resource(res);
>-  ret = 0;
>   } else if (res->start == start && res->end != end) {
>   /* adjust the start */
>-  ret = __adjust_resource(res, end + 1,
>-  res->end - end);
>+  WARN_ON_ONCE(__adjust_resource(res, end + 1,
>+ res->end - end));
>

Re: [Patch v2 0/4] tracing: trivial cleanup

2020-09-14 Thread Wei Yang

On Mon, Sep 14, 2020 at 06:55:28PM -0400, Steven Rostedt wrote:
>On Fri, 28 Aug 2020 11:42:57 +0800
>Wei Yang  wrote:
>
>> Steven,
>> 
>> Would you like to pick this up?
>> 
>
>Hmm, patch 1 and 2 have been accepted (different subjects though):
>
>   746cf3459f11859 ("tracing: Simplify defining of the next event id")
>   36b8aacf2a483ba ("tracing: Save one trace_event->type by using 
> __TRACE_LAST_TYPE")
>
>I'm not sure why I didn't pick up patches 3 and 4. I'm looking into that
>now.
>

Yep, thanks a lot :-)

>-- Steve
>
>
>> On Sun, Jul 12, 2020 at 09:10:32AM +0800, Wei Yang wrote:
>> >Some trivial cleanup for tracing.
>> >
>> >v2:
>> >  * drop patch 1
>> >  * merge patch 4 & 5
>> >  * introduce a new patch change the return value of tracing_init_dentry()
>> >
>> >Wei Yang (4):
>> >  tracing: simplify the logic by defining next to be "lasst + 1"
>> >  tracing: save one trace_event->type by using __TRACE_LAST_TYPE
>> >  tracing: toplevel d_entry already initialized
>> >  tracing: make tracing_init_dentry() returns an integer instead of a
>> >d_entry pointer
>> >
>> > kernel/trace/trace.c | 36 ++--
>> > kernel/trace/trace.h |  2 +-
>> > kernel/trace/trace_dynevent.c|  8 +++
>> > kernel/trace/trace_events.c  |  9 ++-
>> > kernel/trace/trace_events_synth.c|  9 +++
>> > kernel/trace/trace_functions_graph.c |  8 +++
>> > kernel/trace/trace_hwlat.c   |  8 +++
>> > kernel/trace/trace_kprobe.c  | 10 
>> > kernel/trace/trace_output.c  | 14 +--
>> > kernel/trace/trace_printk.c  |  8 +++
>> > kernel/trace/trace_stack.c   | 12 +-
>> > kernel/trace/trace_stat.c|  8 +++
>> > kernel/trace/trace_uprobe.c  |  9 ---
>> > 13 files changed, 66 insertions(+), 75 deletions(-)
>> >
>> >-- 
>> >2.20.1 (Apple Git-117)  
>> 

-- 
Wei Yang
Help you, Help me

[PATCH 2/2] mm/mmap: check on file instead of the rb_root_cached of its address_space

2020-09-13 Thread Wei Yang

In __vma_adjust(), we do the check on *root* to decide whether to adjust
the address_space. While it seems to be more meaningful to do the check
on *file* itself. This means we are adjust some data because it is a
file backed vma.

Since we seems to assume the address_space is valid if it is a file
backed vma, let's just replace *root* with *file* here.

Signed-off-by: Wei Yang 
---
 mm/mmap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 30b155098606..829897646a9c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -823,7 +823,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
anon_vma_interval_tree_pre_update_vma(next);
}
 
-   if (root) {
+   if (file) {
flush_dcache_mmap_lock(mapping);
vma_interval_tree_remove(vma, root);
if (adjust_next)
@@ -844,7 +844,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
next->vm_pgoff += adjust_next >> PAGE_SHIFT;
}
 
-   if (root) {
+   if (file) {
if (adjust_next)
vma_interval_tree_insert(next, root);
vma_interval_tree_insert(vma, root);
@@ -896,7 +896,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
anon_vma_unlock_write(anon_vma);
}
 
-   if (root) {
+   if (file) {
i_mmap_unlock_write(mapping);
uprobe_mmap(vma);
 
-- 
2.20.1 (Apple Git-117)

[PATCH 1/2] mm/mmap: not necessary to check mapping separately

2020-09-13 Thread Wei Yang

*root* with type of struct rb_root_cached is an element of *mapping*
with type of struct address_space. This implies when we have a valid
*root* it must be a part of valid *mapping*.

So we can merge these two checks together to make the code more easy to
read and to save some cpu cycles.

Signed-off-by: Wei Yang 
---
 mm/mmap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 1922e6fce9e7..30b155098606 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -895,10 +895,9 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
anon_vma_interval_tree_post_update_vma(next);
anon_vma_unlock_write(anon_vma);
}
-   if (mapping)
-   i_mmap_unlock_write(mapping);
 
if (root) {
+   i_mmap_unlock_write(mapping);
uprobe_mmap(vma);
 
if (adjust_next)
-- 
2.20.1 (Apple Git-117)

Re: [PATCH] mm/mmap: leave adjust_next as virtual address instead of page frame number

2020-09-08 Thread Wei Yang

On Tue, Sep 08, 2020 at 05:31:22PM +0200, Vlastimil Babka wrote:
>On 8/28/20 10:10 AM, Wei Yang wrote:
>> Instead of convert adjust_next between virtual address and page frame
>> number, let's just store the virtual address into adjust_next.
>
>IMHO more precisely/less confusing it's "bytes" and "pages" instead of "virtual
>address" (which is absolute address, but this variable holds a difference) and
>"page frame number" (which is related to absolute physical address, but what we
>have is difference in pages in virtual address space).
>

Thanks for your comment.

To be honest, I am not sure which one is more precise. English is not my
mother tongue. If others think this is better, I am fine to adjust this.

>> Also, this patch fixes one typo in the comment of
>> vma_adjust_trans_huge().
>> 
>> Signed-off-by: Wei Yang 
>
>Other than that, seems like it leads to less shifting, so
>Acked-by: Vlastimil Babka 
>
>> ---
>>  mm/huge_memory.c | 4 ++--
>>  mm/mmap.c| 8 
>>  2 files changed, 6 insertions(+), 6 deletions(-)
>> 
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 78c84bee7e29..2c633ba14440 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2300,13 +2300,13 @@ void vma_adjust_trans_huge(struct vm_area_struct 
>> *vma,
>>  
>>  /*
>>   * If we're also updating the vma->vm_next->vm_start, if the new
>> - * vm_next->vm_start isn't page aligned and it could previously
>> + * vm_next->vm_start isn't hpage aligned and it could previously
>>   * contain an hugepage: check if we need to split an huge pmd.
>>   */
>>  if (adjust_next > 0) {
>>  struct vm_area_struct *next = vma->vm_next;
>>  unsigned long nstart = next->vm_start;
>> -nstart += adjust_next << PAGE_SHIFT;
>> +nstart += adjust_next;
>>  if (nstart & ~HPAGE_PMD_MASK &&
>>  (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
>>  (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
>> diff --git a/mm/mmap.c b/mm/mmap.c
>> index 90b1298d4222..e4c9bbfd4103 100644
>> --- a/mm/mmap.c
>> +++ b/mm/mmap.c
>> @@ -758,7 +758,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned 
>> long start,
>>   * vma expands, overlapping part of the next:
>>   * mprotect case 5 shifting the boundary up.
>>   */
>> -adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
>> +adjust_next = (end - next->vm_start);
>>  exporter = next;
>>  importer = vma;
>>  VM_WARN_ON(expand != importer);
>> @@ -768,7 +768,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned 
>> long start,
>>   * split_vma inserting another: so it must be
>>   * mprotect case 4 shifting the boundary down.
>>   */
>> -adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
>> +adjust_next = -(vma->vm_end - end);
>>  exporter = vma;
>>  importer = next;
>>  VM_WARN_ON(expand != importer);
>> @@ -840,8 +840,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned 
>> long start,
>>  }
>>  vma->vm_pgoff = pgoff;
>>  if (adjust_next) {
>> -next->vm_start += adjust_next << PAGE_SHIFT;
>> -next->vm_pgoff += adjust_next;
>> +next->vm_start += adjust_next;
>> +next->vm_pgoff += adjust_next >> PAGE_SHIFT;
>>  }
>>  
>>  if (root) {
>> 

-- 
Wei Yang
Help you, Help me

Re: [PATCH v18 00/32] per memcg lru_lock: reviews

2020-09-08 Thread Wei Yang

On Tue, Sep 08, 2020 at 04:41:00PM -0700, Hugh Dickins wrote:
[...]
>[PATCH v18 06/32] mm/thp: narrow lru locking
>Why? What part does this play in the series? "narrow lru locking" can
>also be described as "widen page cache locking": you are changing the
>lock ordering, and not giving any reason to do so. This may be an
>excellent change, or it may be a terrible change: I find that usually
>lock ordering is forced upon us, and it's rare to meet an instance like
>this that could go either way, and I don't know myself how to judge it.
>
>I do want this commit to go in, partly because it has been present in
>all the testing we have done, and partly because I *can at last* see a
>logical advantage to it - it also nests lru_lock inside memcg->move_lock,
>allowing lock_page_memcg() to be used to stabilize page->mem_cgroup when
>getting per-memcg lru_lock - though only in one place, starting in v17,
>do you actually use that (and, warning: it's not used correctly there).
>
>I'm not very bothered by how the local_irq_disable() looks to RT: THP
>seems a very bad idea in an RT kernel.  Earlier I asked you to run this
>past Kirill and Matthew and Johannes: you did so, thank you, and Kirill
>has blessed it, and no one has nacked it, and I have not noticed any
>disadvantage from this change in lock ordering (documented in 23/32),
>so I'm now going to say
>
>Acked-by: Hugh Dickins 
>
>But I wish you could give some reason for it in the commit message!
>
>Signed-off-by: Wei Yang 
>Is that correct? Or Wei Yang suggested some part of it perhaps?
>

If my memory is correct, we had some offline discussion about this change.

-- 
Wei Yang
Help you, Help me

[Patch v4 6/7] mm/hugetlb: narrow the hugetlb_lock protection area during preparing huge page

2020-08-31 Thread Wei Yang

set_hugetlb_cgroup_[rsvd] just manipulate page local data, which is not
necessary to be protected by hugetlb_lock.

Let's take this out.

Signed-off-by: Wei Yang 
Reviewed-by: Baoquan He 
Reviewed-by: Mike Kravetz 
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c9b292e664c4..7b3357c1dcec 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1493,9 +1493,9 @@ static void prep_new_huge_page(struct hstate *h, struct 
page *page, int nid)
 {
INIT_LIST_HEAD(>lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
-   spin_lock(_lock);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
+   spin_lock(_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(_lock);
-- 
2.20.1 (Apple Git-117)

[Patch v4 5/7] mm/hugetlb: a page from buddy is not on any list

2020-08-31 Thread Wei Yang

The page allocated from buddy is not on any list, so just use list_add()
is enough.

Signed-off-by: Wei Yang 
Reviewed-by: Baoquan He 
Reviewed-by: Mike Kravetz 
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 441b7f7c623e..c9b292e664c4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2405,7 +2405,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
h->resv_huge_pages--;
}
spin_lock(_lock);
-   list_move(>lru, >hugepage_activelist);
+   list_add(>lru, >hugepage_activelist);
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
-- 
2.20.1 (Apple Git-117)

[Patch v4 3/7] mm/hugetlb: use list_splice to merge two list at once

2020-08-31 Thread Wei Yang

Instead of add allocated file_region one by one to region_cache, we
could use list_splice to merge two list at once.

Also we know the number of entries in the list, increase the number
directly.

Signed-off-by: Wei Yang 
Reviewed-by: Baoquan He 
Reviewed-by: Mike Kravetz 
---
 mm/hugetlb.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fbaf49bc1d26..a02bf430de6f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -443,11 +443,8 @@ static int allocate_file_region_entries(struct resv_map 
*resv,
 
spin_lock(>lock);
 
-   list_for_each_entry_safe(rg, trg, _regions, link) {
-   list_del(>link);
-   list_add(>link, >region_cache);
-   resv->region_cache_count++;
-   }
+   list_splice(_regions, >region_cache);
+   resv->region_cache_count += to_allocate;
}
 
return 0;
-- 
2.20.1 (Apple Git-117)

[Patch v4 7/7] mm/hugetlb: take the free hpage during the iteration directly

2020-08-31 Thread Wei Yang

Function dequeue_huge_page_node_exact() iterates the free list and
return the first valid free hpage.

Instead of break and check the loop variant, we could return in the loop
directly. This could reduce some redundant check.

Signed-off-by: Wei Yang 

[mike.krav...@oracle.com: points out a logic error]
---
 mm/hugetlb.c | 22 +-
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7b3357c1dcec..82ba4cad2704 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1040,21 +1040,17 @@ static struct page *dequeue_huge_page_node_exact(struct 
hstate *h, int nid)
if (nocma && is_migrate_cma_page(page))
continue;
 
-   if (!PageHWPoison(page))
-   break;
+   if (PageHWPoison(page))
+   continue;
+
+   list_move(>lru, >hugepage_activelist);
+   set_page_refcounted(page);
+   h->free_huge_pages--;
+   h->free_huge_pages_node[nid]--;
+   return page;
}
 
-   /*
-* if 'non-isolated free hugepage' not found on the list,
-* the allocation fails.
-*/
-   if (>hugepage_freelists[nid] == >lru)
-   return NULL;
-   list_move(>lru, >hugepage_activelist);
-   set_page_refcounted(page);
-   h->free_huge_pages--;
-   h->free_huge_pages_node[nid]--;
-   return page;
+   return NULL;
 }
 
 static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t 
gfp_mask, int nid,
-- 
2.20.1 (Apple Git-117)

[Patch v4 2/7] mm/hugetlb: remove VM_BUG_ON(!nrg) in get_file_region_entry_from_cache()

2020-08-31 Thread Wei Yang

We are sure to get a valid file_region, otherwise the
VM_BUG_ON(resv->region_cache_count <= 0) at the very beginning would be
triggered.

Let's remove the redundant one.

Signed-off-by: Wei Yang 
Reviewed-by: Mike Kravetz 
---
 mm/hugetlb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index db6af2654f12..fbaf49bc1d26 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -240,7 +240,6 @@ get_file_region_entry_from_cache(struct resv_map *resv, 
long from, long to)
 
resv->region_cache_count--;
nrg = list_first_entry(>region_cache, struct file_region, link);
-   VM_BUG_ON(!nrg);
list_del(>link);
 
nrg->from = from;
-- 
2.20.1 (Apple Git-117)

[Patch v4 4/7] mm/hugetlb: count file_region to be added when regions_needed != NULL

2020-08-31 Thread Wei Yang

There are only two cases of function add_reservation_in_range()

* count file_region and return the number in regions_needed
* do the real list operation without counting

This means it is not necessary to have two parameters to classify these
two cases.

Just use regions_needed to separate them.

Signed-off-by: Wei Yang 
Reviewed-by: Baoquan He 
Reviewed-by: Mike Kravetz 
---
 mm/hugetlb.c | 33 +
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a02bf430de6f..441b7f7c623e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -321,16 +321,17 @@ static void coalesce_file_region(struct resv_map *resv, 
struct file_region *rg)
}
 }
 
-/* Must be called with resv->lock held. Calling this with count_only == true
- * will count the number of pages to be added but will not modify the linked
- * list. If regions_needed != NULL and count_only == true, then regions_needed
- * will indicate the number of file_regions needed in the cache to carry out to
- * add the regions for this range.
+/*
+ * Must be called with resv->lock held.
+ *
+ * Calling this with regions_needed != NULL will count the number of pages
+ * to be added but will not modify the linked list. And regions_needed will
+ * indicate the number of file_regions needed in the cache to carry out to add
+ * the regions for this range.
  */
 static long add_reservation_in_range(struct resv_map *resv, long f, long t,
 struct hugetlb_cgroup *h_cg,
-struct hstate *h, long *regions_needed,
-bool count_only)
+struct hstate *h, long *regions_needed)
 {
long add = 0;
struct list_head *head = >regions;
@@ -366,14 +367,14 @@ static long add_reservation_in_range(struct resv_map 
*resv, long f, long t,
 */
if (rg->from > last_accounted_offset) {
add += rg->from - last_accounted_offset;
-   if (!count_only) {
+   if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, rg->from);
record_hugetlb_cgroup_uncharge_info(h_cg, h,
resv, nrg);
list_add(>link, rg->link.prev);
coalesce_file_region(resv, nrg);
-   } else if (regions_needed)
+   } else
*regions_needed += 1;
}
 
@@ -385,13 +386,13 @@ static long add_reservation_in_range(struct resv_map 
*resv, long f, long t,
 */
if (last_accounted_offset < t) {
add += t - last_accounted_offset;
-   if (!count_only) {
+   if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, t);
record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
list_add(>link, rg->link.prev);
coalesce_file_region(resv, nrg);
-   } else if (regions_needed)
+   } else
*regions_needed += 1;
}
 
@@ -484,8 +485,8 @@ static long region_add(struct resv_map *resv, long f, long 
t,
 retry:
 
/* Count how many regions are actually needed to execute this add. */
-   add_reservation_in_range(resv, f, t, NULL, NULL, _regions_needed,
-true);
+   add_reservation_in_range(resv, f, t, NULL, NULL,
+_regions_needed);
 
/*
 * Check for sufficient descriptors in the cache to accommodate
@@ -513,7 +514,7 @@ static long region_add(struct resv_map *resv, long f, long 
t,
goto retry;
}
 
-   add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false);
+   add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
 
resv->adds_in_progress -= in_regions_needed;
 
@@ -549,9 +550,9 @@ static long region_chg(struct resv_map *resv, long f, long 
t,
 
spin_lock(>lock);
 
-   /* Count how many hugepages in this range are NOT respresented. */
+   /* Count how many hugepages in this range are NOT represented. */
chg = add_reservation_in_range(resv, f, t, NULL, NULL,
-  out_regions_needed, true);
+  out_regions_needed);
 
if (*out_regions_needed == 0)
*out_regions_needed = 1;
-- 
2.20.1 (Apple Git-117)

[Patch v4 0/7] mm/hugetlb: code refine and simplification

2020-08-31 Thread Wei Yang

Following are some cleanup for hugetlb.

Simple test with tools/testing/selftests/vm/map_hugetlb pass.

v4:
  * fix a logic error for patch 7, thanks Mike
v3:
  * rebase on v5.9-rc2 which adjust the last patch a little
v2:
  * drop 5/6/10 since similar patches are merged or under review.
  * adjust 2 based on comment from Mike Kravetz

Wei Yang (7):
  mm/hugetlb: not necessary to coalesce regions recursively
  mm/hugetlb: remove VM_BUG_ON(!nrg) in
get_file_region_entry_from_cache()
  mm/hugetlb: use list_splice to merge two list at once
  mm/hugetlb: count file_region to be added when regions_needed != NULL
  mm/hugetlb: a page from buddy is not on any list
  mm/hugetlb: narrow the hugetlb_lock protection area during preparing
huge page
  mm/hugetlb: take the free hpage during the iteration directly

 mm/hugetlb.c | 73 ++--
 1 file changed, 31 insertions(+), 42 deletions(-)

-- 
2.20.1 (Apple Git-117)

[Patch v4 1/7] mm/hugetlb: not necessary to coalesce regions recursively

2020-08-31 Thread Wei Yang

Per my understanding, we keep the regions ordered and would always
coalesce regions properly. So the task to keep this property is just
to coalesce its neighbour.

Let's simplify this.

Signed-off-by: Wei Yang 
Reviewed-by: Baoquan He 
Reviewed-by: Mike Kravetz 
---
 mm/hugetlb.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a301c2d672bf..db6af2654f12 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -309,8 +309,7 @@ static void coalesce_file_region(struct resv_map *resv, 
struct file_region *rg)
list_del(>link);
kfree(rg);
 
-   coalesce_file_region(resv, prg);
-   return;
+   rg = prg;
}
 
nrg = list_next_entry(rg, link);
@@ -320,9 +319,6 @@ static void coalesce_file_region(struct resv_map *resv, 
struct file_region *rg)
 
list_del(>link);
kfree(rg);
-
-   coalesce_file_region(resv, nrg);
-   return;
}
 }
 
-- 
2.20.1 (Apple Git-117)

Re: [Patch v3 7/7] mm/hugetlb: take the free hpage during the iteration directly

2020-08-31 Thread Wei Yang

On Mon, Aug 31, 2020 at 04:06:54PM -0700, Mike Kravetz wrote:
>On 8/30/20 7:23 PM, Wei Yang wrote:
>> Function dequeue_huge_page_node_exact() iterates the free list and
>> return the first valid free hpage.
>> 
>> Instead of break and check the loop variant, we could return in the loop
>> directly. This could reduce some redundant check.
>> 
>> Signed-off-by: Wei Yang 
>> ---
>>  mm/hugetlb.c | 20 
>>  1 file changed, 8 insertions(+), 12 deletions(-)
>> 
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 7b3357c1dcec..709be7ab65af 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1040,21 +1040,17 @@ static struct page 
>> *dequeue_huge_page_node_exact(struct hstate *h, int nid)
>>  if (nocma && is_migrate_cma_page(page))
>>  continue;
>>  
>> -if (!PageHWPoison(page))
>> +if (PageHWPoison(page))
>>  break;
>
>Previously, when encountering a PageHWPoison(page) the loop would continue
>and check the next page in the list.  It now breaks the loop and returns
>NULL.  Is not this a change in behavior?  Perhaps you want to change that
>break to a continue.  Or, restructure in some other way.

Shame on me. You are right. I should change it to continue.

Andrew,

Sorry for the inconvenience.

>-- 
>Mike Kravetz
>
>> +
>> +list_move(>lru, >hugepage_activelist);
>> +set_page_refcounted(page);
>> +h->free_huge_pages--;
>> +h->free_huge_pages_node[nid]--;
>> +return page;
>>  }
>>  
>> -/*
>> - * if 'non-isolated free hugepage' not found on the list,
>> - * the allocation fails.
>> - */
>> -if (>hugepage_freelists[nid] == >lru)
>> -return NULL;
>> -list_move(>lru, >hugepage_activelist);
>> -set_page_refcounted(page);
>> -h->free_huge_pages--;
>> -h->free_huge_pages_node[nid]--;
>> -return page;
>> +return NULL;
>>  }
>>  
>>  static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t 
>> gfp_mask, int nid,
>> 

-- 
Wei Yang
Help you, Help me

[PATCH 2/6] ftrace: use fls() to get the bits for dup_hash()

2020-08-30 Thread Wei Yang

The effect here is to get the number of bits, lets use fls() to do
this job.

Signed-off-by: Wei Yang 
---
 kernel/trace/ftrace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 12cb535769bc..9021e16fa079 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1370,8 +1370,9 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash 
*src, int size)
/*
 * Make the hash size about 1/2 the # found
 */
-   for (size /= 2; size; size >>= 1)
-   bits++;
+   bits = fls(size);
+   if (bits)
+   bits--;
 
/* Don't allocate too much */
if (bits > FTRACE_HASH_MAX_BITS)
-- 
2.20.1 (Apple Git-117)

[PATCH 1/6] ftrace: define seq_file only for FMODE_READ

2020-08-30 Thread Wei Yang

The purpose of the operation is to get ftrace_iterator, which is embedded
in file or seq_file for FMODE_WRITE/FMODE_READ respectively. Since we
don't have a seq_file for FMODE_WRITE case, it is meaningless to cast
file->private_data to seq_file.

Let's move the definition when there is a valid seq_file.

Signed-off-by: Wei Yang 
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index edc233122598..12cb535769bc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5558,7 +5558,6 @@ static void __init set_ftrace_early_filters(void)
 
 int ftrace_regex_release(struct inode *inode, struct file *file)
 {
-   struct seq_file *m = (struct seq_file *)file->private_data;
struct ftrace_iterator *iter;
struct ftrace_hash **orig_hash;
struct trace_parser *parser;
@@ -5566,6 +5565,7 @@ int ftrace_regex_release(struct inode *inode, struct file 
*file)
int ret;
 
if (file->f_mode & FMODE_READ) {
+   struct seq_file *m = file->private_data;
iter = m->private;
seq_release(inode, file);
} else
-- 
2.20.1 (Apple Git-117)

[PATCH 3/6] ftrace: simplify the dyn_ftrace->flags macro

2020-08-30 Thread Wei Yang

All the three macro are defined to be used for ftrace_rec_count(). This
can be achieved by (flags & FTRACE_REF_MAX) directly.

Since no other places would use those macros, remove them for clarity.

Also it fixes a typo in the comment.

Signed-off-by: Wei Yang 
---
 include/linux/ftrace.h | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3e3fe779a8c2..23c4d6526998 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -433,7 +433,7 @@ bool is_ftrace_trampoline(unsigned long addr);
  *  DIRECT   - there is a direct function to call
  *
  * When a new ftrace_ops is registered and wants a function to save
- * pt_regs, the rec->flag REGS is set. When the function has been
+ * pt_regs, the rec->flags REGS is set. When the function has been
  * set up to save regs, the REG_EN flag is set. Once a function
  * starts saving regs it will do so until all ftrace_ops are removed
  * from tracing that function.
@@ -451,12 +451,9 @@ enum {
 };
 
 #define FTRACE_REF_MAX_SHIFT   23
-#define FTRACE_FL_BITS 9
-#define FTRACE_FL_MASKED_BITS  ((1UL << FTRACE_FL_BITS) - 1)
-#define FTRACE_FL_MASK (FTRACE_FL_MASKED_BITS << FTRACE_REF_MAX_SHIFT)
 #define FTRACE_REF_MAX ((1UL << FTRACE_REF_MAX_SHIFT) - 1)
 
-#define ftrace_rec_count(rec)  ((rec)->flags & ~FTRACE_FL_MASK)
+#define ftrace_rec_count(rec)  ((rec)->flags & FTRACE_REF_MAX)
 
 struct dyn_ftrace {
unsigned long   ip; /* address of mcount call-site */
-- 
2.20.1 (Apple Git-117)

[PATCH 4/6] ftrace: simplify the calculation of page number for ftrace_page->records

2020-08-30 Thread Wei Yang

Based on the following two reasones, we could simplify the calculation:

  - If the number after roundup count is not power of 2, we would
definitely have more than 1 empty page with a higher order.
  - get_count_order() just return current order, so one lower order
could meet the requirement.

The calculation could be simplified by lower one order level when pages
are not power of 2.

Signed-off-by: Wei Yang 
---
 kernel/trace/ftrace.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9021e16fa079..15fcfa16895d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3127,18 +3127,19 @@ static int ftrace_update_code(struct module *mod, 
struct ftrace_page *new_pgs)
 static int ftrace_allocate_records(struct ftrace_page *pg, int count)
 {
int order;
-   int cnt;
+   int pages, cnt;
 
if (WARN_ON(!count))
return -EINVAL;
 
-   order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
+   pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE);
+   order = get_count_order(pages);
 
/*
 * We want to fill as much as possible. No more than a page
 * may be empty.
 */
-   while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
+   if (!is_power_of_2(pages))
order--;
 
  again:
-- 
2.20.1 (Apple Git-117)

[PATCH 5/6] ftrace: replace do_for_each_ftrace_rec() with for_ftrace_rec_iter()

2020-08-30 Thread Wei Yang

Now we have two similar infrastructure to iterate ftrace_page and
dyn_ftrace:

  * do_for_each_ftrace_rec()
  * for_ftrace_rec_iter()

The 2nd one, for_ftrace_rec_iter(), looks more generic, so preserve it
and replace do_for_each_ftrace_rec() with it.

Signed-off-by: Wei Yang 
---
 kernel/trace/ftrace.c | 94 ---
 1 file changed, 44 insertions(+), 50 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 15fcfa16895d..4def668f45ba 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1501,21 +1501,6 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long 
ip, void *regs)
return ret;
 }
 
-/*
- * This is a double for. Do not use 'break' to break out of the loop,
- * you must use a goto.
- */
-#define do_for_each_ftrace_rec(pg, rec)
\
-   for (pg = ftrace_pages_start; pg; pg = pg->next) {  \
-   int _i; \
-   for (_i = 0; _i < pg->index; _i++) {\
-   rec = >records[_i];
-
-#define while_for_each_ftrace_rec()\
-   }   \
-   }
-
-
 static int ftrace_cmp_recs(const void *a, const void *b)
 {
const struct dyn_ftrace *key = a;
@@ -1638,7 +1623,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops 
*ops,
 {
struct ftrace_hash *hash;
struct ftrace_hash *other_hash;
-   struct ftrace_page *pg;
+   struct ftrace_rec_iter *iter;
struct dyn_ftrace *rec;
bool update = false;
int count = 0;
@@ -1676,11 +1661,13 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops 
*ops,
return false;
}
 
-   do_for_each_ftrace_rec(pg, rec) {
+   for_ftrace_rec_iter(iter) {
int in_other_hash = 0;
int in_hash = 0;
int match = 0;
 
+   rec = ftrace_rec_iter_record(iter);
+
if (rec->flags & FTRACE_FL_DISABLED)
continue;
 
@@ -1797,7 +1784,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops 
*ops,
/* Shortcut, if we handled all records, we are done. */
if (!all && count == hash->count)
return update;
-   } while_for_each_ftrace_rec();
+   }
 
return update;
 }
@@ -1862,7 +1849,7 @@ static int __ftrace_hash_update_ipmodify(struct 
ftrace_ops *ops,
 struct ftrace_hash *old_hash,
 struct ftrace_hash *new_hash)
 {
-   struct ftrace_page *pg;
+   struct ftrace_rec_iter *iter;
struct dyn_ftrace *rec, *end = NULL;
int in_old, in_new;
 
@@ -1881,7 +1868,8 @@ static int __ftrace_hash_update_ipmodify(struct 
ftrace_ops *ops,
return -EINVAL;
 
/* Update rec->flags */
-   do_for_each_ftrace_rec(pg, rec) {
+   for_ftrace_rec_iter(iter) {
+   rec = ftrace_rec_iter_record(iter);
 
if (rec->flags & FTRACE_FL_DISABLED)
continue;
@@ -1899,7 +1887,7 @@ static int __ftrace_hash_update_ipmodify(struct 
ftrace_ops *ops,
rec->flags |= FTRACE_FL_IPMODIFY;
} else /* Removed entry */
rec->flags &= ~FTRACE_FL_IPMODIFY;
-   } while_for_each_ftrace_rec();
+   }
 
return 0;
 
@@ -1907,7 +1895,8 @@ static int __ftrace_hash_update_ipmodify(struct 
ftrace_ops *ops,
end = rec;
 
/* Roll back what we did above */
-   do_for_each_ftrace_rec(pg, rec) {
+   for_ftrace_rec_iter(iter) {
+   rec = ftrace_rec_iter_record(iter);
 
if (rec->flags & FTRACE_FL_DISABLED)
continue;
@@ -1924,7 +1913,7 @@ static int __ftrace_hash_update_ipmodify(struct 
ftrace_ops *ops,
rec->flags &= ~FTRACE_FL_IPMODIFY;
else
rec->flags |= FTRACE_FL_IPMODIFY;
-   } while_for_each_ftrace_rec();
+   }
 
 err_out:
return -EBUSY;
@@ -2517,8 +2506,8 @@ __ftrace_replace_code(struct dyn_ftrace *rec, bool enable)
 
 void __weak ftrace_replace_code(int mod_flags)
 {
+   struct ftrace_rec_iter *iter;
struct dyn_ftrace *rec;
-   struct ftrace_page *pg;
bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL;
int failed;
@@ -2526,7 +2515,8 @@ void __weak ftrace_replace_code(int mod_flags)
if (unlikely(ftrace_disabled))
return;
 
-   do_for_each_ftrace_rec(pg, rec) {
+   for_ftrace_rec_iter(iter) {
+   rec = ftrace_rec_iter_record(iter);
 
if (rec->flags & FTRACE_FL_DISABLED)

[PATCH 6/6] ftrace: ftrace_global_list is renamed to ftrace_ops_list

2020-08-30 Thread Wei Yang

Fix the comment to comply with the code.

Signed-off-by: Wei Yang 
---
 include/linux/ftrace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 23c4d6526998..8e1fd97343c6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -218,11 +218,11 @@ extern struct ftrace_ops __rcu *ftrace_ops_list;
 extern struct ftrace_ops ftrace_list_end;
 
 /*
- * Traverse the ftrace_global_list, invoking all entries.  The reason that we
+ * Traverse the ftrace_ops_list, invoking all entries.  The reason that we
  * can use rcu_dereference_raw_check() is that elements removed from this list
  * are simply leaked, so there is no need to interact with a grace-period
  * mechanism.  The rcu_dereference_raw_check() calls are needed to handle
- * concurrent insertions into the ftrace_global_list.
+ * concurrent insertions into the ftrace_ops_list.
  *
  * Silly Alpha and silly pointer-speculation compiler optimizations!
  */
-- 
2.20.1 (Apple Git-117)

[PATCH 0/6] ftrace: trivial cleanup

2020-08-30 Thread Wei Yang

Trivial cleanups relates to ftrace.

Wei Yang (6):
  ftrace: define seq_file only for FMODE_READ
  ftrace: use fls() to get the bits for dup_hash()
  ftrace: simplify the dyn_ftrace->flags macro
  ftrace: simplify the calculation of page number for
ftrace_page->records
  ftrace: replace do_for_each_ftrace_rec() with for_ftrace_rec_iter()
  ftrace: ftrace_global_list is renamed to ftrace_ops_list

 include/linux/ftrace.h |  11 ++---
 kernel/trace/ftrace.c  | 108 -
 2 files changed, 56 insertions(+), 63 deletions(-)

-- 
2.20.1 (Apple Git-117)

[Patch v3 4/7] mm/hugetlb: count file_region to be added when regions_needed != NULL

2020-08-30 Thread Wei Yang

There are only two cases of function add_reservation_in_range()

* count file_region and return the number in regions_needed
* do the real list operation without counting

This means it is not necessary to have two parameters to classify these
two cases.

Just use regions_needed to separate them.

Signed-off-by: Wei Yang 
Reviewed-by: Baoquan He 
Reviewed-by: Mike Kravetz 
---
 mm/hugetlb.c | 33 +
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a02bf430de6f..441b7f7c623e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -321,16 +321,17 @@ static void coalesce_file_region(struct resv_map *resv, 
struct file_region *rg)
}
 }
 
-/* Must be called with resv->lock held. Calling this with count_only == true
- * will count the number of pages to be added but will not modify the linked
- * list. If regions_needed != NULL and count_only == true, then regions_needed
- * will indicate the number of file_regions needed in the cache to carry out to
- * add the regions for this range.
+/*
+ * Must be called with resv->lock held.
+ *
+ * Calling this with regions_needed != NULL will count the number of pages
+ * to be added but will not modify the linked list. And regions_needed will
+ * indicate the number of file_regions needed in the cache to carry out to add
+ * the regions for this range.
  */
 static long add_reservation_in_range(struct resv_map *resv, long f, long t,
 struct hugetlb_cgroup *h_cg,
-struct hstate *h, long *regions_needed,
-bool count_only)
+struct hstate *h, long *regions_needed)
 {
long add = 0;
struct list_head *head = >regions;
@@ -366,14 +367,14 @@ static long add_reservation_in_range(struct resv_map 
*resv, long f, long t,
 */
if (rg->from > last_accounted_offset) {
add += rg->from - last_accounted_offset;
-   if (!count_only) {
+   if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, rg->from);
record_hugetlb_cgroup_uncharge_info(h_cg, h,
resv, nrg);
list_add(>link, rg->link.prev);
coalesce_file_region(resv, nrg);
-   } else if (regions_needed)
+   } else
*regions_needed += 1;
}
 
@@ -385,13 +386,13 @@ static long add_reservation_in_range(struct resv_map 
*resv, long f, long t,
 */
if (last_accounted_offset < t) {
add += t - last_accounted_offset;
-   if (!count_only) {
+   if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, t);
record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
list_add(>link, rg->link.prev);
coalesce_file_region(resv, nrg);
-   } else if (regions_needed)
+   } else
*regions_needed += 1;
}
 
@@ -484,8 +485,8 @@ static long region_add(struct resv_map *resv, long f, long 
t,
 retry:
 
/* Count how many regions are actually needed to execute this add. */
-   add_reservation_in_range(resv, f, t, NULL, NULL, _regions_needed,
-true);
+   add_reservation_in_range(resv, f, t, NULL, NULL,
+_regions_needed);
 
/*
 * Check for sufficient descriptors in the cache to accommodate
@@ -513,7 +514,7 @@ static long region_add(struct resv_map *resv, long f, long 
t,
goto retry;
}
 
-   add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false);
+   add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
 
resv->adds_in_progress -= in_regions_needed;
 
@@ -549,9 +550,9 @@ static long region_chg(struct resv_map *resv, long f, long 
t,
 
spin_lock(>lock);
 
-   /* Count how many hugepages in this range are NOT respresented. */
+   /* Count how many hugepages in this range are NOT represented. */
chg = add_reservation_in_range(resv, f, t, NULL, NULL,
-  out_regions_needed, true);
+  out_regions_needed);
 
if (*out_regions_needed == 0)
*out_regions_needed = 1;
-- 
2.20.1 (Apple Git-117)

[Patch v3 2/7] mm/hugetlb: remove VM_BUG_ON(!nrg) in get_file_region_entry_from_cache()

2020-08-30 Thread Wei Yang

We are sure to get a valid file_region, otherwise the
VM_BUG_ON(resv->region_cache_count <= 0) at the very beginning would be
triggered.

Let's remove the redundant one.

Signed-off-by: Wei Yang 
Reviewed-by: Mike Kravetz 
---
 mm/hugetlb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index db6af2654f12..fbaf49bc1d26 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -240,7 +240,6 @@ get_file_region_entry_from_cache(struct resv_map *resv, 
long from, long to)
 
resv->region_cache_count--;
nrg = list_first_entry(>region_cache, struct file_region, link);
-   VM_BUG_ON(!nrg);
list_del(>link);
 
nrg->from = from;
-- 
2.20.1 (Apple Git-117)

[Patch v3 3/7] mm/hugetlb: use list_splice to merge two list at once

2020-08-30 Thread Wei Yang

Instead of add allocated file_region one by one to region_cache, we
could use list_splice to merge two list at once.

Also we know the number of entries in the list, increase the number
directly.

Signed-off-by: Wei Yang 
Reviewed-by: Baoquan He 
Reviewed-by: Mike Kravetz 
---
 mm/hugetlb.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fbaf49bc1d26..a02bf430de6f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -443,11 +443,8 @@ static int allocate_file_region_entries(struct resv_map 
*resv,
 
spin_lock(>lock);
 
-   list_for_each_entry_safe(rg, trg, _regions, link) {
-   list_del(>link);
-   list_add(>link, >region_cache);
-   resv->region_cache_count++;
-   }
+   list_splice(_regions, >region_cache);
+   resv->region_cache_count += to_allocate;
}
 
return 0;
-- 
2.20.1 (Apple Git-117)

[Patch v3 0/7] mm/hugetlb: code refine and simplification

2020-08-30 Thread Wei Yang

Following are some cleanup for hugetlb.

Simple test with tools/testing/selftests/vm/map_hugetlb pass.

v3:
  * rebase on v5.9-rc2 which adjust the last patch a little
v2:
  * drop 5/6/10 since similar patches are merged or under review.
  * adjust 2 based on comment from Mike Kravetz

Wei Yang (7):
  mm/hugetlb: not necessary to coalesce regions recursively
  mm/hugetlb: remove VM_BUG_ON(!nrg) in
get_file_region_entry_from_cache()
  mm/hugetlb: use list_splice to merge two list at once
  mm/hugetlb: count file_region to be added when regions_needed != NULL
  mm/hugetlb: a page from buddy is not on any list
  mm/hugetlb: narrow the hugetlb_lock protection area during preparing
huge page
  mm/hugetlb: take the free hpage during the iteration directly

 mm/hugetlb.c | 71 ++--
 1 file changed, 30 insertions(+), 41 deletions(-)

-- 
2.20.1 (Apple Git-117)

[Patch v3 7/7] mm/hugetlb: take the free hpage during the iteration directly

2020-08-30 Thread Wei Yang

Function dequeue_huge_page_node_exact() iterates the free list and
return the first valid free hpage.

Instead of break and check the loop variant, we could return in the loop
directly. This could reduce some redundant check.

Signed-off-by: Wei Yang 
---
 mm/hugetlb.c | 20 
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7b3357c1dcec..709be7ab65af 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1040,21 +1040,17 @@ static struct page *dequeue_huge_page_node_exact(struct 
hstate *h, int nid)
if (nocma && is_migrate_cma_page(page))
continue;
 
-   if (!PageHWPoison(page))
+   if (PageHWPoison(page))
break;
+
+   list_move(>lru, >hugepage_activelist);
+   set_page_refcounted(page);
+   h->free_huge_pages--;
+   h->free_huge_pages_node[nid]--;
+   return page;
}
 
-   /*
-* if 'non-isolated free hugepage' not found on the list,
-* the allocation fails.
-*/
-   if (>hugepage_freelists[nid] == >lru)
-   return NULL;
-   list_move(>lru, >hugepage_activelist);
-   set_page_refcounted(page);
-   h->free_huge_pages--;
-   h->free_huge_pages_node[nid]--;
-   return page;
+   return NULL;
 }
 
 static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t 
gfp_mask, int nid,
-- 
2.20.1 (Apple Git-117)

[Patch v3 5/7] mm/hugetlb: a page from buddy is not on any list

2020-08-30 Thread Wei Yang

The page allocated from buddy is not on any list, so just use list_add()
is enough.

Signed-off-by: Wei Yang 
Reviewed-by: Baoquan He 
Reviewed-by: Mike Kravetz 
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 441b7f7c623e..c9b292e664c4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2405,7 +2405,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
h->resv_huge_pages--;
}
spin_lock(_lock);
-   list_move(>lru, >hugepage_activelist);
+   list_add(>lru, >hugepage_activelist);
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
-- 
2.20.1 (Apple Git-117)

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1221 matches

Mail list logo