Re: [PATCH v2 1/5] virtio-mem: Probe THP size to determine default block size

2020-09-29 Thread Pankaj Gupta
> >> Let's allow a minimum block size of 1 MiB in all configurations. Select
> >> the default block size based on
> >> - The page size of the memory backend.
> >> - The THP size if the memory backend size corresponds to the real hsot
> >
> > s/hsot/host
>
> thanks!
>
> >>   page size.
> >> - The global minimum of 1 MiB.
> >> and warn if something smaller is configured by the user.
> >>
> >> VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the
> >> THP size unconditionally.
> >>
> >> For now we only support virtio-mem on x86-64 - there isn't a user-visiable
> >
> > s/visiable/visible
>
> thanks!
>
> >> change (x86-64 only supports 2 MiB THP on the PMD level) - the default
> >> was, and will be 2 MiB.
> >>
> >> If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we
> >> expect to have a trigger to explicitly opt-in for the new THP granularity.
> >>
> >> Cc: "Michael S. Tsirkin" 
> >> Cc: Wei Yang 
> >> Cc: Dr. David Alan Gilbert 
> >> Cc: Igor Mammedov 
> >> Cc: Pankaj Gupta 
> >> Signed-off-by: David Hildenbrand 
> >> ---
> >>  hw/virtio/virtio-mem.c | 105 +++--
> >>  1 file changed, 101 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> >> index 8fbec77ccc..9b1461cf9d 100644
> >> --- a/hw/virtio/virtio-mem.c
> >> +++ b/hw/virtio/virtio-mem.c
> >> @@ -33,10 +33,83 @@
> >>  #include "trace.h"
> >>
> >>  /*
> >> - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging
> >> - * memory (e.g., 2MB on x86_64).
> >> + * Let's not allow blocks smaller than 1 MiB, for example, to keep the 
> >> tracking
> >> + * bitmap small.
> >>   */
> >> -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN)
> >> +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
> >> +
> >> +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \
> >> +defined(__powerpc64__)
> >> +#define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB))
> >> +#else
> >> +/* fallback to 1 MiB (e.g., the THP size on s390x) */
> >> +#define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE
> >> +#endif
> >> +
> >> +/*
> >> + * We want to have a reasonable default block size such that
> >> + * 1. We avoid splitting THPs when unplugging memory, which degrades
> >> + *performance.
> >> + * 2. We avoid placing THPs for plugged blocks that also cover unplugged
> >> + *blocks.
> >> + *
> >> + * The actual THP size might differ between Linux kernels, so we try to 
> >> probe
> >> + * it. In the future (if we ever run into issues regarding 2.), we might 
> >> want
> >> + * to disable THP in case we fail to properly probe the THP size, or if 
> >> the
> >> + * block size is configured smaller than the THP size.
> >> + */
> >> +static uint32_t thp_size;
> >> +
> >> +#define HPAGE_PMD_SIZE_PATH 
> >> "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
> >> +static uint32_t virtio_mem_thp_size(void)
> >> +{
> >> +gchar *content = NULL;
> >> +const char *endptr;
> >> +uint64_t tmp;
> >> +
> >> +if (thp_size) {
> >> +return thp_size;
> >> +}
> >> +
> >> +/*
> >> + * Try to probe the actual THP size, fallback to (sane but eventually
> >> + * incorrect) default sizes.
> >> + */
> >> +if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
> >> +!qemu_strtou64(content, &endptr, 0, &tmp) &&
> >> +(!endptr || *endptr == '\n')) {
> >> +/*
> >> + * Sanity-check the value, if it's too big (e.g., aarch64 with 
> >> 64k base
> >> + * pages) or weird, fallback to something smaller.
> >> + */
> >> +if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {
> >> +warn_report("Read unsupported THP size: %" PRIx64, tmp);
> >> +} else {
> >> +thp_size = tmp;
> >> +}
> >> +}
> >> +
> >> +if (!thp_size) {
> >> +thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE;
> >> +warn_report("Could not detect THP size, falling back to %" PRIx64
> >> +"  MiB.", thp_size / MiB);
> >> +}
> >> +
> >> +g_free(content);
> >> +return thp_size;
> >> +}
> >> +
> >> +static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
> >> +{
> >> +const uint64_t page_size = qemu_ram_pagesize(rb);
> >> +
> >> +/* We can have hugetlbfs with a page size smaller than the THP size. 
> >> */
> >> +if (page_size == qemu_real_host_page_size) {
> >> +return MAX(page_size, virtio_mem_thp_size());
> >> +}
> >
> > This condition is special, can think of hugetlbfs smaller in size than THP 
> > size
> > configured.
>
> Yeah, there are weird architectures, most prominently arm64:
>
> https://www.kernel.org/doc/html/latest/arm64/hugetlbpage.html
>
> Assume you're on 64K base pages with a probed 512MB THP size
> (currently). You could have hugetlbfs with 2MB page size via "CONT PTE"
> bits.

Ok. I understand now. Thanks for explain

Re: [PATCH v2 1/5] virtio-mem: Probe THP size to determine default block size

2020-09-29 Thread David Hildenbrand
On 29.09.20 15:54, Pankaj Gupta wrote:
>> Let's allow a minimum block size of 1 MiB in all configurations. Select
>> the default block size based on
>> - The page size of the memory backend.
>> - The THP size if the memory backend size corresponds to the real hsot
> 
> s/hsot/host

thanks!

>>   page size.
>> - The global minimum of 1 MiB.
>> and warn if something smaller is configured by the user.
>>
>> VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the
>> THP size unconditionally.
>>
>> For now we only support virtio-mem on x86-64 - there isn't a user-visiable
> 
> s/visiable/visible

thanks!

>> change (x86-64 only supports 2 MiB THP on the PMD level) - the default
>> was, and will be 2 MiB.
>>
>> If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we
>> expect to have a trigger to explicitly opt-in for the new THP granularity.
>>
>> Cc: "Michael S. Tsirkin" 
>> Cc: Wei Yang 
>> Cc: Dr. David Alan Gilbert 
>> Cc: Igor Mammedov 
>> Cc: Pankaj Gupta 
>> Signed-off-by: David Hildenbrand 
>> ---
>>  hw/virtio/virtio-mem.c | 105 +++--
>>  1 file changed, 101 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
>> index 8fbec77ccc..9b1461cf9d 100644
>> --- a/hw/virtio/virtio-mem.c
>> +++ b/hw/virtio/virtio-mem.c
>> @@ -33,10 +33,83 @@
>>  #include "trace.h"
>>
>>  /*
>> - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging
>> - * memory (e.g., 2MB on x86_64).
>> + * Let's not allow blocks smaller than 1 MiB, for example, to keep the 
>> tracking
>> + * bitmap small.
>>   */
>> -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN)
>> +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
>> +
>> +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \
>> +defined(__powerpc64__)
>> +#define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB))
>> +#else
>> +/* fallback to 1 MiB (e.g., the THP size on s390x) */
>> +#define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE
>> +#endif
>> +
>> +/*
>> + * We want to have a reasonable default block size such that
>> + * 1. We avoid splitting THPs when unplugging memory, which degrades
>> + *performance.
>> + * 2. We avoid placing THPs for plugged blocks that also cover unplugged
>> + *blocks.
>> + *
>> + * The actual THP size might differ between Linux kernels, so we try to 
>> probe
>> + * it. In the future (if we ever run into issues regarding 2.), we might 
>> want
>> + * to disable THP in case we fail to properly probe the THP size, or if the
>> + * block size is configured smaller than the THP size.
>> + */
>> +static uint32_t thp_size;
>> +
>> +#define HPAGE_PMD_SIZE_PATH 
>> "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
>> +static uint32_t virtio_mem_thp_size(void)
>> +{
>> +gchar *content = NULL;
>> +const char *endptr;
>> +uint64_t tmp;
>> +
>> +if (thp_size) {
>> +return thp_size;
>> +}
>> +
>> +/*
>> + * Try to probe the actual THP size, fallback to (sane but eventually
>> + * incorrect) default sizes.
>> + */
>> +if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
>> +!qemu_strtou64(content, &endptr, 0, &tmp) &&
>> +(!endptr || *endptr == '\n')) {
>> +/*
>> + * Sanity-check the value, if it's too big (e.g., aarch64 with 64k 
>> base
>> + * pages) or weird, fallback to something smaller.
>> + */
>> +if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {
>> +warn_report("Read unsupported THP size: %" PRIx64, tmp);
>> +} else {
>> +thp_size = tmp;
>> +}
>> +}
>> +
>> +if (!thp_size) {
>> +thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE;
>> +warn_report("Could not detect THP size, falling back to %" PRIx64
>> +"  MiB.", thp_size / MiB);
>> +}
>> +
>> +g_free(content);
>> +return thp_size;
>> +}
>> +
>> +static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
>> +{
>> +const uint64_t page_size = qemu_ram_pagesize(rb);
>> +
>> +/* We can have hugetlbfs with a page size smaller than the THP size. */
>> +if (page_size == qemu_real_host_page_size) {
>> +return MAX(page_size, virtio_mem_thp_size());
>> +}
> 
> This condition is special, can think of hugetlbfs smaller in size than THP 
> size
> configured.

Yeah, there are weird architectures, most prominently arm64:

https://www.kernel.org/doc/html/latest/arm64/hugetlbpage.html

Assume you're on 64K base pages with a probed 512MB THP size
(currently). You could have hugetlbfs with 2MB page size via "CONT PTE"
bits.

>> +return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE);
> 
> Do we still need this? Or we can have only one return for both the cases?
> Probably, I am missing something here.

We still need it. Assume somebody would have 64K hugetlbfs on arm64
(with 4k base pages), we want to mak

Re: [PATCH v2 1/5] virtio-mem: Probe THP size to determine default block size

2020-09-29 Thread Pankaj Gupta
> Let's allow a minimum block size of 1 MiB in all configurations. Select
> the default block size based on
> - The page size of the memory backend.
> - The THP size if the memory backend size corresponds to the real hsot

s/hsot/host
>   page size.
> - The global minimum of 1 MiB.
> and warn if something smaller is configured by the user.
>
> VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the
> THP size unconditionally.
>
> For now we only support virtio-mem on x86-64 - there isn't a user-visiable

s/visiable/visible
> change (x86-64 only supports 2 MiB THP on the PMD level) - the default
> was, and will be 2 MiB.
>
> If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we
> expect to have a trigger to explicitly opt-in for the new THP granularity.
>
> Cc: "Michael S. Tsirkin" 
> Cc: Wei Yang 
> Cc: Dr. David Alan Gilbert 
> Cc: Igor Mammedov 
> Cc: Pankaj Gupta 
> Signed-off-by: David Hildenbrand 
> ---
>  hw/virtio/virtio-mem.c | 105 +++--
>  1 file changed, 101 insertions(+), 4 deletions(-)
>
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index 8fbec77ccc..9b1461cf9d 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -33,10 +33,83 @@
>  #include "trace.h"
>
>  /*
> - * Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging
> - * memory (e.g., 2MB on x86_64).
> + * Let's not allow blocks smaller than 1 MiB, for example, to keep the 
> tracking
> + * bitmap small.
>   */
> -#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN)
> +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
> +
> +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \
> +defined(__powerpc64__)
> +#define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB))
> +#else
> +/* fallback to 1 MiB (e.g., the THP size on s390x) */
> +#define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE
> +#endif
> +
> +/*
> + * We want to have a reasonable default block size such that
> + * 1. We avoid splitting THPs when unplugging memory, which degrades
> + *performance.
> + * 2. We avoid placing THPs for plugged blocks that also cover unplugged
> + *blocks.
> + *
> + * The actual THP size might differ between Linux kernels, so we try to probe
> + * it. In the future (if we ever run into issues regarding 2.), we might want
> + * to disable THP in case we fail to properly probe the THP size, or if the
> + * block size is configured smaller than the THP size.
> + */
> +static uint32_t thp_size;
> +
> +#define HPAGE_PMD_SIZE_PATH 
> "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
> +static uint32_t virtio_mem_thp_size(void)
> +{
> +gchar *content = NULL;
> +const char *endptr;
> +uint64_t tmp;
> +
> +if (thp_size) {
> +return thp_size;
> +}
> +
> +/*
> + * Try to probe the actual THP size, fallback to (sane but eventually
> + * incorrect) default sizes.
> + */
> +if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
> +!qemu_strtou64(content, &endptr, 0, &tmp) &&
> +(!endptr || *endptr == '\n')) {
> +/*
> + * Sanity-check the value, if it's too big (e.g., aarch64 with 64k 
> base
> + * pages) or weird, fallback to something smaller.
> + */
> +if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {
> +warn_report("Read unsupported THP size: %" PRIx64, tmp);
> +} else {
> +thp_size = tmp;
> +}
> +}
> +
> +if (!thp_size) {
> +thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE;
> +warn_report("Could not detect THP size, falling back to %" PRIx64
> +"  MiB.", thp_size / MiB);
> +}
> +
> +g_free(content);
> +return thp_size;
> +}
> +
> +static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
> +{
> +const uint64_t page_size = qemu_ram_pagesize(rb);
> +
> +/* We can have hugetlbfs with a page size smaller than the THP size. */
> +if (page_size == qemu_real_host_page_size) {
> +return MAX(page_size, virtio_mem_thp_size());
> +}

This condition is special, can think of hugetlbfs smaller in size than THP size
configured.
> +return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE);

Do we still need this? Or we can have only one return for both the cases?
Probably, I am missing something here.
> +}
> +
>  /*
>   * Size the usable region bigger than the requested size if possible. Esp.
>   * Linux guests will only add (aligned) memory blocks in case they fully
> @@ -437,10 +510,23 @@ static void virtio_mem_device_realize(DeviceState *dev, 
> Error **errp)
>  rb = vmem->memdev->mr.ram_block;
>  page_size = qemu_ram_pagesize(rb);
>
> +/*
> + * If the block size wasn't configured by the user, use a sane default. 
> This
> + * allows using hugetlbfs backends of any page size without manual
> + * intervention.
> + */
> +if (!vmem->bloc