virtio-mem: Probe THP size to determine default block size

Let's allow a minimum block size of 1 MiB in all configurations. Select
the default block size based on
- The page size of the memory backend.
- The THP size if the memory backend size corresponds to the real host
  page size.
- The global minimum of 1 MiB.
and warn if something smaller is configured by the user.

VIRTIO_MEM only supports Linux (depends on LINUX), so we can probe the
THP size unconditionally.

For now we only support virtio-mem on x86-64 - there isn't a user-visible
change (x86-64 only supports 2 MiB THP on the PMD level) - the default
was, and will be 2 MiB.

If we ever have THP on the PUD level (e.g., 1 GiB THP on x86-64), we
expect it to be more transparent - e.g., to only optimize fully populated
ranges unless explicitly told /configured otherwise (in contrast to PMD
THP).

Reviewed-by: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20201008083029.9504-4-david@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
This commit is contained in:
David Hildenbrand 2020-10-08 10:30:26 +02:00 committed by Michael S. Tsirkin
parent 0aed280061
commit 228957fea3
1 changed files with 101 additions and 4 deletions

View File

@ -33,10 +33,83 @@
#include "trace.h" #include "trace.h"
/* /*
* Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
* memory (e.g., 2MB on x86_64). * bitmap small.
*/ */
#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)QEMU_VMALLOC_ALIGN) #define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || \
defined(__powerpc64__)
#define VIRTIO_MEM_DEFAULT_THP_SIZE ((uint32_t)(2 * MiB))
#else
/* fallback to 1 MiB (e.g., the THP size on s390x) */
#define VIRTIO_MEM_DEFAULT_THP_SIZE VIRTIO_MEM_MIN_BLOCK_SIZE
#endif
/*
* We want to have a reasonable default block size such that
* 1. We avoid splitting THPs when unplugging memory, which degrades
* performance.
* 2. We avoid placing THPs for plugged blocks that also cover unplugged
* blocks.
*
* The actual THP size might differ between Linux kernels, so we try to probe
* it. In the future (if we ever run into issues regarding 2.), we might want
* to disable THP in case we fail to properly probe the THP size, or if the
* block size is configured smaller than the THP size.
*/
static uint32_t thp_size;
#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
static uint32_t virtio_mem_thp_size(void)
{
gchar *content = NULL;
const char *endptr;
uint64_t tmp;
if (thp_size) {
return thp_size;
}
/*
* Try to probe the actual THP size, fallback to (sane but eventually
* incorrect) default sizes.
*/
if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
!qemu_strtou64(content, &endptr, 0, &tmp) &&
(!endptr || *endptr == '\n')) {
/*
* Sanity-check the value, if it's too big (e.g., aarch64 with 64k base
* pages) or weird, fallback to something smaller.
*/
if (!tmp || !is_power_of_2(tmp) || tmp > 16 * MiB) {
warn_report("Read unsupported THP size: %" PRIx64, tmp);
} else {
thp_size = tmp;
}
}
if (!thp_size) {
thp_size = VIRTIO_MEM_DEFAULT_THP_SIZE;
warn_report("Could not detect THP size, falling back to %" PRIx64
" MiB.", thp_size / MiB);
}
g_free(content);
return thp_size;
}
static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
{
const uint64_t page_size = qemu_ram_pagesize(rb);
/* We can have hugetlbfs with a page size smaller than the THP size. */
if (page_size == qemu_real_host_page_size) {
return MAX(page_size, virtio_mem_thp_size());
}
return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE);
}
/* /*
* Size the usable region bigger than the requested size if possible. Esp. * Size the usable region bigger than the requested size if possible. Esp.
* Linux guests will only add (aligned) memory blocks in case they fully * Linux guests will only add (aligned) memory blocks in case they fully
@ -443,10 +516,23 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
rb = vmem->memdev->mr.ram_block; rb = vmem->memdev->mr.ram_block;
page_size = qemu_ram_pagesize(rb); page_size = qemu_ram_pagesize(rb);
/*
* If the block size wasn't configured by the user, use a sane default. This
* allows using hugetlbfs backends of any page size without manual
* intervention.
*/
if (!vmem->block_size) {
vmem->block_size = virtio_mem_default_block_size(rb);
}
if (vmem->block_size < page_size) { if (vmem->block_size < page_size) {
error_setg(errp, "'%s' property has to be at least the page size (0x%" error_setg(errp, "'%s' property has to be at least the page size (0x%"
PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size); PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size);
return; return;
} else if (vmem->block_size < virtio_mem_default_block_size(rb)) {
warn_report("'%s' property is smaller than the default block size (%"
PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP,
virtio_mem_default_block_size(rb) / MiB);
} else if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) { } else if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) {
error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64 error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
")", VIRTIO_MEM_REQUESTED_SIZE_PROP, ")", VIRTIO_MEM_REQUESTED_SIZE_PROP,
@ -742,6 +828,18 @@ static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name,
const VirtIOMEM *vmem = VIRTIO_MEM(obj); const VirtIOMEM *vmem = VIRTIO_MEM(obj);
uint64_t value = vmem->block_size; uint64_t value = vmem->block_size;
/*
* If not configured by the user (and we're not realized yet), use the
* default block size we would use with the current memory backend.
*/
if (!value) {
if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) {
value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block);
} else {
value = virtio_mem_thp_size();
}
}
visit_type_size(v, name, &value, errp); visit_type_size(v, name, &value, errp);
} }
@ -821,7 +919,6 @@ static void virtio_mem_instance_init(Object *obj)
{ {
VirtIOMEM *vmem = VIRTIO_MEM(obj); VirtIOMEM *vmem = VIRTIO_MEM(obj);
vmem->block_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
notifier_list_init(&vmem->size_change_notifiers); notifier_list_init(&vmem->size_change_notifiers);
vmem->precopy_notifier.notify = virtio_mem_precopy_notify; vmem->precopy_notifier.notify = virtio_mem_precopy_notify;