diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 589858cb3203..5499fc06ed5a 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -285,25 +285,6 @@ int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, return 0; } -static inline int papr_scm_node(int node) -{ - int min_dist = INT_MAX, dist; - int nid, min_node; - - if ((node == NUMA_NO_NODE) || node_online(node)) - return node; - - min_node = first_online_node; - for_each_online_node(nid) { - dist = node_distance(node, nid); - if (dist < min_dist) { - min_dist = dist; - min_node = nid; - } - } - return min_node; -} - static int papr_scm_nvdimm_init(struct papr_scm_priv *p) { struct device *dev = &p->pdev->dev; @@ -349,7 +330,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) memset(&ndr_desc, 0, sizeof(ndr_desc)); target_nid = dev_to_node(&p->pdev->dev); - online_nid = papr_scm_node(target_nid); + online_nid = numa_map_to_online_node(target_nid); ndr_desc.numa_node = online_nid; ndr_desc.target_node = target_nid; ndr_desc.res = &p->res; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index beea77046f9b..d4e446daf457 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1664,6 +1664,7 @@ config X86_PMEM_LEGACY depends on PHYS_ADDR_T_64BIT depends on BLK_DEV select X86_PMEM_LEGACY_DEVICE + select NUMA_KEEP_MEMINFO if NUMA select LIBNVDIMM help Treat memory marked using the non-standard e820 type of 12 as used diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 99f7a68738f0..59ba008504dc 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -25,11 +25,8 @@ nodemask_t numa_nodes_parsed __initdata; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -static struct numa_meminfo numa_meminfo -#ifndef CONFIG_MEMORY_HOTPLUG -__initdata -#endif -; +static struct numa_meminfo numa_meminfo __initdata_or_meminfo; +static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; static int numa_distance_cnt; static u8 *numa_distance; @@ -168,6 +165,19 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) (mi->nr_blks - idx) * sizeof(mi->blk[0])); } +/** + * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another + * @dst: numa_meminfo to append block to + * @idx: Index of memblk to remove + * @src: numa_meminfo to remove memblk from + */ +static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, + struct numa_meminfo *src) +{ + dst->blk[dst->nr_blks++] = src->blk[idx]; + numa_remove_memblk_from(idx, src); +} + /** * numa_add_memblk - Add one numa_memblk to numa_meminfo * @nid: NUMA node ID of the new memblk @@ -237,14 +247,19 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; - /* make sure all blocks are inside the limits */ + /* move / save reserved memory ranges */ + if (!memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) { + numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); + continue; + } + + /* make sure all non-reserved blocks are inside the limits */ bi->start = max(bi->start, low); bi->end = min(bi->end, high); - /* and there's no empty or non-exist block */ - if (bi->start >= bi->end || - !memblock_overlaps_region(&memblock.memory, - bi->start, bi->end - bi->start)) + /* and there's no empty block */ + if (bi->start >= bi->end) numa_remove_memblk_from(i--, mi); } @@ -881,16 +896,38 @@ EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#ifdef CONFIG_MEMORY_HOTPLUG -int memory_add_physaddr_to_nid(u64 start) +#ifdef CONFIG_NUMA_KEEP_MEMINFO +static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) { - struct numa_meminfo *mi = &numa_meminfo; - int nid = mi->blk[0].nid; int i; for (i = 0; i < mi->nr_blks; i++) if (mi->blk[i].start <= start && mi->blk[i].end > start) - nid = mi->blk[i].nid; + return mi->blk[i].nid; + return NUMA_NO_NODE; +} + +int phys_to_target_node(phys_addr_t start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + /* + * Prefer online nodes, but if reserved memory might be + * hot-added continue the search with reserved ranges. + */ + if (nid != NUMA_NO_NODE) + return nid; + + return meminfo_to_nid(&numa_reserved_meminfo, start); +} +EXPORT_SYMBOL_GPL(phys_to_target_node); + +int memory_add_physaddr_to_nid(u64 start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + if (nid == NUMA_NO_NODE) + nid = numa_meminfo.blk[0].nid; return nid; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index eadbf90e65d1..47b4969d9b93 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -72,47 +72,6 @@ int acpi_map_pxm_to_node(int pxm) } EXPORT_SYMBOL(acpi_map_pxm_to_node); -/** - * acpi_map_pxm_to_online_node - Map proximity ID to online node - * @pxm: ACPI proximity ID - * - * This is similar to acpi_map_pxm_to_node(), but always returns an online - * node. When the mapped node from a given proximity ID is offline, it - * looks up the node distance table and returns the nearest online node. - * - * ACPI device drivers, which are called after the NUMA initialization has - * completed in the kernel, can call this interface to obtain their device - * NUMA topology from ACPI tables. Such drivers do not have to deal with - * offline nodes. A node may be offline when a device proximity ID is - * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. - * "numa=off" on x86. - */ -int acpi_map_pxm_to_online_node(int pxm) -{ - int node, min_node; - - node = acpi_map_pxm_to_node(pxm); - - if (node == NUMA_NO_NODE) - node = 0; - - min_node = node; - if (!node_online(node)) { - int min_dist = INT_MAX, dist, n; - - for_each_online_node(n) { - dist = node_distance(node, n); - if (dist < min_dist) { - min_dist = dist; - min_node = n; - } - } - } - - return min_node; -} -EXPORT_SYMBOL(acpi_map_pxm_to_online_node); - static void __init acpi_table_print_srat_entry(struct acpi_subtable_header *header) { diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c index e02f60ad6c99..4cd18be9d0e9 100644 --- a/drivers/nvdimm/e820.c +++ b/drivers/nvdimm/e820.c @@ -7,6 +7,7 @@ #include #include #include +#include static int e820_pmem_remove(struct platform_device *pdev) { @@ -16,27 +17,16 @@ static int e820_pmem_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_MEMORY_HOTPLUG -static int e820_range_to_nid(resource_size_t addr) -{ - return memory_add_physaddr_to_nid(addr); -} -#else -static int e820_range_to_nid(resource_size_t addr) -{ - return NUMA_NO_NODE; -} -#endif - static int e820_register_one(struct resource *res, void *data) { struct nd_region_desc ndr_desc; struct nvdimm_bus *nvdimm_bus = data; + int nid = phys_to_target_node(res->start); memset(&ndr_desc, 0, sizeof(ndr_desc)); ndr_desc.res = res; - ndr_desc.numa_node = e820_range_to_nid(res->start); - ndr_desc.target_node = ndr_desc.numa_node; + ndr_desc.numa_node = numa_map_to_online_node(nid); + ndr_desc.target_node = nid; set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) return -ENXIO; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 0f24d701fbdc..3839363081f3 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -416,9 +416,30 @@ extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); #ifdef CONFIG_ACPI_NUMA -int acpi_map_pxm_to_online_node(int pxm); int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); + +/** + * acpi_map_pxm_to_online_node - Map proximity ID to online node + * @pxm: ACPI proximity ID + * + * This is similar to acpi_map_pxm_to_node(), but always returns an online + * node. When the mapped node from a given proximity ID is offline, it + * looks up the node distance table and returns the nearest online node. + * + * ACPI device drivers, which are called after the NUMA initialization has + * completed in the kernel, can call this interface to obtain their device + * NUMA topology from ACPI tables. Such drivers do not have to deal with + * offline nodes. A node may be offline when a device proximity ID is + * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. + * "numa=off" on x86. + */ +static inline int acpi_map_pxm_to_online_node(int pxm) +{ + int node = acpi_map_pxm_to_node(pxm); + + return numa_map_to_online_node(node); +} #else static inline int acpi_map_pxm_to_online_node(int pxm) { diff --git a/include/linux/numa.h b/include/linux/numa.h index 110b0e5d0fb0..a42df804679e 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NUMA_H #define _LINUX_NUMA_H - +#include #ifdef CONFIG_NODES_SHIFT #define NODES_SHIFT CONFIG_NODES_SHIFT @@ -13,4 +13,32 @@ #define NUMA_NO_NODE (-1) +/* optionally keep NUMA memory info available post init */ +#ifdef CONFIG_NUMA_KEEP_MEMINFO +#define __initdata_or_meminfo +#else +#define __initdata_or_meminfo __initdata +#endif + +#ifdef CONFIG_NUMA +/* Generic implementation available */ +int numa_map_to_online_node(int node); + +/* + * Optional architecture specific implementation, users need a "depends + * on $ARCH" + */ +int phys_to_target_node(phys_addr_t addr); +#else +static inline int numa_map_to_online_node(int node) +{ + return NUMA_NO_NODE; +} + +static inline int phys_to_target_node(phys_addr_t addr) +{ + return NUMA_NO_NODE; +} +#endif + #endif /* _LINUX_NUMA_H */ diff --git a/mm/Kconfig b/mm/Kconfig index ab80933be65f..328268473fec 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -139,6 +139,10 @@ config HAVE_FAST_GUP config ARCH_KEEP_MEMBLOCK bool +# Keep arch NUMA mapping infrastructure post-init. +config NUMA_KEEP_MEMINFO + bool + config MEMORY_ISOLATION bool @@ -154,6 +158,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG + select NUMA_KEEP_MEMINFO if NUMA config MEMORY_HOTPLUG_SPARSE def_bool y diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 977c641f78cf..19f7e71945a7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -127,6 +127,32 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +/** + * numa_map_to_online_node - Find closest online node + * @nid: Node id to start the search + * + * Lookup the next closest node by distance if @nid is not online. + */ +int numa_map_to_online_node(int node) +{ + int min_dist = INT_MAX, dist, n, min_node; + + if (node == NUMA_NO_NODE || node_online(node)) + return node; + + min_node = node; + for_each_online_node(n) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(numa_map_to_online_node); + struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy;