2008-01-30 20:30:37 +08:00
|
|
|
/*
|
2005-04-17 06:20:36 +08:00
|
|
|
* Generic VM initialization for x86-64 NUMA setups.
|
|
|
|
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
|
2008-01-30 20:30:37 +08:00
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/bootmem.h>
|
2010-08-26 04:39:17 +08:00
|
|
|
#include <linux/memblock.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/ctype.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/nodemask.h>
|
2008-01-30 20:33:11 +08:00
|
|
|
#include <linux/sched.h>
|
2011-02-16 19:13:06 +08:00
|
|
|
#include <linux/acpi.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <asm/e820.h>
|
|
|
|
#include <asm/proto.h>
|
|
|
|
#include <asm/dma.h>
|
|
|
|
#include <asm/numa.h>
|
|
|
|
#include <asm/acpi.h>
|
2010-09-18 00:03:43 +08:00
|
|
|
#include <asm/amd_nb.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
struct numa_memblk {
|
|
|
|
u64 start;
|
|
|
|
u64 end;
|
|
|
|
int nid;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct numa_meminfo {
|
|
|
|
int nr_blks;
|
|
|
|
struct numa_memblk blk[NR_NODE_MEMBLKS];
|
|
|
|
};
|
|
|
|
|
2005-09-07 06:17:45 +08:00
|
|
|
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
2008-01-30 20:30:37 +08:00
|
|
|
EXPORT_SYMBOL(node_data);
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
nodemask_t numa_nodes_parsed __initdata;
|
2011-02-16 19:13:07 +08:00
|
|
|
|
2006-03-25 23:31:46 +08:00
|
|
|
struct memnode memnode;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-05-12 21:43:36 +08:00
|
|
|
static unsigned long __initdata nodemap_addr;
|
|
|
|
static unsigned long __initdata nodemap_size;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
static struct numa_meminfo numa_meminfo __initdata;
|
2011-02-17 00:11:07 +08:00
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
static int numa_distance_cnt;
|
|
|
|
static u8 *numa_distance;
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA_EMU
|
|
|
|
static bool numa_emu_dist;
|
|
|
|
#endif
|
|
|
|
|
2005-11-06 00:25:54 +08:00
|
|
|
/*
|
|
|
|
* Given a shift value, try to populate memnodemap[]
|
|
|
|
* Returns :
|
|
|
|
* 1 if OK
|
|
|
|
* 0 if memnodmap[] too small (of shift too small)
|
|
|
|
* -1 if node overlap or lost ram (shift too big)
|
|
|
|
*/
|
2011-02-17 00:11:08 +08:00
|
|
|
static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-11-06 00:25:54 +08:00
|
|
|
unsigned long addr, end;
|
2008-01-30 20:30:37 +08:00
|
|
|
int i, res = -1;
|
2005-07-29 12:15:38 +08:00
|
|
|
|
2008-01-30 20:33:25 +08:00
|
|
|
memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
|
2011-02-17 00:11:08 +08:00
|
|
|
for (i = 0; i < mi->nr_blks; i++) {
|
|
|
|
addr = mi->blk[i].start;
|
|
|
|
end = mi->blk[i].end;
|
2005-11-06 00:25:54 +08:00
|
|
|
if (addr >= end)
|
2005-07-29 12:15:38 +08:00
|
|
|
continue;
|
2007-02-13 20:26:19 +08:00
|
|
|
if ((end >> shift) >= memnodemapsize)
|
2005-11-06 00:25:54 +08:00
|
|
|
return 0;
|
|
|
|
do {
|
2008-01-30 20:33:25 +08:00
|
|
|
if (memnodemap[addr >> shift] != NUMA_NO_NODE)
|
2005-07-29 12:15:38 +08:00
|
|
|
return -1;
|
2011-02-17 00:11:08 +08:00
|
|
|
memnodemap[addr >> shift] = mi->blk[i].nid;
|
2007-02-13 20:26:19 +08:00
|
|
|
addr += (1UL << shift);
|
2005-11-06 00:25:54 +08:00
|
|
|
} while (addr < end);
|
|
|
|
res = 1;
|
2008-01-30 20:30:37 +08:00
|
|
|
}
|
2005-11-06 00:25:54 +08:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2007-02-13 20:26:19 +08:00
|
|
|
static int __init allocate_cachealigned_memnodemap(void)
|
|
|
|
{
|
2008-02-02 00:49:41 +08:00
|
|
|
unsigned long addr;
|
2007-02-13 20:26:19 +08:00
|
|
|
|
|
|
|
memnodemap = memnode.embedded_map;
|
2008-01-30 20:33:15 +08:00
|
|
|
if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
|
2007-02-13 20:26:19 +08:00
|
|
|
return 0;
|
|
|
|
|
2008-02-02 00:49:41 +08:00
|
|
|
addr = 0x8000;
|
2008-07-25 22:48:58 +08:00
|
|
|
nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
|
2010-12-28 08:48:08 +08:00
|
|
|
nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
|
2008-02-02 00:49:41 +08:00
|
|
|
nodemap_size, L1_CACHE_BYTES);
|
2010-08-26 04:39:17 +08:00
|
|
|
if (nodemap_addr == MEMBLOCK_ERROR) {
|
2007-02-13 20:26:19 +08:00
|
|
|
printk(KERN_ERR
|
|
|
|
"NUMA: Unable to allocate Memory to Node hash map\n");
|
|
|
|
nodemap_addr = nodemap_size = 0;
|
|
|
|
return -1;
|
|
|
|
}
|
2008-02-02 00:49:41 +08:00
|
|
|
memnodemap = phys_to_virt(nodemap_addr);
|
2010-08-26 04:39:17 +08:00
|
|
|
memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
|
2007-02-13 20:26:19 +08:00
|
|
|
|
|
|
|
printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
|
|
|
|
nodemap_addr, nodemap_addr + nodemap_size);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The LSB of all start and end addresses in the node map is the value of the
|
|
|
|
* maximum possible shift.
|
|
|
|
*/
|
2011-02-17 00:11:08 +08:00
|
|
|
static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
|
2005-11-06 00:25:54 +08:00
|
|
|
{
|
2007-02-13 20:26:20 +08:00
|
|
|
int i, nodes_used = 0;
|
2007-02-13 20:26:19 +08:00
|
|
|
unsigned long start, end;
|
|
|
|
unsigned long bitfield = 0, memtop = 0;
|
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
for (i = 0; i < mi->nr_blks; i++) {
|
|
|
|
start = mi->blk[i].start;
|
|
|
|
end = mi->blk[i].end;
|
2007-02-13 20:26:19 +08:00
|
|
|
if (start >= end)
|
|
|
|
continue;
|
2007-02-13 20:26:20 +08:00
|
|
|
bitfield |= start;
|
|
|
|
nodes_used++;
|
2007-02-13 20:26:19 +08:00
|
|
|
if (end > memtop)
|
|
|
|
memtop = end;
|
|
|
|
}
|
2007-02-13 20:26:20 +08:00
|
|
|
if (nodes_used <= 1)
|
|
|
|
i = 63;
|
|
|
|
else
|
|
|
|
i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
|
2007-02-13 20:26:19 +08:00
|
|
|
memnodemapsize = (memtop >> i)+1;
|
|
|
|
return i;
|
|
|
|
}
|
2005-11-06 00:25:54 +08:00
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
static int __init compute_hash_shift(const struct numa_meminfo *mi)
|
2007-02-13 20:26:19 +08:00
|
|
|
{
|
|
|
|
int shift;
|
2005-11-06 00:25:54 +08:00
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
shift = extract_lsb_from_nodes(mi);
|
2007-02-13 20:26:19 +08:00
|
|
|
if (allocate_cachealigned_memnodemap())
|
|
|
|
return -1;
|
2006-01-12 05:44:33 +08:00
|
|
|
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
|
2005-11-06 00:25:54 +08:00
|
|
|
shift);
|
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
if (populate_memnodemap(mi, shift) != 1) {
|
2008-01-30 20:30:37 +08:00
|
|
|
printk(KERN_INFO "Your memory is not aligned you need to "
|
|
|
|
"rebuild your kernel with a bigger NODEMAPSIZE "
|
|
|
|
"shift=%d\n", shift);
|
2005-11-06 00:25:54 +08:00
|
|
|
return -1;
|
|
|
|
}
|
2005-07-29 12:15:38 +08:00
|
|
|
return shift;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
mm: clean up for early_pfn_to_nid()
What's happening is that the assertion in mm/page_alloc.c:move_freepages()
is triggering:
BUG_ON(page_zone(start_page) != page_zone(end_page));
Once I knew this is what was happening, I added some annotations:
if (unlikely(page_zone(start_page) != page_zone(end_page))) {
printk(KERN_ERR "move_freepages: Bogus zones: "
"start_page[%p] end_page[%p] zone[%p]\n",
start_page, end_page, zone);
printk(KERN_ERR "move_freepages: "
"start_zone[%p] end_zone[%p]\n",
page_zone(start_page), page_zone(end_page));
printk(KERN_ERR "move_freepages: "
"start_pfn[0x%lx] end_pfn[0x%lx]\n",
page_to_pfn(start_page), page_to_pfn(end_page));
printk(KERN_ERR "move_freepages: "
"start_nid[%d] end_nid[%d]\n",
page_to_nid(start_page), page_to_nid(end_page));
...
And here's what I got:
move_freepages: Bogus zones: start_page[2207d0000] end_page[2207dffc0] zone[fffff8103effcb00]
move_freepages: start_zone[fffff8103effcb00] end_zone[fffff8003fffeb00]
move_freepages: start_pfn[0x81f600] end_pfn[0x81f7ff]
move_freepages: start_nid[1] end_nid[0]
My memory layout on this box is:
[ 0.000000] Zone PFN ranges:
[ 0.000000] Normal 0x00000000 -> 0x0081ff5d
[ 0.000000] Movable zone start PFN for each node
[ 0.000000] early_node_map[8] active PFN ranges
[ 0.000000] 0: 0x00000000 -> 0x00020000
[ 0.000000] 1: 0x00800000 -> 0x0081f7ff
[ 0.000000] 1: 0x0081f800 -> 0x0081fe50
[ 0.000000] 1: 0x0081fed1 -> 0x0081fed8
[ 0.000000] 1: 0x0081feda -> 0x0081fedb
[ 0.000000] 1: 0x0081fedd -> 0x0081fee5
[ 0.000000] 1: 0x0081fee7 -> 0x0081ff51
[ 0.000000] 1: 0x0081ff59 -> 0x0081ff5d
So it's a block move in that 0x81f600-->0x81f7ff region which triggers
the problem.
This patch:
Declaration of early_pfn_to_nid() is scattered over per-arch include
files, and it seems it's complicated to know when the declaration is used.
I think it makes fix-for-memmap-init not easy.
This patch moves all declaration to include/linux/mm.h
After this,
if !CONFIG_NODES_POPULATES_NODE_MAP && !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-> Use static definition in include/linux/mm.h
else if !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-> Use generic definition in mm/page_alloc.c
else
-> per-arch back end function will be called.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: David Miller <davem@davemlloft.net>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: <stable@kernel.org> [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-02-19 06:48:32 +08:00
|
|
|
int __meminit __early_pfn_to_nid(unsigned long pfn)
|
2005-06-23 15:08:07 +08:00
|
|
|
{
|
|
|
|
return phys_to_nid(pfn << PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
|
2008-01-30 20:30:37 +08:00
|
|
|
static void * __init early_node_mem(int nodeid, unsigned long start,
|
2008-02-02 00:49:41 +08:00
|
|
|
unsigned long end, unsigned long size,
|
|
|
|
unsigned long align)
|
2006-04-08 01:49:21 +08:00
|
|
|
{
|
2010-02-10 17:20:18 +08:00
|
|
|
unsigned long mem;
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2010-02-10 17:20:18 +08:00
|
|
|
/*
|
|
|
|
* put it on high as possible
|
|
|
|
* something will go with NODE_DATA
|
|
|
|
*/
|
|
|
|
if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
|
|
|
|
start = MAX_DMA_PFN<<PAGE_SHIFT;
|
|
|
|
if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
|
|
|
|
end > (MAX_DMA32_PFN<<PAGE_SHIFT))
|
|
|
|
start = MAX_DMA32_PFN<<PAGE_SHIFT;
|
2010-08-26 04:39:17 +08:00
|
|
|
mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
|
|
|
|
if (mem != MEMBLOCK_ERROR)
|
2006-04-08 01:49:21 +08:00
|
|
|
return __va(mem);
|
2008-02-02 00:49:42 +08:00
|
|
|
|
2010-02-10 17:20:18 +08:00
|
|
|
/* extend the search scope */
|
|
|
|
end = max_pfn_mapped << PAGE_SHIFT;
|
2010-10-29 00:50:17 +08:00
|
|
|
start = MAX_DMA_PFN << PAGE_SHIFT;
|
|
|
|
mem = memblock_find_in_range(start, end, size, align);
|
2010-08-26 04:39:17 +08:00
|
|
|
if (mem != MEMBLOCK_ERROR)
|
2006-04-08 01:49:21 +08:00
|
|
|
return __va(mem);
|
2008-02-02 00:49:42 +08:00
|
|
|
|
2010-02-10 17:20:15 +08:00
|
|
|
printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
|
2008-01-30 20:30:37 +08:00
|
|
|
size, nodeid);
|
2010-02-10 17:20:15 +08:00
|
|
|
|
|
|
|
return NULL;
|
2006-04-08 01:49:21 +08:00
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:10 +08:00
|
|
|
static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
|
|
|
|
struct numa_meminfo *mi)
|
2011-02-17 00:11:07 +08:00
|
|
|
{
|
2011-02-17 00:11:09 +08:00
|
|
|
/* ignore zero length blks */
|
|
|
|
if (start == end)
|
|
|
|
return 0;
|
2011-02-17 00:11:08 +08:00
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
/* whine about and ignore invalid blks */
|
|
|
|
if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
|
|
|
|
pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
|
|
|
|
nid, start, end);
|
|
|
|
return 0;
|
2011-02-17 00:11:07 +08:00
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
if (mi->nr_blks >= NR_NODE_MEMBLKS) {
|
|
|
|
pr_err("NUMA: too many memblk ranges\n");
|
2011-02-17 00:11:07 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
mi->blk[mi->nr_blks].start = start;
|
|
|
|
mi->blk[mi->nr_blks].end = end;
|
|
|
|
mi->blk[mi->nr_blks].nid = nid;
|
|
|
|
mi->nr_blks++;
|
2011-02-17 00:11:07 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
|
|
|
|
{
|
|
|
|
mi->nr_blks--;
|
|
|
|
memmove(&mi->blk[idx], &mi->blk[idx + 1],
|
|
|
|
(mi->nr_blks - idx) * sizeof(mi->blk[0]));
|
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:10 +08:00
|
|
|
int __init numa_add_memblk(int nid, u64 start, u64 end)
|
|
|
|
{
|
|
|
|
return numa_add_memblk_to(nid, start, end, &numa_meminfo);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Initialize bootmem allocator for a node */
|
2009-05-16 04:59:37 +08:00
|
|
|
void __init
|
|
|
|
setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
|
2008-01-30 20:30:37 +08:00
|
|
|
{
|
2010-02-10 17:20:20 +08:00
|
|
|
unsigned long start_pfn, last_pfn, nodedata_phys;
|
2009-05-16 04:59:37 +08:00
|
|
|
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
|
2008-03-19 03:52:37 +08:00
|
|
|
int nid;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-04-23 05:19:27 +08:00
|
|
|
if (!end)
|
|
|
|
return;
|
|
|
|
|
2009-05-16 04:59:37 +08:00
|
|
|
/*
|
|
|
|
* Don't confuse VM with a node that doesn't have the
|
|
|
|
* minimum amount of memory:
|
|
|
|
*/
|
|
|
|
if (end && (end - start) < NODE_MIN_SIZE)
|
|
|
|
return;
|
|
|
|
|
2008-07-25 22:48:58 +08:00
|
|
|
start = roundup(start, ZONE_ALIGN);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-02-10 17:20:20 +08:00
|
|
|
printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
|
2008-01-30 20:30:37 +08:00
|
|
|
start, end);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
start_pfn = start >> PAGE_SHIFT;
|
2008-05-12 21:43:36 +08:00
|
|
|
last_pfn = end >> PAGE_SHIFT;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-02 00:49:41 +08:00
|
|
|
node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
|
|
|
|
SMP_CACHE_BYTES);
|
2006-04-08 01:49:21 +08:00
|
|
|
if (node_data[nodeid] == NULL)
|
|
|
|
return;
|
|
|
|
nodedata_phys = __pa(node_data[nodeid]);
|
2010-08-26 04:39:17 +08:00
|
|
|
memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
|
2008-02-04 23:47:56 +08:00
|
|
|
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
|
|
|
|
nodedata_phys + pgdat_size - 1);
|
2010-02-10 17:20:15 +08:00
|
|
|
nid = phys_to_nid(nodedata_phys);
|
|
|
|
if (nid != nodeid)
|
|
|
|
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
|
2010-02-10 17:20:20 +08:00
|
|
|
NODE_DATA(nodeid)->node_id = nodeid;
|
2005-04-17 06:20:36 +08:00
|
|
|
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
|
2008-05-12 21:43:36 +08:00
|
|
|
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
node_set_online(nodeid);
|
2008-01-30 20:30:37 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
|
2011-02-17 00:11:08 +08:00
|
|
|
{
|
2011-02-17 00:11:09 +08:00
|
|
|
const u64 low = 0;
|
|
|
|
const u64 high = (u64)max_pfn << PAGE_SHIFT;
|
2011-02-17 00:11:09 +08:00
|
|
|
int i, j, k;
|
2011-02-17 00:11:07 +08:00
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
for (i = 0; i < mi->nr_blks; i++) {
|
2011-02-17 00:11:08 +08:00
|
|
|
struct numa_memblk *bi = &mi->blk[i];
|
2011-02-17 00:11:07 +08:00
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
/* make sure all blocks are inside the limits */
|
|
|
|
bi->start = max(bi->start, low);
|
|
|
|
bi->end = min(bi->end, high);
|
|
|
|
|
|
|
|
/* and there's no empty block */
|
|
|
|
if (bi->start == bi->end) {
|
|
|
|
numa_remove_memblk_from(i--, mi);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
for (j = i + 1; j < mi->nr_blks; j++) {
|
2011-02-17 00:11:08 +08:00
|
|
|
struct numa_memblk *bj = &mi->blk[j];
|
2011-02-17 00:11:07 +08:00
|
|
|
unsigned long start, end;
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
/*
|
|
|
|
* See whether there are overlapping blocks. Whine
|
|
|
|
* about but allow overlaps of the same nid. They
|
|
|
|
* will be merged below.
|
|
|
|
*/
|
|
|
|
if (bi->end > bj->start && bi->start < bj->end) {
|
|
|
|
if (bi->nid != bj->nid) {
|
|
|
|
pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
|
|
|
|
bi->nid, bi->start, bi->end,
|
|
|
|
bj->nid, bj->start, bj->end);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
|
|
|
|
bi->nid, bi->start, bi->end,
|
|
|
|
bj->start, bj->end);
|
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
/*
|
|
|
|
* Join together blocks on the same node, holes
|
|
|
|
* between which don't overlap with memory on other
|
|
|
|
* nodes.
|
|
|
|
*/
|
2011-02-17 00:11:08 +08:00
|
|
|
if (bi->nid != bj->nid)
|
2011-02-17 00:11:07 +08:00
|
|
|
continue;
|
2011-02-17 00:11:09 +08:00
|
|
|
start = max(min(bi->start, bj->start), low);
|
|
|
|
end = min(max(bi->end, bj->end), high);
|
2011-02-17 00:11:09 +08:00
|
|
|
for (k = 0; k < mi->nr_blks; k++) {
|
2011-02-17 00:11:08 +08:00
|
|
|
struct numa_memblk *bk = &mi->blk[k];
|
|
|
|
|
|
|
|
if (bi->nid == bk->nid)
|
2011-02-17 00:11:07 +08:00
|
|
|
continue;
|
2011-02-17 00:11:08 +08:00
|
|
|
if (start < bk->end && end > bk->start)
|
2011-02-17 00:11:07 +08:00
|
|
|
break;
|
|
|
|
}
|
2011-02-17 00:11:08 +08:00
|
|
|
if (k < mi->nr_blks)
|
2011-02-17 00:11:07 +08:00
|
|
|
continue;
|
|
|
|
printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
|
2011-02-17 00:11:08 +08:00
|
|
|
bi->nid, bi->start, bi->end, bj->start, bj->end,
|
2011-02-17 00:11:07 +08:00
|
|
|
start, end);
|
2011-02-17 00:11:08 +08:00
|
|
|
bi->start = start;
|
|
|
|
bi->end = end;
|
2011-02-17 00:11:09 +08:00
|
|
|
numa_remove_memblk_from(j--, mi);
|
2011-02-17 00:11:07 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
|
|
|
|
mi->blk[i].start = mi->blk[i].end = 0;
|
|
|
|
mi->blk[i].nid = NUMA_NO_NODE;
|
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
/*
|
|
|
|
* Set nodes, which have memory in @mi, in *@nodemask.
|
|
|
|
*/
|
|
|
|
static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
|
|
|
|
const struct numa_meminfo *mi)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
|
|
|
|
if (mi->blk[i].start != mi->blk[i].end &&
|
|
|
|
mi->blk[i].nid != NUMA_NO_NODE)
|
|
|
|
node_set(mi->blk[i].nid, *nodemask);
|
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
/*
|
|
|
|
* Reset distance table. The current table is freed. The next
|
|
|
|
* numa_set_distance() call will create a new one.
|
|
|
|
*/
|
|
|
|
static void __init numa_reset_distance(void)
|
|
|
|
{
|
|
|
|
size_t size;
|
|
|
|
|
|
|
|
size = numa_distance_cnt * sizeof(numa_distance[0]);
|
|
|
|
memblock_x86_free_range(__pa(numa_distance),
|
|
|
|
__pa(numa_distance) + size);
|
|
|
|
numa_distance = NULL;
|
|
|
|
numa_distance_cnt = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set the distance between node @from to @to to @distance. If distance
|
|
|
|
* table doesn't exist, one which is large enough to accomodate all the
|
|
|
|
* currently known nodes will be created.
|
|
|
|
*/
|
|
|
|
void __init numa_set_distance(int from, int to, int distance)
|
|
|
|
{
|
|
|
|
if (!numa_distance) {
|
|
|
|
nodemask_t nodes_parsed;
|
|
|
|
size_t size;
|
|
|
|
int i, j, cnt = 0;
|
|
|
|
u64 phys;
|
|
|
|
|
|
|
|
/* size the new table and allocate it */
|
|
|
|
nodes_parsed = numa_nodes_parsed;
|
|
|
|
numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
|
|
|
|
|
|
|
|
for_each_node_mask(i, nodes_parsed)
|
|
|
|
cnt = i;
|
|
|
|
size = ++cnt * sizeof(numa_distance[0]);
|
|
|
|
|
|
|
|
phys = memblock_find_in_range(0,
|
|
|
|
(u64)max_pfn_mapped << PAGE_SHIFT,
|
|
|
|
size, PAGE_SIZE);
|
|
|
|
if (phys == MEMBLOCK_ERROR) {
|
|
|
|
pr_warning("NUMA: Warning: can't allocate distance table!\n");
|
|
|
|
/* don't retry until explicitly reset */
|
|
|
|
numa_distance = (void *)1LU;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
|
|
|
|
|
|
|
|
numa_distance = __va(phys);
|
|
|
|
numa_distance_cnt = cnt;
|
|
|
|
|
|
|
|
/* fill with the default distances */
|
|
|
|
for (i = 0; i < cnt; i++)
|
|
|
|
for (j = 0; j < cnt; j++)
|
|
|
|
numa_distance[i * cnt + j] = i == j ?
|
|
|
|
LOCAL_DISTANCE : REMOTE_DISTANCE;
|
|
|
|
printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
|
|
|
|
printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
|
|
|
|
from, to, distance);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((u8)distance != distance ||
|
|
|
|
(from == to && distance != LOCAL_DISTANCE)) {
|
|
|
|
pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
|
|
|
|
from, to, distance);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
numa_distance[from * numa_distance_cnt + to] = distance;
|
|
|
|
}
|
|
|
|
|
|
|
|
int __node_distance(int from, int to)
|
|
|
|
{
|
|
|
|
#if defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA_EMU)
|
|
|
|
if (numa_emu_dist)
|
|
|
|
return acpi_emu_node_distance(from, to);
|
|
|
|
#endif
|
|
|
|
if (from >= numa_distance_cnt || to >= numa_distance_cnt)
|
|
|
|
return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
|
|
|
|
return numa_distance[from * numa_distance_cnt + to];
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__node_distance);
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
/*
|
|
|
|
* Sanity check to catch more bad NUMA configurations (they are amazingly
|
|
|
|
* common). Make sure the nodes cover all memory.
|
|
|
|
*/
|
2011-02-17 00:11:09 +08:00
|
|
|
static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
|
2011-02-17 00:11:09 +08:00
|
|
|
{
|
|
|
|
unsigned long numaram, e820ram;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
numaram = 0;
|
2011-02-17 00:11:09 +08:00
|
|
|
for (i = 0; i < mi->nr_blks; i++) {
|
|
|
|
unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
|
|
|
|
unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
|
2011-02-17 00:11:09 +08:00
|
|
|
numaram += e - s;
|
2011-02-17 00:11:09 +08:00
|
|
|
numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
|
2011-02-17 00:11:09 +08:00
|
|
|
if ((long)numaram < 0)
|
|
|
|
numaram = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
e820ram = max_pfn - (memblock_x86_hole_size(0,
|
|
|
|
max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
|
|
|
|
/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
|
|
|
|
if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
|
|
|
|
printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
|
|
|
|
(numaram << PAGE_SHIFT) >> 20,
|
|
|
|
(e820ram << PAGE_SHIFT) >> 20);
|
2011-02-17 00:11:09 +08:00
|
|
|
return false;
|
2011-02-17 00:11:09 +08:00
|
|
|
}
|
2011-02-17 00:11:09 +08:00
|
|
|
return true;
|
2011-02-17 00:11:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int __init numa_register_memblks(struct numa_meminfo *mi)
|
|
|
|
{
|
2011-02-17 00:11:09 +08:00
|
|
|
int i, j, nid;
|
2011-02-17 00:11:09 +08:00
|
|
|
|
|
|
|
/* Account for nodes with cpus and no memory */
|
2011-02-17 00:11:09 +08:00
|
|
|
node_possible_map = numa_nodes_parsed;
|
|
|
|
numa_nodemask_from_meminfo(&node_possible_map, mi);
|
2011-02-17 00:11:09 +08:00
|
|
|
if (WARN_ON(nodes_empty(node_possible_map)))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
memnode_shift = compute_hash_shift(mi);
|
2011-02-17 00:11:07 +08:00
|
|
|
if (memnode_shift < 0) {
|
|
|
|
printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
for (i = 0; i < mi->nr_blks; i++)
|
|
|
|
memblock_x86_register_active_regions(mi->blk[i].nid,
|
|
|
|
mi->blk[i].start >> PAGE_SHIFT,
|
|
|
|
mi->blk[i].end >> PAGE_SHIFT);
|
2011-02-17 00:11:08 +08:00
|
|
|
|
|
|
|
/* for out of order entries */
|
|
|
|
sort_node_map();
|
2011-02-17 00:11:09 +08:00
|
|
|
if (!numa_meminfo_cover_memory(mi))
|
2011-02-17 00:11:08 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
init_memory_mapping_high();
|
|
|
|
|
|
|
|
/*
|
2011-02-17 00:11:09 +08:00
|
|
|
* Finally register nodes. Do it twice in case setup_node_bootmem
|
|
|
|
* missed one due to missing bootmem.
|
2011-02-17 00:11:08 +08:00
|
|
|
*/
|
2011-02-17 00:11:09 +08:00
|
|
|
for (i = 0; i < 2; i++) {
|
|
|
|
for_each_node_mask(nid, node_possible_map) {
|
|
|
|
u64 start = (u64)max_pfn << PAGE_SHIFT;
|
|
|
|
u64 end = 0;
|
|
|
|
|
|
|
|
if (node_online(nid))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for (j = 0; j < mi->nr_blks; j++) {
|
|
|
|
if (nid != mi->blk[j].nid)
|
|
|
|
continue;
|
|
|
|
start = min(mi->blk[j].start, start);
|
|
|
|
end = max(mi->blk[j].end, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (start < end)
|
|
|
|
setup_node_bootmem(nid, start, end);
|
|
|
|
}
|
|
|
|
}
|
2011-02-17 00:11:08 +08:00
|
|
|
|
2011-02-17 00:11:07 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_NUMA_EMU
|
2007-02-13 20:26:22 +08:00
|
|
|
/* Numa emulation */
|
2009-09-26 06:20:09 +08:00
|
|
|
static struct bootnode nodes[MAX_NUMNODES] __initdata;
|
2010-12-23 09:23:54 +08:00
|
|
|
static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
|
2011-02-17 00:11:10 +08:00
|
|
|
static char *emu_cmdline __initdata;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-19 16:57:21 +08:00
|
|
|
void __init numa_emu_cmdline(char *str)
|
|
|
|
{
|
2011-02-17 00:11:10 +08:00
|
|
|
emu_cmdline = str;
|
2011-01-19 16:57:21 +08:00
|
|
|
}
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
int __init find_node_by_addr(unsigned long addr)
|
|
|
|
{
|
2011-02-17 00:11:09 +08:00
|
|
|
const struct numa_meminfo *mi = &numa_meminfo;
|
2011-02-17 00:11:09 +08:00
|
|
|
int i;
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
for (i = 0; i < mi->nr_blks; i++) {
|
2011-02-17 00:11:09 +08:00
|
|
|
/*
|
|
|
|
* Find the real node that this emulated node appears on. For
|
|
|
|
* the sake of simplicity, we only use a real node's starting
|
|
|
|
* address to determine which emulated node it appears on.
|
|
|
|
*/
|
2011-02-17 00:11:09 +08:00
|
|
|
if (addr >= mi->blk[i].start && addr < mi->blk[i].end)
|
|
|
|
return mi->blk[i].nid;
|
2011-02-17 00:11:09 +08:00
|
|
|
}
|
2011-02-17 00:11:09 +08:00
|
|
|
return NUMA_NO_NODE;
|
2011-02-17 00:11:09 +08:00
|
|
|
}
|
|
|
|
|
2011-02-16 19:13:07 +08:00
|
|
|
static int __init setup_physnodes(unsigned long start, unsigned long end)
|
2009-09-26 06:20:09 +08:00
|
|
|
{
|
2011-02-17 00:11:09 +08:00
|
|
|
const struct numa_meminfo *mi = &numa_meminfo;
|
2009-09-26 06:20:09 +08:00
|
|
|
int ret = 0;
|
|
|
|
int i;
|
|
|
|
|
2010-12-23 09:23:54 +08:00
|
|
|
memset(physnodes, 0, sizeof(physnodes));
|
2011-02-16 19:13:07 +08:00
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
for (i = 0; i < mi->nr_blks; i++) {
|
|
|
|
int nid = mi->blk[i].nid;
|
|
|
|
|
|
|
|
if (physnodes[nid].start == physnodes[nid].end) {
|
|
|
|
physnodes[nid].start = mi->blk[i].start;
|
|
|
|
physnodes[nid].end = mi->blk[i].end;
|
|
|
|
} else {
|
|
|
|
physnodes[nid].start = min(physnodes[nid].start,
|
|
|
|
mi->blk[i].start);
|
|
|
|
physnodes[nid].end = max(physnodes[nid].end,
|
|
|
|
mi->blk[i].end);
|
|
|
|
}
|
2011-02-16 19:13:07 +08:00
|
|
|
}
|
|
|
|
|
2009-09-26 06:20:09 +08:00
|
|
|
/*
|
|
|
|
* Basic sanity checking on the physical node map: there may be errors
|
2010-10-29 23:14:30 +08:00
|
|
|
* if the SRAT or AMD code incorrectly reported the topology or the mem=
|
2009-09-26 06:20:09 +08:00
|
|
|
* kernel parameter is used.
|
|
|
|
*/
|
2010-12-23 09:23:56 +08:00
|
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
2009-09-26 06:20:09 +08:00
|
|
|
if (physnodes[i].start == physnodes[i].end)
|
|
|
|
continue;
|
|
|
|
if (physnodes[i].start > end) {
|
|
|
|
physnodes[i].end = physnodes[i].start;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (physnodes[i].end < start) {
|
|
|
|
physnodes[i].start = physnodes[i].end;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (physnodes[i].start < start)
|
|
|
|
physnodes[i].start = start;
|
|
|
|
if (physnodes[i].end > end)
|
|
|
|
physnodes[i].end = end;
|
|
|
|
ret++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If no physical topology was detected, a single node is faked to cover
|
|
|
|
* the entire address space.
|
|
|
|
*/
|
|
|
|
if (!ret) {
|
|
|
|
physnodes[ret].start = start;
|
|
|
|
physnodes[ret].end = end;
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-12-23 09:23:51 +08:00
|
|
|
static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
BUG_ON(acpi && amd);
|
|
|
|
#ifdef CONFIG_ACPI_NUMA
|
|
|
|
if (acpi)
|
|
|
|
acpi_fake_nodes(nodes, nr_nodes);
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_AMD_NUMA
|
|
|
|
if (amd)
|
|
|
|
amd_fake_nodes(nodes, nr_nodes);
|
|
|
|
#endif
|
|
|
|
if (!acpi && !amd)
|
|
|
|
for (i = 0; i < nr_cpu_ids; i++)
|
|
|
|
numa_set_node(i, 0);
|
|
|
|
}
|
|
|
|
|
2007-02-13 20:26:22 +08:00
|
|
|
/*
|
2008-01-30 20:30:37 +08:00
|
|
|
* Setups up nid to range from addr to addr + size. If the end
|
|
|
|
* boundary is greater than max_addr, then max_addr is used instead.
|
|
|
|
* The return value is 0 if there is additional memory left for
|
|
|
|
* allocation past addr and -1 otherwise. addr is adjusted to be at
|
|
|
|
* the end of the node.
|
2007-02-13 20:26:22 +08:00
|
|
|
*/
|
2009-09-26 06:20:09 +08:00
|
|
|
static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
|
2007-02-13 20:26:22 +08:00
|
|
|
{
|
2007-05-03 01:27:09 +08:00
|
|
|
int ret = 0;
|
|
|
|
nodes[nid].start = *addr;
|
|
|
|
*addr += size;
|
|
|
|
if (*addr >= max_addr) {
|
|
|
|
*addr = max_addr;
|
|
|
|
ret = -1;
|
|
|
|
}
|
|
|
|
nodes[nid].end = *addr;
|
2007-05-03 01:27:20 +08:00
|
|
|
node_set(nid, node_possible_map);
|
2007-05-03 01:27:09 +08:00
|
|
|
printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
|
|
|
|
nodes[nid].start, nodes[nid].end,
|
|
|
|
(nodes[nid].end - nodes[nid].start) >> 20);
|
|
|
|
return ret;
|
2007-02-13 20:26:22 +08:00
|
|
|
}
|
|
|
|
|
2009-09-26 06:20:09 +08:00
|
|
|
/*
|
|
|
|
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
|
|
|
|
* to max_addr. The return value is the number of nodes allocated.
|
|
|
|
*/
|
2010-12-23 09:23:54 +08:00
|
|
|
static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
|
2009-09-26 06:20:09 +08:00
|
|
|
{
|
|
|
|
nodemask_t physnode_mask = NODE_MASK_NONE;
|
|
|
|
u64 size;
|
|
|
|
int big;
|
|
|
|
int ret = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (nr_nodes <= 0)
|
|
|
|
return -1;
|
|
|
|
if (nr_nodes > MAX_NUMNODES) {
|
|
|
|
pr_info("numa=fake=%d too large, reducing to %d\n",
|
|
|
|
nr_nodes, MAX_NUMNODES);
|
|
|
|
nr_nodes = MAX_NUMNODES;
|
|
|
|
}
|
|
|
|
|
2010-08-26 04:39:17 +08:00
|
|
|
size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
|
2009-09-26 06:20:09 +08:00
|
|
|
/*
|
|
|
|
* Calculate the number of big nodes that can be allocated as a result
|
|
|
|
* of consolidating the remainder.
|
|
|
|
*/
|
2010-02-16 05:43:25 +08:00
|
|
|
big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
|
2009-09-26 06:20:09 +08:00
|
|
|
FAKE_NODE_MIN_SIZE;
|
|
|
|
|
|
|
|
size &= FAKE_NODE_MIN_HASH_MASK;
|
|
|
|
if (!size) {
|
|
|
|
pr_err("Not enough memory for each node. "
|
|
|
|
"NUMA emulation disabled.\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2010-12-23 09:23:54 +08:00
|
|
|
for (i = 0; i < MAX_NUMNODES; i++)
|
2009-09-26 06:20:09 +08:00
|
|
|
if (physnodes[i].start != physnodes[i].end)
|
|
|
|
node_set(i, physnode_mask);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Continue to fill physical nodes with fake nodes until there is no
|
|
|
|
* memory left on any of them.
|
|
|
|
*/
|
|
|
|
while (nodes_weight(physnode_mask)) {
|
|
|
|
for_each_node_mask(i, physnode_mask) {
|
|
|
|
u64 end = physnodes[i].start + size;
|
|
|
|
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
|
|
|
|
|
|
|
|
if (ret < big)
|
|
|
|
end += FAKE_NODE_MIN_SIZE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Continue to add memory to this fake node if its
|
|
|
|
* non-reserved memory is less than the per-node size.
|
|
|
|
*/
|
|
|
|
while (end - physnodes[i].start -
|
2010-08-26 04:39:17 +08:00
|
|
|
memblock_x86_hole_size(physnodes[i].start, end) < size) {
|
2009-09-26 06:20:09 +08:00
|
|
|
end += FAKE_NODE_MIN_SIZE;
|
|
|
|
if (end > physnodes[i].end) {
|
|
|
|
end = physnodes[i].end;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there won't be at least FAKE_NODE_MIN_SIZE of
|
|
|
|
* non-reserved memory in ZONE_DMA32 for the next node,
|
|
|
|
* this one must extend to the boundary.
|
|
|
|
*/
|
|
|
|
if (end < dma32_end && dma32_end - end -
|
2010-08-26 04:39:17 +08:00
|
|
|
memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
|
2009-09-26 06:20:09 +08:00
|
|
|
end = dma32_end;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there won't be enough non-reserved memory for the
|
|
|
|
* next node, this one must extend to the end of the
|
|
|
|
* physical node.
|
|
|
|
*/
|
|
|
|
if (physnodes[i].end - end -
|
2010-08-26 04:39:17 +08:00
|
|
|
memblock_x86_hole_size(end, physnodes[i].end) < size)
|
2009-09-26 06:20:09 +08:00
|
|
|
end = physnodes[i].end;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Avoid allocating more nodes than requested, which can
|
|
|
|
* happen as a result of rounding down each node's size
|
|
|
|
* to FAKE_NODE_MIN_SIZE.
|
|
|
|
*/
|
|
|
|
if (nodes_weight(physnode_mask) + ret >= nr_nodes)
|
|
|
|
end = physnodes[i].end;
|
|
|
|
|
|
|
|
if (setup_node_range(ret++, &physnodes[i].start,
|
|
|
|
end - physnodes[i].start,
|
|
|
|
physnodes[i].end) < 0)
|
|
|
|
node_clear(i, physnode_mask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-02-16 05:43:30 +08:00
|
|
|
/*
|
|
|
|
* Returns the end address of a node so that there is at least `size' amount of
|
|
|
|
* non-reserved memory or `max_addr' is reached.
|
|
|
|
*/
|
|
|
|
static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
|
|
|
|
{
|
|
|
|
u64 end = start + size;
|
|
|
|
|
2010-08-26 04:39:17 +08:00
|
|
|
while (end - start - memblock_x86_hole_size(start, end) < size) {
|
2010-02-16 05:43:30 +08:00
|
|
|
end += FAKE_NODE_MIN_SIZE;
|
|
|
|
if (end > max_addr) {
|
|
|
|
end = max_addr;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return end;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sets up fake nodes of `size' interleaved over physical nodes ranging from
|
|
|
|
* `addr' to `max_addr'. The return value is the number of nodes allocated.
|
|
|
|
*/
|
|
|
|
static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
|
|
|
|
{
|
|
|
|
nodemask_t physnode_mask = NODE_MASK_NONE;
|
|
|
|
u64 min_size;
|
|
|
|
int ret = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!size)
|
|
|
|
return -1;
|
|
|
|
/*
|
|
|
|
* The limit on emulated nodes is MAX_NUMNODES, so the size per node is
|
|
|
|
* increased accordingly if the requested size is too small. This
|
|
|
|
* creates a uniform distribution of node sizes across the entire
|
|
|
|
* machine (but not necessarily over physical nodes).
|
|
|
|
*/
|
2010-08-26 04:39:17 +08:00
|
|
|
min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
|
2010-02-16 05:43:30 +08:00
|
|
|
MAX_NUMNODES;
|
|
|
|
min_size = max(min_size, FAKE_NODE_MIN_SIZE);
|
|
|
|
if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
|
|
|
|
min_size = (min_size + FAKE_NODE_MIN_SIZE) &
|
|
|
|
FAKE_NODE_MIN_HASH_MASK;
|
|
|
|
if (size < min_size) {
|
|
|
|
pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
|
|
|
|
size >> 20, min_size >> 20);
|
|
|
|
size = min_size;
|
|
|
|
}
|
|
|
|
size &= FAKE_NODE_MIN_HASH_MASK;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_NUMNODES; i++)
|
|
|
|
if (physnodes[i].start != physnodes[i].end)
|
|
|
|
node_set(i, physnode_mask);
|
|
|
|
/*
|
|
|
|
* Fill physical nodes with fake nodes of size until there is no memory
|
|
|
|
* left on any of them.
|
|
|
|
*/
|
|
|
|
while (nodes_weight(physnode_mask)) {
|
|
|
|
for_each_node_mask(i, physnode_mask) {
|
|
|
|
u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
|
|
|
|
u64 end;
|
|
|
|
|
|
|
|
end = find_end_of_node(physnodes[i].start,
|
|
|
|
physnodes[i].end, size);
|
|
|
|
/*
|
|
|
|
* If there won't be at least FAKE_NODE_MIN_SIZE of
|
|
|
|
* non-reserved memory in ZONE_DMA32 for the next node,
|
|
|
|
* this one must extend to the boundary.
|
|
|
|
*/
|
|
|
|
if (end < dma32_end && dma32_end - end -
|
2010-08-26 04:39:17 +08:00
|
|
|
memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
|
2010-02-16 05:43:30 +08:00
|
|
|
end = dma32_end;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there won't be enough non-reserved memory for the
|
|
|
|
* next node, this one must extend to the end of the
|
|
|
|
* physical node.
|
|
|
|
*/
|
|
|
|
if (physnodes[i].end - end -
|
2010-08-26 04:39:17 +08:00
|
|
|
memblock_x86_hole_size(end, physnodes[i].end) < size)
|
2010-02-16 05:43:30 +08:00
|
|
|
end = physnodes[i].end;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Setup the fake node that will be allocated as bootmem
|
|
|
|
* later. If setup_node_range() returns non-zero, there
|
|
|
|
* is no more memory available on this physical node.
|
|
|
|
*/
|
|
|
|
if (setup_node_range(ret++, &physnodes[i].start,
|
|
|
|
end - physnodes[i].start,
|
|
|
|
physnodes[i].end) < 0)
|
|
|
|
node_clear(i, physnode_mask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-05-03 01:27:09 +08:00
|
|
|
/*
|
2008-05-12 21:43:36 +08:00
|
|
|
* Sets up the system RAM area from start_pfn to last_pfn according to the
|
2007-05-03 01:27:09 +08:00
|
|
|
* numa=fake command-line option.
|
|
|
|
*/
|
2011-02-17 00:11:10 +08:00
|
|
|
static int __init numa_emulation(int acpi, int amd)
|
2007-05-03 01:27:09 +08:00
|
|
|
{
|
2011-02-17 00:11:08 +08:00
|
|
|
static struct numa_meminfo ei __initdata;
|
2011-02-17 00:11:10 +08:00
|
|
|
const u64 max_addr = max_pfn << PAGE_SHIFT;
|
2010-02-16 05:43:33 +08:00
|
|
|
int num_nodes;
|
|
|
|
int i;
|
2007-05-03 01:27:09 +08:00
|
|
|
|
2010-02-16 05:43:30 +08:00
|
|
|
/*
|
|
|
|
* If the numa=fake command-line contains a 'M' or 'G', it represents
|
2010-02-16 05:43:33 +08:00
|
|
|
* the fixed node size. Otherwise, if it is just a single number N,
|
|
|
|
* split the system RAM into N fake nodes.
|
2010-02-16 05:43:30 +08:00
|
|
|
*/
|
2011-02-17 00:11:10 +08:00
|
|
|
if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
|
2010-02-16 05:43:33 +08:00
|
|
|
u64 size;
|
|
|
|
|
2011-02-17 00:11:10 +08:00
|
|
|
size = memparse(emu_cmdline, &emu_cmdline);
|
|
|
|
num_nodes = split_nodes_size_interleave(0, max_addr, size);
|
2010-02-16 05:43:33 +08:00
|
|
|
} else {
|
|
|
|
unsigned long n;
|
2010-02-16 05:43:30 +08:00
|
|
|
|
2011-02-17 00:11:10 +08:00
|
|
|
n = simple_strtoul(emu_cmdline, NULL, 0);
|
|
|
|
num_nodes = split_nodes_interleave(0, max_addr, n);
|
2007-05-03 01:27:09 +08:00
|
|
|
}
|
|
|
|
|
2010-02-16 05:43:33 +08:00
|
|
|
if (num_nodes < 0)
|
|
|
|
return num_nodes;
|
2011-02-17 00:11:08 +08:00
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
ei.nr_blks = num_nodes;
|
|
|
|
for (i = 0; i < ei.nr_blks; i++) {
|
|
|
|
ei.blk[i].start = nodes[i].start;
|
|
|
|
ei.blk[i].end = nodes[i].end;
|
|
|
|
ei.blk[i].nid = i;
|
|
|
|
}
|
2011-02-17 00:11:08 +08:00
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
memnode_shift = compute_hash_shift(&ei);
|
2007-05-03 01:27:09 +08:00
|
|
|
if (memnode_shift < 0) {
|
|
|
|
memnode_shift = 0;
|
|
|
|
printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
|
|
|
|
"disabled.\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2009-09-26 06:20:09 +08:00
|
|
|
* We need to vacate all active ranges that may have been registered for
|
|
|
|
* the e820 memory map.
|
2007-05-03 01:27:09 +08:00
|
|
|
*/
|
|
|
|
remove_all_active_ranges();
|
x86-64, numa: Put pgtable to local node memory
Introduce init_memory_mapping_high(), and use it with 64bit.
It will go with every memory segment above 4g to create page table to the
memory range itself.
before this patch all page tables was on one node.
with this patch, one RED-PEN is killed
debug out for 8 sockets system after patch
[ 0.000000] initial memory mapped : 0 - 20000000
[ 0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff]
[ 0.000000] 0000000000 - 007f600000 page 2M
[ 0.000000] 007f600000 - 007f750000 page 4k
[ 0.000000] kernel direct mapping tables up to 7f750000 @ [0x7f74c000-0x7f74ffff]
[ 0.000000] RAMDISK: 7bc84000 - 7f745000
....
[ 0.000000] Adding active range (0, 0x10, 0x95) 0 entries of 3200 used
[ 0.000000] Adding active range (0, 0x100, 0x7f750) 1 entries of 3200 used
[ 0.000000] Adding active range (0, 0x100000, 0x1080000) 2 entries of 3200 used
[ 0.000000] Adding active range (1, 0x1080000, 0x2080000) 3 entries of 3200 used
[ 0.000000] Adding active range (2, 0x2080000, 0x3080000) 4 entries of 3200 used
[ 0.000000] Adding active range (3, 0x3080000, 0x4080000) 5 entries of 3200 used
[ 0.000000] Adding active range (4, 0x4080000, 0x5080000) 6 entries of 3200 used
[ 0.000000] Adding active range (5, 0x5080000, 0x6080000) 7 entries of 3200 used
[ 0.000000] Adding active range (6, 0x6080000, 0x7080000) 8 entries of 3200 used
[ 0.000000] Adding active range (7, 0x7080000, 0x8080000) 9 entries of 3200 used
[ 0.000000] init_memory_mapping: [0x00000100000000-0x0000107fffffff]
[ 0.000000] 0100000000 - 1080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 1080000000 @ [0x107ffbd000-0x107fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x107ffc2000-0x107fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00001080000000-0x0000207fffffff]
[ 0.000000] 1080000000 - 2080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 2080000000 @ [0x207ff7d000-0x207fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x207ffc0000-0x207fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00002080000000-0x0000307fffffff]
[ 0.000000] 2080000000 - 3080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 3080000000 @ [0x307ff3d000-0x307fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x307ffc0000-0x307fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00003080000000-0x0000407fffffff]
[ 0.000000] 3080000000 - 4080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 4080000000 @ [0x407fefd000-0x407fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x407ffc0000-0x407fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00004080000000-0x0000507fffffff]
[ 0.000000] 4080000000 - 5080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 5080000000 @ [0x507febd000-0x507fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x507ffc0000-0x507fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00005080000000-0x0000607fffffff]
[ 0.000000] 5080000000 - 6080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 6080000000 @ [0x607fe7d000-0x607fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x607ffc0000-0x607fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00006080000000-0x0000707fffffff]
[ 0.000000] 6080000000 - 7080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 7080000000 @ [0x707fe3d000-0x707fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x707ffc0000-0x707fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00007080000000-0x0000807fffffff]
[ 0.000000] 7080000000 - 8080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 8080000000 @ [0x807fdfc000-0x807fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x807ffbf000-0x807fffffff] PGTABLE
[ 0.000000] Initmem setup node 0 [0000000000000000-000000107fffffff]
[ 0.000000] NODE_DATA [0x0000107ffbd000-0x0000107ffc1fff]
[ 0.000000] Initmem setup node 1 [0000001080000000-000000207fffffff]
[ 0.000000] NODE_DATA [0x0000207ffbb000-0x0000207ffbffff]
[ 0.000000] Initmem setup node 2 [0000002080000000-000000307fffffff]
[ 0.000000] NODE_DATA [0x0000307ffbb000-0x0000307ffbffff]
[ 0.000000] Initmem setup node 3 [0000003080000000-000000407fffffff]
[ 0.000000] NODE_DATA [0x0000407ffbb000-0x0000407ffbffff]
[ 0.000000] Initmem setup node 4 [0000004080000000-000000507fffffff]
[ 0.000000] NODE_DATA [0x0000507ffbb000-0x0000507ffbffff]
[ 0.000000] Initmem setup node 5 [0000005080000000-000000607fffffff]
[ 0.000000] NODE_DATA [0x0000607ffbb000-0x0000607ffbffff]
[ 0.000000] Initmem setup node 6 [0000006080000000-000000707fffffff]
[ 0.000000] NODE_DATA [0x0000707ffbb000-0x0000707ffbffff]
[ 0.000000] Initmem setup node 7 [0000007080000000-000000807fffffff]
[ 0.000000] NODE_DATA [0x0000807ffba000-0x0000807ffbefff]
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D1933D1.9020609@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-28 08:48:17 +08:00
|
|
|
for_each_node_mask(i, node_possible_map)
|
2010-08-26 04:39:17 +08:00
|
|
|
memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
|
2006-09-27 16:49:52 +08:00
|
|
|
nodes[i].end >> PAGE_SHIFT);
|
x86-64, numa: Put pgtable to local node memory
Introduce init_memory_mapping_high(), and use it with 64bit.
It will go with every memory segment above 4g to create page table to the
memory range itself.
before this patch all page tables was on one node.
with this patch, one RED-PEN is killed
debug out for 8 sockets system after patch
[ 0.000000] initial memory mapped : 0 - 20000000
[ 0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff]
[ 0.000000] 0000000000 - 007f600000 page 2M
[ 0.000000] 007f600000 - 007f750000 page 4k
[ 0.000000] kernel direct mapping tables up to 7f750000 @ [0x7f74c000-0x7f74ffff]
[ 0.000000] RAMDISK: 7bc84000 - 7f745000
....
[ 0.000000] Adding active range (0, 0x10, 0x95) 0 entries of 3200 used
[ 0.000000] Adding active range (0, 0x100, 0x7f750) 1 entries of 3200 used
[ 0.000000] Adding active range (0, 0x100000, 0x1080000) 2 entries of 3200 used
[ 0.000000] Adding active range (1, 0x1080000, 0x2080000) 3 entries of 3200 used
[ 0.000000] Adding active range (2, 0x2080000, 0x3080000) 4 entries of 3200 used
[ 0.000000] Adding active range (3, 0x3080000, 0x4080000) 5 entries of 3200 used
[ 0.000000] Adding active range (4, 0x4080000, 0x5080000) 6 entries of 3200 used
[ 0.000000] Adding active range (5, 0x5080000, 0x6080000) 7 entries of 3200 used
[ 0.000000] Adding active range (6, 0x6080000, 0x7080000) 8 entries of 3200 used
[ 0.000000] Adding active range (7, 0x7080000, 0x8080000) 9 entries of 3200 used
[ 0.000000] init_memory_mapping: [0x00000100000000-0x0000107fffffff]
[ 0.000000] 0100000000 - 1080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 1080000000 @ [0x107ffbd000-0x107fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x107ffc2000-0x107fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00001080000000-0x0000207fffffff]
[ 0.000000] 1080000000 - 2080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 2080000000 @ [0x207ff7d000-0x207fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x207ffc0000-0x207fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00002080000000-0x0000307fffffff]
[ 0.000000] 2080000000 - 3080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 3080000000 @ [0x307ff3d000-0x307fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x307ffc0000-0x307fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00003080000000-0x0000407fffffff]
[ 0.000000] 3080000000 - 4080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 4080000000 @ [0x407fefd000-0x407fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x407ffc0000-0x407fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00004080000000-0x0000507fffffff]
[ 0.000000] 4080000000 - 5080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 5080000000 @ [0x507febd000-0x507fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x507ffc0000-0x507fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00005080000000-0x0000607fffffff]
[ 0.000000] 5080000000 - 6080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 6080000000 @ [0x607fe7d000-0x607fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x607ffc0000-0x607fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00006080000000-0x0000707fffffff]
[ 0.000000] 6080000000 - 7080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 7080000000 @ [0x707fe3d000-0x707fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x707ffc0000-0x707fffffff] PGTABLE
[ 0.000000] init_memory_mapping: [0x00007080000000-0x0000807fffffff]
[ 0.000000] 7080000000 - 8080000000 page 2M
[ 0.000000] kernel direct mapping tables up to 8080000000 @ [0x807fdfc000-0x807fffffff]
[ 0.000000] memblock_x86_reserve_range: [0x807ffbf000-0x807fffffff] PGTABLE
[ 0.000000] Initmem setup node 0 [0000000000000000-000000107fffffff]
[ 0.000000] NODE_DATA [0x0000107ffbd000-0x0000107ffc1fff]
[ 0.000000] Initmem setup node 1 [0000001080000000-000000207fffffff]
[ 0.000000] NODE_DATA [0x0000207ffbb000-0x0000207ffbffff]
[ 0.000000] Initmem setup node 2 [0000002080000000-000000307fffffff]
[ 0.000000] NODE_DATA [0x0000307ffbb000-0x0000307ffbffff]
[ 0.000000] Initmem setup node 3 [0000003080000000-000000407fffffff]
[ 0.000000] NODE_DATA [0x0000407ffbb000-0x0000407ffbffff]
[ 0.000000] Initmem setup node 4 [0000004080000000-000000507fffffff]
[ 0.000000] NODE_DATA [0x0000507ffbb000-0x0000507ffbffff]
[ 0.000000] Initmem setup node 5 [0000005080000000-000000607fffffff]
[ 0.000000] NODE_DATA [0x0000607ffbb000-0x0000607ffbffff]
[ 0.000000] Initmem setup node 6 [0000006080000000-000000707fffffff]
[ 0.000000] NODE_DATA [0x0000707ffbb000-0x0000707ffbffff]
[ 0.000000] Initmem setup node 7 [0000007080000000-000000807fffffff]
[ 0.000000] NODE_DATA [0x0000807ffba000-0x0000807ffbefff]
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4D1933D1.9020609@kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-28 08:48:17 +08:00
|
|
|
init_memory_mapping_high();
|
|
|
|
for_each_node_mask(i, node_possible_map)
|
2008-01-30 20:30:37 +08:00
|
|
|
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
|
2011-02-17 00:11:10 +08:00
|
|
|
setup_physnodes(0, max_addr);
|
2010-12-23 09:23:51 +08:00
|
|
|
fake_physnodes(acpi, amd, num_nodes);
|
2008-01-30 20:30:37 +08:00
|
|
|
numa_init_array();
|
2011-02-17 00:11:09 +08:00
|
|
|
numa_emu_dist = true;
|
2008-01-30 20:30:37 +08:00
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-05-03 01:27:09 +08:00
|
|
|
#endif /* CONFIG_NUMA_EMU */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-02-16 19:13:06 +08:00
|
|
|
static int dummy_numa_init(void)
|
|
|
|
{
|
2005-04-17 06:20:36 +08:00
|
|
|
printk(KERN_INFO "%s\n",
|
|
|
|
numa_off ? "NUMA turned off" : "No NUMA configuration found");
|
2008-01-30 20:30:37 +08:00
|
|
|
printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
|
2011-02-16 19:13:06 +08:00
|
|
|
0LU, max_pfn << PAGE_SHIFT);
|
2011-02-16 19:13:06 +08:00
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
node_set(0, numa_nodes_parsed);
|
2011-02-17 00:11:08 +08:00
|
|
|
numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
|
2011-02-16 19:13:07 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-02-16 19:13:06 +08:00
|
|
|
void __init initmem_init(void)
|
|
|
|
{
|
|
|
|
int (*numa_init[])(void) = { [2] = dummy_numa_init };
|
|
|
|
int i, j;
|
|
|
|
|
|
|
|
if (!numa_off) {
|
|
|
|
#ifdef CONFIG_ACPI_NUMA
|
|
|
|
numa_init[0] = x86_acpi_numa_init;
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_AMD_NUMA
|
|
|
|
numa_init[1] = amd_numa_init;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(numa_init); i++) {
|
|
|
|
if (!numa_init[i])
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for (j = 0; j < MAX_LOCAL_APIC; j++)
|
|
|
|
set_apicid_to_node(j, NUMA_NO_NODE);
|
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
nodes_clear(numa_nodes_parsed);
|
2011-02-16 19:13:06 +08:00
|
|
|
nodes_clear(node_possible_map);
|
|
|
|
nodes_clear(node_online_map);
|
2011-02-17 00:11:08 +08:00
|
|
|
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
|
2011-02-17 00:11:08 +08:00
|
|
|
remove_all_active_ranges();
|
2011-02-17 00:11:09 +08:00
|
|
|
numa_reset_distance();
|
2011-02-16 19:13:06 +08:00
|
|
|
|
|
|
|
if (numa_init[i]() < 0)
|
|
|
|
continue;
|
2011-02-16 19:13:07 +08:00
|
|
|
|
2011-02-17 00:11:09 +08:00
|
|
|
if (numa_cleanup_meminfo(&numa_meminfo) < 0)
|
|
|
|
continue;
|
2011-02-16 19:13:06 +08:00
|
|
|
#ifdef CONFIG_NUMA_EMU
|
2011-02-16 19:13:07 +08:00
|
|
|
setup_physnodes(0, max_pfn << PAGE_SHIFT);
|
2011-02-17 00:11:10 +08:00
|
|
|
if (emu_cmdline && !numa_emulation(i == 0, i == 1))
|
2011-02-16 19:13:06 +08:00
|
|
|
return;
|
2011-02-16 19:13:07 +08:00
|
|
|
setup_physnodes(0, max_pfn << PAGE_SHIFT);
|
2011-02-16 19:13:06 +08:00
|
|
|
nodes_clear(node_possible_map);
|
|
|
|
nodes_clear(node_online_map);
|
|
|
|
#endif
|
2011-02-17 00:11:09 +08:00
|
|
|
if (numa_register_memblks(&numa_meminfo) < 0)
|
2011-02-17 00:11:08 +08:00
|
|
|
continue;
|
|
|
|
|
2011-02-17 00:11:08 +08:00
|
|
|
for (j = 0; j < nr_cpu_ids; j++) {
|
|
|
|
int nid = early_cpu_to_node(j);
|
|
|
|
|
|
|
|
if (nid == NUMA_NO_NODE)
|
|
|
|
continue;
|
|
|
|
if (!node_online(nid))
|
|
|
|
numa_clear_node(j);
|
|
|
|
}
|
|
|
|
numa_init_array();
|
|
|
|
return;
|
2011-02-16 19:13:06 +08:00
|
|
|
}
|
|
|
|
BUG();
|
2005-11-06 00:25:53 +08:00
|
|
|
}
|
|
|
|
|
2008-01-30 20:30:37 +08:00
|
|
|
unsigned long __init numa_free_all_bootmem(void)
|
|
|
|
{
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long pages = 0;
|
2008-01-30 20:30:37 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_online_node(i)
|
2005-04-17 06:20:36 +08:00
|
|
|
pages += free_all_bootmem_node(NODE_DATA(i));
|
2008-01-30 20:30:37 +08:00
|
|
|
|
2010-02-10 17:20:20 +08:00
|
|
|
pages += free_all_memory_core_early(MAX_NUMNODES);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return pages;
|
2008-01-30 20:30:37 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-23 21:37:39 +08:00
|
|
|
int __cpuinit numa_cpu_node(int cpu)
|
2009-11-21 16:23:37 +08:00
|
|
|
{
|
2011-01-23 21:37:39 +08:00
|
|
|
int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
|
2009-11-21 16:23:37 +08:00
|
|
|
|
2011-01-23 21:37:39 +08:00
|
|
|
if (apicid != BAD_APICID)
|
|
|
|
return __apicid_to_node[apicid];
|
|
|
|
return NUMA_NO_NODE;
|
2009-11-21 16:23:37 +08:00
|
|
|
}
|
|
|
|
|
2006-01-12 05:45:36 +08:00
|
|
|
/*
|
2011-01-23 21:37:41 +08:00
|
|
|
* UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use
|
|
|
|
* of 64bit specific data structures. The distinction is artificial and
|
|
|
|
* should be removed. numa_{add|remove}_cpu() are implemented in numa.c
|
|
|
|
* for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when
|
|
|
|
* enabled.
|
2006-01-12 05:45:36 +08:00
|
|
|
*
|
2011-01-23 21:37:41 +08:00
|
|
|
* NUMA emulation is planned to be made generic and the following and other
|
|
|
|
* related code should be moved to numa.c.
|
2006-01-12 05:45:36 +08:00
|
|
|
*/
|
2011-01-23 21:37:41 +08:00
|
|
|
#ifdef CONFIG_NUMA_EMU
|
|
|
|
# ifndef CONFIG_DEBUG_PER_CPU_MAPS
|
2010-12-23 09:23:54 +08:00
|
|
|
void __cpuinit numa_add_cpu(int cpu)
|
|
|
|
{
|
|
|
|
unsigned long addr;
|
2011-01-23 21:37:39 +08:00
|
|
|
int physnid, nid;
|
2010-12-23 09:23:54 +08:00
|
|
|
|
2011-01-23 21:37:39 +08:00
|
|
|
nid = numa_cpu_node(cpu);
|
2010-12-23 09:23:54 +08:00
|
|
|
if (nid == NUMA_NO_NODE)
|
|
|
|
nid = early_cpu_to_node(cpu);
|
|
|
|
BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use the starting address of the emulated node to find which physical
|
|
|
|
* node it is allocated on.
|
|
|
|
*/
|
|
|
|
addr = node_start_pfn(nid) << PAGE_SHIFT;
|
|
|
|
for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
|
|
|
|
if (addr >= physnodes[physnid].start &&
|
|
|
|
addr < physnodes[physnid].end)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Map the cpu to each emulated node that is allocated on the physical
|
|
|
|
* node of the cpu's apic id.
|
|
|
|
*/
|
|
|
|
for_each_online_node(nid) {
|
|
|
|
addr = node_start_pfn(nid) << PAGE_SHIFT;
|
|
|
|
if (addr >= physnodes[physnid].start &&
|
|
|
|
addr < physnodes[physnid].end)
|
|
|
|
cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void __cpuinit numa_remove_cpu(int cpu)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_online_node(i)
|
|
|
|
cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
|
|
|
|
}
|
2011-01-23 21:37:41 +08:00
|
|
|
# else /* !CONFIG_DEBUG_PER_CPU_MAPS */
|
2009-01-27 11:56:47 +08:00
|
|
|
static void __cpuinit numa_set_cpumask(int cpu, int enable)
|
|
|
|
{
|
|
|
|
int node = early_cpu_to_node(cpu);
|
2009-03-13 12:19:57 +08:00
|
|
|
struct cpumask *mask;
|
2010-12-23 09:23:54 +08:00
|
|
|
int i;
|
2009-01-27 11:56:47 +08:00
|
|
|
|
2011-02-08 06:08:53 +08:00
|
|
|
if (node == NUMA_NO_NODE) {
|
|
|
|
/* early_cpu_to_node() already emits a warning and trace */
|
|
|
|
return;
|
|
|
|
}
|
2010-12-23 09:23:54 +08:00
|
|
|
for_each_online_node(i) {
|
|
|
|
unsigned long addr;
|
2009-01-27 11:56:47 +08:00
|
|
|
|
2010-12-23 09:23:54 +08:00
|
|
|
addr = node_start_pfn(i) << PAGE_SHIFT;
|
|
|
|
if (addr < physnodes[node].start ||
|
|
|
|
addr >= physnodes[node].end)
|
|
|
|
continue;
|
2010-12-31 02:54:16 +08:00
|
|
|
mask = debug_cpumask_set_cpu(cpu, enable);
|
|
|
|
if (!mask)
|
2010-12-23 09:23:54 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
if (enable)
|
|
|
|
cpumask_set_cpu(cpu, mask);
|
|
|
|
else
|
|
|
|
cpumask_clear_cpu(cpu, mask);
|
|
|
|
}
|
2009-01-27 11:56:47 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void __cpuinit numa_add_cpu(int cpu)
|
|
|
|
{
|
|
|
|
numa_set_cpumask(cpu, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
void __cpuinit numa_remove_cpu(int cpu)
|
|
|
|
{
|
|
|
|
numa_set_cpumask(cpu, 0);
|
|
|
|
}
|
2011-01-23 21:37:41 +08:00
|
|
|
# endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
|
|
|
|
#endif /* CONFIG_NUMA_EMU */
|