linux_old1/arch/x86/mm/numa_64.c

990 lines
25 KiB
C
Raw Normal View History

/*
* Generic VM initialization for x86-64 NUMA setups.
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/bootmem.h>
x86: Use memblock to replace early_res 1. replace find_e820_area with memblock_find_in_range 2. replace reserve_early with memblock_x86_reserve_range 3. replace free_early with memblock_x86_free_range. 4. NO_BOOTMEM will switch to use memblock too. 5. use _e820, _early wrap in the patch, in following patch, will replace them all 6. because memblock_x86_free_range support partial free, we can remove some special care 7. Need to make sure that memblock_find_in_range() is called after memblock_x86_fill() so adjust some calling later in setup.c::setup_arch() -- corruption_check and mptable_update -v2: Move reserve_brk() early Before fill_memblock_area, to avoid overlap between brk and memblock_find_in_range() that could happen We have more then 128 RAM entry in E820 tables, and memblock_x86_fill() could use memblock_find_in_range() to find a new place for memblock.memory.region array. and We don't need to use extend_brk() after fill_memblock_area() So move reserve_brk() early before fill_memblock_area(). -v3: Move find_smp_config early To make sure memblock_find_in_range not find wrong place, if BIOS doesn't put mptable in right place. -v4: Treat RESERVED_KERN as RAM in memblock.memory. and they are already in memblock.reserved already.. use __NOT_KEEP_MEMBLOCK to make sure memblock related code could be freed later. -v5: Generic version __memblock_find_in_range() is going from high to low, and for 32bit active_region for 32bit does include high pages need to replace the limit with memblock.default_alloc_limit, aka get_max_mapped() -v6: Use current_limit instead -v7: check with MEMBLOCK_ERROR instead of -1ULL or -1L -v8: Set memblock_can_resize early to handle EFI with more RAM entries -v9: update after kmemleak changes in mainline Suggested-by: David S. Miller <davem@davemloft.net> Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-08-26 04:39:17 +08:00
#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/sched.h>
#include <linux/acpi.h>
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/numa.h>
#include <asm/acpi.h>
#include <asm/amd_nb.h>
struct numa_memblk {
u64 start;
u64 end;
int nid;
};
struct numa_meminfo {
int nr_blks;
struct numa_memblk blk[NR_NODE_MEMBLKS];
};
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
nodemask_t cpu_nodes_parsed __initdata;
nodemask_t mem_nodes_parsed __initdata;
struct memnode memnode;
static unsigned long __initdata nodemap_addr;
static unsigned long __initdata nodemap_size;
static struct numa_meminfo numa_meminfo __initdata;
struct bootnode numa_nodes[MAX_NUMNODES] __initdata;
/*
* Given a shift value, try to populate memnodemap[]
* Returns :
* 1 if OK
* 0 if memnodmap[] too small (of shift too small)
* -1 if node overlap or lost ram (shift too big)
*/
static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
{
unsigned long addr, end;
int i, res = -1;
memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
for (i = 0; i < mi->nr_blks; i++) {
addr = mi->blk[i].start;
end = mi->blk[i].end;
if (addr >= end)
continue;
if ((end >> shift) >= memnodemapsize)
return 0;
do {
if (memnodemap[addr >> shift] != NUMA_NO_NODE)
return -1;
memnodemap[addr >> shift] = mi->blk[i].nid;
addr += (1UL << shift);
} while (addr < end);
res = 1;
}
return res;
}
static int __init allocate_cachealigned_memnodemap(void)
{
x86_64: make bootmap_start page align v6 boot oopses when a system has 64 or 128 GB of RAM installed: Calling initcall 0xffffffff80bc33b6: sctp_init+0x0/0x711() BUG: unable to handle kernel NULL pointer dereference at 000000000000005f IP: [<ffffffff802bfe55>] proc_register+0xe7/0x10f PGD 0 Oops: 0000 [1] SMP CPU 0 Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #6 RIP: 0010:[<ffffffff802bfe55>] [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP: 0000:ffff810824c57e60 EFLAGS: 00010246 RAX: 000000000000d7d7 RBX: ffff811024c5fa80 RCX: ffff810824c57e08 RDX: 0000000000000000 RSI: 0000000000000195 RDI: ffffffff80cc2460 RBP: ffffffffffffffff R08: 0000000000000000 R09: ffff811024c5fa80 R10: 0000000000000000 R11: 0000000000000002 R12: ffff810824c57e6c R13: 0000000000000000 R14: ffff810824c57ee0 R15: 00000006abd25bee FS: 0000000000000000(0000) GS:ffffffff80b4d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 000000000000005f CR3: 0000000000201000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 1, threadinfo ffff810824c56000, task ffff812024c52000) Stack: ffffffff80a57348 0000019500000000 ffff811024c5fa80 0000000000000000 00000000ffffff97 ffffffff802bfef0 0000000000000000 ffffffffffffffff 0000000000000000 ffffffff80bc3b4b ffff810824c57ee0 ffffffff80bc34a5 Call Trace: [<ffffffff802bfef0>] ? create_proc_entry+0x73/0x8a [<ffffffff80bc3b4b>] ? sctp_snmp_proc_init+0x1c/0x34 [<ffffffff80bc34a5>] ? sctp_init+0xef/0x711 [<ffffffff80b976e3>] ? kernel_init+0x175/0x2e1 [<ffffffff8020ccf8>] ? child_rip+0xa/0x12 [<ffffffff80b9756e>] ? kernel_init+0x0/0x2e1 [<ffffffff8020ccee>] ? child_rip+0x0/0x12 Code: 1e 48 83 7b 38 00 75 08 48 c7 43 38 f0 e8 82 80 48 83 7b 30 00 75 08 48 c7 43 30 d0 e9 82 80 48 c7 c7 60 24 cc 80 e8 bd 5a 54 00 <48> 8b 45 60 48 89 6b 58 48 89 5d 60 48 89 43 50 fe 05 f5 25 a0 RIP [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP <ffff810824c57e60> CR2: 000000000000005f ---[ end trace 02c2d78def82877a ]--- Kernel panic - not syncing: Attempted to kill init! it turns out some variables near end of bss are corrupted already. in System.map we have ffffffff80d40420 b rsi_table ffffffff80d40620 B krb5_seq_lock ffffffff80d40628 b i.20437 ffffffff80d40630 b xprt_rdma_inline_write_padding ffffffff80d40638 b sunrpc_table_header ffffffff80d40640 b zero ffffffff80d40644 b min_memreg ffffffff80d40648 b rpcrdma_tk_lock_g ffffffff80d40650 B sctp_assocs_id_lock ffffffff80d40658 B proc_net_sctp ffffffff80d40660 B sctp_assocs_id ffffffff80d40680 B sysctl_sctp_mem ffffffff80d40690 B sysctl_sctp_rmem ffffffff80d406a0 B sysctl_sctp_wmem ffffffff80d406b0 b sctp_ctl_socket ffffffff80d406b8 b sctp_pf_inet6_specific ffffffff80d406c0 b sctp_pf_inet_specific ffffffff80d406c8 b sctp_af_v4_specific ffffffff80d406d0 b sctp_af_v6_specific ffffffff80d406d8 b sctp_rand.33270 ffffffff80d406dc b sctp_memory_pressure ffffffff80d406e0 b sctp_sockets_allocated ffffffff80d406e4 b sctp_memory_allocated ffffffff80d406e8 b sctp_sysctl_header ffffffff80d406f0 b zero ffffffff80d406f4 A __bss_stop ffffffff80d406f4 A _end and setup_node_bootmem() will use that page 0xd40000 for bootmap Bootmem setup node 0 0000000000000000-0000000828000000 NODE_DATA [000000000008a485 - 0000000000091484] bootmap [0000000000d406f4 - 0000000000e456f3] pages 105 Bootmem setup node 1 0000000828000000-0000001028000000 NODE_DATA [0000000828000000 - 0000000828006fff] bootmap [0000000828007000 - 0000000828106fff] pages 100 Bootmem setup node 2 0000001028000000-0000001828000000 NODE_DATA [0000001028000000 - 0000001028006fff] bootmap [0000001028007000 - 0000001028106fff] pages 100 Bootmem setup node 3 0000001828000000-0000002028000000 NODE_DATA [0000001828000000 - 0000001828006fff] bootmap [0000001828007000 - 0000001828106fff] pages 100 setup_node_bootmem() makes NODE_DATA cacheline aligned, and bootmap is page-aligned. the patch updates find_e820_area() to make sure we can meet the alignment constraints. Signed-off-by: Yinghai Lu <yinghai.lu@sun.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-02-02 00:49:41 +08:00
unsigned long addr;
memnodemap = memnode.embedded_map;
if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
return 0;
x86_64: make bootmap_start page align v6 boot oopses when a system has 64 or 128 GB of RAM installed: Calling initcall 0xffffffff80bc33b6: sctp_init+0x0/0x711() BUG: unable to handle kernel NULL pointer dereference at 000000000000005f IP: [<ffffffff802bfe55>] proc_register+0xe7/0x10f PGD 0 Oops: 0000 [1] SMP CPU 0 Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #6 RIP: 0010:[<ffffffff802bfe55>] [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP: 0000:ffff810824c57e60 EFLAGS: 00010246 RAX: 000000000000d7d7 RBX: ffff811024c5fa80 RCX: ffff810824c57e08 RDX: 0000000000000000 RSI: 0000000000000195 RDI: ffffffff80cc2460 RBP: ffffffffffffffff R08: 0000000000000000 R09: ffff811024c5fa80 R10: 0000000000000000 R11: 0000000000000002 R12: ffff810824c57e6c R13: 0000000000000000 R14: ffff810824c57ee0 R15: 00000006abd25bee FS: 0000000000000000(0000) GS:ffffffff80b4d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 000000000000005f CR3: 0000000000201000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 1, threadinfo ffff810824c56000, task ffff812024c52000) Stack: ffffffff80a57348 0000019500000000 ffff811024c5fa80 0000000000000000 00000000ffffff97 ffffffff802bfef0 0000000000000000 ffffffffffffffff 0000000000000000 ffffffff80bc3b4b ffff810824c57ee0 ffffffff80bc34a5 Call Trace: [<ffffffff802bfef0>] ? create_proc_entry+0x73/0x8a [<ffffffff80bc3b4b>] ? sctp_snmp_proc_init+0x1c/0x34 [<ffffffff80bc34a5>] ? sctp_init+0xef/0x711 [<ffffffff80b976e3>] ? kernel_init+0x175/0x2e1 [<ffffffff8020ccf8>] ? child_rip+0xa/0x12 [<ffffffff80b9756e>] ? kernel_init+0x0/0x2e1 [<ffffffff8020ccee>] ? child_rip+0x0/0x12 Code: 1e 48 83 7b 38 00 75 08 48 c7 43 38 f0 e8 82 80 48 83 7b 30 00 75 08 48 c7 43 30 d0 e9 82 80 48 c7 c7 60 24 cc 80 e8 bd 5a 54 00 <48> 8b 45 60 48 89 6b 58 48 89 5d 60 48 89 43 50 fe 05 f5 25 a0 RIP [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP <ffff810824c57e60> CR2: 000000000000005f ---[ end trace 02c2d78def82877a ]--- Kernel panic - not syncing: Attempted to kill init! it turns out some variables near end of bss are corrupted already. in System.map we have ffffffff80d40420 b rsi_table ffffffff80d40620 B krb5_seq_lock ffffffff80d40628 b i.20437 ffffffff80d40630 b xprt_rdma_inline_write_padding ffffffff80d40638 b sunrpc_table_header ffffffff80d40640 b zero ffffffff80d40644 b min_memreg ffffffff80d40648 b rpcrdma_tk_lock_g ffffffff80d40650 B sctp_assocs_id_lock ffffffff80d40658 B proc_net_sctp ffffffff80d40660 B sctp_assocs_id ffffffff80d40680 B sysctl_sctp_mem ffffffff80d40690 B sysctl_sctp_rmem ffffffff80d406a0 B sysctl_sctp_wmem ffffffff80d406b0 b sctp_ctl_socket ffffffff80d406b8 b sctp_pf_inet6_specific ffffffff80d406c0 b sctp_pf_inet_specific ffffffff80d406c8 b sctp_af_v4_specific ffffffff80d406d0 b sctp_af_v6_specific ffffffff80d406d8 b sctp_rand.33270 ffffffff80d406dc b sctp_memory_pressure ffffffff80d406e0 b sctp_sockets_allocated ffffffff80d406e4 b sctp_memory_allocated ffffffff80d406e8 b sctp_sysctl_header ffffffff80d406f0 b zero ffffffff80d406f4 A __bss_stop ffffffff80d406f4 A _end and setup_node_bootmem() will use that page 0xd40000 for bootmap Bootmem setup node 0 0000000000000000-0000000828000000 NODE_DATA [000000000008a485 - 0000000000091484] bootmap [0000000000d406f4 - 0000000000e456f3] pages 105 Bootmem setup node 1 0000000828000000-0000001028000000 NODE_DATA [0000000828000000 - 0000000828006fff] bootmap [0000000828007000 - 0000000828106fff] pages 100 Bootmem setup node 2 0000001028000000-0000001828000000 NODE_DATA [0000001028000000 - 0000001028006fff] bootmap [0000001028007000 - 0000001028106fff] pages 100 Bootmem setup node 3 0000001828000000-0000002028000000 NODE_DATA [0000001828000000 - 0000001828006fff] bootmap [0000001828007000 - 0000001828106fff] pages 100 setup_node_bootmem() makes NODE_DATA cacheline aligned, and bootmap is page-aligned. the patch updates find_e820_area() to make sure we can meet the alignment constraints. Signed-off-by: Yinghai Lu <yinghai.lu@sun.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-02-02 00:49:41 +08:00
addr = 0x8000;
nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
x86_64: make bootmap_start page align v6 boot oopses when a system has 64 or 128 GB of RAM installed: Calling initcall 0xffffffff80bc33b6: sctp_init+0x0/0x711() BUG: unable to handle kernel NULL pointer dereference at 000000000000005f IP: [<ffffffff802bfe55>] proc_register+0xe7/0x10f PGD 0 Oops: 0000 [1] SMP CPU 0 Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #6 RIP: 0010:[<ffffffff802bfe55>] [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP: 0000:ffff810824c57e60 EFLAGS: 00010246 RAX: 000000000000d7d7 RBX: ffff811024c5fa80 RCX: ffff810824c57e08 RDX: 0000000000000000 RSI: 0000000000000195 RDI: ffffffff80cc2460 RBP: ffffffffffffffff R08: 0000000000000000 R09: ffff811024c5fa80 R10: 0000000000000000 R11: 0000000000000002 R12: ffff810824c57e6c R13: 0000000000000000 R14: ffff810824c57ee0 R15: 00000006abd25bee FS: 0000000000000000(0000) GS:ffffffff80b4d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 000000000000005f CR3: 0000000000201000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 1, threadinfo ffff810824c56000, task ffff812024c52000) Stack: ffffffff80a57348 0000019500000000 ffff811024c5fa80 0000000000000000 00000000ffffff97 ffffffff802bfef0 0000000000000000 ffffffffffffffff 0000000000000000 ffffffff80bc3b4b ffff810824c57ee0 ffffffff80bc34a5 Call Trace: [<ffffffff802bfef0>] ? create_proc_entry+0x73/0x8a [<ffffffff80bc3b4b>] ? sctp_snmp_proc_init+0x1c/0x34 [<ffffffff80bc34a5>] ? sctp_init+0xef/0x711 [<ffffffff80b976e3>] ? kernel_init+0x175/0x2e1 [<ffffffff8020ccf8>] ? child_rip+0xa/0x12 [<ffffffff80b9756e>] ? kernel_init+0x0/0x2e1 [<ffffffff8020ccee>] ? child_rip+0x0/0x12 Code: 1e 48 83 7b 38 00 75 08 48 c7 43 38 f0 e8 82 80 48 83 7b 30 00 75 08 48 c7 43 30 d0 e9 82 80 48 c7 c7 60 24 cc 80 e8 bd 5a 54 00 <48> 8b 45 60 48 89 6b 58 48 89 5d 60 48 89 43 50 fe 05 f5 25 a0 RIP [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP <ffff810824c57e60> CR2: 000000000000005f ---[ end trace 02c2d78def82877a ]--- Kernel panic - not syncing: Attempted to kill init! it turns out some variables near end of bss are corrupted already. in System.map we have ffffffff80d40420 b rsi_table ffffffff80d40620 B krb5_seq_lock ffffffff80d40628 b i.20437 ffffffff80d40630 b xprt_rdma_inline_write_padding ffffffff80d40638 b sunrpc_table_header ffffffff80d40640 b zero ffffffff80d40644 b min_memreg ffffffff80d40648 b rpcrdma_tk_lock_g ffffffff80d40650 B sctp_assocs_id_lock ffffffff80d40658 B proc_net_sctp ffffffff80d40660 B sctp_assocs_id ffffffff80d40680 B sysctl_sctp_mem ffffffff80d40690 B sysctl_sctp_rmem ffffffff80d406a0 B sysctl_sctp_wmem ffffffff80d406b0 b sctp_ctl_socket ffffffff80d406b8 b sctp_pf_inet6_specific ffffffff80d406c0 b sctp_pf_inet_specific ffffffff80d406c8 b sctp_af_v4_specific ffffffff80d406d0 b sctp_af_v6_specific ffffffff80d406d8 b sctp_rand.33270 ffffffff80d406dc b sctp_memory_pressure ffffffff80d406e0 b sctp_sockets_allocated ffffffff80d406e4 b sctp_memory_allocated ffffffff80d406e8 b sctp_sysctl_header ffffffff80d406f0 b zero ffffffff80d406f4 A __bss_stop ffffffff80d406f4 A _end and setup_node_bootmem() will use that page 0xd40000 for bootmap Bootmem setup node 0 0000000000000000-0000000828000000 NODE_DATA [000000000008a485 - 0000000000091484] bootmap [0000000000d406f4 - 0000000000e456f3] pages 105 Bootmem setup node 1 0000000828000000-0000001028000000 NODE_DATA [0000000828000000 - 0000000828006fff] bootmap [0000000828007000 - 0000000828106fff] pages 100 Bootmem setup node 2 0000001028000000-0000001828000000 NODE_DATA [0000001028000000 - 0000001028006fff] bootmap [0000001028007000 - 0000001028106fff] pages 100 Bootmem setup node 3 0000001828000000-0000002028000000 NODE_DATA [0000001828000000 - 0000001828006fff] bootmap [0000001828007000 - 0000001828106fff] pages 100 setup_node_bootmem() makes NODE_DATA cacheline aligned, and bootmap is page-aligned. the patch updates find_e820_area() to make sure we can meet the alignment constraints. Signed-off-by: Yinghai Lu <yinghai.lu@sun.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-02-02 00:49:41 +08:00
nodemap_size, L1_CACHE_BYTES);
if (nodemap_addr == MEMBLOCK_ERROR) {
printk(KERN_ERR
"NUMA: Unable to allocate Memory to Node hash map\n");
nodemap_addr = nodemap_size = 0;
return -1;
}
x86_64: make bootmap_start page align v6 boot oopses when a system has 64 or 128 GB of RAM installed: Calling initcall 0xffffffff80bc33b6: sctp_init+0x0/0x711() BUG: unable to handle kernel NULL pointer dereference at 000000000000005f IP: [<ffffffff802bfe55>] proc_register+0xe7/0x10f PGD 0 Oops: 0000 [1] SMP CPU 0 Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #6 RIP: 0010:[<ffffffff802bfe55>] [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP: 0000:ffff810824c57e60 EFLAGS: 00010246 RAX: 000000000000d7d7 RBX: ffff811024c5fa80 RCX: ffff810824c57e08 RDX: 0000000000000000 RSI: 0000000000000195 RDI: ffffffff80cc2460 RBP: ffffffffffffffff R08: 0000000000000000 R09: ffff811024c5fa80 R10: 0000000000000000 R11: 0000000000000002 R12: ffff810824c57e6c R13: 0000000000000000 R14: ffff810824c57ee0 R15: 00000006abd25bee FS: 0000000000000000(0000) GS:ffffffff80b4d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 000000000000005f CR3: 0000000000201000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 1, threadinfo ffff810824c56000, task ffff812024c52000) Stack: ffffffff80a57348 0000019500000000 ffff811024c5fa80 0000000000000000 00000000ffffff97 ffffffff802bfef0 0000000000000000 ffffffffffffffff 0000000000000000 ffffffff80bc3b4b ffff810824c57ee0 ffffffff80bc34a5 Call Trace: [<ffffffff802bfef0>] ? create_proc_entry+0x73/0x8a [<ffffffff80bc3b4b>] ? sctp_snmp_proc_init+0x1c/0x34 [<ffffffff80bc34a5>] ? sctp_init+0xef/0x711 [<ffffffff80b976e3>] ? kernel_init+0x175/0x2e1 [<ffffffff8020ccf8>] ? child_rip+0xa/0x12 [<ffffffff80b9756e>] ? kernel_init+0x0/0x2e1 [<ffffffff8020ccee>] ? child_rip+0x0/0x12 Code: 1e 48 83 7b 38 00 75 08 48 c7 43 38 f0 e8 82 80 48 83 7b 30 00 75 08 48 c7 43 30 d0 e9 82 80 48 c7 c7 60 24 cc 80 e8 bd 5a 54 00 <48> 8b 45 60 48 89 6b 58 48 89 5d 60 48 89 43 50 fe 05 f5 25 a0 RIP [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP <ffff810824c57e60> CR2: 000000000000005f ---[ end trace 02c2d78def82877a ]--- Kernel panic - not syncing: Attempted to kill init! it turns out some variables near end of bss are corrupted already. in System.map we have ffffffff80d40420 b rsi_table ffffffff80d40620 B krb5_seq_lock ffffffff80d40628 b i.20437 ffffffff80d40630 b xprt_rdma_inline_write_padding ffffffff80d40638 b sunrpc_table_header ffffffff80d40640 b zero ffffffff80d40644 b min_memreg ffffffff80d40648 b rpcrdma_tk_lock_g ffffffff80d40650 B sctp_assocs_id_lock ffffffff80d40658 B proc_net_sctp ffffffff80d40660 B sctp_assocs_id ffffffff80d40680 B sysctl_sctp_mem ffffffff80d40690 B sysctl_sctp_rmem ffffffff80d406a0 B sysctl_sctp_wmem ffffffff80d406b0 b sctp_ctl_socket ffffffff80d406b8 b sctp_pf_inet6_specific ffffffff80d406c0 b sctp_pf_inet_specific ffffffff80d406c8 b sctp_af_v4_specific ffffffff80d406d0 b sctp_af_v6_specific ffffffff80d406d8 b sctp_rand.33270 ffffffff80d406dc b sctp_memory_pressure ffffffff80d406e0 b sctp_sockets_allocated ffffffff80d406e4 b sctp_memory_allocated ffffffff80d406e8 b sctp_sysctl_header ffffffff80d406f0 b zero ffffffff80d406f4 A __bss_stop ffffffff80d406f4 A _end and setup_node_bootmem() will use that page 0xd40000 for bootmap Bootmem setup node 0 0000000000000000-0000000828000000 NODE_DATA [000000000008a485 - 0000000000091484] bootmap [0000000000d406f4 - 0000000000e456f3] pages 105 Bootmem setup node 1 0000000828000000-0000001028000000 NODE_DATA [0000000828000000 - 0000000828006fff] bootmap [0000000828007000 - 0000000828106fff] pages 100 Bootmem setup node 2 0000001028000000-0000001828000000 NODE_DATA [0000001028000000 - 0000001028006fff] bootmap [0000001028007000 - 0000001028106fff] pages 100 Bootmem setup node 3 0000001828000000-0000002028000000 NODE_DATA [0000001828000000 - 0000001828006fff] bootmap [0000001828007000 - 0000001828106fff] pages 100 setup_node_bootmem() makes NODE_DATA cacheline aligned, and bootmap is page-aligned. the patch updates find_e820_area() to make sure we can meet the alignment constraints. Signed-off-by: Yinghai Lu <yinghai.lu@sun.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-02-02 00:49:41 +08:00
memnodemap = phys_to_virt(nodemap_addr);
memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
nodemap_addr, nodemap_addr + nodemap_size);
return 0;
}
/*
* The LSB of all start and end addresses in the node map is the value of the
* maximum possible shift.
*/
static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
{
int i, nodes_used = 0;
unsigned long start, end;
unsigned long bitfield = 0, memtop = 0;
for (i = 0; i < mi->nr_blks; i++) {
start = mi->blk[i].start;
end = mi->blk[i].end;
if (start >= end)
continue;
bitfield |= start;
nodes_used++;
if (end > memtop)
memtop = end;
}
if (nodes_used <= 1)
i = 63;
else
i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
memnodemapsize = (memtop >> i)+1;
return i;
}
static int __init compute_hash_shift(const struct numa_meminfo *mi)
{
int shift;
shift = extract_lsb_from_nodes(mi);
if (allocate_cachealigned_memnodemap())
return -1;
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);
if (populate_memnodemap(mi, shift) != 1) {
printk(KERN_INFO "Your memory is not aligned you need to "
"rebuild your kernel with a bigger NODEMAPSIZE "
"shift=%d\n", shift);
return -1;
}
return shift;
}
mm: clean up for early_pfn_to_nid() What's happening is that the assertion in mm/page_alloc.c:move_freepages() is triggering: BUG_ON(page_zone(start_page) != page_zone(end_page)); Once I knew this is what was happening, I added some annotations: if (unlikely(page_zone(start_page) != page_zone(end_page))) { printk(KERN_ERR "move_freepages: Bogus zones: " "start_page[%p] end_page[%p] zone[%p]\n", start_page, end_page, zone); printk(KERN_ERR "move_freepages: " "start_zone[%p] end_zone[%p]\n", page_zone(start_page), page_zone(end_page)); printk(KERN_ERR "move_freepages: " "start_pfn[0x%lx] end_pfn[0x%lx]\n", page_to_pfn(start_page), page_to_pfn(end_page)); printk(KERN_ERR "move_freepages: " "start_nid[%d] end_nid[%d]\n", page_to_nid(start_page), page_to_nid(end_page)); ... And here's what I got: move_freepages: Bogus zones: start_page[2207d0000] end_page[2207dffc0] zone[fffff8103effcb00] move_freepages: start_zone[fffff8103effcb00] end_zone[fffff8003fffeb00] move_freepages: start_pfn[0x81f600] end_pfn[0x81f7ff] move_freepages: start_nid[1] end_nid[0] My memory layout on this box is: [ 0.000000] Zone PFN ranges: [ 0.000000] Normal 0x00000000 -> 0x0081ff5d [ 0.000000] Movable zone start PFN for each node [ 0.000000] early_node_map[8] active PFN ranges [ 0.000000] 0: 0x00000000 -> 0x00020000 [ 0.000000] 1: 0x00800000 -> 0x0081f7ff [ 0.000000] 1: 0x0081f800 -> 0x0081fe50 [ 0.000000] 1: 0x0081fed1 -> 0x0081fed8 [ 0.000000] 1: 0x0081feda -> 0x0081fedb [ 0.000000] 1: 0x0081fedd -> 0x0081fee5 [ 0.000000] 1: 0x0081fee7 -> 0x0081ff51 [ 0.000000] 1: 0x0081ff59 -> 0x0081ff5d So it's a block move in that 0x81f600-->0x81f7ff region which triggers the problem. This patch: Declaration of early_pfn_to_nid() is scattered over per-arch include files, and it seems it's complicated to know when the declaration is used. I think it makes fix-for-memmap-init not easy. This patch moves all declaration to include/linux/mm.h After this, if !CONFIG_NODES_POPULATES_NODE_MAP && !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -> Use static definition in include/linux/mm.h else if !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -> Use generic definition in mm/page_alloc.c else -> per-arch back end function will be called. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reported-by: David Miller <davem@davemlloft.net> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: <stable@kernel.org> [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-02-19 06:48:32 +08:00
int __meminit __early_pfn_to_nid(unsigned long pfn)
{
return phys_to_nid(pfn << PAGE_SHIFT);
}
static void * __init early_node_mem(int nodeid, unsigned long start,
x86_64: make bootmap_start page align v6 boot oopses when a system has 64 or 128 GB of RAM installed: Calling initcall 0xffffffff80bc33b6: sctp_init+0x0/0x711() BUG: unable to handle kernel NULL pointer dereference at 000000000000005f IP: [<ffffffff802bfe55>] proc_register+0xe7/0x10f PGD 0 Oops: 0000 [1] SMP CPU 0 Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #6 RIP: 0010:[<ffffffff802bfe55>] [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP: 0000:ffff810824c57e60 EFLAGS: 00010246 RAX: 000000000000d7d7 RBX: ffff811024c5fa80 RCX: ffff810824c57e08 RDX: 0000000000000000 RSI: 0000000000000195 RDI: ffffffff80cc2460 RBP: ffffffffffffffff R08: 0000000000000000 R09: ffff811024c5fa80 R10: 0000000000000000 R11: 0000000000000002 R12: ffff810824c57e6c R13: 0000000000000000 R14: ffff810824c57ee0 R15: 00000006abd25bee FS: 0000000000000000(0000) GS:ffffffff80b4d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 000000000000005f CR3: 0000000000201000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 1, threadinfo ffff810824c56000, task ffff812024c52000) Stack: ffffffff80a57348 0000019500000000 ffff811024c5fa80 0000000000000000 00000000ffffff97 ffffffff802bfef0 0000000000000000 ffffffffffffffff 0000000000000000 ffffffff80bc3b4b ffff810824c57ee0 ffffffff80bc34a5 Call Trace: [<ffffffff802bfef0>] ? create_proc_entry+0x73/0x8a [<ffffffff80bc3b4b>] ? sctp_snmp_proc_init+0x1c/0x34 [<ffffffff80bc34a5>] ? sctp_init+0xef/0x711 [<ffffffff80b976e3>] ? kernel_init+0x175/0x2e1 [<ffffffff8020ccf8>] ? child_rip+0xa/0x12 [<ffffffff80b9756e>] ? kernel_init+0x0/0x2e1 [<ffffffff8020ccee>] ? child_rip+0x0/0x12 Code: 1e 48 83 7b 38 00 75 08 48 c7 43 38 f0 e8 82 80 48 83 7b 30 00 75 08 48 c7 43 30 d0 e9 82 80 48 c7 c7 60 24 cc 80 e8 bd 5a 54 00 <48> 8b 45 60 48 89 6b 58 48 89 5d 60 48 89 43 50 fe 05 f5 25 a0 RIP [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP <ffff810824c57e60> CR2: 000000000000005f ---[ end trace 02c2d78def82877a ]--- Kernel panic - not syncing: Attempted to kill init! it turns out some variables near end of bss are corrupted already. in System.map we have ffffffff80d40420 b rsi_table ffffffff80d40620 B krb5_seq_lock ffffffff80d40628 b i.20437 ffffffff80d40630 b xprt_rdma_inline_write_padding ffffffff80d40638 b sunrpc_table_header ffffffff80d40640 b zero ffffffff80d40644 b min_memreg ffffffff80d40648 b rpcrdma_tk_lock_g ffffffff80d40650 B sctp_assocs_id_lock ffffffff80d40658 B proc_net_sctp ffffffff80d40660 B sctp_assocs_id ffffffff80d40680 B sysctl_sctp_mem ffffffff80d40690 B sysctl_sctp_rmem ffffffff80d406a0 B sysctl_sctp_wmem ffffffff80d406b0 b sctp_ctl_socket ffffffff80d406b8 b sctp_pf_inet6_specific ffffffff80d406c0 b sctp_pf_inet_specific ffffffff80d406c8 b sctp_af_v4_specific ffffffff80d406d0 b sctp_af_v6_specific ffffffff80d406d8 b sctp_rand.33270 ffffffff80d406dc b sctp_memory_pressure ffffffff80d406e0 b sctp_sockets_allocated ffffffff80d406e4 b sctp_memory_allocated ffffffff80d406e8 b sctp_sysctl_header ffffffff80d406f0 b zero ffffffff80d406f4 A __bss_stop ffffffff80d406f4 A _end and setup_node_bootmem() will use that page 0xd40000 for bootmap Bootmem setup node 0 0000000000000000-0000000828000000 NODE_DATA [000000000008a485 - 0000000000091484] bootmap [0000000000d406f4 - 0000000000e456f3] pages 105 Bootmem setup node 1 0000000828000000-0000001028000000 NODE_DATA [0000000828000000 - 0000000828006fff] bootmap [0000000828007000 - 0000000828106fff] pages 100 Bootmem setup node 2 0000001028000000-0000001828000000 NODE_DATA [0000001028000000 - 0000001028006fff] bootmap [0000001028007000 - 0000001028106fff] pages 100 Bootmem setup node 3 0000001828000000-0000002028000000 NODE_DATA [0000001828000000 - 0000001828006fff] bootmap [0000001828007000 - 0000001828106fff] pages 100 setup_node_bootmem() makes NODE_DATA cacheline aligned, and bootmap is page-aligned. the patch updates find_e820_area() to make sure we can meet the alignment constraints. Signed-off-by: Yinghai Lu <yinghai.lu@sun.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-02-02 00:49:41 +08:00
unsigned long end, unsigned long size,
unsigned long align)
{
x86: Make early_node_mem get mem > 4 GB if possible So we could put pgdata for the node high, and later sparse vmmap will get the section nr that need. With this patch will make <4 GB ram not use a sparse vmmap. before this patch, will get, before swiotlb try get bootmem [ 0.000000] nid=1 start=0 end=2080000 aligned=1 [ 0.000000] free [10 - 96] [ 0.000000] free [b12 - 1000] [ 0.000000] free [359f - 38a3] [ 0.000000] free [38b5 - 3a00] [ 0.000000] free [41e01 - 42000] [ 0.000000] free [73dde - 73e00] [ 0.000000] free [73fdd - 74000] [ 0.000000] free [741dd - 74200] [ 0.000000] free [743dd - 74400] [ 0.000000] free [745dd - 74600] [ 0.000000] free [747dd - 74800] [ 0.000000] free [749dd - 74a00] [ 0.000000] free [74bdd - 74c00] [ 0.000000] free [74ddd - 74e00] [ 0.000000] free [74fdd - 75000] [ 0.000000] free [751dd - 75200] [ 0.000000] free [753dd - 75400] [ 0.000000] free [755dd - 75600] [ 0.000000] free [757dd - 75800] [ 0.000000] free [759dd - 75a00] [ 0.000000] free [75bdd - 7bf5f] [ 0.000000] free [7f730 - 7f750] [ 0.000000] free [100000 - 2080000] [ 0.000000] total free 1f87170 [ 93.301474] Placing 64MB software IO TLB between ffff880075bdd000 - ffff880079bdd000 [ 93.311814] software IO TLB at phys 0x75bdd000 - 0x79bdd000 with this patch will get: before swiotlb try get bootmem [ 0.000000] nid=1 start=0 end=2080000 aligned=1 [ 0.000000] free [a - 96] [ 0.000000] free [702 - 1000] [ 0.000000] free [359f - 3600] [ 0.000000] free [37de - 3800] [ 0.000000] free [39dd - 3a00] [ 0.000000] free [3bdd - 3c00] [ 0.000000] free [3ddd - 3e00] [ 0.000000] free [3fdd - 4000] [ 0.000000] free [41dd - 4200] [ 0.000000] free [43dd - 4400] [ 0.000000] free [45dd - 4600] [ 0.000000] free [47dd - 4800] [ 0.000000] free [49dd - 4a00] [ 0.000000] free [4bdd - 4c00] [ 0.000000] free [4ddd - 4e00] [ 0.000000] free [4fdd - 5000] [ 0.000000] free [51dd - 5200] [ 0.000000] free [53dd - 5400] [ 0.000000] free [55dd - 7bf5f] [ 0.000000] free [7f730 - 7f750] [ 0.000000] free [100428 - 100600] [ 0.000000] free [13ea01 - 13ec00] [ 0.000000] free [170800 - 2080000] [ 0.000000] total free 1f87170 [ 92.689485] PCI-DMA: Using software bounce buffering for IO (SWIOTLB) [ 92.699799] Placing 64MB software IO TLB between ffff8800055dd000 - ffff8800095dd000 [ 92.710916] software IO TLB at phys 0x55dd000 - 0x95dd000 so will get enough space below 4G, aka pfn 0x100000 Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-15-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-02-10 17:20:18 +08:00
unsigned long mem;
x86: Make early_node_mem get mem > 4 GB if possible So we could put pgdata for the node high, and later sparse vmmap will get the section nr that need. With this patch will make <4 GB ram not use a sparse vmmap. before this patch, will get, before swiotlb try get bootmem [ 0.000000] nid=1 start=0 end=2080000 aligned=1 [ 0.000000] free [10 - 96] [ 0.000000] free [b12 - 1000] [ 0.000000] free [359f - 38a3] [ 0.000000] free [38b5 - 3a00] [ 0.000000] free [41e01 - 42000] [ 0.000000] free [73dde - 73e00] [ 0.000000] free [73fdd - 74000] [ 0.000000] free [741dd - 74200] [ 0.000000] free [743dd - 74400] [ 0.000000] free [745dd - 74600] [ 0.000000] free [747dd - 74800] [ 0.000000] free [749dd - 74a00] [ 0.000000] free [74bdd - 74c00] [ 0.000000] free [74ddd - 74e00] [ 0.000000] free [74fdd - 75000] [ 0.000000] free [751dd - 75200] [ 0.000000] free [753dd - 75400] [ 0.000000] free [755dd - 75600] [ 0.000000] free [757dd - 75800] [ 0.000000] free [759dd - 75a00] [ 0.000000] free [75bdd - 7bf5f] [ 0.000000] free [7f730 - 7f750] [ 0.000000] free [100000 - 2080000] [ 0.000000] total free 1f87170 [ 93.301474] Placing 64MB software IO TLB between ffff880075bdd000 - ffff880079bdd000 [ 93.311814] software IO TLB at phys 0x75bdd000 - 0x79bdd000 with this patch will get: before swiotlb try get bootmem [ 0.000000] nid=1 start=0 end=2080000 aligned=1 [ 0.000000] free [a - 96] [ 0.000000] free [702 - 1000] [ 0.000000] free [359f - 3600] [ 0.000000] free [37de - 3800] [ 0.000000] free [39dd - 3a00] [ 0.000000] free [3bdd - 3c00] [ 0.000000] free [3ddd - 3e00] [ 0.000000] free [3fdd - 4000] [ 0.000000] free [41dd - 4200] [ 0.000000] free [43dd - 4400] [ 0.000000] free [45dd - 4600] [ 0.000000] free [47dd - 4800] [ 0.000000] free [49dd - 4a00] [ 0.000000] free [4bdd - 4c00] [ 0.000000] free [4ddd - 4e00] [ 0.000000] free [4fdd - 5000] [ 0.000000] free [51dd - 5200] [ 0.000000] free [53dd - 5400] [ 0.000000] free [55dd - 7bf5f] [ 0.000000] free [7f730 - 7f750] [ 0.000000] free [100428 - 100600] [ 0.000000] free [13ea01 - 13ec00] [ 0.000000] free [170800 - 2080000] [ 0.000000] total free 1f87170 [ 92.689485] PCI-DMA: Using software bounce buffering for IO (SWIOTLB) [ 92.699799] Placing 64MB software IO TLB between ffff8800055dd000 - ffff8800095dd000 [ 92.710916] software IO TLB at phys 0x55dd000 - 0x95dd000 so will get enough space below 4G, aka pfn 0x100000 Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-15-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-02-10 17:20:18 +08:00
/*
* put it on high as possible
* something will go with NODE_DATA
*/
if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
start = MAX_DMA_PFN<<PAGE_SHIFT;
if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
end > (MAX_DMA32_PFN<<PAGE_SHIFT))
start = MAX_DMA32_PFN<<PAGE_SHIFT;
x86: Use memblock to replace early_res 1. replace find_e820_area with memblock_find_in_range 2. replace reserve_early with memblock_x86_reserve_range 3. replace free_early with memblock_x86_free_range. 4. NO_BOOTMEM will switch to use memblock too. 5. use _e820, _early wrap in the patch, in following patch, will replace them all 6. because memblock_x86_free_range support partial free, we can remove some special care 7. Need to make sure that memblock_find_in_range() is called after memblock_x86_fill() so adjust some calling later in setup.c::setup_arch() -- corruption_check and mptable_update -v2: Move reserve_brk() early Before fill_memblock_area, to avoid overlap between brk and memblock_find_in_range() that could happen We have more then 128 RAM entry in E820 tables, and memblock_x86_fill() could use memblock_find_in_range() to find a new place for memblock.memory.region array. and We don't need to use extend_brk() after fill_memblock_area() So move reserve_brk() early before fill_memblock_area(). -v3: Move find_smp_config early To make sure memblock_find_in_range not find wrong place, if BIOS doesn't put mptable in right place. -v4: Treat RESERVED_KERN as RAM in memblock.memory. and they are already in memblock.reserved already.. use __NOT_KEEP_MEMBLOCK to make sure memblock related code could be freed later. -v5: Generic version __memblock_find_in_range() is going from high to low, and for 32bit active_region for 32bit does include high pages need to replace the limit with memblock.default_alloc_limit, aka get_max_mapped() -v6: Use current_limit instead -v7: check with MEMBLOCK_ERROR instead of -1ULL or -1L -v8: Set memblock_can_resize early to handle EFI with more RAM entries -v9: update after kmemleak changes in mainline Suggested-by: David S. Miller <davem@davemloft.net> Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-08-26 04:39:17 +08:00
mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
if (mem != MEMBLOCK_ERROR)
return __va(mem);
x86: Make early_node_mem get mem > 4 GB if possible So we could put pgdata for the node high, and later sparse vmmap will get the section nr that need. With this patch will make <4 GB ram not use a sparse vmmap. before this patch, will get, before swiotlb try get bootmem [ 0.000000] nid=1 start=0 end=2080000 aligned=1 [ 0.000000] free [10 - 96] [ 0.000000] free [b12 - 1000] [ 0.000000] free [359f - 38a3] [ 0.000000] free [38b5 - 3a00] [ 0.000000] free [41e01 - 42000] [ 0.000000] free [73dde - 73e00] [ 0.000000] free [73fdd - 74000] [ 0.000000] free [741dd - 74200] [ 0.000000] free [743dd - 74400] [ 0.000000] free [745dd - 74600] [ 0.000000] free [747dd - 74800] [ 0.000000] free [749dd - 74a00] [ 0.000000] free [74bdd - 74c00] [ 0.000000] free [74ddd - 74e00] [ 0.000000] free [74fdd - 75000] [ 0.000000] free [751dd - 75200] [ 0.000000] free [753dd - 75400] [ 0.000000] free [755dd - 75600] [ 0.000000] free [757dd - 75800] [ 0.000000] free [759dd - 75a00] [ 0.000000] free [75bdd - 7bf5f] [ 0.000000] free [7f730 - 7f750] [ 0.000000] free [100000 - 2080000] [ 0.000000] total free 1f87170 [ 93.301474] Placing 64MB software IO TLB between ffff880075bdd000 - ffff880079bdd000 [ 93.311814] software IO TLB at phys 0x75bdd000 - 0x79bdd000 with this patch will get: before swiotlb try get bootmem [ 0.000000] nid=1 start=0 end=2080000 aligned=1 [ 0.000000] free [a - 96] [ 0.000000] free [702 - 1000] [ 0.000000] free [359f - 3600] [ 0.000000] free [37de - 3800] [ 0.000000] free [39dd - 3a00] [ 0.000000] free [3bdd - 3c00] [ 0.000000] free [3ddd - 3e00] [ 0.000000] free [3fdd - 4000] [ 0.000000] free [41dd - 4200] [ 0.000000] free [43dd - 4400] [ 0.000000] free [45dd - 4600] [ 0.000000] free [47dd - 4800] [ 0.000000] free [49dd - 4a00] [ 0.000000] free [4bdd - 4c00] [ 0.000000] free [4ddd - 4e00] [ 0.000000] free [4fdd - 5000] [ 0.000000] free [51dd - 5200] [ 0.000000] free [53dd - 5400] [ 0.000000] free [55dd - 7bf5f] [ 0.000000] free [7f730 - 7f750] [ 0.000000] free [100428 - 100600] [ 0.000000] free [13ea01 - 13ec00] [ 0.000000] free [170800 - 2080000] [ 0.000000] total free 1f87170 [ 92.689485] PCI-DMA: Using software bounce buffering for IO (SWIOTLB) [ 92.699799] Placing 64MB software IO TLB between ffff8800055dd000 - ffff8800095dd000 [ 92.710916] software IO TLB at phys 0x55dd000 - 0x95dd000 so will get enough space below 4G, aka pfn 0x100000 Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <1265793639-15071-15-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-02-10 17:20:18 +08:00
/* extend the search scope */
end = max_pfn_mapped << PAGE_SHIFT;
start = MAX_DMA_PFN << PAGE_SHIFT;
mem = memblock_find_in_range(start, end, size, align);
x86: Use memblock to replace early_res 1. replace find_e820_area with memblock_find_in_range 2. replace reserve_early with memblock_x86_reserve_range 3. replace free_early with memblock_x86_free_range. 4. NO_BOOTMEM will switch to use memblock too. 5. use _e820, _early wrap in the patch, in following patch, will replace them all 6. because memblock_x86_free_range support partial free, we can remove some special care 7. Need to make sure that memblock_find_in_range() is called after memblock_x86_fill() so adjust some calling later in setup.c::setup_arch() -- corruption_check and mptable_update -v2: Move reserve_brk() early Before fill_memblock_area, to avoid overlap between brk and memblock_find_in_range() that could happen We have more then 128 RAM entry in E820 tables, and memblock_x86_fill() could use memblock_find_in_range() to find a new place for memblock.memory.region array. and We don't need to use extend_brk() after fill_memblock_area() So move reserve_brk() early before fill_memblock_area(). -v3: Move find_smp_config early To make sure memblock_find_in_range not find wrong place, if BIOS doesn't put mptable in right place. -v4: Treat RESERVED_KERN as RAM in memblock.memory. and they are already in memblock.reserved already.. use __NOT_KEEP_MEMBLOCK to make sure memblock related code could be freed later. -v5: Generic version __memblock_find_in_range() is going from high to low, and for 32bit active_region for 32bit does include high pages need to replace the limit with memblock.default_alloc_limit, aka get_max_mapped() -v6: Use current_limit instead -v7: check with MEMBLOCK_ERROR instead of -1ULL or -1L -v8: Set memblock_can_resize early to handle EFI with more RAM entries -v9: update after kmemleak changes in mainline Suggested-by: David S. Miller <davem@davemloft.net> Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-08-26 04:39:17 +08:00
if (mem != MEMBLOCK_ERROR)
return __va(mem);
printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
size, nodeid);
return NULL;
}
int __init numa_add_memblk(int nid, u64 start, u64 end)
{
struct numa_meminfo *mi = &numa_meminfo;
/* ignore zero length blks */
if (start == end)
return 0;
/* whine about and ignore invalid blks */
if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
nid, start, end);
return 0;
}
if (mi->nr_blks >= NR_NODE_MEMBLKS) {
pr_err("NUMA: too many memblk ranges\n");
return -EINVAL;
}
mi->blk[mi->nr_blks].start = start;
mi->blk[mi->nr_blks].end = end;
mi->blk[mi->nr_blks].nid = nid;
mi->nr_blks++;
return 0;
}
static void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
{
mi->nr_blks--;
memmove(&mi->blk[idx], &mi->blk[idx + 1],
(mi->nr_blks - idx) * sizeof(mi->blk[0]));
}
/* Initialize bootmem allocator for a node */
void __init
setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
unsigned long start_pfn, last_pfn, nodedata_phys;
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
int nid;
if (!end)
return;
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
*/
if (end && (end - start) < NODE_MIN_SIZE)
return;
start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
start, end);
start_pfn = start >> PAGE_SHIFT;
last_pfn = end >> PAGE_SHIFT;
x86_64: make bootmap_start page align v6 boot oopses when a system has 64 or 128 GB of RAM installed: Calling initcall 0xffffffff80bc33b6: sctp_init+0x0/0x711() BUG: unable to handle kernel NULL pointer dereference at 000000000000005f IP: [<ffffffff802bfe55>] proc_register+0xe7/0x10f PGD 0 Oops: 0000 [1] SMP CPU 0 Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #6 RIP: 0010:[<ffffffff802bfe55>] [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP: 0000:ffff810824c57e60 EFLAGS: 00010246 RAX: 000000000000d7d7 RBX: ffff811024c5fa80 RCX: ffff810824c57e08 RDX: 0000000000000000 RSI: 0000000000000195 RDI: ffffffff80cc2460 RBP: ffffffffffffffff R08: 0000000000000000 R09: ffff811024c5fa80 R10: 0000000000000000 R11: 0000000000000002 R12: ffff810824c57e6c R13: 0000000000000000 R14: ffff810824c57ee0 R15: 00000006abd25bee FS: 0000000000000000(0000) GS:ffffffff80b4d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 000000000000005f CR3: 0000000000201000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 1, threadinfo ffff810824c56000, task ffff812024c52000) Stack: ffffffff80a57348 0000019500000000 ffff811024c5fa80 0000000000000000 00000000ffffff97 ffffffff802bfef0 0000000000000000 ffffffffffffffff 0000000000000000 ffffffff80bc3b4b ffff810824c57ee0 ffffffff80bc34a5 Call Trace: [<ffffffff802bfef0>] ? create_proc_entry+0x73/0x8a [<ffffffff80bc3b4b>] ? sctp_snmp_proc_init+0x1c/0x34 [<ffffffff80bc34a5>] ? sctp_init+0xef/0x711 [<ffffffff80b976e3>] ? kernel_init+0x175/0x2e1 [<ffffffff8020ccf8>] ? child_rip+0xa/0x12 [<ffffffff80b9756e>] ? kernel_init+0x0/0x2e1 [<ffffffff8020ccee>] ? child_rip+0x0/0x12 Code: 1e 48 83 7b 38 00 75 08 48 c7 43 38 f0 e8 82 80 48 83 7b 30 00 75 08 48 c7 43 30 d0 e9 82 80 48 c7 c7 60 24 cc 80 e8 bd 5a 54 00 <48> 8b 45 60 48 89 6b 58 48 89 5d 60 48 89 43 50 fe 05 f5 25 a0 RIP [<ffffffff802bfe55>] proc_register+0xe7/0x10f RSP <ffff810824c57e60> CR2: 000000000000005f ---[ end trace 02c2d78def82877a ]--- Kernel panic - not syncing: Attempted to kill init! it turns out some variables near end of bss are corrupted already. in System.map we have ffffffff80d40420 b rsi_table ffffffff80d40620 B krb5_seq_lock ffffffff80d40628 b i.20437 ffffffff80d40630 b xprt_rdma_inline_write_padding ffffffff80d40638 b sunrpc_table_header ffffffff80d40640 b zero ffffffff80d40644 b min_memreg ffffffff80d40648 b rpcrdma_tk_lock_g ffffffff80d40650 B sctp_assocs_id_lock ffffffff80d40658 B proc_net_sctp ffffffff80d40660 B sctp_assocs_id ffffffff80d40680 B sysctl_sctp_mem ffffffff80d40690 B sysctl_sctp_rmem ffffffff80d406a0 B sysctl_sctp_wmem ffffffff80d406b0 b sctp_ctl_socket ffffffff80d406b8 b sctp_pf_inet6_specific ffffffff80d406c0 b sctp_pf_inet_specific ffffffff80d406c8 b sctp_af_v4_specific ffffffff80d406d0 b sctp_af_v6_specific ffffffff80d406d8 b sctp_rand.33270 ffffffff80d406dc b sctp_memory_pressure ffffffff80d406e0 b sctp_sockets_allocated ffffffff80d406e4 b sctp_memory_allocated ffffffff80d406e8 b sctp_sysctl_header ffffffff80d406f0 b zero ffffffff80d406f4 A __bss_stop ffffffff80d406f4 A _end and setup_node_bootmem() will use that page 0xd40000 for bootmap Bootmem setup node 0 0000000000000000-0000000828000000 NODE_DATA [000000000008a485 - 0000000000091484] bootmap [0000000000d406f4 - 0000000000e456f3] pages 105 Bootmem setup node 1 0000000828000000-0000001028000000 NODE_DATA [0000000828000000 - 0000000828006fff] bootmap [0000000828007000 - 0000000828106fff] pages 100 Bootmem setup node 2 0000001028000000-0000001828000000 NODE_DATA [0000001028000000 - 0000001028006fff] bootmap [0000001028007000 - 0000001028106fff] pages 100 Bootmem setup node 3 0000001828000000-0000002028000000 NODE_DATA [0000001828000000 - 0000001828006fff] bootmap [0000001828007000 - 0000001828106fff] pages 100 setup_node_bootmem() makes NODE_DATA cacheline aligned, and bootmap is page-aligned. the patch updates find_e820_area() to make sure we can meet the alignment constraints. Signed-off-by: Yinghai Lu <yinghai.lu@sun.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-02-02 00:49:41 +08:00
node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
SMP_CACHE_BYTES);
if (node_data[nodeid] == NULL)
return;
nodedata_phys = __pa(node_data[nodeid]);
memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
nodedata_phys + pgdat_size - 1);
nid = phys_to_nid(nodedata_phys);
if (nid != nodeid)
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
NODE_DATA(nodeid)->node_id = nodeid;
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
node_set_online(nodeid);
}
static int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
{
const u64 low = 0;
const u64 high = (u64)max_pfn << PAGE_SHIFT;
int i, j, k;
for (i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *bi = &mi->blk[i];
/* make sure all blocks are inside the limits */
bi->start = max(bi->start, low);
bi->end = min(bi->end, high);
/* and there's no empty block */
if (bi->start == bi->end) {
numa_remove_memblk_from(i--, mi);
continue;
}
for (j = i + 1; j < mi->nr_blks; j++) {
struct numa_memblk *bj = &mi->blk[j];
unsigned long start, end;
/*
* See whether there are overlapping blocks. Whine
* about but allow overlaps of the same nid. They
* will be merged below.
*/
if (bi->end > bj->start && bi->start < bj->end) {
if (bi->nid != bj->nid) {
pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
bi->nid, bi->start, bi->end,
bj->nid, bj->start, bj->end);
return -EINVAL;
}
pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
bi->nid, bi->start, bi->end,
bj->start, bj->end);
}
/*
* Join together blocks on the same node, holes
* between which don't overlap with memory on other
* nodes.
*/
if (bi->nid != bj->nid)
continue;
start = max(min(bi->start, bj->start), low);
end = min(max(bi->end, bj->end), high);
for (k = 0; k < mi->nr_blks; k++) {
struct numa_memblk *bk = &mi->blk[k];
if (bi->nid == bk->nid)
continue;
if (start < bk->end && end > bk->start)
break;
}
if (k < mi->nr_blks)
continue;
printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
bi->nid, bi->start, bi->end, bj->start, bj->end,
start, end);
bi->start = start;
bi->end = end;
numa_remove_memblk_from(j--, mi);
}
}
for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
mi->blk[i].start = mi->blk[i].end = 0;
mi->blk[i].nid = NUMA_NO_NODE;
}
return 0;
}
/*
* Sanity check to catch more bad NUMA configurations (they are amazingly
* common). Make sure the nodes cover all memory.
*/
static int __init nodes_cover_memory(const struct bootnode *nodes)
{
unsigned long numaram, e820ram;
int i;
numaram = 0;
for_each_node_mask(i, mem_nodes_parsed) {
unsigned long s = nodes[i].start >> PAGE_SHIFT;
unsigned long e = nodes[i].end >> PAGE_SHIFT;
numaram += e - s;
numaram -= __absent_pages_in_range(i, s, e);
if ((long)numaram < 0)
numaram = 0;
}
e820ram = max_pfn - (memblock_x86_hole_size(0,
max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
(numaram << PAGE_SHIFT) >> 20,
(e820ram << PAGE_SHIFT) >> 20);
return 0;
}
return 1;
}
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
int i;
/* Account for nodes with cpus and no memory */
nodes_or(node_possible_map, mem_nodes_parsed, cpu_nodes_parsed);
if (WARN_ON(nodes_empty(node_possible_map)))
return -EINVAL;
memnode_shift = compute_hash_shift(mi);
if (memnode_shift < 0) {
printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
return -EINVAL;
}
for (i = 0; i < mi->nr_blks; i++)
memblock_x86_register_active_regions(mi->blk[i].nid,
mi->blk[i].start >> PAGE_SHIFT,
mi->blk[i].end >> PAGE_SHIFT);
/* for out of order entries */
sort_node_map();
if (!nodes_cover_memory(numa_nodes))
return -EINVAL;
init_memory_mapping_high();
/* Finally register nodes. */
for_each_node_mask(i, node_possible_map)
setup_node_bootmem(i, numa_nodes[i].start, numa_nodes[i].end);
/*
* Try again in case setup_node_bootmem missed one due to missing
* bootmem.
*/
for_each_node_mask(i, node_possible_map)
if (!node_online(i))
setup_node_bootmem(i, numa_nodes[i].start,
numa_nodes[i].end);
return 0;
}
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
static struct bootnode nodes[MAX_NUMNODES] __initdata;
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
static char *cmdline __initdata;
void __init numa_emu_cmdline(char *str)
{
cmdline = str;
}
int __init find_node_by_addr(unsigned long addr)
{
int ret = NUMA_NO_NODE;
int i;
for_each_node_mask(i, mem_nodes_parsed) {
/*
* Find the real node that this emulated node appears on. For
* the sake of simplicity, we only use a real node's starting
* address to determine which emulated node it appears on.
*/
if (addr >= numa_nodes[i].start && addr < numa_nodes[i].end) {
ret = i;
break;
}
}
return ret;
}
static int __init setup_physnodes(unsigned long start, unsigned long end)
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
{
int ret = 0;
int i;
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
memset(physnodes, 0, sizeof(physnodes));
for_each_node_mask(i, mem_nodes_parsed) {
physnodes[i].start = numa_nodes[i].start;
physnodes[i].end = numa_nodes[i].end;
}
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
/*
* Basic sanity checking on the physical node map: there may be errors
* if the SRAT or AMD code incorrectly reported the topology or the mem=
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
* kernel parameter is used.
*/
x86, numa: Fix cpu to node mapping for sparse node ids NUMA boot code assumes that physical node ids start at 0, but the DIMMs that the apic id represents may not be reachable. If this is the case, node 0 is never online and cpus never end up getting appropriately assigned to a node. This causes the cpumask of all online nodes to be empty and machines crash with kernel code assuming online nodes have valid cpus. The fix is to appropriately map all the address ranges for physical nodes and ensure the cpu to node mapping function checks all possible nodes (up to MAX_NUMNODES) instead of simply checking nodes 0-N, where N is the number of physical nodes, for valid address ranges. This requires no longer "compressing" the address ranges of nodes in the physical node map from 0-N, but rather leave indices in physnodes[] to represent the actual node id of the physical node. Accordingly, the topology exported by both amd_get_nodes() and acpi_get_nodes() no longer must return the number of nodes to iterate through; all such iterations will now be to MAX_NUMNODES. This change also passes the end address of system RAM (which may be different from normal operation if mem= is specified on the command line) before the physnodes[] array is populated. ACPI parsed nodes are truncated to fit within the address range that respect the mem= boundaries and even some physical nodes may become unreachable in such cases. When NUMA emulation does succeed, any apicid to node mapping that exists for unreachable nodes are given default values so that proximity domains can still be assigned. This is important for node_distance() to function as desired. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221702090.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:56 +08:00
for (i = 0; i < MAX_NUMNODES; i++) {
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
if (physnodes[i].start == physnodes[i].end)
continue;
if (physnodes[i].start > end) {
physnodes[i].end = physnodes[i].start;
continue;
}
if (physnodes[i].end < start) {
physnodes[i].start = physnodes[i].end;
continue;
}
if (physnodes[i].start < start)
physnodes[i].start = start;
if (physnodes[i].end > end)
physnodes[i].end = end;
ret++;
}
/*
* If no physical topology was detected, a single node is faked to cover
* the entire address space.
*/
if (!ret) {
physnodes[ret].start = start;
physnodes[ret].end = end;
ret = 1;
}
return ret;
}
static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
{
int i;
BUG_ON(acpi && amd);
#ifdef CONFIG_ACPI_NUMA
if (acpi)
acpi_fake_nodes(nodes, nr_nodes);
#endif
#ifdef CONFIG_AMD_NUMA
if (amd)
amd_fake_nodes(nodes, nr_nodes);
#endif
if (!acpi && !amd)
for (i = 0; i < nr_cpu_ids; i++)
numa_set_node(i, 0);
}
/*
* Setups up nid to range from addr to addr + size. If the end
* boundary is greater than max_addr, then max_addr is used instead.
* The return value is 0 if there is additional memory left for
* allocation past addr and -1 otherwise. addr is adjusted to be at
* the end of the node.
*/
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
{
[PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:09 +08:00
int ret = 0;
nodes[nid].start = *addr;
*addr += size;
if (*addr >= max_addr) {
*addr = max_addr;
ret = -1;
}
nodes[nid].end = *addr;
node_set(nid, node_possible_map);
[PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:09 +08:00
printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
nodes[nid].start, nodes[nid].end,
(nodes[nid].end - nodes[nid].start) >> 20);
return ret;
}
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
/*
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
* to max_addr. The return value is the number of nodes allocated.
*/
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
{
nodemask_t physnode_mask = NODE_MASK_NONE;
u64 size;
int big;
int ret = 0;
int i;
if (nr_nodes <= 0)
return -1;
if (nr_nodes > MAX_NUMNODES) {
pr_info("numa=fake=%d too large, reducing to %d\n",
nr_nodes, MAX_NUMNODES);
nr_nodes = MAX_NUMNODES;
}
size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
/*
* Calculate the number of big nodes that can be allocated as a result
* of consolidating the remainder.
*/
big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
FAKE_NODE_MIN_SIZE;
size &= FAKE_NODE_MIN_HASH_MASK;
if (!size) {
pr_err("Not enough memory for each node. "
"NUMA emulation disabled.\n");
return -1;
}
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
for (i = 0; i < MAX_NUMNODES; i++)
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
if (physnodes[i].start != physnodes[i].end)
node_set(i, physnode_mask);
/*
* Continue to fill physical nodes with fake nodes until there is no
* memory left on any of them.
*/
while (nodes_weight(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 end = physnodes[i].start + size;
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
if (ret < big)
end += FAKE_NODE_MIN_SIZE;
/*
* Continue to add memory to this fake node if its
* non-reserved memory is less than the per-node size.
*/
while (end - physnodes[i].start -
memblock_x86_hole_size(physnodes[i].start, end) < size) {
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
end += FAKE_NODE_MIN_SIZE;
if (end > physnodes[i].end) {
end = physnodes[i].end;
break;
}
}
/*
* If there won't be at least FAKE_NODE_MIN_SIZE of
* non-reserved memory in ZONE_DMA32 for the next node,
* this one must extend to the boundary.
*/
if (end < dma32_end && dma32_end - end -
memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
end = dma32_end;
/*
* If there won't be enough non-reserved memory for the
* next node, this one must extend to the end of the
* physical node.
*/
if (physnodes[i].end - end -
memblock_x86_hole_size(end, physnodes[i].end) < size)
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
end = physnodes[i].end;
/*
* Avoid allocating more nodes than requested, which can
* happen as a result of rounding down each node's size
* to FAKE_NODE_MIN_SIZE.
*/
if (nodes_weight(physnode_mask) + ret >= nr_nodes)
end = physnodes[i].end;
if (setup_node_range(ret++, &physnodes[i].start,
end - physnodes[i].start,
physnodes[i].end) < 0)
node_clear(i, physnode_mask);
}
}
return ret;
}
/*
* Returns the end address of a node so that there is at least `size' amount of
* non-reserved memory or `max_addr' is reached.
*/
static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
{
u64 end = start + size;
while (end - start - memblock_x86_hole_size(start, end) < size) {
end += FAKE_NODE_MIN_SIZE;
if (end > max_addr) {
end = max_addr;
break;
}
}
return end;
}
/*
* Sets up fake nodes of `size' interleaved over physical nodes ranging from
* `addr' to `max_addr'. The return value is the number of nodes allocated.
*/
static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
{
nodemask_t physnode_mask = NODE_MASK_NONE;
u64 min_size;
int ret = 0;
int i;
if (!size)
return -1;
/*
* The limit on emulated nodes is MAX_NUMNODES, so the size per node is
* increased accordingly if the requested size is too small. This
* creates a uniform distribution of node sizes across the entire
* machine (but not necessarily over physical nodes).
*/
min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
MAX_NUMNODES;
min_size = max(min_size, FAKE_NODE_MIN_SIZE);
if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
min_size = (min_size + FAKE_NODE_MIN_SIZE) &
FAKE_NODE_MIN_HASH_MASK;
if (size < min_size) {
pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
size >> 20, min_size >> 20);
size = min_size;
}
size &= FAKE_NODE_MIN_HASH_MASK;
for (i = 0; i < MAX_NUMNODES; i++)
if (physnodes[i].start != physnodes[i].end)
node_set(i, physnode_mask);
/*
* Fill physical nodes with fake nodes of size until there is no memory
* left on any of them.
*/
while (nodes_weight(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
u64 end;
end = find_end_of_node(physnodes[i].start,
physnodes[i].end, size);
/*
* If there won't be at least FAKE_NODE_MIN_SIZE of
* non-reserved memory in ZONE_DMA32 for the next node,
* this one must extend to the boundary.
*/
if (end < dma32_end && dma32_end - end -
memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
end = dma32_end;
/*
* If there won't be enough non-reserved memory for the
* next node, this one must extend to the end of the
* physical node.
*/
if (physnodes[i].end - end -
memblock_x86_hole_size(end, physnodes[i].end) < size)
end = physnodes[i].end;
/*
* Setup the fake node that will be allocated as bootmem
* later. If setup_node_range() returns non-zero, there
* is no more memory available on this physical node.
*/
if (setup_node_range(ret++, &physnodes[i].start,
end - physnodes[i].start,
physnodes[i].end) < 0)
node_clear(i, physnode_mask);
}
}
return ret;
}
[PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:09 +08:00
/*
* Sets up the system RAM area from start_pfn to last_pfn according to the
[PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:09 +08:00
* numa=fake command-line option.
*/
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
static int __init numa_emulation(unsigned long start_pfn,
unsigned long last_pfn, int acpi, int amd)
[PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:09 +08:00
{
static struct numa_meminfo ei __initdata;
u64 addr = start_pfn << PAGE_SHIFT;
u64 max_addr = last_pfn << PAGE_SHIFT;
int num_nodes;
int i;
[PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:09 +08:00
/*
* If the numa=fake command-line contains a 'M' or 'G', it represents
* the fixed node size. Otherwise, if it is just a single number N,
* split the system RAM into N fake nodes.
*/
if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
u64 size;
size = memparse(cmdline, &cmdline);
num_nodes = split_nodes_size_interleave(addr, max_addr, size);
} else {
unsigned long n;
n = simple_strtoul(cmdline, NULL, 0);
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
num_nodes = split_nodes_interleave(addr, max_addr, n);
[PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:09 +08:00
}
if (num_nodes < 0)
return num_nodes;
ei.nr_blks = num_nodes;
for (i = 0; i < ei.nr_blks; i++) {
ei.blk[i].start = nodes[i].start;
ei.blk[i].end = nodes[i].end;
ei.blk[i].nid = i;
}
memnode_shift = compute_hash_shift(&ei);
[PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:09 +08:00
if (memnode_shift < 0) {
memnode_shift = 0;
printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
"disabled.\n");
return -1;
}
/*
x86: Interleave emulated nodes over physical nodes Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-26 06:20:09 +08:00
* We need to vacate all active ranges that may have been registered for
* the e820 memory map.
[PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:09 +08:00
*/
remove_all_active_ranges();
x86-64, numa: Put pgtable to local node memory Introduce init_memory_mapping_high(), and use it with 64bit. It will go with every memory segment above 4g to create page table to the memory range itself. before this patch all page tables was on one node. with this patch, one RED-PEN is killed debug out for 8 sockets system after patch [ 0.000000] initial memory mapped : 0 - 20000000 [ 0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff] [ 0.000000] 0000000000 - 007f600000 page 2M [ 0.000000] 007f600000 - 007f750000 page 4k [ 0.000000] kernel direct mapping tables up to 7f750000 @ [0x7f74c000-0x7f74ffff] [ 0.000000] RAMDISK: 7bc84000 - 7f745000 .... [ 0.000000] Adding active range (0, 0x10, 0x95) 0 entries of 3200 used [ 0.000000] Adding active range (0, 0x100, 0x7f750) 1 entries of 3200 used [ 0.000000] Adding active range (0, 0x100000, 0x1080000) 2 entries of 3200 used [ 0.000000] Adding active range (1, 0x1080000, 0x2080000) 3 entries of 3200 used [ 0.000000] Adding active range (2, 0x2080000, 0x3080000) 4 entries of 3200 used [ 0.000000] Adding active range (3, 0x3080000, 0x4080000) 5 entries of 3200 used [ 0.000000] Adding active range (4, 0x4080000, 0x5080000) 6 entries of 3200 used [ 0.000000] Adding active range (5, 0x5080000, 0x6080000) 7 entries of 3200 used [ 0.000000] Adding active range (6, 0x6080000, 0x7080000) 8 entries of 3200 used [ 0.000000] Adding active range (7, 0x7080000, 0x8080000) 9 entries of 3200 used [ 0.000000] init_memory_mapping: [0x00000100000000-0x0000107fffffff] [ 0.000000] 0100000000 - 1080000000 page 2M [ 0.000000] kernel direct mapping tables up to 1080000000 @ [0x107ffbd000-0x107fffffff] [ 0.000000] memblock_x86_reserve_range: [0x107ffc2000-0x107fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00001080000000-0x0000207fffffff] [ 0.000000] 1080000000 - 2080000000 page 2M [ 0.000000] kernel direct mapping tables up to 2080000000 @ [0x207ff7d000-0x207fffffff] [ 0.000000] memblock_x86_reserve_range: [0x207ffc0000-0x207fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00002080000000-0x0000307fffffff] [ 0.000000] 2080000000 - 3080000000 page 2M [ 0.000000] kernel direct mapping tables up to 3080000000 @ [0x307ff3d000-0x307fffffff] [ 0.000000] memblock_x86_reserve_range: [0x307ffc0000-0x307fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00003080000000-0x0000407fffffff] [ 0.000000] 3080000000 - 4080000000 page 2M [ 0.000000] kernel direct mapping tables up to 4080000000 @ [0x407fefd000-0x407fffffff] [ 0.000000] memblock_x86_reserve_range: [0x407ffc0000-0x407fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00004080000000-0x0000507fffffff] [ 0.000000] 4080000000 - 5080000000 page 2M [ 0.000000] kernel direct mapping tables up to 5080000000 @ [0x507febd000-0x507fffffff] [ 0.000000] memblock_x86_reserve_range: [0x507ffc0000-0x507fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00005080000000-0x0000607fffffff] [ 0.000000] 5080000000 - 6080000000 page 2M [ 0.000000] kernel direct mapping tables up to 6080000000 @ [0x607fe7d000-0x607fffffff] [ 0.000000] memblock_x86_reserve_range: [0x607ffc0000-0x607fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00006080000000-0x0000707fffffff] [ 0.000000] 6080000000 - 7080000000 page 2M [ 0.000000] kernel direct mapping tables up to 7080000000 @ [0x707fe3d000-0x707fffffff] [ 0.000000] memblock_x86_reserve_range: [0x707ffc0000-0x707fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00007080000000-0x0000807fffffff] [ 0.000000] 7080000000 - 8080000000 page 2M [ 0.000000] kernel direct mapping tables up to 8080000000 @ [0x807fdfc000-0x807fffffff] [ 0.000000] memblock_x86_reserve_range: [0x807ffbf000-0x807fffffff] PGTABLE [ 0.000000] Initmem setup node 0 [0000000000000000-000000107fffffff] [ 0.000000] NODE_DATA [0x0000107ffbd000-0x0000107ffc1fff] [ 0.000000] Initmem setup node 1 [0000001080000000-000000207fffffff] [ 0.000000] NODE_DATA [0x0000207ffbb000-0x0000207ffbffff] [ 0.000000] Initmem setup node 2 [0000002080000000-000000307fffffff] [ 0.000000] NODE_DATA [0x0000307ffbb000-0x0000307ffbffff] [ 0.000000] Initmem setup node 3 [0000003080000000-000000407fffffff] [ 0.000000] NODE_DATA [0x0000407ffbb000-0x0000407ffbffff] [ 0.000000] Initmem setup node 4 [0000004080000000-000000507fffffff] [ 0.000000] NODE_DATA [0x0000507ffbb000-0x0000507ffbffff] [ 0.000000] Initmem setup node 5 [0000005080000000-000000607fffffff] [ 0.000000] NODE_DATA [0x0000607ffbb000-0x0000607ffbffff] [ 0.000000] Initmem setup node 6 [0000006080000000-000000707fffffff] [ 0.000000] NODE_DATA [0x0000707ffbb000-0x0000707ffbffff] [ 0.000000] Initmem setup node 7 [0000007080000000-000000807fffffff] [ 0.000000] NODE_DATA [0x0000807ffba000-0x0000807ffbefff] Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <4D1933D1.9020609@kernel.org> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-28 08:48:17 +08:00
for_each_node_mask(i, node_possible_map)
memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
x86-64, numa: Put pgtable to local node memory Introduce init_memory_mapping_high(), and use it with 64bit. It will go with every memory segment above 4g to create page table to the memory range itself. before this patch all page tables was on one node. with this patch, one RED-PEN is killed debug out for 8 sockets system after patch [ 0.000000] initial memory mapped : 0 - 20000000 [ 0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff] [ 0.000000] 0000000000 - 007f600000 page 2M [ 0.000000] 007f600000 - 007f750000 page 4k [ 0.000000] kernel direct mapping tables up to 7f750000 @ [0x7f74c000-0x7f74ffff] [ 0.000000] RAMDISK: 7bc84000 - 7f745000 .... [ 0.000000] Adding active range (0, 0x10, 0x95) 0 entries of 3200 used [ 0.000000] Adding active range (0, 0x100, 0x7f750) 1 entries of 3200 used [ 0.000000] Adding active range (0, 0x100000, 0x1080000) 2 entries of 3200 used [ 0.000000] Adding active range (1, 0x1080000, 0x2080000) 3 entries of 3200 used [ 0.000000] Adding active range (2, 0x2080000, 0x3080000) 4 entries of 3200 used [ 0.000000] Adding active range (3, 0x3080000, 0x4080000) 5 entries of 3200 used [ 0.000000] Adding active range (4, 0x4080000, 0x5080000) 6 entries of 3200 used [ 0.000000] Adding active range (5, 0x5080000, 0x6080000) 7 entries of 3200 used [ 0.000000] Adding active range (6, 0x6080000, 0x7080000) 8 entries of 3200 used [ 0.000000] Adding active range (7, 0x7080000, 0x8080000) 9 entries of 3200 used [ 0.000000] init_memory_mapping: [0x00000100000000-0x0000107fffffff] [ 0.000000] 0100000000 - 1080000000 page 2M [ 0.000000] kernel direct mapping tables up to 1080000000 @ [0x107ffbd000-0x107fffffff] [ 0.000000] memblock_x86_reserve_range: [0x107ffc2000-0x107fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00001080000000-0x0000207fffffff] [ 0.000000] 1080000000 - 2080000000 page 2M [ 0.000000] kernel direct mapping tables up to 2080000000 @ [0x207ff7d000-0x207fffffff] [ 0.000000] memblock_x86_reserve_range: [0x207ffc0000-0x207fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00002080000000-0x0000307fffffff] [ 0.000000] 2080000000 - 3080000000 page 2M [ 0.000000] kernel direct mapping tables up to 3080000000 @ [0x307ff3d000-0x307fffffff] [ 0.000000] memblock_x86_reserve_range: [0x307ffc0000-0x307fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00003080000000-0x0000407fffffff] [ 0.000000] 3080000000 - 4080000000 page 2M [ 0.000000] kernel direct mapping tables up to 4080000000 @ [0x407fefd000-0x407fffffff] [ 0.000000] memblock_x86_reserve_range: [0x407ffc0000-0x407fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00004080000000-0x0000507fffffff] [ 0.000000] 4080000000 - 5080000000 page 2M [ 0.000000] kernel direct mapping tables up to 5080000000 @ [0x507febd000-0x507fffffff] [ 0.000000] memblock_x86_reserve_range: [0x507ffc0000-0x507fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00005080000000-0x0000607fffffff] [ 0.000000] 5080000000 - 6080000000 page 2M [ 0.000000] kernel direct mapping tables up to 6080000000 @ [0x607fe7d000-0x607fffffff] [ 0.000000] memblock_x86_reserve_range: [0x607ffc0000-0x607fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00006080000000-0x0000707fffffff] [ 0.000000] 6080000000 - 7080000000 page 2M [ 0.000000] kernel direct mapping tables up to 7080000000 @ [0x707fe3d000-0x707fffffff] [ 0.000000] memblock_x86_reserve_range: [0x707ffc0000-0x707fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00007080000000-0x0000807fffffff] [ 0.000000] 7080000000 - 8080000000 page 2M [ 0.000000] kernel direct mapping tables up to 8080000000 @ [0x807fdfc000-0x807fffffff] [ 0.000000] memblock_x86_reserve_range: [0x807ffbf000-0x807fffffff] PGTABLE [ 0.000000] Initmem setup node 0 [0000000000000000-000000107fffffff] [ 0.000000] NODE_DATA [0x0000107ffbd000-0x0000107ffc1fff] [ 0.000000] Initmem setup node 1 [0000001080000000-000000207fffffff] [ 0.000000] NODE_DATA [0x0000207ffbb000-0x0000207ffbffff] [ 0.000000] Initmem setup node 2 [0000002080000000-000000307fffffff] [ 0.000000] NODE_DATA [0x0000307ffbb000-0x0000307ffbffff] [ 0.000000] Initmem setup node 3 [0000003080000000-000000407fffffff] [ 0.000000] NODE_DATA [0x0000407ffbb000-0x0000407ffbffff] [ 0.000000] Initmem setup node 4 [0000004080000000-000000507fffffff] [ 0.000000] NODE_DATA [0x0000507ffbb000-0x0000507ffbffff] [ 0.000000] Initmem setup node 5 [0000005080000000-000000607fffffff] [ 0.000000] NODE_DATA [0x0000607ffbb000-0x0000607ffbffff] [ 0.000000] Initmem setup node 6 [0000006080000000-000000707fffffff] [ 0.000000] NODE_DATA [0x0000707ffbb000-0x0000707ffbffff] [ 0.000000] Initmem setup node 7 [0000007080000000-000000807fffffff] [ 0.000000] NODE_DATA [0x0000807ffba000-0x0000807ffbefff] Signed-off-by: Yinghai Lu <yinghai@kernel.org> LKML-Reference: <4D1933D1.9020609@kernel.org> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-28 08:48:17 +08:00
init_memory_mapping_high();
for_each_node_mask(i, node_possible_map)
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
setup_physnodes(addr, max_addr);
fake_physnodes(acpi, amd, num_nodes);
numa_init_array();
return 0;
}
[PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-03 01:27:09 +08:00
#endif /* CONFIG_NUMA_EMU */
static int dummy_numa_init(void)
{
printk(KERN_INFO "%s\n",
numa_off ? "NUMA turned off" : "No NUMA configuration found");
printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
0LU, max_pfn << PAGE_SHIFT);
node_set(0, cpu_nodes_parsed);
node_set(0, mem_nodes_parsed);
numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
numa_nodes[0].start = 0;
numa_nodes[0].end = (u64)max_pfn << PAGE_SHIFT;
return 0;
}
void __init initmem_init(void)
{
int (*numa_init[])(void) = { [2] = dummy_numa_init };
int i, j;
if (!numa_off) {
#ifdef CONFIG_ACPI_NUMA
numa_init[0] = x86_acpi_numa_init;
#endif
#ifdef CONFIG_AMD_NUMA
numa_init[1] = amd_numa_init;
#endif
}
for (i = 0; i < ARRAY_SIZE(numa_init); i++) {
if (!numa_init[i])
continue;
for (j = 0; j < MAX_LOCAL_APIC; j++)
set_apicid_to_node(j, NUMA_NO_NODE);
nodes_clear(cpu_nodes_parsed);
nodes_clear(mem_nodes_parsed);
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
memset(numa_nodes, 0, sizeof(numa_nodes));
remove_all_active_ranges();
if (numa_init[i]() < 0)
continue;
if (numa_cleanup_meminfo(&numa_meminfo) < 0)
continue;
#ifdef CONFIG_NUMA_EMU
setup_physnodes(0, max_pfn << PAGE_SHIFT);
if (cmdline && !numa_emulation(0, max_pfn, i == 0, i == 1))
return;
setup_physnodes(0, max_pfn << PAGE_SHIFT);
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
if (numa_register_memblks(&numa_meminfo) < 0)
continue;
for (j = 0; j < nr_cpu_ids; j++) {
int nid = early_cpu_to_node(j);
if (nid == NUMA_NO_NODE)
continue;
if (!node_online(nid))
numa_clear_node(j);
}
numa_init_array();
return;
}
BUG();
}
unsigned long __init numa_free_all_bootmem(void)
{
unsigned long pages = 0;
int i;
for_each_online_node(i)
pages += free_all_bootmem_node(NODE_DATA(i));
pages += free_all_memory_core_early(MAX_NUMNODES);
return pages;
}
int __cpuinit numa_cpu_node(int cpu)
{
int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
if (apicid != BAD_APICID)
return __apicid_to_node[apicid];
return NUMA_NO_NODE;
}
/*
* UGLINESS AHEAD: Currently, CONFIG_NUMA_EMU is 64bit only and makes use
* of 64bit specific data structures. The distinction is artificial and
* should be removed. numa_{add|remove}_cpu() are implemented in numa.c
* for both 32 and 64bit when CONFIG_NUMA_EMU is disabled but here when
* enabled.
*
* NUMA emulation is planned to be made generic and the following and other
* related code should be moved to numa.c.
*/
#ifdef CONFIG_NUMA_EMU
# ifndef CONFIG_DEBUG_PER_CPU_MAPS
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
void __cpuinit numa_add_cpu(int cpu)
{
unsigned long addr;
int physnid, nid;
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
nid = numa_cpu_node(cpu);
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
if (nid == NUMA_NO_NODE)
nid = early_cpu_to_node(cpu);
BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
/*
* Use the starting address of the emulated node to find which physical
* node it is allocated on.
*/
addr = node_start_pfn(nid) << PAGE_SHIFT;
for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
if (addr >= physnodes[physnid].start &&
addr < physnodes[physnid].end)
break;
/*
* Map the cpu to each emulated node that is allocated on the physical
* node of the cpu's apic id.
*/
for_each_online_node(nid) {
addr = node_start_pfn(nid) << PAGE_SHIFT;
if (addr >= physnodes[physnid].start &&
addr < physnodes[physnid].end)
cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
}
}
void __cpuinit numa_remove_cpu(int cpu)
{
int i;
for_each_online_node(i)
cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
}
# else /* !CONFIG_DEBUG_PER_CPU_MAPS */
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
int node = early_cpu_to_node(cpu);
struct cpumask *mask;
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
int i;
if (node == NUMA_NO_NODE) {
/* early_cpu_to_node() already emits a warning and trace */
return;
}
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
for_each_online_node(i) {
unsigned long addr;
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
addr = node_start_pfn(i) << PAGE_SHIFT;
if (addr < physnodes[node].start ||
addr >= physnodes[node].end)
continue;
mask = debug_cpumask_set_cpu(cpu, enable);
if (!mask)
x86, numa: Fake node-to-cpumask for NUMA emulation It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2010-12-23 09:23:54 +08:00
return;
if (enable)
cpumask_set_cpu(cpu, mask);
else
cpumask_clear_cpu(cpu, mask);
}
}
void __cpuinit numa_add_cpu(int cpu)
{
numa_set_cpumask(cpu, 1);
}
void __cpuinit numa_remove_cpu(int cpu)
{
numa_set_cpumask(cpu, 0);
}
# endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
#endif /* CONFIG_NUMA_EMU */