2005-04-17 06:20:36 +08:00
|
|
|
#ifndef _I386_PGTABLE_H
|
|
|
|
#define _I386_PGTABLE_H
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The Linux memory management assumes a three-level page table setup. On
|
|
|
|
* the i386, we use that, but "fold" the mid level into the top-level page
|
|
|
|
* table, so that we physically have the same two-level page table as the
|
|
|
|
* i386 mmu expects.
|
|
|
|
*
|
|
|
|
* This file contains the functions and defines necessary to modify and use
|
|
|
|
* the i386 page table tree.
|
|
|
|
*/
|
|
|
|
#ifndef __ASSEMBLY__
|
|
|
|
#include <asm/processor.h>
|
|
|
|
#include <asm/fixmap.h>
|
|
|
|
#include <linux/threads.h>
|
2006-12-07 09:14:08 +08:00
|
|
|
#include <asm/paravirt.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-19 14:40:25 +08:00
|
|
|
#include <linux/bitops.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
|
2005-11-07 16:59:43 +08:00
|
|
|
struct mm_struct;
|
|
|
|
struct vm_area_struct;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern pgd_t swapper_pg_dir[1024];
|
2006-12-07 12:33:20 +08:00
|
|
|
extern struct kmem_cache *pmd_cache;
|
2007-05-13 02:15:24 +08:00
|
|
|
void check_pgt_cache(void);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-01-30 20:34:11 +08:00
|
|
|
static inline void pgtable_cache_init(void) {}
|
2005-04-17 06:20:36 +08:00
|
|
|
void paging_init(void);
|
|
|
|
|
2007-05-13 02:15:24 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* The Linux x86 paging architecture is 'compile-time dual-mode', it
|
|
|
|
* implements both the traditional 2-level x86 page tables and the
|
|
|
|
* newer 3-level PAE-mode page tables.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_X86_PAE
|
|
|
|
# include <asm/pgtable-3level-defs.h>
|
|
|
|
# define PMD_SIZE (1UL << PMD_SHIFT)
|
|
|
|
# define PMD_MASK (~(PMD_SIZE-1))
|
|
|
|
#else
|
|
|
|
# include <asm/pgtable-2level-defs.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
|
|
|
|
#define PGDIR_MASK (~(PGDIR_SIZE-1))
|
|
|
|
|
|
|
|
#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
|
|
|
|
#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
|
|
|
|
|
|
|
|
/* Just any arbitrary offset to the start of the vmalloc VM area: the
|
|
|
|
* current 8MB value just means that there will be a 8MB "hole" after the
|
|
|
|
* physical memory until the kernel virtual memory starts. That means that
|
|
|
|
* any out-of-bounds memory accesses will hopefully be caught.
|
|
|
|
* The vmalloc() routines leaves a hole of 4kB between each vmalloced
|
|
|
|
* area for the same reason. ;)
|
|
|
|
*/
|
|
|
|
#define VMALLOC_OFFSET (8*1024*1024)
|
2007-07-16 14:38:19 +08:00
|
|
|
#define VMALLOC_START (((unsigned long) high_memory + \
|
2005-04-17 06:20:36 +08:00
|
|
|
2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1))
|
2008-02-05 14:28:33 +08:00
|
|
|
#ifdef CONFIG_X86_PAE
|
|
|
|
#define LAST_PKMAP 512
|
|
|
|
#else
|
|
|
|
#define LAST_PKMAP 1024
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE)
|
|
|
|
#else
|
|
|
|
# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Define this if things work differently on an i386 and an i486:
|
|
|
|
* it will (on an i486) warn about kernel memory accesses that are
|
2005-05-01 23:59:08 +08:00
|
|
|
* done without a 'access_ok(VERIFY_WRITE,..)'
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2005-05-01 23:59:08 +08:00
|
|
|
#undef TEST_ACCESS_OK
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* The boot page tables (all created as a single array) */
|
|
|
|
extern unsigned long pg0[];
|
|
|
|
|
|
|
|
#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
|
|
|
|
|
2005-10-30 09:16:27 +08:00
|
|
|
/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
|
|
|
|
#define pmd_none(x) (!(unsigned long)pmd_val(x))
|
2005-04-17 06:20:36 +08:00
|
|
|
#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
|
x86: fix pmd_bad and pud_bad to support huge pages
I recently stumbled upon a problem in the support for huge pages. If a
program using huge pages does not explicitly unmap them, they remain
mapped (and therefore, are lost) after the program exits.
I observed that the free huge page count in /proc/meminfo decreased when
running my program, and it did not increase after the program exited.
After running the program a few times, no more huge pages could be
allocated.
The reason for this seems to be that the x86 pmd_bad and pud_bad
consider pmd/pud entries having the PSE bit set invalid. I think there
is nothing wrong with this bit being set, it just indicates that the
lowest level of translation has been reached. This bit has to be (and
is) checked after the basic validity of the entry has been checked, like
in this fragment from follow_page() in mm/memory.c:
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto no_page_table;
if (pmd_huge(*pmd)) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
Note that this code currently doesn't work as intended if the pmd refers
to a huge page, the pmd_huge() check can not be reached if the page is
huge.
Extending pmd_bad() (and, for future 1GB page support, pud_bad()) to
allow for the PSE bit being set fixes this. For similar reasons,
allowing the NX bit being set is necessary, too. I have seen huge pages
having the NX bit set in their pmd entry, which would cause the same
problem.
Signed-Off-By: Hans Rosenfeld <hans.rosenfeld@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-02-19 01:10:47 +08:00
|
|
|
#define pmd_bad(x) ((pmd_val(x) \
|
|
|
|
& ~(PAGE_MASK | _PAGE_USER | _PAGE_PSE | _PAGE_NX)) \
|
|
|
|
!= _KERNPG_TABLE)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
|
|
|
|
#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
|
|
|
|
|
|
|
|
#ifdef CONFIG_X86_PAE
|
|
|
|
# include <asm/pgtable-3level.h>
|
|
|
|
#else
|
|
|
|
# include <asm/pgtable-2level.h>
|
|
|
|
#endif
|
|
|
|
|
2005-09-04 06:56:50 +08:00
|
|
|
/*
|
|
|
|
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
|
|
|
|
*
|
|
|
|
* dst - pointer to pgd range anwhere on a pgd page
|
|
|
|
* src - ""
|
|
|
|
* count - the number of pgds to copy.
|
|
|
|
*
|
|
|
|
* dst and src can be on the same page, but the range must not overlap,
|
|
|
|
* and must not cross a page boundary.
|
|
|
|
*/
|
|
|
|
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
|
|
|
|
{
|
|
|
|
memcpy(dst, src, count * sizeof(pgd_t));
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Macro to mark a page protection value as "uncacheable". On processors which do not support
|
|
|
|
* it, this is a no-op.
|
|
|
|
*/
|
|
|
|
#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \
|
|
|
|
? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Conversion functions: convert a page and protection to a page entry,
|
|
|
|
* and a page entry and page directory to the page they refer to.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
|
|
|
|
*
|
|
|
|
* this macro returns the index of the entry in the pgd page which would
|
|
|
|
* control the given virtual address
|
|
|
|
*/
|
|
|
|
#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
|
|
|
|
#define pgd_index_k(addr) pgd_index(addr)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pgd_offset() returns a (pgd_t *)
|
|
|
|
* pgd_index() is used get the offset into the pgd page's array of pgd_t's;
|
|
|
|
*/
|
|
|
|
#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* a shortcut which implies the use of the kernel's pgd, instead
|
|
|
|
* of a process's
|
|
|
|
*/
|
|
|
|
#define pgd_offset_k(address) pgd_offset(&init_mm, address)
|
|
|
|
|
2008-02-04 23:48:09 +08:00
|
|
|
static inline int pud_large(pud_t pud) { return 0; }
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
|
|
|
|
*
|
|
|
|
* this macro returns the index of the entry in the pmd page which would
|
|
|
|
* control the given virtual address
|
|
|
|
*/
|
|
|
|
#define pmd_index(address) \
|
|
|
|
(((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
|
|
|
|
*
|
|
|
|
* this macro returns the index of the entry in the pte page which would
|
|
|
|
* control the given virtual address
|
|
|
|
*/
|
|
|
|
#define pte_index(address) \
|
|
|
|
(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
|
|
|
|
#define pte_offset_kernel(dir, address) \
|
2006-09-26 14:31:48 +08:00
|
|
|
((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-10-31 06:59:31 +08:00
|
|
|
#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
|
|
|
|
|
2006-09-26 14:31:48 +08:00
|
|
|
#define pmd_page_vaddr(pmd) \
|
2005-10-31 06:59:31 +08:00
|
|
|
((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#if defined(CONFIG_HIGHPTE)
|
2007-05-03 01:27:15 +08:00
|
|
|
#define pte_offset_map(dir, address) \
|
2007-05-03 01:27:15 +08:00
|
|
|
((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
|
2007-05-03 01:27:15 +08:00
|
|
|
#define pte_offset_map_nested(dir, address) \
|
2007-05-03 01:27:15 +08:00
|
|
|
((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
|
2005-04-17 06:20:36 +08:00
|
|
|
#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
|
|
|
|
#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
|
|
|
|
#else
|
|
|
|
#define pte_offset_map(dir, address) \
|
|
|
|
((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
|
|
|
|
#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
|
|
|
|
#define pte_unmap(pte) do { } while (0)
|
|
|
|
#define pte_unmap_nested(pte) do { } while (0)
|
|
|
|
#endif
|
|
|
|
|
2006-10-01 14:29:35 +08:00
|
|
|
/* Clear a kernel PTE and flush it from the TLB */
|
|
|
|
#define kpte_clear_flush(ptep, vaddr) \
|
|
|
|
do { \
|
|
|
|
pte_clear(&init_mm, vaddr, ptep); \
|
|
|
|
__flush_tlb_one(vaddr); \
|
|
|
|
} while (0)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* The i386 doesn't have any external MMU info: the kernel page
|
|
|
|
* tables contain all the necessary information.
|
|
|
|
*/
|
|
|
|
#define update_mmu_cache(vma,address,pte) do { } while (0)
|
[PATCH] i386: PARAVIRT: Hooks to set up initial pagetable
This patch introduces paravirt_ops hooks to control how the kernel's
initial pagetable is set up.
In the case of a native boot, the very early bootstrap code creates a
simple non-PAE pagetable to map the kernel and physical memory. When
the VM subsystem is initialized, it creates a proper pagetable which
respects the PAE mode, large pages, etc.
When booting under a hypervisor, there are many possibilities for what
paging environment the hypervisor establishes for the guest kernel, so
the constructon of the kernel's pagetable depends on the hypervisor.
In the case of Xen, the hypervisor boots the kernel with a fully
constructed pagetable, which is already using PAE if necessary. Also,
Xen requires particular care when constructing pagetables to make sure
all pagetables are always mapped read-only.
In order to make this easier, kernel's initial pagetable construction
has been changed to only allocate and initialize a pagetable page if
there's no page already present in the pagetable. This allows the Xen
paravirt backend to make a copy of the hypervisor-provided pagetable,
allowing the kernel to establish any more mappings it needs while
keeping the existing ones.
A slightly subtle point which is worth highlighting here is that Xen
requires all kernel mappings to share the same pte_t pages between all
pagetables, so that updating a kernel page's mapping in one pagetable
is reflected in all other pagetables. This makes it possible to
allocate a page and attach it to a pagetable without having to
explicitly enumerate that page's mapping in all pagetables.
And:
+From: "Eric W. Biederman" <ebiederm@xmission.com>
If we don't set the leaf page table entries it is quite possible that
will inherit and incorrect page table entry from the initial boot
page table setup in head.S. So we need to redo the effort here,
so we pick up PSE, PGE and the like.
Hypervisors like Xen require that their page tables be read-only,
which is slightly incompatible with our low identity mappings, however
I discussed this with Jeremy he has modified the Xen early set_pte
function to avoid problems in this area.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: William Irwin <bill.irwin@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
2007-05-03 01:27:13 +08:00
|
|
|
|
|
|
|
void native_pagetable_setup_start(pgd_t *base);
|
|
|
|
void native_pagetable_setup_done(pgd_t *base);
|
|
|
|
|
|
|
|
#ifndef CONFIG_PARAVIRT
|
|
|
|
static inline void paravirt_pagetable_setup_start(pgd_t *base)
|
|
|
|
{
|
|
|
|
native_pagetable_setup_start(base);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void paravirt_pagetable_setup_done(pgd_t *base)
|
|
|
|
{
|
|
|
|
native_pagetable_setup_done(base);
|
|
|
|
}
|
|
|
|
#endif /* !CONFIG_PARAVIRT */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* !__ASSEMBLY__ */
|
|
|
|
|
2008-01-30 20:30:37 +08:00
|
|
|
/*
|
|
|
|
* kern_addr_valid() is (1) for FLATMEM and (0) for
|
|
|
|
* SPARSEMEM and DISCONTIGMEM
|
|
|
|
*/
|
2005-06-23 15:07:57 +08:00
|
|
|
#ifdef CONFIG_FLATMEM
|
2005-04-17 06:20:36 +08:00
|
|
|
#define kern_addr_valid(addr) (1)
|
2008-01-30 20:30:37 +08:00
|
|
|
#else
|
|
|
|
#define kern_addr_valid(kaddr) (0)
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
|
|
|
|
remap_pfn_range(vma, vaddr, pfn, size, prot)
|
|
|
|
|
|
|
|
#endif /* _I386_PGTABLE_H */
|