linux/arch/arm64/mm/fault.c

708 lines
20 KiB
C
Raw Normal View History

/*
* Based on arch/arm/mm/fault.c
*
* Copyright (C) 1995 Linus Torvalds
* Copyright (C) 1995-2004 Russell King
* Copyright (C) 2012 ARM Ltd.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/extable.h>
#include <linux/signal.h>
#include <linux/mm.h>
#include <linux/hardirq.h>
#include <linux/init.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/page-flags.h>
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/perf_event.h>
#include <linux/preempt.h>
#include <asm/bug.h>
#include <asm/cpufeature.h>
#include <asm/exception.h>
#include <asm/debug-monitors.h>
#include <asm/esr.h>
#include <asm/sysreg.h>
#include <asm/system_misc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
static const char *fault_name(unsigned int esr);
arm64: Kprobes with single stepping support Add support for basic kernel probes(kprobes) and jump probes (jprobes) for ARM64. Kprobes utilizes software breakpoint and single step debug exceptions supported on ARM v8. A software breakpoint is placed at the probe address to trap the kernel execution into the kprobe handler. ARM v8 supports enabling single stepping before the break exception return (ERET), with next PC in exception return address (ELR_EL1). The kprobe handler prepares an executable memory slot for out-of-line execution with a copy of the original instruction being probed, and enables single stepping. The PC is set to the out-of-line slot address before the ERET. With this scheme, the instruction is executed with the exact same register context except for the PC (and DAIF) registers. Debug mask (PSTATE.D) is enabled only when single stepping a recursive kprobe, e.g.: during kprobes reenter so that probed instruction can be single stepped within the kprobe handler -exception- context. The recursion depth of kprobe is always 2, i.e. upon probe re-entry, any further re-entry is prevented by not calling handlers and the case counted as a missed kprobe). Single stepping from the x-o-l slot has a drawback for PC-relative accesses like branching and symbolic literals access as the offset from the new PC (slot address) may not be ensured to fit in the immediate value of the opcode. Such instructions need simulation, so reject probing them. Instructions generating exceptions or cpu mode change are rejected for probing. Exclusive load/store instructions are rejected too. Additionally, the code is checked to see if it is inside an exclusive load/store sequence (code from Pratyush). System instructions are mostly enabled for stepping, except MSR/MRS accesses to "DAIF" flags in PSTATE, which are not safe for probing. This also changes arch/arm64/include/asm/ptrace.h to use include/asm-generic/ptrace.h. Thanks to Steve Capper and Pratyush Anand for several suggested Changes. Signed-off-by: Sandeepa Prabhu <sandeepa.s.prabhu@gmail.com> Signed-off-by: David A. Long <dave.long@linaro.org> Signed-off-by: Pratyush Anand <panand@redhat.com> Acked-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-07-09 00:35:48 +08:00
#ifdef CONFIG_KPROBES
static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
{
int ret = 0;
/* kprobe_running() needs smp_processor_id() */
if (!user_mode(regs)) {
preempt_disable();
if (kprobe_running() && kprobe_fault_handler(regs, esr))
ret = 1;
preempt_enable();
}
return ret;
}
#else
static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
{
return 0;
}
#endif
/*
* Dump out the page tables associated with 'addr' in mm 'mm'.
*/
void show_pte(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
if (!mm)
mm = &init_mm;
pr_alert("pgd = %p\n", mm->pgd);
pgd = pgd_offset(mm, addr);
pr_alert("[%08lx] *pgd=%016llx", addr, pgd_val(*pgd));
do {
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (pgd_none(*pgd) || pgd_bad(*pgd))
break;
pud = pud_offset(pgd, addr);
arm64: mm: Implement 4 levels of translation tables This patch implements 4 levels of translation tables since 3 levels of page tables with 4KB pages cannot support 40-bit physical address space described in [1] due to the following issue. It is a restriction that kernel logical memory map with 4KB + 3 levels (0xffffffc000000000-0xffffffffffffffff) cannot cover RAM region from 544GB to 1024GB in [1]. Specifically, ARM64 kernel fails to create mapping for this region in map_mem function since __phys_to_virt for this region reaches to address overflow. If SoC design follows the document, [1], over 32GB RAM would be placed from 544GB. Even 64GB system is supposed to use the region from 544GB to 576GB for only 32GB RAM. Naturally, it would reach to enable 4 levels of page tables to avoid hacking __virt_to_phys and __phys_to_virt. However, it is recommended 4 levels of page table should be only enabled if memory map is too sparse or there is about 512GB RAM. References ---------- [1]: Principles of ARM Memory Maps, White Paper, Issue C Signed-off-by: Jungseok Lee <jays.lee@samsung.com> Reviewed-by: Sungjinn Chung <sungjinn.chung@samsung.com> Acked-by: Kukjin Kim <kgene.kim@samsung.com> Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org> Reviewed-by: Steve Capper <steve.capper@linaro.org> [catalin.marinas@arm.com: MEMBLOCK_INITIAL_LIMIT removed, same as PUD_SIZE] [catalin.marinas@arm.com: early_ioremap_init() updated for 4 levels] [catalin.marinas@arm.com: 48-bit VA depends on BROKEN until KVM is fixed] Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> Tested-by: Jungseok Lee <jungseoklee85@gmail.com>
2014-05-12 17:40:51 +08:00
printk(", *pud=%016llx", pud_val(*pud));
if (pud_none(*pud) || pud_bad(*pud))
break;
pmd = pmd_offset(pud, addr);
printk(", *pmd=%016llx", pmd_val(*pmd));
if (pmd_none(*pmd) || pmd_bad(*pmd))
break;
pte = pte_offset_map(pmd, addr);
printk(", *pte=%016llx", pte_val(*pte));
pte_unmap(pte);
} while(0);
printk("\n");
}
arm64: Implement ptep_set_access_flags() for hardware AF/DBM When hardware updates of the access and dirty states are enabled, the default ptep_set_access_flags() implementation based on calling set_pte_at() directly is potentially racy. This triggers the "racy dirty state clearing" warning in set_pte_at() because an existing writable PTE is overridden with a clean entry. There are two main scenarios for this situation: 1. The CPU getting an access fault does not support hardware updates of the access/dirty flags. However, a different agent in the system (e.g. SMMU) can do this, therefore overriding a writable entry with a clean one could potentially lose the automatically updated dirty status 2. A more complex situation is possible when all CPUs support hardware AF/DBM: a) Initial state: shareable + writable vma and pte_none(pte) b) Read fault taken by two threads of the same process on different CPUs c) CPU0 takes the mmap_sem and proceeds to handling the fault. It eventually reaches do_set_pte() which sets a writable + clean pte. CPU0 releases the mmap_sem d) CPU1 acquires the mmap_sem and proceeds to handle_pte_fault(). The pte entry it reads is present, writable and clean and it continues to pte_mkyoung() e) CPU1 calls ptep_set_access_flags() If between (d) and (e) the hardware (another CPU) updates the dirty state (clears PTE_RDONLY), CPU1 will override the PTR_RDONLY bit marking the entry clean again. This patch implements an arm64-specific ptep_set_access_flags() function to perform an atomic update of the PTE flags. Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits") Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> Reported-by: Ming Lei <tom.leiming@gmail.com> Tested-by: Julien Grall <julien.grall@arm.com> Cc: Will Deacon <will.deacon@arm.com> Cc: <stable@vger.kernel.org> # 4.3+ [will: reworded comment] Signed-off-by: Will Deacon <will.deacon@arm.com>
2016-04-13 23:01:22 +08:00
#ifdef CONFIG_ARM64_HW_AFDBM
/*
* This function sets the access flags (dirty, accessed), as well as write
* permission, and only to a more permissive setting.
*
* It needs to cope with hardware update of the accessed/dirty state by other
* agents in the system and can safely skip the __sync_icache_dcache() call as,
* like set_pte_at(), the PTE is never changed from no-exec to exec here.
*
* Returns whether or not the PTE actually changed.
*/
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
pteval_t old_pteval;
unsigned int tmp;
if (pte_same(*ptep, entry))
return 0;
/* only preserve the access flags and write permission */
pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY;
/*
* PTE_RDONLY is cleared by default in the asm below, so set it in
* back if necessary (read-only or clean PTE).
*/
arm64: mm: always take dirty state from new pte in ptep_set_access_flags Commit 66dbd6e61a52 ("arm64: Implement ptep_set_access_flags() for hardware AF/DBM") ensured that pte flags are updated atomically in the face of potential concurrent, hardware-assisted updates. However, Alex reports that: | This patch breaks swapping for me. | In the broken case, you'll see either systemd cpu time spike (because | it's stuck in a page fault loop) or the system hang (because the | application owning the screen is stuck in a page fault loop). It turns out that this is because the 'dirty' argument to ptep_set_access_flags is always 0 for read faults, and so we can't use it to set PTE_RDONLY. The failing sequence is: 1. We put down a PTE_WRITE | PTE_DIRTY | PTE_AF pte 2. Memory pressure -> pte_mkold(pte) -> clear PTE_AF 3. A read faults due to the missing access flag 4. ptep_set_access_flags is called with dirty = 0, due to the read fault 5. pte is then made PTE_WRITE | PTE_DIRTY | PTE_AF | PTE_RDONLY (!) 6. A write faults, but pte_write is true so we get stuck The solution is to check the new page table entry (as would be done by the generic, non-atomic definition of ptep_set_access_flags that just calls set_pte_at) to establish the dirty state. Cc: <stable@vger.kernel.org> # 4.3+ Fixes: 66dbd6e61a52 ("arm64: Implement ptep_set_access_flags() for hardware AF/DBM") Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Reported-by: Alexander Graf <agraf@suse.de> Tested-by: Alexander Graf <agraf@suse.de> Signed-off-by: Will Deacon <will.deacon@arm.com>
2016-06-08 00:55:15 +08:00
if (!pte_write(entry) || !pte_sw_dirty(entry))
arm64: Implement ptep_set_access_flags() for hardware AF/DBM When hardware updates of the access and dirty states are enabled, the default ptep_set_access_flags() implementation based on calling set_pte_at() directly is potentially racy. This triggers the "racy dirty state clearing" warning in set_pte_at() because an existing writable PTE is overridden with a clean entry. There are two main scenarios for this situation: 1. The CPU getting an access fault does not support hardware updates of the access/dirty flags. However, a different agent in the system (e.g. SMMU) can do this, therefore overriding a writable entry with a clean one could potentially lose the automatically updated dirty status 2. A more complex situation is possible when all CPUs support hardware AF/DBM: a) Initial state: shareable + writable vma and pte_none(pte) b) Read fault taken by two threads of the same process on different CPUs c) CPU0 takes the mmap_sem and proceeds to handling the fault. It eventually reaches do_set_pte() which sets a writable + clean pte. CPU0 releases the mmap_sem d) CPU1 acquires the mmap_sem and proceeds to handle_pte_fault(). The pte entry it reads is present, writable and clean and it continues to pte_mkyoung() e) CPU1 calls ptep_set_access_flags() If between (d) and (e) the hardware (another CPU) updates the dirty state (clears PTE_RDONLY), CPU1 will override the PTR_RDONLY bit marking the entry clean again. This patch implements an arm64-specific ptep_set_access_flags() function to perform an atomic update of the PTE flags. Fixes: 2f4b829c625e ("arm64: Add support for hardware updates of the access and dirty pte bits") Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> Reported-by: Ming Lei <tom.leiming@gmail.com> Tested-by: Julien Grall <julien.grall@arm.com> Cc: Will Deacon <will.deacon@arm.com> Cc: <stable@vger.kernel.org> # 4.3+ [will: reworded comment] Signed-off-by: Will Deacon <will.deacon@arm.com>
2016-04-13 23:01:22 +08:00
pte_val(entry) |= PTE_RDONLY;
/*
* Setting the flags must be done atomically to avoid racing with the
* hardware update of the access/dirty state.
*/
asm volatile("// ptep_set_access_flags\n"
" prfm pstl1strm, %2\n"
"1: ldxr %0, %2\n"
" and %0, %0, %3 // clear PTE_RDONLY\n"
" orr %0, %0, %4 // set flags\n"
" stxr %w1, %0, %2\n"
" cbnz %w1, 1b\n"
: "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep))
: "L" (~PTE_RDONLY), "r" (pte_val(entry)));
flush_tlb_fix_spurious_fault(vma, address);
return 1;
}
#endif
arm64: Handle el1 synchronous instruction aborts cleanly Executing from a non-executable area gives an ugly message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0e08 lkdtm: attempting bad execution at ffff000008880700 Bad mode in Synchronous Abort handler detected on CPU2, code 0x8400000e -- IABT (current EL) CPU: 2 PID: 998 Comm: sh Not tainted 4.7.0-rc2+ #13 Hardware name: linux,dummy-virt (DT) task: ffff800077e35780 ti: ffff800077970000 task.ti: ffff800077970000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 The 'IABT (current EL)' indicates the error but it's a bit cryptic without knowledge of the ARM ARM. There is also no indication of the specific address which triggered the fault. The increase in kernel page permissions makes hitting this case more likely as well. Handling the case in the vectors gives a much more familiar looking error message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0840 lkdtm: attempting bad execution at ffff000008880680 Unable to handle kernel paging request at virtual address ffff000008880680 pgd = ffff8000089b2000 [ffff000008880680] *pgd=00000000489b4003, *pud=0000000048904003, *pmd=0000000000000000 Internal error: Oops: 8400000e [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 997 Comm: sh Not tainted 4.7.0-rc1+ #24 Hardware name: linux,dummy-virt (DT) task: ffff800077f9f080 ti: ffff800008a1c000 task.ti: ffff800008a1c000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 Acked-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Laura Abbott <labbott@redhat.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-08-10 09:25:26 +08:00
static bool is_el1_instruction_abort(unsigned int esr)
{
return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
}
/*
* The kernel tried to access some page that wasn't present.
*/
static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
unsigned int esr, struct pt_regs *regs)
{
/*
* Are we prepared to handle this kernel fault?
arm64: Handle el1 synchronous instruction aborts cleanly Executing from a non-executable area gives an ugly message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0e08 lkdtm: attempting bad execution at ffff000008880700 Bad mode in Synchronous Abort handler detected on CPU2, code 0x8400000e -- IABT (current EL) CPU: 2 PID: 998 Comm: sh Not tainted 4.7.0-rc2+ #13 Hardware name: linux,dummy-virt (DT) task: ffff800077e35780 ti: ffff800077970000 task.ti: ffff800077970000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 The 'IABT (current EL)' indicates the error but it's a bit cryptic without knowledge of the ARM ARM. There is also no indication of the specific address which triggered the fault. The increase in kernel page permissions makes hitting this case more likely as well. Handling the case in the vectors gives a much more familiar looking error message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0840 lkdtm: attempting bad execution at ffff000008880680 Unable to handle kernel paging request at virtual address ffff000008880680 pgd = ffff8000089b2000 [ffff000008880680] *pgd=00000000489b4003, *pud=0000000048904003, *pmd=0000000000000000 Internal error: Oops: 8400000e [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 997 Comm: sh Not tainted 4.7.0-rc1+ #24 Hardware name: linux,dummy-virt (DT) task: ffff800077f9f080 ti: ffff800008a1c000 task.ti: ffff800008a1c000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 Acked-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Laura Abbott <labbott@redhat.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-08-10 09:25:26 +08:00
* We are almost certainly not prepared to handle instruction faults.
*/
arm64: Handle el1 synchronous instruction aborts cleanly Executing from a non-executable area gives an ugly message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0e08 lkdtm: attempting bad execution at ffff000008880700 Bad mode in Synchronous Abort handler detected on CPU2, code 0x8400000e -- IABT (current EL) CPU: 2 PID: 998 Comm: sh Not tainted 4.7.0-rc2+ #13 Hardware name: linux,dummy-virt (DT) task: ffff800077e35780 ti: ffff800077970000 task.ti: ffff800077970000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 The 'IABT (current EL)' indicates the error but it's a bit cryptic without knowledge of the ARM ARM. There is also no indication of the specific address which triggered the fault. The increase in kernel page permissions makes hitting this case more likely as well. Handling the case in the vectors gives a much more familiar looking error message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0840 lkdtm: attempting bad execution at ffff000008880680 Unable to handle kernel paging request at virtual address ffff000008880680 pgd = ffff8000089b2000 [ffff000008880680] *pgd=00000000489b4003, *pud=0000000048904003, *pmd=0000000000000000 Internal error: Oops: 8400000e [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 997 Comm: sh Not tainted 4.7.0-rc1+ #24 Hardware name: linux,dummy-virt (DT) task: ffff800077f9f080 ti: ffff800008a1c000 task.ti: ffff800008a1c000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 Acked-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Laura Abbott <labbott@redhat.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-08-10 09:25:26 +08:00
if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
return;
/*
* No handler, we'll have to terminate things with extreme prejudice.
*/
bust_spinlocks(1);
pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
(addr < PAGE_SIZE) ? "NULL pointer dereference" :
"paging request", addr);
show_pte(mm, addr);
die("Oops", regs, esr);
bust_spinlocks(0);
do_exit(SIGKILL);
}
/*
* Something tried to access memory that isn't in our memory map. User mode
* accesses just cause a SIGSEGV
*/
static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
unsigned int esr, unsigned int sig, int code,
struct pt_regs *regs)
{
struct siginfo si;
if (unhandled_signal(tsk, sig) && show_unhandled_signals_ratelimited()) {
pr_info("%s[%d]: unhandled %s (%d) at 0x%08lx, esr 0x%03x\n",
tsk->comm, task_pid_nr(tsk), fault_name(esr), sig,
addr, esr);
show_pte(tsk->mm, addr);
show_regs(regs);
}
tsk->thread.fault_address = addr;
tsk->thread.fault_code = esr;
si.si_signo = sig;
si.si_errno = 0;
si.si_code = code;
si.si_addr = (void __user *)addr;
force_sig_info(sig, &si, tsk);
}
static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->active_mm;
/*
* If we are in kernel mode at this point, we have no context to
* handle this fault with.
*/
if (user_mode(regs))
__do_user_fault(tsk, addr, esr, SIGSEGV, SEGV_MAPERR, regs);
else
__do_kernel_fault(mm, addr, esr, regs);
}
#define VM_FAULT_BADMAP 0x010000
#define VM_FAULT_BADACCESS 0x020000
static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
unsigned int mm_flags, unsigned long vm_flags,
struct task_struct *tsk)
{
struct vm_area_struct *vma;
int fault;
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
if (unlikely(!vma))
goto out;
if (unlikely(vma->vm_start > addr))
goto check_stack;
/*
* Ok, we have a good vm_area for this memory access, so we can handle
* it.
*/
good_area:
/*
* Check that the permissions on the VMA allow for the fault which
* occurred.
*/
if (!(vma->vm_flags & vm_flags)) {
fault = VM_FAULT_BADACCESS;
goto out;
}
return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags);
check_stack:
if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr))
goto good_area;
out:
return fault;
}
static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs)
{
unsigned int ec = ESR_ELx_EC(esr);
unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR)
return false;
if (system_uses_ttbr0_pan())
return fsc_type == ESR_ELx_FSC_FAULT &&
(regs->pstate & PSR_PAN_BIT);
else
return fsc_type == ESR_ELx_FSC_PERM;
}
static bool is_el0_instruction_abort(unsigned int esr)
{
return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
}
static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
struct task_struct *tsk;
struct mm_struct *mm;
int fault, sig, code;
unsigned long vm_flags = VM_READ | VM_WRITE;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
arm64: Kprobes with single stepping support Add support for basic kernel probes(kprobes) and jump probes (jprobes) for ARM64. Kprobes utilizes software breakpoint and single step debug exceptions supported on ARM v8. A software breakpoint is placed at the probe address to trap the kernel execution into the kprobe handler. ARM v8 supports enabling single stepping before the break exception return (ERET), with next PC in exception return address (ELR_EL1). The kprobe handler prepares an executable memory slot for out-of-line execution with a copy of the original instruction being probed, and enables single stepping. The PC is set to the out-of-line slot address before the ERET. With this scheme, the instruction is executed with the exact same register context except for the PC (and DAIF) registers. Debug mask (PSTATE.D) is enabled only when single stepping a recursive kprobe, e.g.: during kprobes reenter so that probed instruction can be single stepped within the kprobe handler -exception- context. The recursion depth of kprobe is always 2, i.e. upon probe re-entry, any further re-entry is prevented by not calling handlers and the case counted as a missed kprobe). Single stepping from the x-o-l slot has a drawback for PC-relative accesses like branching and symbolic literals access as the offset from the new PC (slot address) may not be ensured to fit in the immediate value of the opcode. Such instructions need simulation, so reject probing them. Instructions generating exceptions or cpu mode change are rejected for probing. Exclusive load/store instructions are rejected too. Additionally, the code is checked to see if it is inside an exclusive load/store sequence (code from Pratyush). System instructions are mostly enabled for stepping, except MSR/MRS accesses to "DAIF" flags in PSTATE, which are not safe for probing. This also changes arch/arm64/include/asm/ptrace.h to use include/asm-generic/ptrace.h. Thanks to Steve Capper and Pratyush Anand for several suggested Changes. Signed-off-by: Sandeepa Prabhu <sandeepa.s.prabhu@gmail.com> Signed-off-by: David A. Long <dave.long@linaro.org> Signed-off-by: Pratyush Anand <panand@redhat.com> Acked-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-07-09 00:35:48 +08:00
if (notify_page_fault(regs, esr))
return 0;
tsk = current;
mm = tsk->mm;
/*
* If we're in an interrupt or have no user context, we must not take
* the fault.
*/
mm/fault, arch: Use pagefault_disable() to check for disabled pagefaults in the handler Introduce faulthandler_disabled() and use it to check for irq context and disabled pagefaults (via pagefault_disable()) in the pagefault handlers. Please note that we keep the in_atomic() checks in place - to detect whether in irq context (in which case preemption is always properly disabled). In contrast, preempt_disable() should never be used to disable pagefaults. With !CONFIG_PREEMPT_COUNT, preempt_disable() doesn't modify the preempt counter, and therefore the result of in_atomic() differs. We validate that condition by using might_fault() checks when calling might_sleep(). Therefore, add a comment to faulthandler_disabled(), describing why this is needed. faulthandler_disabled() and pagefault_disable() are defined in linux/uaccess.h, so let's properly add that include to all relevant files. This patch is based on a patch from Thomas Gleixner. Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-7-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-11 23:52:11 +08:00
if (faulthandler_disabled() || !mm)
goto no_context;
if (user_mode(regs))
mm_flags |= FAULT_FLAG_USER;
if (is_el0_instruction_abort(esr)) {
vm_flags = VM_EXEC;
} else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) {
vm_flags = VM_WRITE;
mm_flags |= FAULT_FLAG_WRITE;
}
if (addr < USER_DS && is_permission_fault(esr, regs)) {
/* regs->orig_addr_limit may be 0 if we entered from EL0 */
if (regs->orig_addr_limit == KERNEL_DS)
die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
arm64: Handle el1 synchronous instruction aborts cleanly Executing from a non-executable area gives an ugly message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0e08 lkdtm: attempting bad execution at ffff000008880700 Bad mode in Synchronous Abort handler detected on CPU2, code 0x8400000e -- IABT (current EL) CPU: 2 PID: 998 Comm: sh Not tainted 4.7.0-rc2+ #13 Hardware name: linux,dummy-virt (DT) task: ffff800077e35780 ti: ffff800077970000 task.ti: ffff800077970000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 The 'IABT (current EL)' indicates the error but it's a bit cryptic without knowledge of the ARM ARM. There is also no indication of the specific address which triggered the fault. The increase in kernel page permissions makes hitting this case more likely as well. Handling the case in the vectors gives a much more familiar looking error message: lkdtm: Performing direct entry EXEC_RODATA lkdtm: attempting ok execution at ffff0000084c0840 lkdtm: attempting bad execution at ffff000008880680 Unable to handle kernel paging request at virtual address ffff000008880680 pgd = ffff8000089b2000 [ffff000008880680] *pgd=00000000489b4003, *pud=0000000048904003, *pmd=0000000000000000 Internal error: Oops: 8400000e [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 997 Comm: sh Not tainted 4.7.0-rc1+ #24 Hardware name: linux,dummy-virt (DT) task: ffff800077f9f080 ti: ffff800008a1c000 task.ti: ffff800008a1c000 PC is at lkdtm_rodata_do_nothing+0x0/0x8 LR is at execute_location+0x74/0x88 Acked-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Laura Abbott <labbott@redhat.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-08-10 09:25:26 +08:00
if (is_el1_instruction_abort(esr))
die("Attempting to execute userspace memory", regs, esr);
if (!search_exception_tables(regs->pc))
die("Accessing user space memory outside uaccess.h routines", regs, esr);
}
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
* we can bug out early if this is from code which shouldn't.
*/
if (!down_read_trylock(&mm->mmap_sem)) {
if (!user_mode(regs) && !search_exception_tables(regs->pc))
goto no_context;
retry:
down_read(&mm->mmap_sem);
} else {
/*
* The above down_read_trylock() might have succeeded in which
* case, we'll have missed the might_sleep() from down_read().
*/
might_sleep();
#ifdef CONFIG_DEBUG_VM
if (!user_mode(regs) && !search_exception_tables(regs->pc))
goto no_context;
#endif
}
fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
/*
* If we need to retry but a fatal signal is pending, handle the
* signal first. We do not need to release the mmap_sem because it
* would already be released in __lock_page_or_retry in mm/filemap.c.
*/
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
return 0;
/*
* Major/minor page fault accounting is only done on the initial
* attempt. If we go through a retry, it is extremely likely that the
* page will be found in page cache at that point.
*/
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
addr);
} else {
tsk->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
addr);
}
if (fault & VM_FAULT_RETRY) {
/*
* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
* starvation.
*/
mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
mm_flags |= FAULT_FLAG_TRIED;
goto retry;
}
}
up_read(&mm->mmap_sem);
/*
* Handle the "normal" case first - VM_FAULT_MAJOR
*/
if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
VM_FAULT_BADACCESS))))
return 0;
/*
* If we are in kernel mode at this point, we have no context to
* handle this fault with.
*/
if (!user_mode(regs))
goto no_context;
if (fault & VM_FAULT_OOM) {
/*
* We ran out of memory, call the OOM killer, and return to
* userspace (which will retry the fault, or kill us if we got
* oom-killed).
*/
pagefault_out_of_memory();
return 0;
}
if (fault & VM_FAULT_SIGBUS) {
/*
* We had some memory, but were unable to successfully fix up
* this page fault.
*/
sig = SIGBUS;
code = BUS_ADRERR;
} else {
/*
* Something tried to access memory that isn't in our memory
* map.
*/
sig = SIGSEGV;
code = fault == VM_FAULT_BADACCESS ?
SEGV_ACCERR : SEGV_MAPERR;
}
__do_user_fault(tsk, addr, esr, sig, code, regs);
return 0;
no_context:
__do_kernel_fault(mm, addr, esr, regs);
return 0;
}
/*
* First Level Translation Fault Handler
*
* We enter here because the first level page table doesn't contain a valid
* entry for the address.
*
* If the address is in kernel space (>= TASK_SIZE), then we are probably
* faulting in the vmalloc() area.
*
* If the init_task's first level page tables contains the relevant entry, we
* copy the it to this task. If not, we send the process a signal, fixup the
* exception, or oops the kernel.
*
* NOTE! We MUST NOT take any locks for this case. We may be in an interrupt
* or a critical region, and should only copy the information from the master
* page table, nothing more.
*/
static int __kprobes do_translation_fault(unsigned long addr,
unsigned int esr,
struct pt_regs *regs)
{
if (addr < TASK_SIZE)
return do_page_fault(addr, esr, regs);
do_bad_area(addr, esr, regs);
return 0;
}
static int do_alignment_fault(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
do_bad_area(addr, esr, regs);
return 0;
}
/*
* This abort handler always returns "fault".
*/
static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
return 1;
}
static const struct fault_info {
int (*fn)(unsigned long addr, unsigned int esr, struct pt_regs *regs);
int sig;
int code;
const char *name;
} fault_info[] = {
{ do_bad, SIGBUS, 0, "ttbr address size fault" },
{ do_bad, SIGBUS, 0, "level 1 address size fault" },
{ do_bad, SIGBUS, 0, "level 2 address size fault" },
{ do_bad, SIGBUS, 0, "level 3 address size fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" },
{ do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
{ do_bad, SIGBUS, 0, "unknown 8" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" },
{ do_bad, SIGBUS, 0, "unknown 12" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
{ do_bad, SIGBUS, 0, "synchronous external abort" },
{ do_bad, SIGBUS, 0, "unknown 17" },
{ do_bad, SIGBUS, 0, "unknown 18" },
{ do_bad, SIGBUS, 0, "unknown 19" },
{ do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" },
{ do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" },
{ do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" },
{ do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" },
{ do_bad, SIGBUS, 0, "synchronous parity error" },
{ do_bad, SIGBUS, 0, "unknown 25" },
{ do_bad, SIGBUS, 0, "unknown 26" },
{ do_bad, SIGBUS, 0, "unknown 27" },
{ do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" },
{ do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" },
{ do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" },
{ do_bad, SIGBUS, 0, "synchronous parity error (translation table walk)" },
{ do_bad, SIGBUS, 0, "unknown 32" },
{ do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" },
{ do_bad, SIGBUS, 0, "unknown 34" },
{ do_bad, SIGBUS, 0, "unknown 35" },
{ do_bad, SIGBUS, 0, "unknown 36" },
{ do_bad, SIGBUS, 0, "unknown 37" },
{ do_bad, SIGBUS, 0, "unknown 38" },
{ do_bad, SIGBUS, 0, "unknown 39" },
{ do_bad, SIGBUS, 0, "unknown 40" },
{ do_bad, SIGBUS, 0, "unknown 41" },
{ do_bad, SIGBUS, 0, "unknown 42" },
{ do_bad, SIGBUS, 0, "unknown 43" },
{ do_bad, SIGBUS, 0, "unknown 44" },
{ do_bad, SIGBUS, 0, "unknown 45" },
{ do_bad, SIGBUS, 0, "unknown 46" },
{ do_bad, SIGBUS, 0, "unknown 47" },
{ do_bad, SIGBUS, 0, "TLB conflict abort" },
{ do_bad, SIGBUS, 0, "unknown 49" },
{ do_bad, SIGBUS, 0, "unknown 50" },
{ do_bad, SIGBUS, 0, "unknown 51" },
{ do_bad, SIGBUS, 0, "implementation fault (lockdown abort)" },
{ do_bad, SIGBUS, 0, "implementation fault (unsupported exclusive)" },
{ do_bad, SIGBUS, 0, "unknown 54" },
{ do_bad, SIGBUS, 0, "unknown 55" },
{ do_bad, SIGBUS, 0, "unknown 56" },
{ do_bad, SIGBUS, 0, "unknown 57" },
{ do_bad, SIGBUS, 0, "unknown 58" },
{ do_bad, SIGBUS, 0, "unknown 59" },
{ do_bad, SIGBUS, 0, "unknown 60" },
{ do_bad, SIGBUS, 0, "section domain fault" },
{ do_bad, SIGBUS, 0, "page domain fault" },
{ do_bad, SIGBUS, 0, "unknown 63" },
};
static const char *fault_name(unsigned int esr)
{
const struct fault_info *inf = fault_info + (esr & 63);
return inf->name;
}
/*
* Dispatch a data abort to the relevant handler.
*/
asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
const struct fault_info *inf = fault_info + (esr & 63);
struct siginfo info;
if (!inf->fn(addr, esr, regs))
return;
pr_alert("Unhandled fault: %s (0x%08x) at 0x%016lx\n",
inf->name, esr, addr);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
info.si_addr = (void __user *)addr;
arm64_notify_die("", regs, &info, esr);
}
/*
* Handle stack alignment exceptions.
*/
asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
unsigned int esr,
struct pt_regs *regs)
{
struct siginfo info;
struct task_struct *tsk = current;
if (show_unhandled_signals && unhandled_signal(tsk, SIGBUS))
pr_info_ratelimited("%s[%d]: %s exception: pc=%p sp=%p\n",
tsk->comm, task_pid_nr(tsk),
esr_get_class_string(esr), (void *)regs->pc,
(void *)regs->sp);
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = BUS_ADRALN;
info.si_addr = (void __user *)addr;
arm64_notify_die("Oops - SP/PC alignment exception", regs, &info, esr);
}
int __init early_brk64(unsigned long addr, unsigned int esr,
struct pt_regs *regs);
/*
* __refdata because early_brk64 is __init, but the reference to it is
* clobbered at arch_initcall time.
* See traps.c and debug-monitors.c:debug_traps_init().
*/
static struct fault_info __refdata debug_fault_info[] = {
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" },
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" },
{ do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" },
{ do_bad, SIGBUS, 0, "unknown 3" },
{ do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" },
{ do_bad, SIGTRAP, 0, "aarch32 vector catch" },
{ early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" },
{ do_bad, SIGBUS, 0, "unknown 7" },
};
void __init hook_debug_fault_code(int nr,
int (*fn)(unsigned long, unsigned int, struct pt_regs *),
int sig, int code, const char *name)
{
BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
debug_fault_info[nr].fn = fn;
debug_fault_info[nr].sig = sig;
debug_fault_info[nr].code = code;
debug_fault_info[nr].name = name;
}
asmlinkage int __exception do_debug_exception(unsigned long addr,
unsigned int esr,
struct pt_regs *regs)
{
const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
struct siginfo info;
arm64: mm: Add trace_irqflags annotations to do_debug_exception() With CONFIG_PROVE_LOCKING, CONFIG_DEBUG_LOCKDEP and CONFIG_TRACE_IRQFLAGS enabled, lockdep will compare current->hardirqs_enabled with the flags from local_irq_save(). When a debug exception occurs, interrupts are disabled in entry.S, but lockdep isn't told, resulting in: DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled) ------------[ cut here ]------------ WARNING: at ../kernel/locking/lockdep.c:3523 Modules linked in: CPU: 3 PID: 1752 Comm: perf Not tainted 4.5.0-rc4+ #2204 Hardware name: ARM Juno development board (r1) (DT) task: ffffffc974868000 ti: ffffffc975f40000 task.ti: ffffffc975f40000 PC is at check_flags.part.35+0x17c/0x184 LR is at check_flags.part.35+0x17c/0x184 pc : [<ffffff80080fc93c>] lr : [<ffffff80080fc93c>] pstate: 600003c5 [...] ---[ end trace 74631f9305ef5020 ]--- Call trace: [<ffffff80080fc93c>] check_flags.part.35+0x17c/0x184 [<ffffff80080ffe30>] lock_acquire+0xa8/0xc4 [<ffffff8008093038>] breakpoint_handler+0x118/0x288 [<ffffff8008082434>] do_debug_exception+0x3c/0xa8 [<ffffff80080854b4>] el1_dbg+0x18/0x6c [<ffffff80081e82f4>] do_filp_open+0x64/0xdc [<ffffff80081d6e60>] do_sys_open+0x140/0x204 [<ffffff80081d6f58>] SyS_openat+0x10/0x18 [<ffffff8008085d30>] el0_svc_naked+0x24/0x28 possible reason: unannotated irqs-off. irq event stamp: 65857 hardirqs last enabled at (65857): [<ffffff80081fb1c0>] lookup_mnt+0xf4/0x1b4 hardirqs last disabled at (65856): [<ffffff80081fb188>] lookup_mnt+0xbc/0x1b4 softirqs last enabled at (65790): [<ffffff80080bdca4>] __do_softirq+0x1f8/0x290 softirqs last disabled at (65757): [<ffffff80080be038>] irq_exit+0x9c/0xd0 This patch adds the annotations to do_debug_exception(), while trying not to call trace_hardirqs_off() if el1_dbg() interrupted a task that already had irqs disabled. Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com>
2016-04-13 20:40:00 +08:00
int rv;
arm64: mm: Add trace_irqflags annotations to do_debug_exception() With CONFIG_PROVE_LOCKING, CONFIG_DEBUG_LOCKDEP and CONFIG_TRACE_IRQFLAGS enabled, lockdep will compare current->hardirqs_enabled with the flags from local_irq_save(). When a debug exception occurs, interrupts are disabled in entry.S, but lockdep isn't told, resulting in: DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled) ------------[ cut here ]------------ WARNING: at ../kernel/locking/lockdep.c:3523 Modules linked in: CPU: 3 PID: 1752 Comm: perf Not tainted 4.5.0-rc4+ #2204 Hardware name: ARM Juno development board (r1) (DT) task: ffffffc974868000 ti: ffffffc975f40000 task.ti: ffffffc975f40000 PC is at check_flags.part.35+0x17c/0x184 LR is at check_flags.part.35+0x17c/0x184 pc : [<ffffff80080fc93c>] lr : [<ffffff80080fc93c>] pstate: 600003c5 [...] ---[ end trace 74631f9305ef5020 ]--- Call trace: [<ffffff80080fc93c>] check_flags.part.35+0x17c/0x184 [<ffffff80080ffe30>] lock_acquire+0xa8/0xc4 [<ffffff8008093038>] breakpoint_handler+0x118/0x288 [<ffffff8008082434>] do_debug_exception+0x3c/0xa8 [<ffffff80080854b4>] el1_dbg+0x18/0x6c [<ffffff80081e82f4>] do_filp_open+0x64/0xdc [<ffffff80081d6e60>] do_sys_open+0x140/0x204 [<ffffff80081d6f58>] SyS_openat+0x10/0x18 [<ffffff8008085d30>] el0_svc_naked+0x24/0x28 possible reason: unannotated irqs-off. irq event stamp: 65857 hardirqs last enabled at (65857): [<ffffff80081fb1c0>] lookup_mnt+0xf4/0x1b4 hardirqs last disabled at (65856): [<ffffff80081fb188>] lookup_mnt+0xbc/0x1b4 softirqs last enabled at (65790): [<ffffff80080bdca4>] __do_softirq+0x1f8/0x290 softirqs last disabled at (65757): [<ffffff80080be038>] irq_exit+0x9c/0xd0 This patch adds the annotations to do_debug_exception(), while trying not to call trace_hardirqs_off() if el1_dbg() interrupted a task that already had irqs disabled. Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com>
2016-04-13 20:40:00 +08:00
/*
* Tell lockdep we disabled irqs in entry.S. Do nothing if they were
* already disabled to preserve the last enabled/disabled addresses.
*/
if (interrupts_enabled(regs))
trace_hardirqs_off();
arm64: mm: Add trace_irqflags annotations to do_debug_exception() With CONFIG_PROVE_LOCKING, CONFIG_DEBUG_LOCKDEP and CONFIG_TRACE_IRQFLAGS enabled, lockdep will compare current->hardirqs_enabled with the flags from local_irq_save(). When a debug exception occurs, interrupts are disabled in entry.S, but lockdep isn't told, resulting in: DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled) ------------[ cut here ]------------ WARNING: at ../kernel/locking/lockdep.c:3523 Modules linked in: CPU: 3 PID: 1752 Comm: perf Not tainted 4.5.0-rc4+ #2204 Hardware name: ARM Juno development board (r1) (DT) task: ffffffc974868000 ti: ffffffc975f40000 task.ti: ffffffc975f40000 PC is at check_flags.part.35+0x17c/0x184 LR is at check_flags.part.35+0x17c/0x184 pc : [<ffffff80080fc93c>] lr : [<ffffff80080fc93c>] pstate: 600003c5 [...] ---[ end trace 74631f9305ef5020 ]--- Call trace: [<ffffff80080fc93c>] check_flags.part.35+0x17c/0x184 [<ffffff80080ffe30>] lock_acquire+0xa8/0xc4 [<ffffff8008093038>] breakpoint_handler+0x118/0x288 [<ffffff8008082434>] do_debug_exception+0x3c/0xa8 [<ffffff80080854b4>] el1_dbg+0x18/0x6c [<ffffff80081e82f4>] do_filp_open+0x64/0xdc [<ffffff80081d6e60>] do_sys_open+0x140/0x204 [<ffffff80081d6f58>] SyS_openat+0x10/0x18 [<ffffff8008085d30>] el0_svc_naked+0x24/0x28 possible reason: unannotated irqs-off. irq event stamp: 65857 hardirqs last enabled at (65857): [<ffffff80081fb1c0>] lookup_mnt+0xf4/0x1b4 hardirqs last disabled at (65856): [<ffffff80081fb188>] lookup_mnt+0xbc/0x1b4 softirqs last enabled at (65790): [<ffffff80080bdca4>] __do_softirq+0x1f8/0x290 softirqs last disabled at (65757): [<ffffff80080be038>] irq_exit+0x9c/0xd0 This patch adds the annotations to do_debug_exception(), while trying not to call trace_hardirqs_off() if el1_dbg() interrupted a task that already had irqs disabled. Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com>
2016-04-13 20:40:00 +08:00
if (!inf->fn(addr, esr, regs)) {
rv = 1;
} else {
pr_alert("Unhandled debug exception: %s (0x%08x) at 0x%016lx\n",
inf->name, esr, addr);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
info.si_addr = (void __user *)addr;
arm64_notify_die("", regs, &info, 0);
rv = 0;
}
arm64: mm: Add trace_irqflags annotations to do_debug_exception() With CONFIG_PROVE_LOCKING, CONFIG_DEBUG_LOCKDEP and CONFIG_TRACE_IRQFLAGS enabled, lockdep will compare current->hardirqs_enabled with the flags from local_irq_save(). When a debug exception occurs, interrupts are disabled in entry.S, but lockdep isn't told, resulting in: DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled) ------------[ cut here ]------------ WARNING: at ../kernel/locking/lockdep.c:3523 Modules linked in: CPU: 3 PID: 1752 Comm: perf Not tainted 4.5.0-rc4+ #2204 Hardware name: ARM Juno development board (r1) (DT) task: ffffffc974868000 ti: ffffffc975f40000 task.ti: ffffffc975f40000 PC is at check_flags.part.35+0x17c/0x184 LR is at check_flags.part.35+0x17c/0x184 pc : [<ffffff80080fc93c>] lr : [<ffffff80080fc93c>] pstate: 600003c5 [...] ---[ end trace 74631f9305ef5020 ]--- Call trace: [<ffffff80080fc93c>] check_flags.part.35+0x17c/0x184 [<ffffff80080ffe30>] lock_acquire+0xa8/0xc4 [<ffffff8008093038>] breakpoint_handler+0x118/0x288 [<ffffff8008082434>] do_debug_exception+0x3c/0xa8 [<ffffff80080854b4>] el1_dbg+0x18/0x6c [<ffffff80081e82f4>] do_filp_open+0x64/0xdc [<ffffff80081d6e60>] do_sys_open+0x140/0x204 [<ffffff80081d6f58>] SyS_openat+0x10/0x18 [<ffffff8008085d30>] el0_svc_naked+0x24/0x28 possible reason: unannotated irqs-off. irq event stamp: 65857 hardirqs last enabled at (65857): [<ffffff80081fb1c0>] lookup_mnt+0xf4/0x1b4 hardirqs last disabled at (65856): [<ffffff80081fb188>] lookup_mnt+0xbc/0x1b4 softirqs last enabled at (65790): [<ffffff80080bdca4>] __do_softirq+0x1f8/0x290 softirqs last disabled at (65757): [<ffffff80080be038>] irq_exit+0x9c/0xd0 This patch adds the annotations to do_debug_exception(), while trying not to call trace_hardirqs_off() if el1_dbg() interrupted a task that already had irqs disabled. Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com>
2016-04-13 20:40:00 +08:00
if (interrupts_enabled(regs))
trace_hardirqs_on();
arm64: mm: Add trace_irqflags annotations to do_debug_exception() With CONFIG_PROVE_LOCKING, CONFIG_DEBUG_LOCKDEP and CONFIG_TRACE_IRQFLAGS enabled, lockdep will compare current->hardirqs_enabled with the flags from local_irq_save(). When a debug exception occurs, interrupts are disabled in entry.S, but lockdep isn't told, resulting in: DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled) ------------[ cut here ]------------ WARNING: at ../kernel/locking/lockdep.c:3523 Modules linked in: CPU: 3 PID: 1752 Comm: perf Not tainted 4.5.0-rc4+ #2204 Hardware name: ARM Juno development board (r1) (DT) task: ffffffc974868000 ti: ffffffc975f40000 task.ti: ffffffc975f40000 PC is at check_flags.part.35+0x17c/0x184 LR is at check_flags.part.35+0x17c/0x184 pc : [<ffffff80080fc93c>] lr : [<ffffff80080fc93c>] pstate: 600003c5 [...] ---[ end trace 74631f9305ef5020 ]--- Call trace: [<ffffff80080fc93c>] check_flags.part.35+0x17c/0x184 [<ffffff80080ffe30>] lock_acquire+0xa8/0xc4 [<ffffff8008093038>] breakpoint_handler+0x118/0x288 [<ffffff8008082434>] do_debug_exception+0x3c/0xa8 [<ffffff80080854b4>] el1_dbg+0x18/0x6c [<ffffff80081e82f4>] do_filp_open+0x64/0xdc [<ffffff80081d6e60>] do_sys_open+0x140/0x204 [<ffffff80081d6f58>] SyS_openat+0x10/0x18 [<ffffff8008085d30>] el0_svc_naked+0x24/0x28 possible reason: unannotated irqs-off. irq event stamp: 65857 hardirqs last enabled at (65857): [<ffffff80081fb1c0>] lookup_mnt+0xf4/0x1b4 hardirqs last disabled at (65856): [<ffffff80081fb188>] lookup_mnt+0xbc/0x1b4 softirqs last enabled at (65790): [<ffffff80080bdca4>] __do_softirq+0x1f8/0x290 softirqs last disabled at (65757): [<ffffff80080be038>] irq_exit+0x9c/0xd0 This patch adds the annotations to do_debug_exception(), while trying not to call trace_hardirqs_off() if el1_dbg() interrupted a task that already had irqs disabled. Signed-off-by: James Morse <james.morse@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com>
2016-04-13 20:40:00 +08:00
return rv;
}
arm64: Kprobes with single stepping support Add support for basic kernel probes(kprobes) and jump probes (jprobes) for ARM64. Kprobes utilizes software breakpoint and single step debug exceptions supported on ARM v8. A software breakpoint is placed at the probe address to trap the kernel execution into the kprobe handler. ARM v8 supports enabling single stepping before the break exception return (ERET), with next PC in exception return address (ELR_EL1). The kprobe handler prepares an executable memory slot for out-of-line execution with a copy of the original instruction being probed, and enables single stepping. The PC is set to the out-of-line slot address before the ERET. With this scheme, the instruction is executed with the exact same register context except for the PC (and DAIF) registers. Debug mask (PSTATE.D) is enabled only when single stepping a recursive kprobe, e.g.: during kprobes reenter so that probed instruction can be single stepped within the kprobe handler -exception- context. The recursion depth of kprobe is always 2, i.e. upon probe re-entry, any further re-entry is prevented by not calling handlers and the case counted as a missed kprobe). Single stepping from the x-o-l slot has a drawback for PC-relative accesses like branching and symbolic literals access as the offset from the new PC (slot address) may not be ensured to fit in the immediate value of the opcode. Such instructions need simulation, so reject probing them. Instructions generating exceptions or cpu mode change are rejected for probing. Exclusive load/store instructions are rejected too. Additionally, the code is checked to see if it is inside an exclusive load/store sequence (code from Pratyush). System instructions are mostly enabled for stepping, except MSR/MRS accesses to "DAIF" flags in PSTATE, which are not safe for probing. This also changes arch/arm64/include/asm/ptrace.h to use include/asm-generic/ptrace.h. Thanks to Steve Capper and Pratyush Anand for several suggested Changes. Signed-off-by: Sandeepa Prabhu <sandeepa.s.prabhu@gmail.com> Signed-off-by: David A. Long <dave.long@linaro.org> Signed-off-by: Pratyush Anand <panand@redhat.com> Acked-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-07-09 00:35:48 +08:00
NOKPROBE_SYMBOL(do_debug_exception);
#ifdef CONFIG_ARM64_PAN
int cpu_enable_pan(void *__unused)
{
/*
* We modify PSTATE. This won't work from irq context as the PSTATE
* is discarded once we return from the exception.
*/
WARN_ON_ONCE(in_interrupt());
config_sctlr_el1(SCTLR_EL1_SPAN, 0);
asm(SET_PSTATE_PAN(1));
return 0;
}
#endif /* CONFIG_ARM64_PAN */
#ifdef CONFIG_ARM64_UAO
/*
* Kernel threads have fs=KERNEL_DS by default, and don't need to call
* set_fs(), devtmpfs in particular relies on this behaviour.
* We need to enable the feature at runtime (instead of adding it to
* PSR_MODE_EL1h) as the feature may not be implemented by the cpu.
*/
int cpu_enable_uao(void *__unused)
{
asm(SET_PSTATE_UAO(1));
return 0;
}
#endif /* CONFIG_ARM64_UAO */