Merge branch 'bpf-stackmap-nmi'
Song Liu says: ==================== Changes v2 -> v3: Improve syntax based on suggestion by Tobin C. Harding. Changes v1 -> v2: 1. Rename some variables to (hopefully) reduce confusion; 2. Check irq_work status with IRQ_WORK_BUSY (instead of work->sem); 3. In Kconfig, let BPF_SYSCALL select IRQ_WORK; 4. Add static to DEFINE_PER_CPU(); 5. Remove pr_info() in stack_map_init(). ==================== Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
This commit is contained in:
commit
1772be37fc
|
@ -1391,6 +1391,7 @@ config BPF_SYSCALL
|
|||
bool "Enable bpf() system call"
|
||||
select ANON_INODES
|
||||
select BPF
|
||||
select IRQ_WORK
|
||||
default n
|
||||
help
|
||||
Enable the bpf() system call that allows to manipulate eBPF
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include <linux/perf_event.h>
|
||||
#include <linux/elf.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/irq_work.h>
|
||||
#include "percpu_freelist.h"
|
||||
|
||||
#define STACK_CREATE_FLAG_MASK \
|
||||
|
@ -32,6 +33,23 @@ struct bpf_stack_map {
|
|||
struct stack_map_bucket *buckets[];
|
||||
};
|
||||
|
||||
/* irq_work to run up_read() for build_id lookup in nmi context */
|
||||
struct stack_map_irq_work {
|
||||
struct irq_work irq_work;
|
||||
struct rw_semaphore *sem;
|
||||
};
|
||||
|
||||
static void do_up_read(struct irq_work *entry)
|
||||
{
|
||||
struct stack_map_irq_work *work;
|
||||
|
||||
work = container_of(entry, struct stack_map_irq_work, irq_work);
|
||||
up_read(work->sem);
|
||||
work->sem = NULL;
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
|
||||
|
||||
static inline bool stack_map_use_build_id(struct bpf_map *map)
|
||||
{
|
||||
return (map->map_flags & BPF_F_STACK_BUILD_ID);
|
||||
|
@ -267,17 +285,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
|
|||
{
|
||||
int i;
|
||||
struct vm_area_struct *vma;
|
||||
bool in_nmi_ctx = in_nmi();
|
||||
bool irq_work_busy = false;
|
||||
struct stack_map_irq_work *work;
|
||||
|
||||
if (in_nmi_ctx) {
|
||||
work = this_cpu_ptr(&up_read_work);
|
||||
if (work->irq_work.flags & IRQ_WORK_BUSY)
|
||||
/* cannot queue more up_read, fallback */
|
||||
irq_work_busy = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* We cannot do up_read() in nmi context, so build_id lookup is
|
||||
* only supported for non-nmi events. If at some point, it is
|
||||
* possible to run find_vma() without taking the semaphore, we
|
||||
* would like to allow build_id lookup in nmi context.
|
||||
* We cannot do up_read() in nmi context. To do build_id lookup
|
||||
* in nmi context, we need to run up_read() in irq_work. We use
|
||||
* a percpu variable to do the irq_work. If the irq_work is
|
||||
* already used by another lookup, we fall back to report ips.
|
||||
*
|
||||
* Same fallback is used for kernel stack (!user) on a stackmap
|
||||
* with build_id.
|
||||
*/
|
||||
if (!user || !current || !current->mm || in_nmi() ||
|
||||
if (!user || !current || !current->mm || irq_work_busy ||
|
||||
down_read_trylock(¤t->mm->mmap_sem) == 0) {
|
||||
/* cannot access current->mm, fall back to ips */
|
||||
for (i = 0; i < trace_nr; i++) {
|
||||
|
@ -299,7 +327,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
|
|||
- vma->vm_start;
|
||||
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
|
||||
}
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
|
||||
if (!in_nmi_ctx) {
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
} else {
|
||||
work->sem = ¤t->mm->mmap_sem;
|
||||
irq_work_queue(&work->irq_work);
|
||||
}
|
||||
}
|
||||
|
||||
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
|
||||
|
@ -575,3 +609,16 @@ const struct bpf_map_ops stack_map_ops = {
|
|||
.map_update_elem = stack_map_update_elem,
|
||||
.map_delete_elem = stack_map_delete_elem,
|
||||
};
|
||||
|
||||
static int __init stack_map_init(void)
|
||||
{
|
||||
int cpu;
|
||||
struct stack_map_irq_work *work;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
work = per_cpu_ptr(&up_read_work, cpu);
|
||||
init_irq_work(&work->irq_work, do_up_read);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(stack_map_init);
|
||||
|
|
|
@ -1272,6 +1272,139 @@ static void test_stacktrace_build_id(void)
|
|||
return;
|
||||
}
|
||||
|
||||
static void test_stacktrace_build_id_nmi(void)
|
||||
{
|
||||
int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
|
||||
const char *file = "./test_stacktrace_build_id.o";
|
||||
int err, pmu_fd, prog_fd;
|
||||
struct perf_event_attr attr = {
|
||||
.sample_freq = 5000,
|
||||
.freq = 1,
|
||||
.type = PERF_TYPE_HARDWARE,
|
||||
.config = PERF_COUNT_HW_CPU_CYCLES,
|
||||
};
|
||||
__u32 key, previous_key, val, duration = 0;
|
||||
struct bpf_object *obj;
|
||||
char buf[256];
|
||||
int i, j;
|
||||
struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
|
||||
int build_id_matches = 0;
|
||||
|
||||
err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
|
||||
if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
|
||||
return;
|
||||
|
||||
pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
|
||||
0 /* cpu 0 */, -1 /* group id */,
|
||||
0 /* flags */);
|
||||
if (CHECK(pmu_fd < 0, "perf_event_open",
|
||||
"err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
|
||||
pmu_fd, errno))
|
||||
goto close_prog;
|
||||
|
||||
err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
|
||||
if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
|
||||
err, errno))
|
||||
goto close_pmu;
|
||||
|
||||
err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
|
||||
if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
|
||||
err, errno))
|
||||
goto disable_pmu;
|
||||
|
||||
/* find map fds */
|
||||
control_map_fd = bpf_find_map(__func__, obj, "control_map");
|
||||
if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
|
||||
"err %d errno %d\n", err, errno))
|
||||
goto disable_pmu;
|
||||
|
||||
stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
|
||||
if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
|
||||
"err %d errno %d\n", err, errno))
|
||||
goto disable_pmu;
|
||||
|
||||
stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
|
||||
if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
|
||||
err, errno))
|
||||
goto disable_pmu;
|
||||
|
||||
stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
|
||||
if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
|
||||
"err %d errno %d\n", err, errno))
|
||||
goto disable_pmu;
|
||||
|
||||
assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
|
||||
== 0);
|
||||
assert(system("taskset 0x1 ./urandom_read 100000") == 0);
|
||||
/* disable stack trace collection */
|
||||
key = 0;
|
||||
val = 1;
|
||||
bpf_map_update_elem(control_map_fd, &key, &val, 0);
|
||||
|
||||
/* for every element in stackid_hmap, we can find a corresponding one
|
||||
* in stackmap, and vise versa.
|
||||
*/
|
||||
err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
|
||||
if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
|
||||
"err %d errno %d\n", err, errno))
|
||||
goto disable_pmu;
|
||||
|
||||
err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
|
||||
if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
|
||||
"err %d errno %d\n", err, errno))
|
||||
goto disable_pmu;
|
||||
|
||||
err = extract_build_id(buf, 256);
|
||||
|
||||
if (CHECK(err, "get build_id with readelf",
|
||||
"err %d errno %d\n", err, errno))
|
||||
goto disable_pmu;
|
||||
|
||||
err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
|
||||
if (CHECK(err, "get_next_key from stackmap",
|
||||
"err %d, errno %d\n", err, errno))
|
||||
goto disable_pmu;
|
||||
|
||||
do {
|
||||
char build_id[64];
|
||||
|
||||
err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
|
||||
if (CHECK(err, "lookup_elem from stackmap",
|
||||
"err %d, errno %d\n", err, errno))
|
||||
goto disable_pmu;
|
||||
for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
|
||||
if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
|
||||
id_offs[i].offset != 0) {
|
||||
for (j = 0; j < 20; ++j)
|
||||
sprintf(build_id + 2 * j, "%02x",
|
||||
id_offs[i].build_id[j] & 0xff);
|
||||
if (strstr(buf, build_id) != NULL)
|
||||
build_id_matches = 1;
|
||||
}
|
||||
previous_key = key;
|
||||
} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
|
||||
|
||||
if (CHECK(build_id_matches < 1, "build id match",
|
||||
"Didn't find expected build ID from the map\n"))
|
||||
goto disable_pmu;
|
||||
|
||||
/*
|
||||
* We intentionally skip compare_stack_ips(). This is because we
|
||||
* only support one in_nmi() ips-to-build_id translation per cpu
|
||||
* at any time, thus stack_amap here will always fallback to
|
||||
* BPF_STACK_BUILD_ID_IP;
|
||||
*/
|
||||
|
||||
disable_pmu:
|
||||
ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
|
||||
|
||||
close_pmu:
|
||||
close(pmu_fd);
|
||||
|
||||
close_prog:
|
||||
bpf_object__close(obj);
|
||||
}
|
||||
|
||||
#define MAX_CNT_RAWTP 10ull
|
||||
#define MAX_STACK_RAWTP 100
|
||||
struct get_stack_trace_t {
|
||||
|
@ -1425,6 +1558,7 @@ int main(void)
|
|||
test_tp_attach_query();
|
||||
test_stacktrace_map();
|
||||
test_stacktrace_build_id();
|
||||
test_stacktrace_build_id_nmi();
|
||||
test_stacktrace_map_raw_tp();
|
||||
test_get_stack_raw_tp();
|
||||
|
||||
|
|
|
@ -6,15 +6,21 @@
|
|||
#include <stdlib.h>
|
||||
|
||||
#define BUF_SIZE 256
|
||||
int main(void)
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int fd = open("/dev/urandom", O_RDONLY);
|
||||
int i;
|
||||
char buf[BUF_SIZE];
|
||||
int count = 4;
|
||||
|
||||
if (fd < 0)
|
||||
return 1;
|
||||
for (i = 0; i < 4; ++i)
|
||||
|
||||
if (argc == 2)
|
||||
count = atoi(argv[1]);
|
||||
|
||||
for (i = 0; i < count; ++i)
|
||||
read(fd, buf, BUF_SIZE);
|
||||
|
||||
close(fd);
|
||||
|
|
Loading…
Reference in New Issue