2019-11-15 02:57:04 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
/* Copyright (c) 2019 Facebook */
|
|
|
|
#include <linux/hash.h>
|
|
|
|
#include <linux/bpf.h>
|
|
|
|
#include <linux/filter.h>
|
2019-12-09 08:01:13 +08:00
|
|
|
#include <linux/ftrace.h>
|
2020-01-24 00:15:07 +08:00
|
|
|
#include <linux/rbtree_latch.h>
|
2019-11-15 02:57:04 +08:00
|
|
|
|
2020-01-21 08:53:46 +08:00
|
|
|
/* dummy _ops. The verifier will operate on target program's ops. */
|
|
|
|
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
|
|
|
|
};
|
|
|
|
const struct bpf_prog_ops bpf_extension_prog_ops = {
|
|
|
|
};
|
|
|
|
|
2019-11-15 02:57:04 +08:00
|
|
|
/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
|
|
|
|
#define TRAMPOLINE_HASH_BITS 10
|
|
|
|
#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
|
|
|
|
|
|
|
|
static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
|
2020-01-24 00:15:07 +08:00
|
|
|
static struct latch_tree_root image_tree __cacheline_aligned;
|
2019-11-15 02:57:04 +08:00
|
|
|
|
2020-01-24 00:15:07 +08:00
|
|
|
/* serializes access to trampoline_table and image_tree */
|
2019-11-15 02:57:04 +08:00
|
|
|
static DEFINE_MUTEX(trampoline_mutex);
|
|
|
|
|
2020-01-24 00:15:07 +08:00
|
|
|
static void *bpf_jit_alloc_exec_page(void)
|
2019-12-14 01:51:07 +08:00
|
|
|
{
|
|
|
|
void *image;
|
|
|
|
|
|
|
|
image = bpf_jit_alloc_exec(PAGE_SIZE);
|
|
|
|
if (!image)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
set_vm_flush_reset_perms(image);
|
|
|
|
/* Keep image as writeable. The alternative is to keep flipping ro/rw
|
|
|
|
* everytime new program is attached or detached.
|
|
|
|
*/
|
|
|
|
set_memory_x((long)image, 1);
|
|
|
|
return image;
|
|
|
|
}
|
|
|
|
|
2020-01-24 00:15:07 +08:00
|
|
|
static __always_inline bool image_tree_less(struct latch_tree_node *a,
|
|
|
|
struct latch_tree_node *b)
|
|
|
|
{
|
|
|
|
struct bpf_image *ia = container_of(a, struct bpf_image, tnode);
|
|
|
|
struct bpf_image *ib = container_of(b, struct bpf_image, tnode);
|
|
|
|
|
|
|
|
return ia < ib;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n)
|
|
|
|
{
|
|
|
|
void *image = container_of(n, struct bpf_image, tnode);
|
|
|
|
|
|
|
|
if (addr < image)
|
|
|
|
return -1;
|
|
|
|
if (addr >= image + PAGE_SIZE)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct latch_tree_ops image_tree_ops = {
|
|
|
|
.less = image_tree_less,
|
|
|
|
.comp = image_tree_comp,
|
|
|
|
};
|
|
|
|
|
|
|
|
static void *__bpf_image_alloc(bool lock)
|
|
|
|
{
|
|
|
|
struct bpf_image *image;
|
|
|
|
|
|
|
|
image = bpf_jit_alloc_exec_page();
|
|
|
|
if (!image)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (lock)
|
|
|
|
mutex_lock(&trampoline_mutex);
|
|
|
|
latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops);
|
|
|
|
if (lock)
|
|
|
|
mutex_unlock(&trampoline_mutex);
|
|
|
|
return image->data;
|
|
|
|
}
|
|
|
|
|
|
|
|
void *bpf_image_alloc(void)
|
|
|
|
{
|
|
|
|
return __bpf_image_alloc(true);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool is_bpf_image_address(unsigned long addr)
|
|
|
|
{
|
|
|
|
bool ret;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL;
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-11-15 02:57:04 +08:00
|
|
|
struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
|
|
|
|
{
|
|
|
|
struct bpf_trampoline *tr;
|
|
|
|
struct hlist_head *head;
|
|
|
|
void *image;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
mutex_lock(&trampoline_mutex);
|
|
|
|
head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
|
|
|
|
hlist_for_each_entry(tr, head, hlist) {
|
|
|
|
if (tr->key == key) {
|
|
|
|
refcount_inc(&tr->refcnt);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
|
|
|
|
if (!tr)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
|
2020-01-24 00:15:07 +08:00
|
|
|
image = __bpf_image_alloc(false);
|
2019-11-15 02:57:04 +08:00
|
|
|
if (!image) {
|
|
|
|
kfree(tr);
|
|
|
|
tr = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
tr->key = key;
|
|
|
|
INIT_HLIST_NODE(&tr->hlist);
|
|
|
|
hlist_add_head(&tr->hlist, head);
|
|
|
|
refcount_set(&tr->refcnt, 1);
|
|
|
|
mutex_init(&tr->mutex);
|
|
|
|
for (i = 0; i < BPF_TRAMP_MAX; i++)
|
|
|
|
INIT_HLIST_HEAD(&tr->progs_hlist[i]);
|
|
|
|
tr->image = image;
|
|
|
|
out:
|
|
|
|
mutex_unlock(&trampoline_mutex);
|
|
|
|
return tr;
|
|
|
|
}
|
|
|
|
|
2019-12-09 08:01:13 +08:00
|
|
|
static int is_ftrace_location(void *ip)
|
|
|
|
{
|
|
|
|
long addr;
|
|
|
|
|
|
|
|
addr = ftrace_location((long)ip);
|
|
|
|
if (!addr)
|
|
|
|
return 0;
|
|
|
|
if (WARN_ON_ONCE(addr != (long)ip))
|
|
|
|
return -EFAULT;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
|
|
|
|
{
|
|
|
|
void *ip = tr->func.addr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (tr->func.ftrace_managed)
|
|
|
|
ret = unregister_ftrace_direct((long)ip, (long)old_addr);
|
|
|
|
else
|
|
|
|
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
|
|
|
|
{
|
|
|
|
void *ip = tr->func.addr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (tr->func.ftrace_managed)
|
|
|
|
ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
|
|
|
|
else
|
|
|
|
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* first time registering */
|
|
|
|
static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
|
|
|
|
{
|
|
|
|
void *ip = tr->func.addr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = is_ftrace_location(ip);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
tr->func.ftrace_managed = ret;
|
|
|
|
|
|
|
|
if (tr->func.ftrace_managed)
|
|
|
|
ret = register_ftrace_direct((long)ip, (long)new_addr);
|
|
|
|
else
|
|
|
|
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-11-15 02:57:04 +08:00
|
|
|
/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
|
2020-01-24 00:15:07 +08:00
|
|
|
* bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2
|
2019-11-15 02:57:04 +08:00
|
|
|
*/
|
|
|
|
#define BPF_MAX_TRAMP_PROGS 40
|
|
|
|
|
|
|
|
static int bpf_trampoline_update(struct bpf_trampoline *tr)
|
|
|
|
{
|
2020-01-24 00:15:07 +08:00
|
|
|
void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2;
|
|
|
|
void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2;
|
2019-11-15 02:57:04 +08:00
|
|
|
struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
|
|
|
|
int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
|
|
|
|
int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
|
|
|
|
struct bpf_prog **progs, **fentry, **fexit;
|
|
|
|
u32 flags = BPF_TRAMP_F_RESTORE_REGS;
|
|
|
|
struct bpf_prog_aux *aux;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (fentry_cnt + fexit_cnt == 0) {
|
2019-12-09 08:01:13 +08:00
|
|
|
err = unregister_fentry(tr, old_image);
|
2019-11-15 02:57:04 +08:00
|
|
|
tr->selector = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* populate fentry progs */
|
|
|
|
fentry = progs = progs_to_run;
|
|
|
|
hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist)
|
|
|
|
*progs++ = aux->prog;
|
|
|
|
|
|
|
|
/* populate fexit progs */
|
|
|
|
fexit = progs;
|
|
|
|
hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist)
|
|
|
|
*progs++ = aux->prog;
|
|
|
|
|
|
|
|
if (fexit_cnt)
|
|
|
|
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
|
|
|
|
|
2020-01-21 11:22:31 +08:00
|
|
|
/* Though the second half of trampoline page is unused a task could be
|
|
|
|
* preempted in the middle of the first half of trampoline and two
|
|
|
|
* updates to trampoline would change the code from underneath the
|
|
|
|
* preempted task. Hence wait for tasks to voluntarily schedule or go
|
|
|
|
* to userspace.
|
|
|
|
*/
|
|
|
|
synchronize_rcu_tasks();
|
|
|
|
|
2020-01-24 00:15:07 +08:00
|
|
|
err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2,
|
bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS
The patch introduces BPF_MAP_TYPE_STRUCT_OPS. The map value
is a kernel struct with its func ptr implemented in bpf prog.
This new map is the interface to register/unregister/introspect
a bpf implemented kernel struct.
The kernel struct is actually embedded inside another new struct
(or called the "value" struct in the code). For example,
"struct tcp_congestion_ops" is embbeded in:
struct bpf_struct_ops_tcp_congestion_ops {
refcount_t refcnt;
enum bpf_struct_ops_state state;
struct tcp_congestion_ops data; /* <-- kernel subsystem struct here */
}
The map value is "struct bpf_struct_ops_tcp_congestion_ops".
The "bpftool map dump" will then be able to show the
state ("inuse"/"tobefree") and the number of subsystem's refcnt (e.g.
number of tcp_sock in the tcp_congestion_ops case). This "value" struct
is created automatically by a macro. Having a separate "value" struct
will also make extending "struct bpf_struct_ops_XYZ" easier (e.g. adding
"void (*init)(void)" to "struct bpf_struct_ops_XYZ" to do some
initialization works before registering the struct_ops to the kernel
subsystem). The libbpf will take care of finding and populating the
"struct bpf_struct_ops_XYZ" from "struct XYZ".
Register a struct_ops to a kernel subsystem:
1. Load all needed BPF_PROG_TYPE_STRUCT_OPS prog(s)
2. Create a BPF_MAP_TYPE_STRUCT_OPS with attr->btf_vmlinux_value_type_id
set to the btf id "struct bpf_struct_ops_tcp_congestion_ops" of the
running kernel.
Instead of reusing the attr->btf_value_type_id,
btf_vmlinux_value_type_id s added such that attr->btf_fd can still be
used as the "user" btf which could store other useful sysadmin/debug
info that may be introduced in the furture,
e.g. creation-date/compiler-details/map-creator...etc.
3. Create a "struct bpf_struct_ops_tcp_congestion_ops" object as described
in the running kernel btf. Populate the value of this object.
The function ptr should be populated with the prog fds.
4. Call BPF_MAP_UPDATE with the object created in (3) as
the map value. The key is always "0".
During BPF_MAP_UPDATE, the code that saves the kernel-func-ptr's
args as an array of u64 is generated. BPF_MAP_UPDATE also allows
the specific struct_ops to do some final checks in "st_ops->init_member()"
(e.g. ensure all mandatory func ptrs are implemented).
If everything looks good, it will register this kernel struct
to the kernel subsystem. The map will not allow further update
from this point.
Unregister a struct_ops from the kernel subsystem:
BPF_MAP_DELETE with key "0".
Introspect a struct_ops:
BPF_MAP_LOOKUP_ELEM with key "0". The map value returned will
have the prog _id_ populated as the func ptr.
The map value state (enum bpf_struct_ops_state) will transit from:
INIT (map created) =>
INUSE (map updated, i.e. reg) =>
TOBEFREE (map value deleted, i.e. unreg)
The kernel subsystem needs to call bpf_struct_ops_get() and
bpf_struct_ops_put() to manage the "refcnt" in the
"struct bpf_struct_ops_XYZ". This patch uses a separate refcnt
for the purose of tracking the subsystem usage. Another approach
is to reuse the map->refcnt and then "show" (i.e. during map_lookup)
the subsystem's usage by doing map->refcnt - map->usercnt to filter out
the map-fd/pinned-map usage. However, that will also tie down the
future semantics of map->refcnt and map->usercnt.
The very first subsystem's refcnt (during reg()) holds one
count to map->refcnt. When the very last subsystem's refcnt
is gone, it will also release the map->refcnt. All bpf_prog will be
freed when the map->refcnt reaches 0 (i.e. during map_free()).
Here is how the bpftool map command will look like:
[root@arch-fb-vm1 bpf]# bpftool map show
6: struct_ops name dctcp flags 0x0
key 4B value 256B max_entries 1 memlock 4096B
btf_id 6
[root@arch-fb-vm1 bpf]# bpftool map dump id 6
[{
"value": {
"refcnt": {
"refs": {
"counter": 1
}
},
"state": 1,
"data": {
"list": {
"next": 0,
"prev": 0
},
"key": 0,
"flags": 2,
"init": 24,
"release": 0,
"ssthresh": 25,
"cong_avoid": 30,
"set_state": 27,
"cwnd_event": 28,
"in_ack_event": 26,
"undo_cwnd": 29,
"pkts_acked": 0,
"min_tso_segs": 0,
"sndbuf_expand": 0,
"cong_control": 0,
"get_info": 0,
"name": [98,112,102,95,100,99,116,99,112,0,0,0,0,0,0,0
],
"owner": 0
}
}
}
]
Misc Notes:
* bpf_struct_ops_map_sys_lookup_elem() is added for syscall lookup.
It does an inplace update on "*value" instead returning a pointer
to syscall.c. Otherwise, it needs a separate copy of "zero" value
for the BPF_STRUCT_OPS_STATE_INIT to avoid races.
* The bpf_struct_ops_map_delete_elem() is also called without
preempt_disable() from map_delete_elem(). It is because
the "->unreg()" may requires sleepable context, e.g.
the "tcp_unregister_congestion_control()".
* "const" is added to some of the existing "struct btf_func_model *"
function arg to avoid a compiler warning caused by this patch.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20200109003505.3855919-1-kafai@fb.com
2020-01-09 08:35:05 +08:00
|
|
|
&tr->func.model, flags,
|
2019-11-15 02:57:04 +08:00
|
|
|
fentry, fentry_cnt,
|
|
|
|
fexit, fexit_cnt,
|
|
|
|
tr->func.addr);
|
bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS
The patch introduces BPF_MAP_TYPE_STRUCT_OPS. The map value
is a kernel struct with its func ptr implemented in bpf prog.
This new map is the interface to register/unregister/introspect
a bpf implemented kernel struct.
The kernel struct is actually embedded inside another new struct
(or called the "value" struct in the code). For example,
"struct tcp_congestion_ops" is embbeded in:
struct bpf_struct_ops_tcp_congestion_ops {
refcount_t refcnt;
enum bpf_struct_ops_state state;
struct tcp_congestion_ops data; /* <-- kernel subsystem struct here */
}
The map value is "struct bpf_struct_ops_tcp_congestion_ops".
The "bpftool map dump" will then be able to show the
state ("inuse"/"tobefree") and the number of subsystem's refcnt (e.g.
number of tcp_sock in the tcp_congestion_ops case). This "value" struct
is created automatically by a macro. Having a separate "value" struct
will also make extending "struct bpf_struct_ops_XYZ" easier (e.g. adding
"void (*init)(void)" to "struct bpf_struct_ops_XYZ" to do some
initialization works before registering the struct_ops to the kernel
subsystem). The libbpf will take care of finding and populating the
"struct bpf_struct_ops_XYZ" from "struct XYZ".
Register a struct_ops to a kernel subsystem:
1. Load all needed BPF_PROG_TYPE_STRUCT_OPS prog(s)
2. Create a BPF_MAP_TYPE_STRUCT_OPS with attr->btf_vmlinux_value_type_id
set to the btf id "struct bpf_struct_ops_tcp_congestion_ops" of the
running kernel.
Instead of reusing the attr->btf_value_type_id,
btf_vmlinux_value_type_id s added such that attr->btf_fd can still be
used as the "user" btf which could store other useful sysadmin/debug
info that may be introduced in the furture,
e.g. creation-date/compiler-details/map-creator...etc.
3. Create a "struct bpf_struct_ops_tcp_congestion_ops" object as described
in the running kernel btf. Populate the value of this object.
The function ptr should be populated with the prog fds.
4. Call BPF_MAP_UPDATE with the object created in (3) as
the map value. The key is always "0".
During BPF_MAP_UPDATE, the code that saves the kernel-func-ptr's
args as an array of u64 is generated. BPF_MAP_UPDATE also allows
the specific struct_ops to do some final checks in "st_ops->init_member()"
(e.g. ensure all mandatory func ptrs are implemented).
If everything looks good, it will register this kernel struct
to the kernel subsystem. The map will not allow further update
from this point.
Unregister a struct_ops from the kernel subsystem:
BPF_MAP_DELETE with key "0".
Introspect a struct_ops:
BPF_MAP_LOOKUP_ELEM with key "0". The map value returned will
have the prog _id_ populated as the func ptr.
The map value state (enum bpf_struct_ops_state) will transit from:
INIT (map created) =>
INUSE (map updated, i.e. reg) =>
TOBEFREE (map value deleted, i.e. unreg)
The kernel subsystem needs to call bpf_struct_ops_get() and
bpf_struct_ops_put() to manage the "refcnt" in the
"struct bpf_struct_ops_XYZ". This patch uses a separate refcnt
for the purose of tracking the subsystem usage. Another approach
is to reuse the map->refcnt and then "show" (i.e. during map_lookup)
the subsystem's usage by doing map->refcnt - map->usercnt to filter out
the map-fd/pinned-map usage. However, that will also tie down the
future semantics of map->refcnt and map->usercnt.
The very first subsystem's refcnt (during reg()) holds one
count to map->refcnt. When the very last subsystem's refcnt
is gone, it will also release the map->refcnt. All bpf_prog will be
freed when the map->refcnt reaches 0 (i.e. during map_free()).
Here is how the bpftool map command will look like:
[root@arch-fb-vm1 bpf]# bpftool map show
6: struct_ops name dctcp flags 0x0
key 4B value 256B max_entries 1 memlock 4096B
btf_id 6
[root@arch-fb-vm1 bpf]# bpftool map dump id 6
[{
"value": {
"refcnt": {
"refs": {
"counter": 1
}
},
"state": 1,
"data": {
"list": {
"next": 0,
"prev": 0
},
"key": 0,
"flags": 2,
"init": 24,
"release": 0,
"ssthresh": 25,
"cong_avoid": 30,
"set_state": 27,
"cwnd_event": 28,
"in_ack_event": 26,
"undo_cwnd": 29,
"pkts_acked": 0,
"min_tso_segs": 0,
"sndbuf_expand": 0,
"cong_control": 0,
"get_info": 0,
"name": [98,112,102,95,100,99,116,99,112,0,0,0,0,0,0,0
],
"owner": 0
}
}
}
]
Misc Notes:
* bpf_struct_ops_map_sys_lookup_elem() is added for syscall lookup.
It does an inplace update on "*value" instead returning a pointer
to syscall.c. Otherwise, it needs a separate copy of "zero" value
for the BPF_STRUCT_OPS_STATE_INIT to avoid races.
* The bpf_struct_ops_map_delete_elem() is also called without
preempt_disable() from map_delete_elem(). It is because
the "->unreg()" may requires sleepable context, e.g.
the "tcp_unregister_congestion_control()".
* "const" is added to some of the existing "struct btf_func_model *"
function arg to avoid a compiler warning caused by this patch.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20200109003505.3855919-1-kafai@fb.com
2020-01-09 08:35:05 +08:00
|
|
|
if (err < 0)
|
2019-11-15 02:57:04 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (tr->selector)
|
|
|
|
/* progs already running at this address */
|
2019-12-09 08:01:13 +08:00
|
|
|
err = modify_fentry(tr, old_image, new_image);
|
2019-11-15 02:57:04 +08:00
|
|
|
else
|
|
|
|
/* first time registering */
|
2019-12-09 08:01:13 +08:00
|
|
|
err = register_fentry(tr, new_image);
|
2019-11-15 02:57:04 +08:00
|
|
|
if (err)
|
|
|
|
goto out;
|
|
|
|
tr->selector++;
|
|
|
|
out:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
|
|
|
|
{
|
|
|
|
switch (t) {
|
|
|
|
case BPF_TRACE_FENTRY:
|
|
|
|
return BPF_TRAMP_FENTRY;
|
2020-01-21 08:53:46 +08:00
|
|
|
case BPF_TRACE_FEXIT:
|
2019-11-15 02:57:04 +08:00
|
|
|
return BPF_TRAMP_FEXIT;
|
2020-01-21 08:53:46 +08:00
|
|
|
default:
|
|
|
|
return BPF_TRAMP_REPLACE;
|
2019-11-15 02:57:04 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int bpf_trampoline_link_prog(struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
enum bpf_tramp_prog_type kind;
|
|
|
|
struct bpf_trampoline *tr;
|
|
|
|
int err = 0;
|
2020-01-21 08:53:46 +08:00
|
|
|
int cnt;
|
2019-11-15 02:57:04 +08:00
|
|
|
|
|
|
|
tr = prog->aux->trampoline;
|
|
|
|
kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
|
|
|
|
mutex_lock(&tr->mutex);
|
2020-01-21 08:53:46 +08:00
|
|
|
if (tr->extension_prog) {
|
|
|
|
/* cannot attach fentry/fexit if extension prog is attached.
|
|
|
|
* cannot overwrite extension prog either.
|
|
|
|
*/
|
|
|
|
err = -EBUSY;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
|
|
|
|
if (kind == BPF_TRAMP_REPLACE) {
|
|
|
|
/* Cannot attach extension if fentry/fexit are in use. */
|
|
|
|
if (cnt) {
|
|
|
|
err = -EBUSY;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
tr->extension_prog = prog;
|
|
|
|
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
|
|
|
|
prog->bpf_func);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (cnt >= BPF_MAX_TRAMP_PROGS) {
|
2019-11-15 02:57:04 +08:00
|
|
|
err = -E2BIG;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
|
|
|
|
/* prog already linked */
|
|
|
|
err = -EBUSY;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
|
|
|
|
tr->progs_cnt[kind]++;
|
|
|
|
err = bpf_trampoline_update(prog->aux->trampoline);
|
|
|
|
if (err) {
|
|
|
|
hlist_del(&prog->aux->tramp_hlist);
|
|
|
|
tr->progs_cnt[kind]--;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
mutex_unlock(&tr->mutex);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* bpf_trampoline_unlink_prog() should never fail. */
|
|
|
|
int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
enum bpf_tramp_prog_type kind;
|
|
|
|
struct bpf_trampoline *tr;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
tr = prog->aux->trampoline;
|
|
|
|
kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
|
|
|
|
mutex_lock(&tr->mutex);
|
2020-01-21 08:53:46 +08:00
|
|
|
if (kind == BPF_TRAMP_REPLACE) {
|
|
|
|
WARN_ON_ONCE(!tr->extension_prog);
|
|
|
|
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
|
|
|
|
tr->extension_prog->bpf_func, NULL);
|
|
|
|
tr->extension_prog = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
2019-11-15 02:57:04 +08:00
|
|
|
hlist_del(&prog->aux->tramp_hlist);
|
|
|
|
tr->progs_cnt[kind]--;
|
|
|
|
err = bpf_trampoline_update(prog->aux->trampoline);
|
2020-01-21 08:53:46 +08:00
|
|
|
out:
|
2019-11-15 02:57:04 +08:00
|
|
|
mutex_unlock(&tr->mutex);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
void bpf_trampoline_put(struct bpf_trampoline *tr)
|
|
|
|
{
|
2020-01-24 00:15:07 +08:00
|
|
|
struct bpf_image *image;
|
|
|
|
|
2019-11-15 02:57:04 +08:00
|
|
|
if (!tr)
|
|
|
|
return;
|
|
|
|
mutex_lock(&trampoline_mutex);
|
|
|
|
if (!refcount_dec_and_test(&tr->refcnt))
|
|
|
|
goto out;
|
|
|
|
WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
|
|
|
|
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
|
|
|
|
goto out;
|
|
|
|
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
|
|
|
|
goto out;
|
2020-01-24 00:15:07 +08:00
|
|
|
image = container_of(tr->image, struct bpf_image, data);
|
|
|
|
latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops);
|
2020-01-21 11:22:31 +08:00
|
|
|
/* wait for tasks to get out of trampoline before freeing it */
|
|
|
|
synchronize_rcu_tasks();
|
2020-01-24 00:15:07 +08:00
|
|
|
bpf_jit_free_exec(image);
|
2019-11-15 02:57:04 +08:00
|
|
|
hlist_del(&tr->hlist);
|
|
|
|
kfree(tr);
|
|
|
|
out:
|
|
|
|
mutex_unlock(&trampoline_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that
|
|
|
|
* are needed for trampoline. The macro is split into
|
|
|
|
* call _bpf_prog_enter
|
|
|
|
* call prog->bpf_func
|
|
|
|
* call __bpf_prog_exit
|
|
|
|
*/
|
|
|
|
u64 notrace __bpf_prog_enter(void)
|
|
|
|
{
|
|
|
|
u64 start = 0;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
preempt_disable();
|
|
|
|
if (static_branch_unlikely(&bpf_stats_enabled_key))
|
|
|
|
start = sched_clock();
|
|
|
|
return start;
|
|
|
|
}
|
|
|
|
|
|
|
|
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
|
|
|
|
{
|
|
|
|
struct bpf_prog_stats *stats;
|
|
|
|
|
|
|
|
if (static_branch_unlikely(&bpf_stats_enabled_key) &&
|
|
|
|
/* static_key could be enabled in __bpf_prog_enter
|
|
|
|
* and disabled in __bpf_prog_exit.
|
|
|
|
* And vice versa.
|
|
|
|
* Hence check that 'start' is not zero.
|
|
|
|
*/
|
|
|
|
start) {
|
|
|
|
stats = this_cpu_ptr(prog->aux->stats);
|
|
|
|
u64_stats_update_begin(&stats->syncp);
|
|
|
|
stats->cnt++;
|
|
|
|
stats->nsecs += sched_clock() - start;
|
|
|
|
u64_stats_update_end(&stats->syncp);
|
|
|
|
}
|
|
|
|
preempt_enable();
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
int __weak
|
bpf: Introduce BPF_MAP_TYPE_STRUCT_OPS
The patch introduces BPF_MAP_TYPE_STRUCT_OPS. The map value
is a kernel struct with its func ptr implemented in bpf prog.
This new map is the interface to register/unregister/introspect
a bpf implemented kernel struct.
The kernel struct is actually embedded inside another new struct
(or called the "value" struct in the code). For example,
"struct tcp_congestion_ops" is embbeded in:
struct bpf_struct_ops_tcp_congestion_ops {
refcount_t refcnt;
enum bpf_struct_ops_state state;
struct tcp_congestion_ops data; /* <-- kernel subsystem struct here */
}
The map value is "struct bpf_struct_ops_tcp_congestion_ops".
The "bpftool map dump" will then be able to show the
state ("inuse"/"tobefree") and the number of subsystem's refcnt (e.g.
number of tcp_sock in the tcp_congestion_ops case). This "value" struct
is created automatically by a macro. Having a separate "value" struct
will also make extending "struct bpf_struct_ops_XYZ" easier (e.g. adding
"void (*init)(void)" to "struct bpf_struct_ops_XYZ" to do some
initialization works before registering the struct_ops to the kernel
subsystem). The libbpf will take care of finding and populating the
"struct bpf_struct_ops_XYZ" from "struct XYZ".
Register a struct_ops to a kernel subsystem:
1. Load all needed BPF_PROG_TYPE_STRUCT_OPS prog(s)
2. Create a BPF_MAP_TYPE_STRUCT_OPS with attr->btf_vmlinux_value_type_id
set to the btf id "struct bpf_struct_ops_tcp_congestion_ops" of the
running kernel.
Instead of reusing the attr->btf_value_type_id,
btf_vmlinux_value_type_id s added such that attr->btf_fd can still be
used as the "user" btf which could store other useful sysadmin/debug
info that may be introduced in the furture,
e.g. creation-date/compiler-details/map-creator...etc.
3. Create a "struct bpf_struct_ops_tcp_congestion_ops" object as described
in the running kernel btf. Populate the value of this object.
The function ptr should be populated with the prog fds.
4. Call BPF_MAP_UPDATE with the object created in (3) as
the map value. The key is always "0".
During BPF_MAP_UPDATE, the code that saves the kernel-func-ptr's
args as an array of u64 is generated. BPF_MAP_UPDATE also allows
the specific struct_ops to do some final checks in "st_ops->init_member()"
(e.g. ensure all mandatory func ptrs are implemented).
If everything looks good, it will register this kernel struct
to the kernel subsystem. The map will not allow further update
from this point.
Unregister a struct_ops from the kernel subsystem:
BPF_MAP_DELETE with key "0".
Introspect a struct_ops:
BPF_MAP_LOOKUP_ELEM with key "0". The map value returned will
have the prog _id_ populated as the func ptr.
The map value state (enum bpf_struct_ops_state) will transit from:
INIT (map created) =>
INUSE (map updated, i.e. reg) =>
TOBEFREE (map value deleted, i.e. unreg)
The kernel subsystem needs to call bpf_struct_ops_get() and
bpf_struct_ops_put() to manage the "refcnt" in the
"struct bpf_struct_ops_XYZ". This patch uses a separate refcnt
for the purose of tracking the subsystem usage. Another approach
is to reuse the map->refcnt and then "show" (i.e. during map_lookup)
the subsystem's usage by doing map->refcnt - map->usercnt to filter out
the map-fd/pinned-map usage. However, that will also tie down the
future semantics of map->refcnt and map->usercnt.
The very first subsystem's refcnt (during reg()) holds one
count to map->refcnt. When the very last subsystem's refcnt
is gone, it will also release the map->refcnt. All bpf_prog will be
freed when the map->refcnt reaches 0 (i.e. during map_free()).
Here is how the bpftool map command will look like:
[root@arch-fb-vm1 bpf]# bpftool map show
6: struct_ops name dctcp flags 0x0
key 4B value 256B max_entries 1 memlock 4096B
btf_id 6
[root@arch-fb-vm1 bpf]# bpftool map dump id 6
[{
"value": {
"refcnt": {
"refs": {
"counter": 1
}
},
"state": 1,
"data": {
"list": {
"next": 0,
"prev": 0
},
"key": 0,
"flags": 2,
"init": 24,
"release": 0,
"ssthresh": 25,
"cong_avoid": 30,
"set_state": 27,
"cwnd_event": 28,
"in_ack_event": 26,
"undo_cwnd": 29,
"pkts_acked": 0,
"min_tso_segs": 0,
"sndbuf_expand": 0,
"cong_control": 0,
"get_info": 0,
"name": [98,112,102,95,100,99,116,99,112,0,0,0,0,0,0,0
],
"owner": 0
}
}
}
]
Misc Notes:
* bpf_struct_ops_map_sys_lookup_elem() is added for syscall lookup.
It does an inplace update on "*value" instead returning a pointer
to syscall.c. Otherwise, it needs a separate copy of "zero" value
for the BPF_STRUCT_OPS_STATE_INIT to avoid races.
* The bpf_struct_ops_map_delete_elem() is also called without
preempt_disable() from map_delete_elem(). It is because
the "->unreg()" may requires sleepable context, e.g.
the "tcp_unregister_congestion_control()".
* "const" is added to some of the existing "struct btf_func_model *"
function arg to avoid a compiler warning caused by this patch.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20200109003505.3855919-1-kafai@fb.com
2020-01-09 08:35:05 +08:00
|
|
|
arch_prepare_bpf_trampoline(void *image, void *image_end,
|
|
|
|
const struct btf_func_model *m, u32 flags,
|
2019-11-15 02:57:04 +08:00
|
|
|
struct bpf_prog **fentry_progs, int fentry_cnt,
|
|
|
|
struct bpf_prog **fexit_progs, int fexit_cnt,
|
|
|
|
void *orig_call)
|
|
|
|
{
|
|
|
|
return -ENOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __init init_trampolines(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
|
|
|
|
INIT_HLIST_HEAD(&trampoline_table[i]);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
late_initcall(init_trampolines);
|