Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2021-07-15

The following pull-request contains BPF updates for your *net-next* tree.

We've added 45 non-merge commits during the last 15 day(s) which contain
a total of 52 files changed, 3122 insertions(+), 384 deletions(-).

The main changes are:

1) Introduce bpf timers, from Alexei.

2) Add sockmap support for unix datagram socket, from Cong.

3) Fix potential memleak and UAF in the verifier, from He.

4) Add bpf_get_func_ip helper, from Jiri.

5) Improvements to generic XDP mode, from Kumar.

6) Support for passing xdp_md to XDP programs in bpf_prog_run, from Zvi.
===================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2021-07-15 22:40:10 -07:00
commit 82a1ffe57e
52 changed files with 3127 additions and 389 deletions

View File

@ -10387,6 +10387,7 @@ F: net/core/skmsg.c
F: net/core/sock_map.c
F: net/ipv4/tcp_bpf.c
F: net/ipv4/udp_bpf.c
F: net/unix/unix_bpf.c
LANDLOCK SECURITY MODULE
M: Mickaël Salaün <mic@digikod.net>

View File

@ -1954,6 +1954,9 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (flags & BPF_TRAMP_F_CALL_ORIG)
stack_size += 8; /* room for return value of orig_call */
if (flags & BPF_TRAMP_F_IP_ARG)
stack_size += 8; /* room for IP address argument */
if (flags & BPF_TRAMP_F_SKIP_FRAME)
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
@ -1967,6 +1970,22 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
EMIT1(0x53); /* push rbx */
if (flags & BPF_TRAMP_F_IP_ARG) {
/* Store IP address of the traced function:
* mov rax, QWORD PTR [rbp + 8]
* sub rax, X86_PATCH_SIZE
* mov QWORD PTR [rbp - stack_size], rax
*/
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -stack_size);
/* Continue with stack_size for regs storage, stack will
* be correctly restored with 'leave' instruction.
*/
stack_size -= 8;
}
save_regs(m, &prog, nr_args, stack_size);
if (flags & BPF_TRAMP_F_CALL_ORIG) {

View File

@ -4,6 +4,7 @@
#include <asm/types.h>
#include <linux/bits.h>
#include <linux/typecheck.h>
#include <uapi/linux/kernel.h>
@ -253,6 +254,55 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
__clear_bit(nr, addr);
}
/**
* __ptr_set_bit - Set bit in a pointer's value
* @nr: the bit to set
* @addr: the address of the pointer variable
*
* Example:
* void *p = foo();
* __ptr_set_bit(bit, &p);
*/
#define __ptr_set_bit(nr, addr) \
({ \
typecheck_pointer(*(addr)); \
__set_bit(nr, (unsigned long *)(addr)); \
})
/**
* __ptr_clear_bit - Clear bit in a pointer's value
* @nr: the bit to clear
* @addr: the address of the pointer variable
*
* Example:
* void *p = foo();
* __ptr_clear_bit(bit, &p);
*/
#define __ptr_clear_bit(nr, addr) \
({ \
typecheck_pointer(*(addr)); \
__clear_bit(nr, (unsigned long *)(addr)); \
})
/**
* __ptr_test_bit - Test bit in a pointer's value
* @nr: the bit to test
* @addr: the address of the pointer variable
*
* Example:
* void *p = foo();
* if (__ptr_test_bit(bit, &p)) {
* ...
* } else {
* ...
* }
*/
#define __ptr_test_bit(nr, addr) \
({ \
typecheck_pointer(*(addr)); \
test_bit(nr, (unsigned long *)(addr)); \
})
#ifdef __KERNEL__
#ifndef set_mask_bits

View File

@ -168,6 +168,7 @@ struct bpf_map {
u32 max_entries;
u32 map_flags;
int spin_lock_off; /* >=0 valid offset, <0 error */
int timer_off; /* >=0 valid offset, <0 error */
u32 id;
int numa_node;
u32 btf_key_type_id;
@ -197,30 +198,53 @@ static inline bool map_value_has_spin_lock(const struct bpf_map *map)
return map->spin_lock_off >= 0;
}
static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
static inline bool map_value_has_timer(const struct bpf_map *map)
{
if (likely(!map_value_has_spin_lock(map)))
return;
*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
(struct bpf_spin_lock){};
return map->timer_off >= 0;
}
/* copy everything but bpf_spin_lock */
static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
{
if (unlikely(map_value_has_spin_lock(map)))
*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
(struct bpf_spin_lock){};
if (unlikely(map_value_has_timer(map)))
*(struct bpf_timer *)(dst + map->timer_off) =
(struct bpf_timer){};
}
/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */
static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
{
if (unlikely(map_value_has_spin_lock(map))) {
u32 off = map->spin_lock_off;
u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0;
memcpy(dst, src, off);
memcpy(dst + off + sizeof(struct bpf_spin_lock),
src + off + sizeof(struct bpf_spin_lock),
map->value_size - off - sizeof(struct bpf_spin_lock));
if (unlikely(map_value_has_spin_lock(map))) {
s_off = map->spin_lock_off;
s_sz = sizeof(struct bpf_spin_lock);
} else if (unlikely(map_value_has_timer(map))) {
t_off = map->timer_off;
t_sz = sizeof(struct bpf_timer);
}
if (unlikely(s_sz || t_sz)) {
if (s_off < t_off || !s_sz) {
swap(s_off, t_off);
swap(s_sz, t_sz);
}
memcpy(dst, src, t_off);
memcpy(dst + t_off + t_sz,
src + t_off + t_sz,
s_off - t_off - t_sz);
memcpy(dst + s_off + s_sz,
src + s_off + s_sz,
map->value_size - s_off - s_sz);
} else {
memcpy(dst, src, map->value_size);
}
}
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
bool lock_src);
void bpf_timer_cancel_and_free(void *timer);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
struct bpf_offload_dev;
@ -314,6 +338,7 @@ enum bpf_arg_type {
ARG_PTR_TO_FUNC, /* pointer to a bpf program function */
ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */
ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */
ARG_PTR_TO_TIMER, /* pointer to bpf_timer */
__BPF_ARG_TYPE_MAX,
};
@ -554,6 +579,11 @@ struct btf_func_model {
*/
#define BPF_TRAMP_F_SKIP_FRAME BIT(2)
/* Store IP address of the caller on the trampoline stack,
* so it's available for trampoline's programs.
*/
#define BPF_TRAMP_F_IP_ARG BIT(3)
/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
* bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2
*/
@ -1509,12 +1539,12 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
struct bpf_prog *xdp_prog, struct bpf_map *map,
bool exclude_ingress);
bool dev_map_can_have_prog(struct bpf_map *map);
void __cpu_map_flush(void);
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
struct net_device *dev_rx);
bool cpu_map_prog_allowed(struct bpf_map *map);
int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
struct sk_buff *skb);
/* Return map's numa specified by userspace */
static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
@ -1711,6 +1741,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
return 0;
}
static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
struct sk_buff *skb)
{
return -EOPNOTSUPP;
}
static inline bool cpu_map_prog_allowed(struct bpf_map *map)
{
return false;
@ -1852,6 +1888,12 @@ void bpf_map_offload_map_free(struct bpf_map *map);
int bpf_prog_test_run_syscall(struct bpf_prog *prog,
const union bpf_attr *kattr,
union bpf_attr __user *uattr);
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
void sock_map_unhash(struct sock *sk);
void sock_map_close(struct sock *sk, long timeout);
#else
static inline int bpf_prog_offload_init(struct bpf_prog *prog,
union bpf_attr *attr)
@ -1884,24 +1926,6 @@ static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog,
{
return -ENOTSUPP;
}
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
void sock_map_unhash(struct sock *sk);
void sock_map_close(struct sock *sk, long timeout);
void bpf_sk_reuseport_detach(struct sock *sk);
int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
void *value);
int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags);
#else
static inline void bpf_sk_reuseport_detach(struct sock *sk)
{
}
#ifdef CONFIG_BPF_SYSCALL
static inline int sock_map_get_from_fd(const union bpf_attr *attr,
@ -1921,7 +1945,21 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void
{
return -EOPNOTSUPP;
}
#endif /* CONFIG_BPF_SYSCALL */
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
void bpf_sk_reuseport_detach(struct sock *sk);
int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
void *value);
int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags);
#else
static inline void bpf_sk_reuseport_detach(struct sock *sk)
{
}
#ifdef CONFIG_BPF_SYSCALL
static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
void *key, void *value)
{

View File

@ -53,7 +53,14 @@ struct bpf_reg_state {
/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
* PTR_TO_MAP_VALUE_OR_NULL
*/
struct bpf_map *map_ptr;
struct {
struct bpf_map *map_ptr;
/* To distinguish map lookups from outer map
* the map_uid is non-zero for registers
* pointing to inner maps.
*/
u32 map_uid;
};
/* for PTR_TO_BTF_ID */
struct {
@ -201,12 +208,19 @@ struct bpf_func_state {
* zero == main subprog
*/
u32 subprogno;
/* Every bpf_timer_start will increment async_entry_cnt.
* It's used to distinguish:
* void foo(void) { for(;;); }
* void foo(void) { bpf_timer_set_callback(,foo); }
*/
u32 async_entry_cnt;
bool in_callback_fn;
bool in_async_callback_fn;
/* The following fields should be last. See copy_func_state() */
int acquired_refs;
struct bpf_reference_state *refs;
int allocated_stack;
bool in_callback_fn;
struct bpf_stack_state *stack;
};
@ -392,6 +406,7 @@ struct bpf_subprog_info {
bool has_tail_call;
bool tail_call_reachable;
bool has_ld_abs;
bool is_async_cb;
};
/* single container for all structs

View File

@ -99,6 +99,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
const struct btf_member *m,
u32 expected_offset, u32 expected_size);
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
int btf_find_timer(const struct btf *btf, const struct btf_type *t);
bool btf_type_is_void(const struct btf_type *t);
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,

View File

@ -559,7 +559,8 @@ struct bpf_prog {
kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */
call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
call_get_func_ip:1; /* Do we call get_func_ip() */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */

View File

@ -3984,6 +3984,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
}
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog);
void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);

View File

@ -863,8 +863,8 @@ struct sk_buff {
__u8 tc_skip_classify:1;
__u8 tc_at_ingress:1;
#endif
#ifdef CONFIG_NET_REDIRECT
__u8 redirected:1;
#ifdef CONFIG_NET_REDIRECT
__u8 from_ingress:1;
#endif
#ifdef CONFIG_TLS_DEVICE
@ -4664,17 +4664,13 @@ static inline __wsum lco_csum(struct sk_buff *skb)
static inline bool skb_is_redirected(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_REDIRECT
return skb->redirected;
#else
return false;
#endif
}
static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
{
#ifdef CONFIG_NET_REDIRECT
skb->redirected = 1;
#ifdef CONFIG_NET_REDIRECT
skb->from_ingress = from_ingress;
if (skb->from_ingress)
skb->tstamp = 0;
@ -4683,9 +4679,7 @@ static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
static inline void skb_reset_redirect(struct sk_buff *skb)
{
#ifdef CONFIG_NET_REDIRECT
skb->redirected = 0;
#endif
}
static inline bool skb_csum_is_sctp(struct sk_buff *skb)

View File

@ -22,4 +22,13 @@
(void)__tmp; \
})
/*
* Check at compile time that something is a pointer type.
*/
#define typecheck_pointer(x) \
({ typeof(x) __dummy; \
(void)sizeof(*__dummy); \
1; \
})
#endif /* TYPECHECK_H_INCLUDED */

View File

@ -82,6 +82,8 @@ static inline struct unix_sock *unix_sk(const struct sock *sk)
long unix_inq_len(struct sock *sk);
long unix_outq_len(struct sock *sk);
int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
int flags);
#ifdef CONFIG_SYSCTL
int unix_sysctl_register(struct net *net);
void unix_sysctl_unregister(struct net *net);
@ -89,4 +91,14 @@ void unix_sysctl_unregister(struct net *net);
static inline int unix_sysctl_register(struct net *net) { return 0; }
static inline void unix_sysctl_unregister(struct net *net) {}
#endif
#ifdef CONFIG_BPF_SYSCALL
extern struct proto unix_proto;
int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
void __init unix_bpf_build_proto(void);
#else
static inline void __init unix_bpf_build_proto(void)
{}
#endif
#endif

View File

@ -276,6 +276,11 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp)
return unlikely(xdp->data_meta > xdp->data);
}
static inline bool xdp_metalen_invalid(unsigned long metalen)
{
return (metalen & (sizeof(__u32) - 1)) || (metalen > 32);
}
struct xdp_attachment_info {
struct bpf_prog *prog;
u32 flags;

View File

@ -324,9 +324,6 @@ union bpf_iter_link_info {
* **BPF_PROG_TYPE_SK_LOOKUP**
* *data_in* and *data_out* must be NULL.
*
* **BPF_PROG_TYPE_XDP**
* *ctx_in* and *ctx_out* must be NULL.
*
* **BPF_PROG_TYPE_RAW_TRACEPOINT**,
* **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE**
*
@ -3249,7 +3246,7 @@ union bpf_attr {
* long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
* Description
* Select a **SO_REUSEPORT** socket from a
* **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
* **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*.
* It checks the selected socket is matching the incoming
* request in the socket buffer.
* Return
@ -4780,6 +4777,76 @@ union bpf_attr {
* Execute close syscall for given FD.
* Return
* A syscall result.
*
* long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags)
* Description
* Initialize the timer.
* First 4 bits of *flags* specify clockid.
* Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
* All other bits of *flags* are reserved.
* The verifier will reject the program if *timer* is not from
* the same *map*.
* Return
* 0 on success.
* **-EBUSY** if *timer* is already initialized.
* **-EINVAL** if invalid *flags* are passed.
* **-EPERM** if *timer* is in a map that doesn't have any user references.
* The user space should either hold a file descriptor to a map with timers
* or pin such map in bpffs. When map is unpinned or file descriptor is
* closed all timers in the map will be cancelled and freed.
*
* long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn)
* Description
* Configure the timer to call *callback_fn* static function.
* Return
* 0 on success.
* **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
* **-EPERM** if *timer* is in a map that doesn't have any user references.
* The user space should either hold a file descriptor to a map with timers
* or pin such map in bpffs. When map is unpinned or file descriptor is
* closed all timers in the map will be cancelled and freed.
*
* long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags)
* Description
* Set timer expiration N nanoseconds from the current time. The
* configured callback will be invoked in soft irq context on some cpu
* and will not repeat unless another bpf_timer_start() is made.
* In such case the next invocation can migrate to a different cpu.
* Since struct bpf_timer is a field inside map element the map
* owns the timer. The bpf_timer_set_callback() will increment refcnt
* of BPF program to make sure that callback_fn code stays valid.
* When user space reference to a map reaches zero all timers
* in a map are cancelled and corresponding program's refcnts are
* decremented. This is done to make sure that Ctrl-C of a user
* process doesn't leave any timers running. If map is pinned in
* bpffs the callback_fn can re-arm itself indefinitely.
* bpf_map_update/delete_elem() helpers and user space sys_bpf commands
* cancel and free the timer in the given map element.
* The map can contain timers that invoke callback_fn-s from different
* programs. The same callback_fn can serve different timers from
* different maps if key/value layout matches across maps.
* Every bpf_timer_set_callback() can have different callback_fn.
*
* Return
* 0 on success.
* **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
* or invalid *flags* are passed.
*
* long bpf_timer_cancel(struct bpf_timer *timer)
* Description
* Cancel the timer and wait for callback_fn to finish if it was running.
* Return
* 0 if the timer was not active.
* 1 if the timer was active.
* **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
* **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
* own timer which would have led to a deadlock otherwise.
*
* u64 bpf_get_func_ip(void *ctx)
* Description
* Get address of the traced function (for tracing and kprobe programs).
* Return
* Address of the traced function.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@ -4951,6 +5018,11 @@ union bpf_attr {
FN(sys_bpf), \
FN(btf_find_by_name_kind), \
FN(sys_close), \
FN(timer_init), \
FN(timer_set_callback), \
FN(timer_start), \
FN(timer_cancel), \
FN(get_func_ip), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@ -6077,6 +6149,11 @@ struct bpf_spin_lock {
__u32 val;
};
struct bpf_timer {
__u64 :64;
__u64 :64;
} __attribute__((aligned(8)));
struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write.

View File

@ -29,7 +29,7 @@ config BPF_SYSCALL
select IRQ_WORK
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if INET
select NET_SOCK_MSG if NET
default n
help
Enable the bpf() system call that allows to manipulate BPF programs

View File

@ -287,6 +287,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return 0;
}
static void check_and_free_timer_in_array(struct bpf_array *arr, void *val)
{
if (unlikely(map_value_has_timer(&arr->map)))
bpf_timer_cancel_and_free(val + arr->map.timer_off);
}
/* Called from syscall or from eBPF program */
static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
@ -321,6 +327,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
check_and_free_timer_in_array(array, val);
}
return 0;
}
@ -374,6 +381,19 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
static void array_map_free_timers(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
if (likely(!map_value_has_timer(map)))
return;
for (i = 0; i < array->map.max_entries; i++)
bpf_timer_cancel_and_free(array->value + array->elem_size * i +
map->timer_off);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
@ -668,6 +688,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_release_uref = array_map_free_timers,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,

View File

@ -3046,43 +3046,92 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
/* find 'struct bpf_spin_lock' in map value.
* return >= 0 offset if found
* and < 0 in case of error
*/
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
const char *name, int sz, int align)
{
const struct btf_member *member;
u32 i, off = -ENOENT;
if (!__btf_type_is_struct(t))
return -EINVAL;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
if (!__btf_type_is_struct(member_type))
continue;
if (member_type->size != sizeof(struct bpf_spin_lock))
if (member_type->size != sz)
continue;
if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
"bpf_spin_lock"))
if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
continue;
if (off != -ENOENT)
/* only one 'struct bpf_spin_lock' is allowed */
/* only one such field is allowed */
return -E2BIG;
off = btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
if (off % __alignof__(struct bpf_spin_lock))
/* valid struct bpf_spin_lock will be 4 byte aligned */
if (off % align)
return -EINVAL;
}
return off;
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
const char *name, int sz, int align)
{
const struct btf_var_secinfo *vsi;
u32 i, off = -ENOENT;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
if (!__btf_type_is_struct(var_type))
continue;
if (var_type->size != sz)
continue;
if (vsi->size != sz)
continue;
if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
continue;
if (off != -ENOENT)
/* only one such field is allowed */
return -E2BIG;
off = vsi->offset;
if (off % align)
return -EINVAL;
}
return off;
}
static int btf_find_field(const struct btf *btf, const struct btf_type *t,
const char *name, int sz, int align)
{
if (__btf_type_is_struct(t))
return btf_find_struct_field(btf, t, name, sz, align);
else if (btf_type_is_datasec(t))
return btf_find_datasec_var(btf, t, name, sz, align);
return -EINVAL;
}
/* find 'struct bpf_spin_lock' in map value.
* return >= 0 offset if found
* and < 0 in case of error
*/
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
{
return btf_find_field(btf, t, "bpf_spin_lock",
sizeof(struct bpf_spin_lock),
__alignof__(struct bpf_spin_lock));
}
int btf_find_timer(const struct btf *btf, const struct btf_type *t)
{
return btf_find_field(btf, t, "bpf_timer",
sizeof(struct bpf_timer),
__alignof__(struct bpf_timer));
}
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)

View File

@ -16,6 +16,7 @@
* netstack, and assigning dedicated CPUs for this stage. This
* basically allows for 10G wirespeed pre-filtering via bpf.
*/
#include <linux/bitops.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ptr_ring.h>
@ -168,6 +169,46 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
}
}
static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
struct list_head *listp,
struct xdp_cpumap_stats *stats)
{
struct sk_buff *skb, *tmp;
struct xdp_buff xdp;
u32 act;
int err;
list_for_each_entry_safe(skb, tmp, listp, list) {
act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
switch (act) {
case XDP_PASS:
break;
case XDP_REDIRECT:
skb_list_del_init(skb);
err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
rcpu->prog);
if (unlikely(err)) {
kfree_skb(skb);
stats->drop++;
} else {
stats->redirect++;
}
return;
default:
bpf_warn_invalid_xdp_action(act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(skb->dev, rcpu->prog, act);
fallthrough;
case XDP_DROP:
skb_list_del_init(skb);
kfree_skb(skb);
stats->drop++;
return;
}
}
}
static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
void **frames, int n,
struct xdp_cpumap_stats *stats)
@ -176,11 +217,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
struct xdp_buff xdp;
int i, nframes = 0;
if (!rcpu->prog)
return n;
rcu_read_lock_bh();
xdp_set_return_frame_no_direct();
xdp.rxq = &rxq;
@ -227,17 +263,37 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
}
if (stats->redirect)
xdp_do_flush_map();
xdp_clear_return_frame_no_direct();
return nframes;
}
#define CPUMAP_BATCH 8
static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
int xdp_n, struct xdp_cpumap_stats *stats,
struct list_head *list)
{
int nframes;
if (!rcpu->prog)
return xdp_n;
rcu_read_lock_bh();
nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
if (stats->redirect)
xdp_do_flush();
if (unlikely(!list_empty(list)))
cpu_map_bpf_prog_run_skb(rcpu, list, stats);
rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
return nframes;
}
#define CPUMAP_BATCH 8
static int cpu_map_kthread_run(void *data)
{
@ -254,9 +310,9 @@ static int cpu_map_kthread_run(void *data)
struct xdp_cpumap_stats stats = {}; /* zero stats */
unsigned int kmem_alloc_drops = 0, sched = 0;
gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
int i, n, m, nframes, xdp_n;
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
int i, n, m, nframes;
LIST_HEAD(list);
/* Release CPU reschedule checks */
@ -280,9 +336,20 @@ static int cpu_map_kthread_run(void *data)
*/
n = __ptr_ring_consume_batched(rcpu->queue, frames,
CPUMAP_BATCH);
for (i = 0; i < n; i++) {
for (i = 0, xdp_n = 0; i < n; i++) {
void *f = frames[i];
struct page *page = virt_to_page(f);
struct page *page;
if (unlikely(__ptr_test_bit(0, &f))) {
struct sk_buff *skb = f;
__ptr_clear_bit(0, &skb);
list_add_tail(&skb->list, &list);
continue;
}
frames[xdp_n++] = f;
page = virt_to_page(f);
/* Bring struct page memory area to curr CPU. Read by
* build_skb_around via page_is_pfmemalloc(), and when
@ -292,7 +359,7 @@ static int cpu_map_kthread_run(void *data)
}
/* Support running another XDP prog on this CPU */
nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats);
nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
if (nframes) {
m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
if (unlikely(m == 0)) {
@ -330,12 +397,6 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
bool cpu_map_prog_allowed(struct bpf_map *map)
{
return map->map_type == BPF_MAP_TYPE_CPUMAP &&
map->value_size != offsetofend(struct bpf_cpumap_val, qsize);
}
static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
{
struct bpf_prog *prog;
@ -701,6 +762,25 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
struct sk_buff *skb)
{
int ret;
__skb_pull(skb, skb->mac_len);
skb_set_redirected(skb, false);
__ptr_set_bit(0, &skb);
ret = ptr_ring_produce(rcpu->queue, skb);
if (ret < 0)
goto trace;
wake_up_process(rcpu->kthread);
trace:
trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu);
return ret;
}
void __cpu_map_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);

View File

@ -322,16 +322,6 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
bool dev_map_can_have_prog(struct bpf_map *map)
{
if ((map->map_type == BPF_MAP_TYPE_DEVMAP ||
map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) &&
map->value_size != offsetofend(struct bpf_devmap_val, ifindex))
return true;
return false;
}
static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
struct xdp_frame **frames, int n,
struct net_device *dev)
@ -499,6 +489,37 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
return 0;
}
static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
{
struct xdp_txq_info txq = { .dev = dst->dev };
struct xdp_buff xdp;
u32 act;
if (!dst->xdp_prog)
return XDP_PASS;
__skb_pull(skb, skb->mac_len);
xdp.txq = &txq;
act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
switch (act) {
case XDP_PASS:
__skb_push(skb, skb->mac_len);
break;
default:
bpf_warn_invalid_xdp_action(act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(dst->dev, dst->xdp_prog, act);
fallthrough;
case XDP_DROP:
kfree_skb(skb);
break;
}
return act;
}
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx)
{
@ -615,6 +636,14 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
err = xdp_ok_fwd_dev(dst->dev, skb->len);
if (unlikely(err))
return err;
/* Redirect has already succeeded semantically at this point, so we just
* return 0 even if packet is dropped. Helper below takes care of
* freeing skb.
*/
if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
return 0;
skb->dev = dst->dev;
generic_xdp_tx(skb, xdp_prog);

View File

@ -228,6 +228,32 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
}
static bool htab_has_extra_elems(struct bpf_htab *htab)
{
return !htab_is_percpu(htab) && !htab_is_lru(htab);
}
static void htab_free_prealloced_timers(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
if (likely(!map_value_has_timer(&htab->map)))
return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
for (i = 0; i < num_entries; i++) {
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
bpf_timer_cancel_and_free(elem->key +
round_up(htab->map.key_size, 8) +
htab->map.timer_off);
cond_resched();
}
}
static void htab_free_elems(struct bpf_htab *htab)
{
int i;
@ -265,8 +291,12 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
u32 key_size = htab->map.key_size;
l = container_of(node, struct htab_elem, lru_node);
memcpy(l->key, key, htab->map.key_size);
memcpy(l->key, key, key_size);
check_and_init_map_value(&htab->map,
l->key + round_up(key_size, 8));
return l;
}
@ -278,7 +308,7 @@ static int prealloc_init(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
if (!htab_is_percpu(htab) && !htab_is_lru(htab))
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
@ -695,6 +725,14 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
{
if (unlikely(map_value_has_timer(&htab->map)))
bpf_timer_cancel_and_free(elem->key +
round_up(htab->map.key_size, 8) +
htab->map.timer_off);
}
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
@ -719,6 +757,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
check_and_free_timer(htab, l);
break;
}
@ -790,6 +829,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
check_and_free_timer(htab, l);
kfree(l);
}
@ -817,6 +857,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
check_and_free_timer(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
@ -920,8 +961,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
check_and_init_map_lock(&htab->map,
l_new->key + round_up(key_size, 8));
check_and_init_map_value(&htab->map,
l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
@ -1062,6 +1103,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_del_rcu(&l_old->hash_node);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
else
check_and_free_timer(htab, l_old);
}
ret = 0;
err:
@ -1069,6 +1112,12 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
return ret;
}
static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
{
check_and_free_timer(htab, elem);
bpf_lru_push_free(&htab->lru, &elem->lru_node);
}
static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
@ -1102,7 +1151,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
copy_map_value(&htab->map,
l_new->key + round_up(map->key_size, 8), value);
ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
@ -1128,9 +1178,9 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
htab_unlock_bucket(htab, b, hash, flags);
if (ret)
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
htab_lru_push_free(htab, l_new);
else if (l_old)
bpf_lru_push_free(&htab->lru, &l_old->lru_node);
htab_lru_push_free(htab, l_old);
return ret;
}
@ -1339,7 +1389,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
htab_unlock_bucket(htab, b, hash, flags);
if (l)
bpf_lru_push_free(&htab->lru, &l->lru_node);
htab_lru_push_free(htab, l);
return ret;
}
@ -1359,6 +1409,35 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}
static void htab_free_malloced_timers(struct bpf_htab *htab)
{
int i;
rcu_read_lock();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
struct htab_elem *l;
hlist_nulls_for_each_entry(l, n, head, hash_node)
check_and_free_timer(htab, l);
cond_resched_rcu();
}
rcu_read_unlock();
}
static void htab_map_free_timers(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
if (likely(!map_value_has_timer(&htab->map)))
return;
if (!htab_is_prealloc(htab))
htab_free_malloced_timers(htab);
else
htab_free_prealloced_timers(htab);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@ -1456,7 +1535,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
else
copy_map_value(map, value, l->key +
roundup_key_size);
check_and_init_map_lock(map, value);
check_and_init_map_value(map, value);
}
hlist_nulls_del_rcu(&l->hash_node);
@ -1467,7 +1546,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
htab_unlock_bucket(htab, b, hash, bflags);
if (is_lru_map && l)
bpf_lru_push_free(&htab->lru, &l->lru_node);
htab_lru_push_free(htab, l);
return ret;
}
@ -1645,7 +1724,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
true);
else
copy_map_value(map, dst_val, value);
check_and_init_map_lock(map, dst_val);
check_and_init_map_value(map, dst_val);
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
@ -1672,7 +1751,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
bpf_lru_push_free(&htab->lru, &l->lru_node);
htab_lru_push_free(htab, l);
}
next_batch:
@ -2034,6 +2113,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@ -2055,6 +2135,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,

View File

@ -289,13 +289,18 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
static DEFINE_PER_CPU(unsigned long, irqsave_flags);
notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
{
unsigned long flags;
local_irq_save(flags);
__bpf_spin_lock(lock);
__this_cpu_write(irqsave_flags, flags);
}
notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
{
__bpf_spin_lock_irqsave(lock);
return 0;
}
@ -306,13 +311,18 @@ const struct bpf_func_proto bpf_spin_lock_proto = {
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
};
notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
{
unsigned long flags;
flags = __this_cpu_read(irqsave_flags);
__bpf_spin_unlock(lock);
local_irq_restore(flags);
}
notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
{
__bpf_spin_unlock_irqrestore(lock);
return 0;
}
@ -333,9 +343,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
else
lock = dst + map->spin_lock_off;
preempt_disable();
____bpf_spin_lock(lock);
__bpf_spin_lock_irqsave(lock);
copy_map_value(map, dst, src);
____bpf_spin_unlock(lock);
__bpf_spin_unlock_irqrestore(lock);
preempt_enable();
}
@ -989,6 +999,320 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
/* BPF map elements can contain 'struct bpf_timer'.
* Such map owns all of its BPF timers.
* 'struct bpf_timer' is allocated as part of map element allocation
* and it's zero initialized.
* That space is used to keep 'struct bpf_timer_kern'.
* bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
* remembers 'struct bpf_map *' pointer it's part of.
* bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
* bpf_timer_start() arms the timer.
* If user space reference to a map goes to zero at this point
* ops->map_release_uref callback is responsible for cancelling the timers,
* freeing their memory, and decrementing prog's refcnts.
* bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
* Inner maps can contain bpf timers as well. ops->map_release_uref is
* freeing the timers when inner map is replaced or deleted by user space.
*/
struct bpf_hrtimer {
struct hrtimer timer;
struct bpf_map *map;
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
};
/* the actual struct hidden inside uapi struct bpf_timer */
struct bpf_timer_kern {
struct bpf_hrtimer *timer;
/* bpf_spin_lock is used here instead of spinlock_t to make
* sure that it always fits into space resereved by struct bpf_timer
* regardless of LOCKDEP and spinlock debug flags.
*/
struct bpf_spin_lock lock;
} __attribute__((aligned(8)));
static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
struct bpf_map *map = t->map;
void *value = t->value;
void *callback_fn;
void *key;
u32 idx;
callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
if (!callback_fn)
goto out;
/* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
* cannot be preempted by another bpf_timer_cb() on the same cpu.
* Remember the timer this callback is servicing to prevent
* deadlock if callback_fn() calls bpf_timer_cancel() or
* bpf_map_delete_elem() on the same timer.
*/
this_cpu_write(hrtimer_running, t);
if (map->map_type == BPF_MAP_TYPE_ARRAY) {
struct bpf_array *array = container_of(map, struct bpf_array, map);
/* compute the key */
idx = ((char *)value - array->value) / array->elem_size;
key = &idx;
} else { /* hash or lru */
key = value - round_up(map->key_size, 8);
}
BPF_CAST_CALL(callback_fn)((u64)(long)map, (u64)(long)key,
(u64)(long)value, 0, 0);
/* The verifier checked that return value is zero. */
this_cpu_write(hrtimer_running, NULL);
out:
return HRTIMER_NORESTART;
}
BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
u64, flags)
{
clockid_t clockid = flags & (MAX_CLOCKS - 1);
struct bpf_hrtimer *t;
int ret = 0;
BUILD_BUG_ON(MAX_CLOCKS != 16);
BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
if (in_nmi())
return -EOPNOTSUPP;
if (flags >= MAX_CLOCKS ||
/* similar to timerfd except _ALARM variants are not supported */
(clockid != CLOCK_MONOTONIC &&
clockid != CLOCK_REALTIME &&
clockid != CLOCK_BOOTTIME))
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
if (t) {
ret = -EBUSY;
goto out;
}
if (!atomic64_read(&map->usercnt)) {
/* maps with timers must be either held by user space
* or pinned in bpffs.
*/
ret = -EPERM;
goto out;
}
/* allocate hrtimer via map_kmalloc to use memcg accounting */
t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
if (!t) {
ret = -ENOMEM;
goto out;
}
t->value = (void *)timer - map->timer_off;
t->map = map;
t->prog = NULL;
rcu_assign_pointer(t->callback_fn, NULL);
hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
t->timer.function = bpf_timer_cb;
timer->timer = t;
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
return ret;
}
static const struct bpf_func_proto bpf_timer_init_proto = {
.func = bpf_timer_init,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_TIMER,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
};
BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
struct bpf_prog_aux *, aux)
{
struct bpf_prog *prev, *prog = aux->prog;
struct bpf_hrtimer *t;
int ret = 0;
if (in_nmi())
return -EOPNOTSUPP;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
if (!t) {
ret = -EINVAL;
goto out;
}
if (!atomic64_read(&t->map->usercnt)) {
/* maps with timers must be either held by user space
* or pinned in bpffs. Otherwise timer might still be
* running even when bpf prog is detached and user space
* is gone, since map_release_uref won't ever be called.
*/
ret = -EPERM;
goto out;
}
prev = t->prog;
if (prev != prog) {
/* Bump prog refcnt once. Every bpf_timer_set_callback()
* can pick different callback_fn-s within the same prog.
*/
prog = bpf_prog_inc_not_zero(prog);
if (IS_ERR(prog)) {
ret = PTR_ERR(prog);
goto out;
}
if (prev)
/* Drop prev prog refcnt when swapping with new prog */
bpf_prog_put(prev);
t->prog = prog;
}
rcu_assign_pointer(t->callback_fn, callback_fn);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
return ret;
}
static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.func = bpf_timer_set_callback,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_TIMER,
.arg2_type = ARG_PTR_TO_FUNC,
};
BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
{
struct bpf_hrtimer *t;
int ret = 0;
if (in_nmi())
return -EOPNOTSUPP;
if (flags)
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
if (!t || !t->prog) {
ret = -EINVAL;
goto out;
}
hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
return ret;
}
static const struct bpf_func_proto bpf_timer_start_proto = {
.func = bpf_timer_start,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_TIMER,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};
static void drop_prog_refcnt(struct bpf_hrtimer *t)
{
struct bpf_prog *prog = t->prog;
if (prog) {
bpf_prog_put(prog);
t->prog = NULL;
rcu_assign_pointer(t->callback_fn, NULL);
}
}
BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
{
struct bpf_hrtimer *t;
int ret = 0;
if (in_nmi())
return -EOPNOTSUPP;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
if (!t) {
ret = -EINVAL;
goto out;
}
if (this_cpu_read(hrtimer_running) == t) {
/* If bpf callback_fn is trying to bpf_timer_cancel()
* its own timer the hrtimer_cancel() will deadlock
* since it waits for callback_fn to finish
*/
ret = -EDEADLK;
goto out;
}
drop_prog_refcnt(t);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
/* Cancel the timer and wait for associated callback to finish
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
return ret;
}
static const struct bpf_func_proto bpf_timer_cancel_proto = {
.func = bpf_timer_cancel,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_TIMER,
};
/* This function is called by map_delete/update_elem for individual element and
* by ops->map_release_uref when the user space reference to a map reaches zero.
*/
void bpf_timer_cancel_and_free(void *val)
{
struct bpf_timer_kern *timer = val;
struct bpf_hrtimer *t;
/* Performance optimization: read timer->timer without lock first. */
if (!READ_ONCE(timer->timer))
return;
__bpf_spin_lock_irqsave(&timer->lock);
/* re-read it under lock */
t = timer->timer;
if (!t)
goto out;
drop_prog_refcnt(t);
/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
* this timer, since it won't be initialized.
*/
timer->timer = NULL;
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
if (!t)
return;
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call kfree(t)
* right after for both preallocated and non-preallocated maps.
* The timer->timer = NULL was already done and no code path can
* see address 't' anymore.
*
* Check that bpf_map_delete/update_elem() wasn't called from timer
* callback_fn. In such case don't call hrtimer_cancel() (since it will
* deadlock) and don't call hrtimer_try_to_cancel() (since it will just
* return -1). Though callback_fn is still running on this cpu it's
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
* from 't'. The bpf subprog callback_fn won't be able to access 't',
* since timer->timer = NULL was already done. The timer will be
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
*/
if (this_cpu_read(hrtimer_running) != t)
hrtimer_cancel(&t->timer);
kfree(t);
}
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@ -1055,6 +1379,14 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_per_cpu_ptr_proto;
case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
case BPF_FUNC_timer_init:
return &bpf_timer_init_proto;
case BPF_FUNC_timer_set_callback:
return &bpf_timer_set_callback_proto;
case BPF_FUNC_timer_start:
return &bpf_timer_start_proto;
case BPF_FUNC_timer_cancel:
return &bpf_timer_cancel_proto;
default:
break;
}

View File

@ -173,7 +173,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return -ENOMEM;
memcpy(&new->data[0], value, map->value_size);
check_and_init_map_lock(map, new->data);
check_and_init_map_value(map, new->data);
new = xchg(&storage->buf, new);
kfree_rcu(new, rcu);
@ -509,7 +509,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
map->numa_node);
if (!storage->buf)
goto enomem;
check_and_init_map_lock(map, storage->buf->data);
check_and_init_map_value(map, storage->buf->data);
} else {
storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
if (!storage->percpu_buf)

View File

@ -3,6 +3,7 @@
*/
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include "map_in_map.h"
@ -50,6 +51,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
inner_map_meta->timer_off = inner_map->timer_off;
if (inner_map->btf) {
btf_get(inner_map->btf);
inner_map_meta->btf = inner_map->btf;
}
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
@ -65,6 +71,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
void bpf_map_meta_free(struct bpf_map *map_meta)
{
btf_put(map_meta->btf);
kfree(map_meta);
}
@ -75,6 +82,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
meta0->timer_off == meta1->timer_off &&
meta0->map_flags == meta1->map_flags;
}

View File

@ -260,8 +260,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, value, ptr, true);
else
copy_map_value(map, value, ptr);
/* mask lock, since value wasn't zero inited */
check_and_init_map_lock(map, value);
/* mask lock and timer, since value wasn't zero inited */
check_and_init_map_value(map, value);
}
rcu_read_unlock();
}
@ -623,7 +623,8 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
struct bpf_map *map = filp->private_data;
int err;
if (!map->ops->map_mmap || map_value_has_spin_lock(map))
if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
map_value_has_timer(map))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@ -793,6 +794,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
}
}
map->timer_off = btf_find_timer(btf, value_type);
if (map_value_has_timer(map)) {
if (map->map_flags & BPF_F_RDONLY_PROG)
return -EACCES;
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY)
return -EOPNOTSUPP;
}
if (map->ops->map_check_btf)
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
@ -844,6 +855,7 @@ static int map_create(union bpf_attr *attr)
mutex_init(&map->freeze_mutex);
map->spin_lock_off = -EINVAL;
map->timer_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
@ -1591,7 +1603,8 @@ static int map_freeze(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
map_value_has_timer(map)) {
fdput(f);
return -ENOTSUPP;
}
@ -1699,6 +1712,8 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
{
unsigned long flags;
/* cBPF to eBPF migrations are currently not in the idr store.
* Offloaded programs are removed from the store when their device
* disappears - even if someone grabs an fd to them they are unusable,
@ -1708,7 +1723,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
return;
if (do_idr_lock)
spin_lock_bh(&prog_idr_lock);
spin_lock_irqsave(&prog_idr_lock, flags);
else
__acquire(&prog_idr_lock);
@ -1716,7 +1731,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
prog->aux->id = 0;
if (do_idr_lock)
spin_unlock_bh(&prog_idr_lock);
spin_unlock_irqrestore(&prog_idr_lock, flags);
else
__release(&prog_idr_lock);
}
@ -1752,14 +1767,32 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
}
}
static void bpf_prog_put_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
struct bpf_prog *prog;
aux = container_of(work, struct bpf_prog_aux, work);
prog = aux->prog;
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
__bpf_prog_put_noref(prog, true);
}
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic64_dec_and_test(&prog->aux->refcnt)) {
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
struct bpf_prog_aux *aux = prog->aux;
if (atomic64_dec_and_test(&aux->refcnt)) {
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
__bpf_prog_put_noref(prog, true);
if (in_irq() || irqs_disabled()) {
INIT_WORK(&aux->work, bpf_prog_put_deferred);
schedule_work(&aux->work);
} else {
bpf_prog_put_deferred(&aux->work);
}
}
}

View File

@ -172,7 +172,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
}
static struct bpf_tramp_progs *
bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
{
const struct bpf_prog_aux *aux;
struct bpf_tramp_progs *tprogs;
@ -189,8 +189,10 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
*total += tr->progs_cnt[kind];
progs = tprogs[kind].progs;
hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) {
*ip_arg |= aux->prog->call_get_func_ip;
*progs++ = aux->prog;
}
}
return tprogs;
}
@ -333,9 +335,10 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
struct bpf_tramp_image *im;
struct bpf_tramp_progs *tprogs;
u32 flags = BPF_TRAMP_F_RESTORE_REGS;
bool ip_arg = false;
int err, total;
tprogs = bpf_trampoline_get_progs(tr, &total);
tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg);
if (IS_ERR(tprogs))
return PTR_ERR(tprogs);
@ -357,6 +360,9 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
if (ip_arg)
flags |= BPF_TRAMP_F_IP_ARG;
err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
&tr->func.model, flags, tprogs,
tr->func.addr);

View File

@ -255,6 +255,7 @@ struct bpf_call_arg_meta {
int mem_size;
u64 msize_max_value;
int ref_obj_id;
int map_uid;
int func_id;
struct btf *btf;
u32 btf_id;
@ -734,6 +735,10 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (state->refs[i].id)
verbose(env, ",%d", state->refs[i].id);
}
if (state->in_callback_fn)
verbose(env, " cb");
if (state->in_async_callback_fn)
verbose(env, " async_cb");
verbose(env, "\n");
}
@ -1135,6 +1140,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
if (map->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = map->inner_map_meta;
/* transfer reg's id which is unique for every map_lookup_elem
* as UID of the inner map.
*/
reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@ -1522,6 +1531,54 @@ static void init_func_state(struct bpf_verifier_env *env,
init_reg_state(env, state);
}
/* Similar to push_stack(), but for async callbacks */
static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx,
int subprog)
{
struct bpf_verifier_stack_elem *elem;
struct bpf_func_state *frame;
elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
if (!elem)
goto err;
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
elem->log_pos = env->log.len_used;
env->head = elem;
env->stack_size++;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
verbose(env,
"The sequence of %d jumps is too complex for async cb.\n",
env->stack_size);
goto err;
}
/* Unlike push_stack() do not copy_verifier_state().
* The caller state doesn't matter.
* This is async callback. It starts in a fresh stack.
* Initialize it similar to do_check_common().
*/
elem->st.branches = 1;
frame = kzalloc(sizeof(*frame), GFP_KERNEL);
if (!frame)
goto err;
init_func_state(env, frame,
BPF_MAIN_FUNC /* callsite */,
0 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
elem->st.frame[0] = frame;
return &elem->st;
err:
free_verifier_state(env->cur_state, true);
env->cur_state = NULL;
/* pop all elements and return */
while (!pop_stack(env, NULL, NULL, false));
return NULL;
}
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
@ -3241,6 +3298,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
}
if (map_value_has_timer(map)) {
u32 t = map->timer_off;
if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
t < reg->umax_value + off + size) {
verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
return -EACCES;
}
}
return err;
}
@ -3643,6 +3709,8 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
int next_insn;
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
@ -3650,13 +3718,22 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
ret_prog[frame] = idx;
/* find the callee */
i = i + insn[i].imm + 1;
idx = find_subprog(env, i);
next_insn = i + insn[i].imm + 1;
idx = find_subprog(env, next_insn);
if (idx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
i);
next_insn);
return -EFAULT;
}
if (subprog[idx].is_async_cb) {
if (subprog[idx].has_tail_call) {
verbose(env, "verifier bug. subprog has tail_call and async cb\n");
return -EFAULT;
}
/* async callbacks don't increase bpf prog stack size */
continue;
}
i = next_insn;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@ -4656,6 +4733,54 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
return 0;
}
static int process_timer_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
bool is_const = tnum_is_const(reg->var_off);
struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
if (!is_const) {
verbose(env,
"R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
regno);
return -EINVAL;
}
if (!map->btf) {
verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
map->name);
return -EINVAL;
}
if (!map_value_has_timer(map)) {
if (map->timer_off == -E2BIG)
verbose(env,
"map '%s' has more than one 'struct bpf_timer'\n",
map->name);
else if (map->timer_off == -ENOENT)
verbose(env,
"map '%s' doesn't have 'struct bpf_timer'\n",
map->name);
else
verbose(env,
"map '%s' is not a struct type or bpf_timer is mangled\n",
map->name);
return -EINVAL;
}
if (map->timer_off != val + reg->off) {
verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
val + reg->off, map->timer_off);
return -EINVAL;
}
if (meta->map_ptr) {
verbose(env, "verifier bug. Two map pointers in a timer helper\n");
return -EFAULT;
}
meta->map_uid = reg->map_uid;
meta->map_ptr = map;
return 0;
}
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
return type == ARG_PTR_TO_MEM ||
@ -4788,6 +4913,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
@ -4819,6 +4945,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_FUNC] = &func_ptr_types,
[ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@ -4948,7 +5075,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
if (arg_type == ARG_CONST_MAP_PTR) {
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
if (meta->map_ptr) {
/* Use map_uid (which is unique id of inner map) to reject:
* inner_map1 = bpf_map_lookup_elem(outer_map, key1)
* inner_map2 = bpf_map_lookup_elem(outer_map, key2)
* if (inner_map1 && inner_map2) {
* timer = bpf_map_lookup_elem(inner_map1);
* if (timer)
* // mismatch would have been allowed
* bpf_timer_init(timer, inner_map2);
* }
*
* Comparing map_ptr is enough to distinguish normal and outer maps.
*/
if (meta->map_ptr != reg->map_ptr ||
meta->map_uid != reg->map_uid) {
verbose(env,
"timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
meta->map_uid, reg->map_uid);
return -EINVAL;
}
}
meta->map_ptr = reg->map_ptr;
meta->map_uid = reg->map_uid;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
@ -5000,6 +5149,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
verbose(env, "verifier internal error\n");
return -EFAULT;
}
} else if (arg_type == ARG_PTR_TO_TIMER) {
if (process_timer_func(env, regno, meta))
return -EACCES;
} else if (arg_type == ARG_PTR_TO_FUNC) {
meta->subprogno = reg->subprogno;
} else if (arg_type_is_mem_ptr(arg_type)) {
@ -5615,6 +5767,31 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
if (insn->code == (BPF_JMP | BPF_CALL) &&
insn->imm == BPF_FUNC_timer_set_callback) {
struct bpf_verifier_state *async_cb;
/* there is no real recursion here. timer callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
*insn_idx, subprog);
if (!async_cb)
return -EFAULT;
callee = async_cb->frame[0];
callee->async_entry_cnt = caller->async_entry_cnt + 1;
/* Convert bpf_timer_set_callback() args into timer callback args */
err = set_callee_state_cb(env, caller, callee, *insn_idx);
if (err)
return err;
clear_caller_saved_regs(env, caller->regs);
mark_reg_unknown(env, caller->regs, BPF_REG_0);
caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* continue with next insn after call */
return 0;
}
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
if (!callee)
return -ENOMEM;
@ -5742,6 +5919,35 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return 0;
}
static int set_timer_callback_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee,
int insn_idx)
{
struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;
/* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
* callback_fn(struct bpf_map *map, void *key, void *value);
*/
callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
callee->regs[BPF_REG_1].map_ptr = map_ptr;
callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
callee->regs[BPF_REG_2].map_ptr = map_ptr;
callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
__mark_reg_known_zero(&callee->regs[BPF_REG_3]);
callee->regs[BPF_REG_3].map_ptr = map_ptr;
/* unused */
__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
callee->in_async_callback_fn = true;
return 0;
}
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@ -5955,6 +6161,29 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
return err;
}
static int check_get_func_ip(struct bpf_verifier_env *env)
{
enum bpf_attach_type eatype = env->prog->expected_attach_type;
enum bpf_prog_type type = resolve_prog_type(env->prog);
int func_id = BPF_FUNC_get_func_ip;
if (type == BPF_PROG_TYPE_TRACING) {
if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT &&
eatype != BPF_MODIFY_RETURN) {
verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
func_id_name(func_id), func_id);
return -ENOTSUPP;
}
return 0;
} else if (type == BPF_PROG_TYPE_KPROBE) {
return 0;
}
verbose(env, "func %s#%d not supported for program type %d\n",
func_id_name(func_id), func_id, type);
return -ENOTSUPP;
}
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
@ -6069,6 +6298,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
if (func_id == BPF_FUNC_timer_set_callback) {
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_timer_callback_state);
if (err < 0)
return -EINVAL;
}
if (func_id == BPF_FUNC_snprintf) {
err = check_bpf_snprintf_call(env, regs);
if (err < 0)
@ -6104,6 +6340,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].map_uid = meta.map_uid;
if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
if (map_value_has_spin_lock(meta.map_ptr))
@ -6225,6 +6462,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
env->prog->call_get_stack = true;
if (func_id == BPF_FUNC_get_func_ip) {
if (check_get_func_ip(env))
return -ENOTSUPP;
env->prog->call_get_func_ip = true;
}
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
@ -9099,7 +9342,8 @@ static int check_return_code(struct bpf_verifier_env *env)
struct tnum range = tnum_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
const bool is_subprog = env->cur_state->frame[0]->subprogno;
struct bpf_func_state *frame = env->cur_state->frame[0];
const bool is_subprog = frame->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
if (!is_subprog &&
@ -9124,6 +9368,22 @@ static int check_return_code(struct bpf_verifier_env *env)
}
reg = cur_regs(env) + BPF_REG_0;
if (frame->in_async_callback_fn) {
/* enforce return zero from async callbacks like timer */
if (reg->type != SCALAR_VALUE) {
verbose(env, "In async callback the register R0 is not a known value (%s)\n",
reg_type_str[reg->type]);
return -EINVAL;
}
if (!tnum_in(tnum_const(0), reg->var_off)) {
verbose_invalid_scalar(env, reg, &range, "async callback", "R0");
return -EINVAL;
}
return 0;
}
if (is_subprog) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
@ -9338,8 +9598,12 @@ static int visit_func_call_insn(int t, int insn_cnt,
init_explored_state(env, t + 1);
if (visit_callee) {
init_explored_state(env, t);
ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
env, false);
ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
/* It's ok to allow recursion from CFG point of
* view. __check_func_call() will do the actual
* check.
*/
bpf_pseudo_func(insns + t));
}
return ret;
}
@ -9367,6 +9631,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
return DONE_EXPLORING;
case BPF_CALL:
if (insns[t].imm == BPF_FUNC_timer_set_callback)
/* Mark this call insn to trigger is_state_visited() check
* before call itself is processed by __check_func_call().
* Otherwise new async state will be pushed for further
* exploration.
*/
init_explored_state(env, t);
return visit_func_call_insn(t, insn_cnt, insns, env,
insns[t].src_reg == BPF_PSEUDO_CALL);
@ -10374,9 +10645,25 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
states_cnt++;
if (sl->state.insn_idx != insn_idx)
goto next;
if (sl->state.branches) {
if (states_maybe_looping(&sl->state, cur) &&
states_equal(env, &sl->state, cur)) {
struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
if (frame->in_async_callback_fn &&
frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
/* Different async_entry_cnt means that the verifier is
* processing another entry into async callback.
* Seeing the same state is not an indication of infinite
* loop or infinite recursion.
* But finding the same state doesn't mean that it's safe
* to stop processing the current state. The previous state
* hasn't yet reached bpf_exit, since state.branches > 0.
* Checking in_async_callback_fn alone is not enough either.
* Since the verifier still needs to catch infinite loops
* inside async callbacks.
*/
} else if (states_maybe_looping(&sl->state, cur) &&
states_equal(env, &sl->state, cur)) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
return -EINVAL;
@ -11425,10 +11712,11 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
* insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
static int adjust_insn_aux_data(struct bpf_verifier_env *env,
struct bpf_prog *new_prog, u32 off, u32 cnt)
static void adjust_insn_aux_data(struct bpf_verifier_env *env,
struct bpf_insn_aux_data *new_data,
struct bpf_prog *new_prog, u32 off, u32 cnt)
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
struct bpf_insn_aux_data *old_data = env->insn_aux_data;
struct bpf_insn *insn = new_prog->insnsi;
u32 old_seen = old_data[off].seen;
u32 prog_len;
@ -11441,12 +11729,9 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
if (cnt == 1)
return 0;
return;
prog_len = new_prog->len;
new_data = vzalloc(array_size(prog_len,
sizeof(struct bpf_insn_aux_data)));
if (!new_data)
return -ENOMEM;
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
@ -11457,7 +11742,6 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env,
}
env->insn_aux_data = new_data;
vfree(old_data);
return 0;
}
static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
@ -11492,6 +11776,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
const struct bpf_insn *patch, u32 len)
{
struct bpf_prog *new_prog;
struct bpf_insn_aux_data *new_data = NULL;
if (len > 1) {
new_data = vzalloc(array_size(env->prog->len + len - 1,
sizeof(struct bpf_insn_aux_data)));
if (!new_data)
return NULL;
}
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
if (IS_ERR(new_prog)) {
@ -11499,10 +11791,10 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
verbose(env,
"insn %d cannot be patched due to 16-bit range\n",
env->insn_aux_data[off].orig_idx);
vfree(new_data);
return NULL;
}
if (adjust_insn_aux_data(env, new_prog, off, len))
return NULL;
adjust_insn_aux_data(env, new_data, new_prog, off, len);
adjust_subprog_starts(env, off, len);
adjust_poke_descs(new_prog, off, len);
return new_prog;
@ -12351,6 +12643,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
bool expect_blinding = bpf_jit_blinding_enabled(prog);
enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
@ -12568,6 +12861,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
continue;
}
if (insn->imm == BPF_FUNC_timer_set_callback) {
/* The verifier will process callback_fn as many times as necessary
* with different maps and the register states prepared by
* set_timer_callback_state will be accurate.
*
* The following use case is valid:
* map1 is shared by prog1, prog2, prog3.
* prog1 calls bpf_timer_init for some map1 elements
* prog2 calls bpf_timer_set_callback for some map1 elements.
* Those that were not bpf_timer_init-ed will return -EINVAL.
* prog3 calls bpf_timer_start for some map1 elements.
* Those that were not both bpf_timer_init-ed and
* bpf_timer_set_callback-ed will return -EINVAL.
*/
struct bpf_insn ld_addrs[2] = {
BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
};
insn_buf[0] = ld_addrs[0];
insn_buf[1] = ld_addrs[1];
insn_buf[2] = *insn;
cnt = 3;
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
goto patch_call_imm;
}
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
@ -12684,6 +13010,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
continue;
}
/* Implement bpf_get_func_ip inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_ip) {
/* Load IP address from ctx - 8 */
insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
if (!new_prog)
return -ENOMEM;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
continue;
}
patch_call_imm:
fn = env->ops->get_func_proto(insn->imm, env->prog);
/* all functions that have prototype and verifier allowed

View File

@ -948,6 +948,33 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
.arg5_type = ARG_ANYTHING,
};
BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
{
/* This helper call is inlined by verifier. */
return ((u64 *)ctx)[-1];
}
static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
.func = bpf_get_func_ip_tracing,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
struct kprobe *kp = kprobe_running();
return kp ? (u64) kp->addr : 0;
}
static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
.func = bpf_get_func_ip_kprobe,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@ -1058,8 +1085,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_for_each_map_elem_proto;
case BPF_FUNC_snprintf:
return &bpf_snprintf_proto;
case BPF_FUNC_get_func_ip:
return &bpf_get_func_ip_proto_tracing;
default:
return NULL;
return bpf_base_func_proto(func_id);
}
}
@ -1077,6 +1106,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
return &bpf_get_func_ip_proto_kprobe;
default:
return bpf_tracing_func_proto(func_id, prog);
}

View File

@ -15,6 +15,7 @@
#include <linux/error-injection.h>
#include <linux/smp.h>
#include <linux/sock_diag.h>
#include <net/xdp.h>
#define CREATE_TRACE_POINTS
#include <trace/events/bpf_test_run.h>
@ -687,6 +688,64 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
return ret;
}
static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp)
{
unsigned int ingress_ifindex, rx_queue_index;
struct netdev_rx_queue *rxqueue;
struct net_device *device;
if (!xdp_md)
return 0;
if (xdp_md->egress_ifindex != 0)
return -EINVAL;
ingress_ifindex = xdp_md->ingress_ifindex;
rx_queue_index = xdp_md->rx_queue_index;
if (!ingress_ifindex && rx_queue_index)
return -EINVAL;
if (ingress_ifindex) {
device = dev_get_by_index(current->nsproxy->net_ns,
ingress_ifindex);
if (!device)
return -ENODEV;
if (rx_queue_index >= device->real_num_rx_queues)
goto free_dev;
rxqueue = __netif_get_rx_queue(device, rx_queue_index);
if (!xdp_rxq_info_is_reg(&rxqueue->xdp_rxq))
goto free_dev;
xdp->rxq = &rxqueue->xdp_rxq;
/* The device is now tracked in the xdp->rxq for later
* dev_put()
*/
}
xdp->data = xdp->data_meta + xdp_md->data;
return 0;
free_dev:
dev_put(device);
return -EINVAL;
}
static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md)
{
if (!xdp_md)
return;
xdp_md->data = xdp->data - xdp->data_meta;
xdp_md->data_end = xdp->data_end - xdp->data_meta;
if (xdp_md->ingress_ifindex)
dev_put(xdp->rxq->dev);
}
int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
@ -697,35 +756,69 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
struct netdev_rx_queue *rxqueue;
struct xdp_buff xdp = {};
u32 retval, duration;
struct xdp_md *ctx;
u32 max_data_sz;
void *data;
int ret;
int ret = -EINVAL;
if (kattr->test.ctx_in || kattr->test.ctx_out)
return -EINVAL;
ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md));
if (IS_ERR(ctx))
return PTR_ERR(ctx);
if (ctx) {
/* There can't be user provided data before the meta data */
if (ctx->data_meta || ctx->data_end != size ||
ctx->data > ctx->data_end ||
unlikely(xdp_metalen_invalid(ctx->data)))
goto free_ctx;
/* Meta data is allocated from the headroom */
headroom -= ctx->data;
}
/* XDP have extra tailroom as (most) drivers use full page */
max_data_sz = 4096 - headroom - tailroom;
data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
if (IS_ERR(data))
return PTR_ERR(data);
if (IS_ERR(data)) {
ret = PTR_ERR(data);
goto free_ctx;
}
rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
xdp_init_buff(&xdp, headroom + max_data_sz + tailroom,
&rxqueue->xdp_rxq);
xdp_prepare_buff(&xdp, data, headroom, size, true);
ret = xdp_convert_md_to_buff(ctx, &xdp);
if (ret)
goto free_data;
bpf_prog_change_xdp(NULL, prog);
ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
/* We convert the xdp_buff back to an xdp_md before checking the return
* code so the reference count of any held netdevice will be decremented
* even if the test run failed.
*/
xdp_convert_buff_to_md(&xdp, ctx);
if (ret)
goto out;
if (xdp.data != data + headroom || xdp.data_end != xdp.data + size)
size = xdp.data_end - xdp.data;
ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
if (xdp.data_meta != data + headroom ||
xdp.data_end != xdp.data_meta + size)
size = xdp.data_end - xdp.data_meta;
ret = bpf_test_finish(kattr, uattr, xdp.data_meta, size, retval,
duration);
if (!ret)
ret = bpf_ctx_finish(kattr, uattr, ctx,
sizeof(struct xdp_md));
out:
bpf_prog_change_xdp(prog, NULL);
free_data:
kfree(data);
free_ctx:
kfree(ctx);
return ret;
}

View File

@ -33,8 +33,6 @@ obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
obj-$(CONFIG_FAILOVER) += failover.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
endif
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o

View File

@ -4744,45 +4744,18 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
return rxqueue;
}
static u32 netif_receive_generic_xdp(struct sk_buff *skb,
struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
u32 metalen, act = XDP_DROP;
bool orig_bcast, orig_host;
u32 mac_len, frame_sz;
__be16 orig_eth_type;
struct ethhdr *eth;
u32 metalen, act;
int off;
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
*/
if (skb_is_redirected(skb))
return XDP_PASS;
/* XDP packets must be linear and must have sufficient headroom
* of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
* native XDP provides, thus we need to do it here as well.
*/
if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
int troom = skb->tail + skb->data_len - skb->end;
/* In case we have to go down the path and also linearize,
* then lets do the pskb_expand_head() work just once here.
*/
if (pskb_expand_head(skb,
hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
goto do_drop;
if (skb_linearize(skb))
goto do_drop;
}
/* The XDP program wants to see the packet starting at the MAC
* header.
*/
@ -4837,6 +4810,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
skb->protocol = eth_type_trans(skb, skb->dev);
}
/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
* before calling us again on redirect path. We do not call do_redirect
* as we leave that up to the caller.
*
* Caller is responsible for managing lifetime of skb (i.e. calling
* kfree_skb in response to actions it cannot handle/XDP_DROP).
*/
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
@ -4847,6 +4827,49 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
if (metalen)
skb_metadata_set(skb, metalen);
break;
}
return act;
}
static u32 netif_receive_generic_xdp(struct sk_buff *skb,
struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
u32 act = XDP_DROP;
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
*/
if (skb_is_redirected(skb))
return XDP_PASS;
/* XDP packets must be linear and must have sufficient headroom
* of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
* native XDP provides, thus we need to do it here as well.
*/
if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
int troom = skb->tail + skb->data_len - skb->end;
/* In case we have to go down the path and also linearize,
* then lets do the pskb_expand_head() work just once here.
*/
if (pskb_expand_head(skb,
hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
goto do_drop;
if (skb_linearize(skb))
goto do_drop;
}
act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
case XDP_PASS:
break;
default:
bpf_warn_invalid_xdp_action(act);
fallthrough;
@ -5312,7 +5335,6 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
ret = NET_RX_DROP;
goto out;
}
skb_reset_mac_len(skb);
}
if (eth_type_vlan(skb->protocol)) {
@ -5638,25 +5660,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
struct bpf_prog *new = xdp->prog;
int ret = 0;
if (new) {
u32 i;
mutex_lock(&new->aux->used_maps_mutex);
/* generic XDP does not work with DEVMAPs that can
* have a bpf_prog installed on an entry
*/
for (i = 0; i < new->aux->used_map_cnt; i++) {
if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
cpu_map_prog_allowed(new->aux->used_maps[i])) {
mutex_unlock(&new->aux->used_maps_mutex);
return -EINVAL;
}
}
mutex_unlock(&new->aux->used_maps_mutex);
}
switch (xdp->command) {
case XDP_SETUP_PROG:
rcu_assign_pointer(dev->xdp_prog, new);

View File

@ -77,6 +77,7 @@
#include <net/transp_v6.h>
#include <linux/btf_ids.h>
#include <net/tls.h>
#include <net/xdp.h>
static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id);
@ -3880,8 +3881,7 @@ BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
if (unlikely(meta < xdp_frame_end ||
meta > xdp->data))
return -EINVAL;
if (unlikely((metalen & (sizeof(__u32) - 1)) ||
(metalen > 32)))
if (unlikely(xdp_metalen_invalid(metalen)))
return -EACCES;
xdp->data_meta = meta;
@ -4040,8 +4040,12 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
goto err;
consume_skb(skb);
break;
case BPF_MAP_TYPE_CPUMAP:
err = cpu_map_generic_redirect(fwd, skb);
if (unlikely(err))
goto err;
break;
default:
/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
err = -EBADRQC;
goto err;
}

View File

@ -211,8 +211,6 @@ static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
return psock;
}
static bool sock_map_redirect_allowed(const struct sock *sk);
static int sock_map_link(struct bpf_map *map, struct sock *sk)
{
struct sk_psock_progs *progs = sock_map_progs(map);
@ -223,13 +221,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
struct sk_psock *psock;
int ret;
/* Only sockets we can redirect into/from in BPF need to hold
* refs to parser/verdict progs and have their sk_data_ready
* and sk_write_space callbacks overridden.
*/
if (!sock_map_redirect_allowed(sk))
goto no_progs;
stream_verdict = READ_ONCE(progs->stream_verdict);
if (stream_verdict) {
stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
@ -264,7 +255,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
}
}
no_progs:
psock = sock_map_psock_get_checked(sk);
if (IS_ERR(psock)) {
ret = PTR_ERR(psock);
@ -527,12 +517,6 @@ static bool sk_is_tcp(const struct sock *sk)
sk->sk_protocol == IPPROTO_TCP;
}
static bool sk_is_udp(const struct sock *sk)
{
return sk->sk_type == SOCK_DGRAM &&
sk->sk_protocol == IPPROTO_UDP;
}
static bool sock_map_redirect_allowed(const struct sock *sk)
{
if (sk_is_tcp(sk))
@ -550,10 +534,7 @@ static bool sock_map_sk_state_allowed(const struct sock *sk)
{
if (sk_is_tcp(sk))
return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
else if (sk_is_udp(sk))
return sk_hashed(sk);
return false;
return true;
}
static int sock_hash_update_common(struct bpf_map *map, void *key,
@ -1536,6 +1517,7 @@ void sock_map_close(struct sock *sk, long timeout)
release_sock(sk);
saved_close(sk, timeout);
}
EXPORT_SYMBOL_GPL(sock_map_close);
static int sock_map_iter_attach_target(struct bpf_prog *prog,
union bpf_iter_link_info *linfo,

View File

@ -112,7 +112,6 @@ static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
{
*prot = *base;
prot->unhash = sock_map_unhash;
prot->close = sock_map_close;
prot->recvmsg = udp_bpf_recvmsg;
}

View File

@ -7,6 +7,7 @@ obj-$(CONFIG_UNIX) += unix.o
unix-y := af_unix.o garbage.o
unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o
unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o
obj-$(CONFIG_UNIX_DIAG) += unix_diag.o
unix_diag-y := diag.o

View File

@ -494,6 +494,7 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
sk_error_report(other);
}
}
sk->sk_state = other->sk_state = TCP_CLOSE;
}
static void unix_sock_destructor(struct sock *sk)
@ -669,6 +670,8 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
unsigned int flags);
static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
static int unix_dgram_connect(struct socket *, struct sockaddr *,
int, int);
static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
@ -746,6 +749,7 @@ static const struct proto_ops unix_dgram_ops = {
.listen = sock_no_listen,
.shutdown = unix_shutdown,
.sendmsg = unix_dgram_sendmsg,
.read_sock = unix_read_sock,
.recvmsg = unix_dgram_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
@ -777,10 +781,21 @@ static const struct proto_ops unix_seqpacket_ops = {
.show_fdinfo = unix_show_fdinfo,
};
static struct proto unix_proto = {
static void unix_close(struct sock *sk, long timeout)
{
/* Nothing to do here, unix socket does not need a ->close().
* This is merely for sockmap.
*/
}
struct proto unix_proto = {
.name = "UNIX",
.owner = THIS_MODULE,
.obj_size = sizeof(struct unix_sock),
.close = unix_close,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = unix_bpf_update_proto,
#endif
};
static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
@ -864,6 +879,7 @@ static int unix_release(struct socket *sock)
if (!sk)
return 0;
sk->sk_prot->close(sk, 0);
unix_release_sock(sk, 0);
sock->sk = NULL;
@ -1199,6 +1215,9 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
unix_peer(sk) = other;
unix_state_double_unlock(sk, other);
}
if (unix_peer(sk))
sk->sk_state = other->sk_state = TCP_ESTABLISHED;
return 0;
out_unlock:
@ -1431,12 +1450,10 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb)
init_peercred(ska);
init_peercred(skb);
if (ska->sk_type != SOCK_DGRAM) {
ska->sk_state = TCP_ESTABLISHED;
skb->sk_state = TCP_ESTABLISHED;
socka->state = SS_CONNECTED;
sockb->state = SS_CONNECTED;
}
ska->sk_state = TCP_ESTABLISHED;
skb->sk_state = TCP_ESTABLISHED;
socka->state = SS_CONNECTED;
sockb->state = SS_CONNECTED;
return 0;
}
@ -2081,11 +2098,11 @@ static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
}
}
static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
size_t size, int flags)
int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
int flags)
{
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct socket *sock = sk->sk_socket;
struct unix_sock *u = unix_sk(sk);
struct sk_buff *skb, *last;
long timeo;
@ -2188,6 +2205,53 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
return err;
}
static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
struct sock *sk = sock->sk;
#ifdef CONFIG_BPF_SYSCALL
if (sk->sk_prot != &unix_proto)
return sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, NULL);
#endif
return __unix_dgram_recvmsg(sk, msg, size, flags);
}
static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor)
{
int copied = 0;
while (1) {
struct unix_sock *u = unix_sk(sk);
struct sk_buff *skb;
int used, err;
mutex_lock(&u->iolock);
skb = skb_recv_datagram(sk, 0, 1, &err);
mutex_unlock(&u->iolock);
if (!skb)
return err;
used = recv_actor(desc, skb, 0, skb->len);
if (used <= 0) {
if (!copied)
copied = used;
kfree_skb(skb);
break;
} else if (used <= skb->len) {
copied += used;
}
kfree_skb(skb);
if (!desc->count)
break;
}
return copied;
}
/*
* Sleep until more data has arrived. But check for races..
*/
@ -2925,6 +2989,7 @@ static int __init af_unix_init(void)
sock_register(&unix_family_ops);
register_pernet_subsys(&unix_net_ops);
unix_bpf_build_proto();
out:
return rc;
}

122
net/unix/unix_bpf.c Normal file
View File

@ -0,0 +1,122 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */
#include <linux/skmsg.h>
#include <linux/bpf.h>
#include <net/sock.h>
#include <net/af_unix.h>
#define unix_sk_has_data(__sk, __psock) \
({ !skb_queue_empty(&__sk->sk_receive_queue) || \
!skb_queue_empty(&__psock->ingress_skb) || \
!list_empty(&__psock->ingress_msg); \
})
static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock,
long timeo)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct unix_sock *u = unix_sk(sk);
int ret = 0;
if (sk->sk_shutdown & RCV_SHUTDOWN)
return 1;
if (!timeo)
return ret;
add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
if (!unix_sk_has_data(sk, psock)) {
mutex_unlock(&u->iolock);
wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
mutex_lock(&u->iolock);
ret = unix_sk_has_data(sk, psock);
}
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
remove_wait_queue(sk_sleep(sk), &wait);
return ret;
}
static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
size_t len, int nonblock, int flags,
int *addr_len)
{
struct unix_sock *u = unix_sk(sk);
struct sk_psock *psock;
int copied, ret;
psock = sk_psock_get(sk);
if (unlikely(!psock))
return __unix_dgram_recvmsg(sk, msg, len, flags);
mutex_lock(&u->iolock);
if (!skb_queue_empty(&sk->sk_receive_queue) &&
sk_psock_queue_empty(psock)) {
ret = __unix_dgram_recvmsg(sk, msg, len, flags);
goto out;
}
msg_bytes_ready:
copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
if (!copied) {
long timeo;
int data;
timeo = sock_rcvtimeo(sk, nonblock);
data = unix_msg_wait_data(sk, psock, timeo);
if (data) {
if (!sk_psock_queue_empty(psock))
goto msg_bytes_ready;
ret = __unix_dgram_recvmsg(sk, msg, len, flags);
goto out;
}
copied = -EAGAIN;
}
ret = copied;
out:
mutex_unlock(&u->iolock);
sk_psock_put(sk, psock);
return ret;
}
static struct proto *unix_prot_saved __read_mostly;
static DEFINE_SPINLOCK(unix_prot_lock);
static struct proto unix_bpf_prot;
static void unix_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
{
*prot = *base;
prot->close = sock_map_close;
prot->recvmsg = unix_dgram_bpf_recvmsg;
}
static void unix_bpf_check_needs_rebuild(struct proto *ops)
{
if (unlikely(ops != smp_load_acquire(&unix_prot_saved))) {
spin_lock_bh(&unix_prot_lock);
if (likely(ops != unix_prot_saved)) {
unix_bpf_rebuild_protos(&unix_bpf_prot, ops);
smp_store_release(&unix_prot_saved, ops);
}
spin_unlock_bh(&unix_prot_lock);
}
}
int unix_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
{
if (restore) {
sk->sk_write_space = psock->saved_write_space;
WRITE_ONCE(sk->sk_prot, psock->sk_proto);
return 0;
}
unix_bpf_check_needs_rebuild(psock->sk_proto);
WRITE_ONCE(sk->sk_prot, &unix_bpf_prot);
return 0;
}
void __init unix_bpf_build_proto(void)
{
unix_bpf_rebuild_protos(&unix_bpf_prot, &unix_proto);
}

View File

@ -792,13 +792,23 @@ int main(int argc, char **argv)
n_cpus = get_nprocs_conf();
/* Notice: choosing he queue size is very important with the
* ixgbe driver, because it's driver page recycling trick is
* dependend on pages being returned quickly. The number of
* out-standing packets in the system must be less-than 2x
* RX-ring size.
/* Notice: Choosing the queue size is very important when CPU is
* configured with power-saving states.
*
* If deepest state take 133 usec to wakeup from (133/10^6). When link
* speed is 10Gbit/s ((10*10^9/8) in bytes/sec). How many bytes can
* arrive with in 133 usec at this speed: (10*10^9/8)*(133/10^6) =
* 166250 bytes. With MTU size packets this is 110 packets, and with
* minimum Ethernet (MAC-preamble + intergap) 84 bytes is 1979 packets.
*
* Setting default cpumap queue to 2048 as worst-case (small packet)
* should be +64 packet due kthread wakeup call (due to xdp_do_flush)
* worst-case is 2043 packets.
*
* Sysadm can configured system to avoid deep-sleep via:
* tuned-adm profile network-latency
*/
qsize = 128+64;
qsize = 2048;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
prog_load_attr.file = filename;

View File

@ -547,6 +547,7 @@ class PrinterHelpers(Printer):
'struct inode',
'struct socket',
'struct file',
'struct bpf_timer',
]
known_types = {
'...',
@ -594,6 +595,7 @@ class PrinterHelpers(Printer):
'struct inode',
'struct socket',
'struct file',
'struct bpf_timer',
}
mapped_types = {
'u8': '__u8',

View File

@ -324,9 +324,6 @@ union bpf_iter_link_info {
* **BPF_PROG_TYPE_SK_LOOKUP**
* *data_in* and *data_out* must be NULL.
*
* **BPF_PROG_TYPE_XDP**
* *ctx_in* and *ctx_out* must be NULL.
*
* **BPF_PROG_TYPE_RAW_TRACEPOINT**,
* **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE**
*
@ -3249,7 +3246,7 @@ union bpf_attr {
* long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
* Description
* Select a **SO_REUSEPORT** socket from a
* **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
* **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*.
* It checks the selected socket is matching the incoming
* request in the socket buffer.
* Return
@ -4780,6 +4777,76 @@ union bpf_attr {
* Execute close syscall for given FD.
* Return
* A syscall result.
*
* long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags)
* Description
* Initialize the timer.
* First 4 bits of *flags* specify clockid.
* Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
* All other bits of *flags* are reserved.
* The verifier will reject the program if *timer* is not from
* the same *map*.
* Return
* 0 on success.
* **-EBUSY** if *timer* is already initialized.
* **-EINVAL** if invalid *flags* are passed.
* **-EPERM** if *timer* is in a map that doesn't have any user references.
* The user space should either hold a file descriptor to a map with timers
* or pin such map in bpffs. When map is unpinned or file descriptor is
* closed all timers in the map will be cancelled and freed.
*
* long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn)
* Description
* Configure the timer to call *callback_fn* static function.
* Return
* 0 on success.
* **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
* **-EPERM** if *timer* is in a map that doesn't have any user references.
* The user space should either hold a file descriptor to a map with timers
* or pin such map in bpffs. When map is unpinned or file descriptor is
* closed all timers in the map will be cancelled and freed.
*
* long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags)
* Description
* Set timer expiration N nanoseconds from the current time. The
* configured callback will be invoked in soft irq context on some cpu
* and will not repeat unless another bpf_timer_start() is made.
* In such case the next invocation can migrate to a different cpu.
* Since struct bpf_timer is a field inside map element the map
* owns the timer. The bpf_timer_set_callback() will increment refcnt
* of BPF program to make sure that callback_fn code stays valid.
* When user space reference to a map reaches zero all timers
* in a map are cancelled and corresponding program's refcnts are
* decremented. This is done to make sure that Ctrl-C of a user
* process doesn't leave any timers running. If map is pinned in
* bpffs the callback_fn can re-arm itself indefinitely.
* bpf_map_update/delete_elem() helpers and user space sys_bpf commands
* cancel and free the timer in the given map element.
* The map can contain timers that invoke callback_fn-s from different
* programs. The same callback_fn can serve different timers from
* different maps if key/value layout matches across maps.
* Every bpf_timer_set_callback() can have different callback_fn.
*
* Return
* 0 on success.
* **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
* or invalid *flags* are passed.
*
* long bpf_timer_cancel(struct bpf_timer *timer)
* Description
* Cancel the timer and wait for callback_fn to finish if it was running.
* Return
* 0 if the timer was not active.
* 1 if the timer was active.
* **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
* **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
* own timer which would have led to a deadlock otherwise.
*
* u64 bpf_get_func_ip(void *ctx)
* Description
* Get address of the traced function (for tracing and kprobe programs).
* Return
* Address of the traced function.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@ -4951,6 +5018,11 @@ union bpf_attr {
FN(sys_bpf), \
FN(btf_find_by_name_kind), \
FN(sys_close), \
FN(timer_init), \
FN(timer_set_callback), \
FN(timer_start), \
FN(timer_cancel), \
FN(get_func_ip), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@ -6077,6 +6149,11 @@ struct bpf_spin_lock {
__u32 val;
};
struct bpf_timer {
__u64 :64;
__u64 :64;
} __attribute__((aligned(8)));
struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write.

View File

@ -3894,6 +3894,42 @@ static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map)
return 0;
}
static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info)
{
char file[PATH_MAX], buff[4096];
FILE *fp;
__u32 val;
int err;
snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd);
memset(info, 0, sizeof(*info));
fp = fopen(file, "r");
if (!fp) {
err = -errno;
pr_warn("failed to open %s: %d. No procfs support?\n", file,
err);
return err;
}
while (fgets(buff, sizeof(buff), fp)) {
if (sscanf(buff, "map_type:\t%u", &val) == 1)
info->type = val;
else if (sscanf(buff, "key_size:\t%u", &val) == 1)
info->key_size = val;
else if (sscanf(buff, "value_size:\t%u", &val) == 1)
info->value_size = val;
else if (sscanf(buff, "max_entries:\t%u", &val) == 1)
info->max_entries = val;
else if (sscanf(buff, "map_flags:\t%i", &val) == 1)
info->map_flags = val;
}
fclose(fp);
return 0;
}
int bpf_map__reuse_fd(struct bpf_map *map, int fd)
{
struct bpf_map_info info = {};
@ -3902,6 +3938,8 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd)
char *new_name;
err = bpf_obj_get_info_by_fd(fd, &info, &len);
if (err && errno == EINVAL)
err = bpf_get_map_info_from_fdinfo(fd, &info);
if (err)
return libbpf_err(err);
@ -4381,12 +4419,16 @@ static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd)
struct bpf_map_info map_info = {};
char msg[STRERR_BUFSIZE];
__u32 map_info_len;
int err;
map_info_len = sizeof(map_info);
if (bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len)) {
pr_warn("failed to get map info for map FD %d: %s\n",
map_fd, libbpf_strerror_r(errno, msg, sizeof(msg)));
err = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len);
if (err && errno == EINVAL)
err = bpf_get_map_info_from_fdinfo(map_fd, &map_info);
if (err) {
pr_warn("failed to get map info for map FD %d: %s\n", map_fd,
libbpf_strerror_r(errno, msg, sizeof(msg)));
return false;
}
@ -10304,19 +10346,25 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name,
return pfd;
}
struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog,
bool retprobe,
const char *func_name)
struct bpf_program_attach_kprobe_opts {
bool retprobe;
unsigned long offset;
};
static struct bpf_link*
bpf_program__attach_kprobe_opts(struct bpf_program *prog,
const char *func_name,
struct bpf_program_attach_kprobe_opts *opts)
{
char errmsg[STRERR_BUFSIZE];
struct bpf_link *link;
int pfd, err;
pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name,
0 /* offset */, -1 /* pid */);
pfd = perf_event_open_probe(false /* uprobe */, opts->retprobe, func_name,
opts->offset, -1 /* pid */);
if (pfd < 0) {
pr_warn("prog '%s': failed to create %s '%s' perf event: %s\n",
prog->name, retprobe ? "kretprobe" : "kprobe", func_name,
prog->name, opts->retprobe ? "kretprobe" : "kprobe", func_name,
libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
return libbpf_err_ptr(pfd);
}
@ -10325,23 +10373,53 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog,
if (err) {
close(pfd);
pr_warn("prog '%s': failed to attach to %s '%s': %s\n",
prog->name, retprobe ? "kretprobe" : "kprobe", func_name,
prog->name, opts->retprobe ? "kretprobe" : "kprobe", func_name,
libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
return libbpf_err_ptr(err);
}
return link;
}
struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog,
bool retprobe,
const char *func_name)
{
struct bpf_program_attach_kprobe_opts opts = {
.retprobe = retprobe,
};
return bpf_program__attach_kprobe_opts(prog, func_name, &opts);
}
static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec,
struct bpf_program *prog)
{
struct bpf_program_attach_kprobe_opts opts;
unsigned long offset = 0;
struct bpf_link *link;
const char *func_name;
bool retprobe;
char *func;
int n, err;
func_name = prog->sec_name + sec->len;
retprobe = strcmp(sec->sec, "kretprobe/") == 0;
opts.retprobe = strcmp(sec->sec, "kretprobe/") == 0;
return bpf_program__attach_kprobe(prog, retprobe, func_name);
n = sscanf(func_name, "%m[a-zA-Z0-9_.]+%lx", &func, &offset);
if (n < 1) {
err = -EINVAL;
pr_warn("kprobe name is invalid: %s\n", func_name);
return libbpf_err_ptr(err);
}
if (opts.retprobe && offset != 0) {
err = -EINVAL;
pr_warn("kretprobes do not support offset specification\n");
return libbpf_err_ptr(err);
}
opts.offset = offset;
link = bpf_program__attach_kprobe_opts(prog, func, &opts);
free(func);
return link;
}
struct bpf_link *bpf_program__attach_uprobe(struct bpf_program *prog,

View File

@ -0,0 +1,53 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
#include "get_func_ip_test.skel.h"
void test_get_func_ip_test(void)
{
struct get_func_ip_test *skel = NULL;
__u32 duration = 0, retval;
int err, prog_fd;
skel = get_func_ip_test__open();
if (!ASSERT_OK_PTR(skel, "get_func_ip_test__open"))
return;
/* test6 is x86_64 specifc because of the instruction
* offset, disabling it for all other archs
*/
#ifndef __x86_64__
bpf_program__set_autoload(skel->progs.test6, false);
#endif
err = get_func_ip_test__load(skel);
if (!ASSERT_OK(err, "get_func_ip_test__load"))
goto cleanup;
err = get_func_ip_test__attach(skel);
if (!ASSERT_OK(err, "get_func_ip_test__attach"))
goto cleanup;
prog_fd = bpf_program__fd(skel->progs.test1);
err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
NULL, NULL, &retval, &duration);
ASSERT_OK(err, "test_run");
ASSERT_EQ(retval, 0, "test_run");
prog_fd = bpf_program__fd(skel->progs.test5);
err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
NULL, NULL, &retval, &duration);
ASSERT_OK(err, "test_run");
ASSERT_EQ(skel->bss->test1_result, 1, "test1_result");
ASSERT_EQ(skel->bss->test2_result, 1, "test2_result");
ASSERT_EQ(skel->bss->test3_result, 1, "test3_result");
ASSERT_EQ(skel->bss->test4_result, 1, "test4_result");
ASSERT_EQ(skel->bss->test5_result, 1, "test5_result");
#ifdef __x86_64__
ASSERT_EQ(skel->bss->test6_result, 1, "test6_result");
#endif
cleanup:
get_func_ip_test__destroy(skel);
}

View File

@ -351,9 +351,11 @@ static void test_insert_opened(int family, int sotype, int mapfd)
errno = 0;
value = s;
err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
if (!err || errno != EOPNOTSUPP)
FAIL_ERRNO("map_update: expected EOPNOTSUPP");
if (sotype == SOCK_STREAM) {
if (!err || errno != EOPNOTSUPP)
FAIL_ERRNO("map_update: expected EOPNOTSUPP");
} else if (err)
FAIL_ERRNO("map_update: expected success");
xclose(s);
}
@ -919,6 +921,23 @@ static const char *redir_mode_str(enum redir_mode mode)
}
}
static int add_to_sockmap(int sock_mapfd, int fd1, int fd2)
{
u64 value;
u32 key;
int err;
key = 0;
value = fd1;
err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
if (err)
return err;
key = 1;
value = fd2;
return xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
}
static void redir_to_connected(int family, int sotype, int sock_mapfd,
int verd_mapfd, enum redir_mode mode)
{
@ -928,7 +947,6 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
unsigned int pass;
socklen_t len;
int err, n;
u64 value;
u32 key;
char b;
@ -965,15 +983,7 @@ static void redir_to_connected(int family, int sotype, int sock_mapfd,
if (p1 < 0)
goto close_cli1;
key = 0;
value = p0;
err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
if (err)
goto close_peer1;
key = 1;
value = p1;
err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
err = add_to_sockmap(sock_mapfd, p0, p1);
if (err)
goto close_peer1;
@ -1061,7 +1071,6 @@ static void redir_to_listening(int family, int sotype, int sock_mapfd,
int s, c, p, err, n;
unsigned int drop;
socklen_t len;
u64 value;
u32 key;
zero_verdict_count(verd_mapfd);
@ -1086,15 +1095,7 @@ static void redir_to_listening(int family, int sotype, int sock_mapfd,
if (p < 0)
goto close_cli;
key = 0;
value = s;
err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
if (err)
goto close_peer;
key = 1;
value = p;
err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
err = add_to_sockmap(sock_mapfd, s, p);
if (err)
goto close_peer;
@ -1346,7 +1347,6 @@ static void test_reuseport_mixed_groups(int family, int sotype, int sock_map,
int s1, s2, c, err;
unsigned int drop;
socklen_t len;
u64 value;
u32 key;
zero_verdict_count(verd_map);
@ -1360,16 +1360,10 @@ static void test_reuseport_mixed_groups(int family, int sotype, int sock_map,
if (s2 < 0)
goto close_srv1;
key = 0;
value = s1;
err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST);
err = add_to_sockmap(sock_map, s1, s2);
if (err)
goto close_srv2;
key = 1;
value = s2;
err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST);
/* Connect to s2, reuseport BPF selects s1 via sock_map[0] */
len = sizeof(addr);
err = xgetsockname(s2, sockaddr(&addr), &len);
@ -1441,6 +1435,8 @@ static const char *family_str(sa_family_t family)
return "IPv4";
case AF_INET6:
return "IPv6";
case AF_UNIX:
return "Unix";
default:
return "unknown";
}
@ -1563,6 +1559,99 @@ static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map,
}
}
static void unix_redir_to_connected(int sotype, int sock_mapfd,
int verd_mapfd, enum redir_mode mode)
{
const char *log_prefix = redir_mode_str(mode);
int c0, c1, p0, p1;
unsigned int pass;
int retries = 100;
int err, n;
int sfd[2];
u32 key;
char b;
zero_verdict_count(verd_mapfd);
if (socketpair(AF_UNIX, sotype | SOCK_NONBLOCK, 0, sfd))
return;
c0 = sfd[0], p0 = sfd[1];
if (socketpair(AF_UNIX, sotype | SOCK_NONBLOCK, 0, sfd))
goto close0;
c1 = sfd[0], p1 = sfd[1];
err = add_to_sockmap(sock_mapfd, p0, p1);
if (err)
goto close;
n = write(c1, "a", 1);
if (n < 0)
FAIL_ERRNO("%s: write", log_prefix);
if (n == 0)
FAIL("%s: incomplete write", log_prefix);
if (n < 1)
goto close;
key = SK_PASS;
err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass);
if (err)
goto close;
if (pass != 1)
FAIL("%s: want pass count 1, have %d", log_prefix, pass);
again:
n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
if (n < 0) {
if (errno == EAGAIN && retries--)
goto again;
FAIL_ERRNO("%s: read", log_prefix);
}
if (n == 0)
FAIL("%s: incomplete read", log_prefix);
close:
xclose(c1);
xclose(p1);
close0:
xclose(c0);
xclose(p0);
}
static void unix_skb_redir_to_connected(struct test_sockmap_listen *skel,
struct bpf_map *inner_map, int sotype)
{
int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
int verdict_map = bpf_map__fd(skel->maps.verdict_map);
int sock_map = bpf_map__fd(inner_map);
int err;
err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0);
if (err)
return;
skel->bss->test_ingress = false;
unix_redir_to_connected(sotype, sock_map, verdict_map, REDIR_EGRESS);
skel->bss->test_ingress = true;
unix_redir_to_connected(sotype, sock_map, verdict_map, REDIR_INGRESS);
xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
}
static void test_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *map,
int sotype)
{
const char *family_name, *map_name;
char s[MAX_TEST_NAME];
family_name = family_str(AF_UNIX);
map_name = map_type_str(map);
snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__);
if (!test__start_subtest(s))
return;
unix_skb_redir_to_connected(skel, map, sotype);
}
static void test_reuseport(struct test_sockmap_listen *skel,
struct bpf_map *map, int family, int sotype)
{
@ -1603,33 +1692,27 @@ static void test_reuseport(struct test_sockmap_listen *skel,
}
}
static void udp_redir_to_connected(int family, int sotype, int sock_mapfd,
int verd_mapfd, enum redir_mode mode)
static int udp_socketpair(int family, int *s, int *c)
{
const char *log_prefix = redir_mode_str(mode);
struct sockaddr_storage addr;
int c0, c1, p0, p1;
unsigned int pass;
int retries = 100;
socklen_t len;
int err, n;
u64 value;
u32 key;
char b;
int p0, c0;
int err;
zero_verdict_count(verd_mapfd);
p0 = socket_loopback(family, sotype | SOCK_NONBLOCK);
p0 = socket_loopback(family, SOCK_DGRAM | SOCK_NONBLOCK);
if (p0 < 0)
return;
return p0;
len = sizeof(addr);
err = xgetsockname(p0, sockaddr(&addr), &len);
if (err)
goto close_peer0;
c0 = xsocket(family, sotype | SOCK_NONBLOCK, 0);
if (c0 < 0)
c0 = xsocket(family, SOCK_DGRAM | SOCK_NONBLOCK, 0);
if (c0 < 0) {
err = c0;
goto close_peer0;
}
err = xconnect(c0, sockaddr(&addr), len);
if (err)
goto close_cli0;
@ -1640,35 +1723,38 @@ static void udp_redir_to_connected(int family, int sotype, int sock_mapfd,
if (err)
goto close_cli0;
p1 = socket_loopback(family, sotype | SOCK_NONBLOCK);
if (p1 < 0)
goto close_cli0;
err = xgetsockname(p1, sockaddr(&addr), &len);
*s = p0;
*c = c0;
return 0;
close_cli0:
xclose(c0);
close_peer0:
xclose(p0);
return err;
}
static void udp_redir_to_connected(int family, int sock_mapfd, int verd_mapfd,
enum redir_mode mode)
{
const char *log_prefix = redir_mode_str(mode);
int c0, c1, p0, p1;
unsigned int pass;
int retries = 100;
int err, n;
u32 key;
char b;
zero_verdict_count(verd_mapfd);
err = udp_socketpair(family, &p0, &c0);
if (err)
return;
err = udp_socketpair(family, &p1, &c1);
if (err)
goto close_cli0;
c1 = xsocket(family, sotype | SOCK_NONBLOCK, 0);
if (c1 < 0)
goto close_peer1;
err = xconnect(c1, sockaddr(&addr), len);
if (err)
goto close_cli1;
err = xgetsockname(c1, sockaddr(&addr), &len);
if (err)
goto close_cli1;
err = xconnect(p1, sockaddr(&addr), len);
if (err)
goto close_cli1;
key = 0;
value = p0;
err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
if (err)
goto close_cli1;
key = 1;
value = p1;
err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
err = add_to_sockmap(sock_mapfd, p0, p1);
if (err)
goto close_cli1;
@ -1699,11 +1785,9 @@ static void udp_redir_to_connected(int family, int sotype, int sock_mapfd,
close_cli1:
xclose(c1);
close_peer1:
xclose(p1);
close_cli0:
xclose(c0);
close_peer0:
xclose(p0);
}
@ -1720,11 +1804,9 @@ static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel,
return;
skel->bss->test_ingress = false;
udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map,
REDIR_EGRESS);
udp_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS);
skel->bss->test_ingress = true;
udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map,
REDIR_INGRESS);
udp_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS);
xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
}
@ -1743,6 +1825,175 @@ static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map
udp_skb_redir_to_connected(skel, map, family);
}
static void udp_unix_redir_to_connected(int family, int sock_mapfd,
int verd_mapfd, enum redir_mode mode)
{
const char *log_prefix = redir_mode_str(mode);
int c0, c1, p0, p1;
unsigned int pass;
int retries = 100;
int err, n;
int sfd[2];
u32 key;
char b;
zero_verdict_count(verd_mapfd);
if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd))
return;
c0 = sfd[0], p0 = sfd[1];
err = udp_socketpair(family, &p1, &c1);
if (err)
goto close;
err = add_to_sockmap(sock_mapfd, p0, p1);
if (err)
goto close_cli1;
n = write(c1, "a", 1);
if (n < 0)
FAIL_ERRNO("%s: write", log_prefix);
if (n == 0)
FAIL("%s: incomplete write", log_prefix);
if (n < 1)
goto close_cli1;
key = SK_PASS;
err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass);
if (err)
goto close_cli1;
if (pass != 1)
FAIL("%s: want pass count 1, have %d", log_prefix, pass);
again:
n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
if (n < 0) {
if (errno == EAGAIN && retries--)
goto again;
FAIL_ERRNO("%s: read", log_prefix);
}
if (n == 0)
FAIL("%s: incomplete read", log_prefix);
close_cli1:
xclose(c1);
xclose(p1);
close:
xclose(c0);
xclose(p0);
}
static void udp_unix_skb_redir_to_connected(struct test_sockmap_listen *skel,
struct bpf_map *inner_map, int family)
{
int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
int verdict_map = bpf_map__fd(skel->maps.verdict_map);
int sock_map = bpf_map__fd(inner_map);
int err;
err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0);
if (err)
return;
skel->bss->test_ingress = false;
udp_unix_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS);
skel->bss->test_ingress = true;
udp_unix_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS);
xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
}
static void unix_udp_redir_to_connected(int family, int sock_mapfd,
int verd_mapfd, enum redir_mode mode)
{
const char *log_prefix = redir_mode_str(mode);
int c0, c1, p0, p1;
unsigned int pass;
int err, n;
int sfd[2];
u32 key;
char b;
zero_verdict_count(verd_mapfd);
err = udp_socketpair(family, &p0, &c0);
if (err)
return;
if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sfd))
goto close_cli0;
c1 = sfd[0], p1 = sfd[1];
err = add_to_sockmap(sock_mapfd, p0, p1);
if (err)
goto close;
n = write(c1, "a", 1);
if (n < 0)
FAIL_ERRNO("%s: write", log_prefix);
if (n == 0)
FAIL("%s: incomplete write", log_prefix);
if (n < 1)
goto close;
key = SK_PASS;
err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass);
if (err)
goto close;
if (pass != 1)
FAIL("%s: want pass count 1, have %d", log_prefix, pass);
n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1);
if (n < 0)
FAIL_ERRNO("%s: read", log_prefix);
if (n == 0)
FAIL("%s: incomplete read", log_prefix);
close:
xclose(c1);
xclose(p1);
close_cli0:
xclose(c0);
xclose(p0);
}
static void unix_udp_skb_redir_to_connected(struct test_sockmap_listen *skel,
struct bpf_map *inner_map, int family)
{
int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
int verdict_map = bpf_map__fd(skel->maps.verdict_map);
int sock_map = bpf_map__fd(inner_map);
int err;
err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0);
if (err)
return;
skel->bss->test_ingress = false;
unix_udp_redir_to_connected(family, sock_map, verdict_map, REDIR_EGRESS);
skel->bss->test_ingress = true;
unix_udp_redir_to_connected(family, sock_map, verdict_map, REDIR_INGRESS);
xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT);
}
static void test_udp_unix_redir(struct test_sockmap_listen *skel, struct bpf_map *map,
int family)
{
const char *family_name, *map_name;
char s[MAX_TEST_NAME];
family_name = family_str(family);
map_name = map_type_str(map);
snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__);
if (!test__start_subtest(s))
return;
udp_unix_skb_redir_to_connected(skel, map, family);
unix_udp_skb_redir_to_connected(skel, map, family);
}
static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map,
int family)
{
@ -1752,6 +2003,7 @@ static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map,
test_reuseport(skel, map, family, SOCK_STREAM);
test_reuseport(skel, map, family, SOCK_DGRAM);
test_udp_redir(skel, map, family);
test_udp_unix_redir(skel, map, family);
}
void test_sockmap_listen(void)
@ -1767,10 +2019,12 @@ void test_sockmap_listen(void)
skel->bss->test_sockmap = true;
run_tests(skel, skel->maps.sock_map, AF_INET);
run_tests(skel, skel->maps.sock_map, AF_INET6);
test_unix_redir(skel, skel->maps.sock_map, SOCK_DGRAM);
skel->bss->test_sockmap = false;
run_tests(skel, skel->maps.sock_hash, AF_INET);
run_tests(skel, skel->maps.sock_hash, AF_INET6);
test_unix_redir(skel, skel->maps.sock_hash, SOCK_DGRAM);
test_sockmap_listen__destroy(skel);
}

View File

@ -0,0 +1,55 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */
#include <test_progs.h>
#include "timer.skel.h"
static int timer(struct timer *timer_skel)
{
int err, prog_fd;
__u32 duration = 0, retval;
err = timer__attach(timer_skel);
if (!ASSERT_OK(err, "timer_attach"))
return err;
ASSERT_EQ(timer_skel->data->callback_check, 52, "callback_check1");
ASSERT_EQ(timer_skel->data->callback2_check, 52, "callback2_check1");
prog_fd = bpf_program__fd(timer_skel->progs.test1);
err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
NULL, NULL, &retval, &duration);
ASSERT_OK(err, "test_run");
ASSERT_EQ(retval, 0, "test_run");
timer__detach(timer_skel);
usleep(50); /* 10 usecs should be enough, but give it extra */
/* check that timer_cb1() was executed 10+10 times */
ASSERT_EQ(timer_skel->data->callback_check, 42, "callback_check2");
ASSERT_EQ(timer_skel->data->callback2_check, 42, "callback2_check2");
/* check that timer_cb2() was executed twice */
ASSERT_EQ(timer_skel->bss->bss_data, 10, "bss_data");
/* check that there were no errors in timer execution */
ASSERT_EQ(timer_skel->bss->err, 0, "err");
/* check that code paths completed */
ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok");
return 0;
}
void test_timer(void)
{
struct timer *timer_skel = NULL;
int err;
timer_skel = timer__open_and_load();
if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load"))
goto cleanup;
err = timer(timer_skel);
ASSERT_OK(err, "timer");
cleanup:
timer__destroy(timer_skel);
}

View File

@ -0,0 +1,69 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */
#include <test_progs.h>
#include "timer_mim.skel.h"
#include "timer_mim_reject.skel.h"
static int timer_mim(struct timer_mim *timer_skel)
{
__u32 duration = 0, retval;
__u64 cnt1, cnt2;
int err, prog_fd, key1 = 1;
err = timer_mim__attach(timer_skel);
if (!ASSERT_OK(err, "timer_attach"))
return err;
prog_fd = bpf_program__fd(timer_skel->progs.test1);
err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
NULL, NULL, &retval, &duration);
ASSERT_OK(err, "test_run");
ASSERT_EQ(retval, 0, "test_run");
timer_mim__detach(timer_skel);
/* check that timer_cb[12] are incrementing 'cnt' */
cnt1 = READ_ONCE(timer_skel->bss->cnt);
usleep(200); /* 100 times more than interval */
cnt2 = READ_ONCE(timer_skel->bss->cnt);
ASSERT_GT(cnt2, cnt1, "cnt");
ASSERT_EQ(timer_skel->bss->err, 0, "err");
/* check that code paths completed */
ASSERT_EQ(timer_skel->bss->ok, 1 | 2, "ok");
close(bpf_map__fd(timer_skel->maps.inner_htab));
err = bpf_map_delete_elem(bpf_map__fd(timer_skel->maps.outer_arr), &key1);
ASSERT_EQ(err, 0, "delete inner map");
/* check that timer_cb[12] are no longer running */
cnt1 = READ_ONCE(timer_skel->bss->cnt);
usleep(200);
cnt2 = READ_ONCE(timer_skel->bss->cnt);
ASSERT_EQ(cnt2, cnt1, "cnt");
return 0;
}
void test_timer_mim(void)
{
struct timer_mim_reject *timer_reject_skel = NULL;
libbpf_print_fn_t old_print_fn = NULL;
struct timer_mim *timer_skel = NULL;
int err;
old_print_fn = libbpf_set_print(NULL);
timer_reject_skel = timer_mim_reject__open_and_load();
libbpf_set_print(old_print_fn);
if (!ASSERT_ERR_PTR(timer_reject_skel, "timer_reject_skel_load"))
goto cleanup;
timer_skel = timer_mim__open_and_load();
if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load"))
goto cleanup;
err = timer_mim(timer_skel);
ASSERT_OK(err, "timer_mim");
cleanup:
timer_mim__destroy(timer_skel);
timer_mim_reject__destroy(timer_reject_skel);
}

View File

@ -0,0 +1,105 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
#include <network_helpers.h>
#include "test_xdp_context_test_run.skel.h"
void test_xdp_context_error(int prog_fd, struct bpf_test_run_opts opts,
__u32 data_meta, __u32 data, __u32 data_end,
__u32 ingress_ifindex, __u32 rx_queue_index,
__u32 egress_ifindex)
{
struct xdp_md ctx = {
.data = data,
.data_end = data_end,
.data_meta = data_meta,
.ingress_ifindex = ingress_ifindex,
.rx_queue_index = rx_queue_index,
.egress_ifindex = egress_ifindex,
};
int err;
opts.ctx_in = &ctx;
opts.ctx_size_in = sizeof(ctx);
err = bpf_prog_test_run_opts(prog_fd, &opts);
ASSERT_EQ(errno, EINVAL, "errno-EINVAL");
ASSERT_ERR(err, "bpf_prog_test_run");
}
void test_xdp_context_test_run(void)
{
struct test_xdp_context_test_run *skel = NULL;
char data[sizeof(pkt_v4) + sizeof(__u32)];
char bad_ctx[sizeof(struct xdp_md) + 1];
struct xdp_md ctx_in, ctx_out;
DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
.data_in = &data,
.data_size_in = sizeof(data),
.ctx_out = &ctx_out,
.ctx_size_out = sizeof(ctx_out),
.repeat = 1,
);
int err, prog_fd;
skel = test_xdp_context_test_run__open_and_load();
if (!ASSERT_OK_PTR(skel, "skel"))
return;
prog_fd = bpf_program__fd(skel->progs.xdp_context);
/* Data past the end of the kernel's struct xdp_md must be 0 */
bad_ctx[sizeof(bad_ctx) - 1] = 1;
opts.ctx_in = bad_ctx;
opts.ctx_size_in = sizeof(bad_ctx);
err = bpf_prog_test_run_opts(prog_fd, &opts);
ASSERT_EQ(errno, E2BIG, "extradata-errno");
ASSERT_ERR(err, "bpf_prog_test_run(extradata)");
*(__u32 *)data = XDP_PASS;
*(struct ipv4_packet *)(data + sizeof(__u32)) = pkt_v4;
opts.ctx_in = &ctx_in;
opts.ctx_size_in = sizeof(ctx_in);
memset(&ctx_in, 0, sizeof(ctx_in));
ctx_in.data_meta = 0;
ctx_in.data = sizeof(__u32);
ctx_in.data_end = ctx_in.data + sizeof(pkt_v4);
err = bpf_prog_test_run_opts(prog_fd, &opts);
ASSERT_OK(err, "bpf_prog_test_run(valid)");
ASSERT_EQ(opts.retval, XDP_PASS, "valid-retval");
ASSERT_EQ(opts.data_size_out, sizeof(pkt_v4), "valid-datasize");
ASSERT_EQ(opts.ctx_size_out, opts.ctx_size_in, "valid-ctxsize");
ASSERT_EQ(ctx_out.data_meta, 0, "valid-datameta");
ASSERT_EQ(ctx_out.data, 0, "valid-data");
ASSERT_EQ(ctx_out.data_end, sizeof(pkt_v4), "valid-dataend");
/* Meta data's size must be a multiple of 4 */
test_xdp_context_error(prog_fd, opts, 0, 1, sizeof(data), 0, 0, 0);
/* data_meta must reference the start of data */
test_xdp_context_error(prog_fd, opts, 4, sizeof(__u32), sizeof(data),
0, 0, 0);
/* Meta data must be 32 bytes or smaller */
test_xdp_context_error(prog_fd, opts, 0, 36, sizeof(data), 0, 0, 0);
/* Total size of data must match data_end - data_meta */
test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32),
sizeof(data) - 1, 0, 0, 0);
test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32),
sizeof(data) + 1, 0, 0, 0);
/* RX queue cannot be specified without specifying an ingress */
test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data),
0, 1, 0);
/* Interface 1 is always the loopback interface which always has only
* one RX queue (index 0). This makes index 1 an invalid rx queue index
* for interface 1.
*/
test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data),
1, 1, 0);
/* The egress cannot be specified */
test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data),
0, 0, 1);
test_xdp_context_test_run__destroy(skel);
}

View File

@ -7,64 +7,53 @@
#define IFINDEX_LO 1
void test_xdp_with_cpumap_helpers(void)
void test_xdp_cpumap_attach(void)
{
struct test_xdp_with_cpumap_helpers *skel;
struct bpf_prog_info info = {};
__u32 len = sizeof(info);
struct bpf_cpumap_val val = {
.qsize = 192,
};
__u32 duration = 0, idx = 0;
__u32 len = sizeof(info);
int err, prog_fd, map_fd;
__u32 idx = 0;
skel = test_xdp_with_cpumap_helpers__open_and_load();
if (CHECK_FAIL(!skel)) {
perror("test_xdp_with_cpumap_helpers__open_and_load");
if (!ASSERT_OK_PTR(skel, "test_xdp_with_cpumap_helpers__open_and_load"))
return;
}
/* can not attach program with cpumaps that allow programs
* as xdp generic
*/
prog_fd = bpf_program__fd(skel->progs.xdp_redir_prog);
err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE);
CHECK(err == 0, "Generic attach of program with 8-byte CPUMAP",
"should have failed\n");
if (!ASSERT_OK(err, "Generic attach of program with 8-byte CPUMAP"))
goto out_close;
err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE);
ASSERT_OK(err, "XDP program detach");
prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm);
map_fd = bpf_map__fd(skel->maps.cpu_map);
err = bpf_obj_get_info_by_fd(prog_fd, &info, &len);
if (CHECK_FAIL(err))
if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd"))
goto out_close;
val.bpf_prog.fd = prog_fd;
err = bpf_map_update_elem(map_fd, &idx, &val, 0);
CHECK(err, "Add program to cpumap entry", "err %d errno %d\n",
err, errno);
ASSERT_OK(err, "Add program to cpumap entry");
err = bpf_map_lookup_elem(map_fd, &idx, &val);
CHECK(err, "Read cpumap entry", "err %d errno %d\n", err, errno);
CHECK(info.id != val.bpf_prog.id, "Expected program id in cpumap entry",
"expected %u read %u\n", info.id, val.bpf_prog.id);
ASSERT_OK(err, "Read cpumap entry");
ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to cpumap entry prog_id");
/* can not attach BPF_XDP_CPUMAP program to a device */
err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE);
CHECK(err == 0, "Attach of BPF_XDP_CPUMAP program",
"should have failed\n");
if (!ASSERT_NEQ(err, 0, "Attach of BPF_XDP_CPUMAP program"))
bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE);
val.qsize = 192;
val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
err = bpf_map_update_elem(map_fd, &idx, &val, 0);
CHECK(err == 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry",
"should have failed\n");
ASSERT_NEQ(err, 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry");
out_close:
test_xdp_with_cpumap_helpers__destroy(skel);
}
void test_xdp_cpumap_attach(void)
{
if (test__start_subtest("cpumap_with_progs"))
test_xdp_with_cpumap_helpers();
}

View File

@ -16,50 +16,45 @@ void test_xdp_with_devmap_helpers(void)
.ifindex = IFINDEX_LO,
};
__u32 len = sizeof(info);
__u32 duration = 0, idx = 0;
int err, dm_fd, map_fd;
__u32 idx = 0;
skel = test_xdp_with_devmap_helpers__open_and_load();
if (CHECK_FAIL(!skel)) {
perror("test_xdp_with_devmap_helpers__open_and_load");
if (!ASSERT_OK_PTR(skel, "test_xdp_with_devmap_helpers__open_and_load"))
return;
}
/* can not attach program with DEVMAPs that allow programs
* as xdp generic
*/
dm_fd = bpf_program__fd(skel->progs.xdp_redir_prog);
err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE);
CHECK(err == 0, "Generic attach of program with 8-byte devmap",
"should have failed\n");
if (!ASSERT_OK(err, "Generic attach of program with 8-byte devmap"))
goto out_close;
err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE);
ASSERT_OK(err, "XDP program detach");
dm_fd = bpf_program__fd(skel->progs.xdp_dummy_dm);
map_fd = bpf_map__fd(skel->maps.dm_ports);
err = bpf_obj_get_info_by_fd(dm_fd, &info, &len);
if (CHECK_FAIL(err))
if (!ASSERT_OK(err, "bpf_obj_get_info_by_fd"))
goto out_close;
val.bpf_prog.fd = dm_fd;
err = bpf_map_update_elem(map_fd, &idx, &val, 0);
CHECK(err, "Add program to devmap entry",
"err %d errno %d\n", err, errno);
ASSERT_OK(err, "Add program to devmap entry");
err = bpf_map_lookup_elem(map_fd, &idx, &val);
CHECK(err, "Read devmap entry", "err %d errno %d\n", err, errno);
CHECK(info.id != val.bpf_prog.id, "Expected program id in devmap entry",
"expected %u read %u\n", info.id, val.bpf_prog.id);
ASSERT_OK(err, "Read devmap entry");
ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to devmap entry prog_id");
/* can not attach BPF_XDP_DEVMAP program to a device */
err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE);
CHECK(err == 0, "Attach of BPF_XDP_DEVMAP program",
"should have failed\n");
if (!ASSERT_NEQ(err, 0, "Attach of BPF_XDP_DEVMAP program"))
bpf_set_link_xdp_fd(IFINDEX_LO, -1, XDP_FLAGS_SKB_MODE);
val.ifindex = 1;
val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
err = bpf_map_update_elem(map_fd, &idx, &val, 0);
CHECK(err == 0, "Add non-BPF_XDP_DEVMAP program to devmap entry",
"should have failed\n");
ASSERT_NEQ(err, 0, "Add non-BPF_XDP_DEVMAP program to devmap entry");
out_close:
test_xdp_with_devmap_helpers__destroy(skel);
@ -68,12 +63,10 @@ void test_xdp_with_devmap_helpers(void)
void test_neg_xdp_devmap_helpers(void)
{
struct test_xdp_devmap_helpers *skel;
__u32 duration = 0;
skel = test_xdp_devmap_helpers__open_and_load();
if (CHECK(skel,
"Load of XDP program accessing egress ifindex without attach type",
"should have failed\n")) {
if (!ASSERT_EQ(skel, NULL,
"Load of XDP program accessing egress ifindex without attach type")) {
test_xdp_devmap_helpers__destroy(skel);
}
}

View File

@ -0,0 +1,73 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
extern const void bpf_fentry_test1 __ksym;
extern const void bpf_fentry_test2 __ksym;
extern const void bpf_fentry_test3 __ksym;
extern const void bpf_fentry_test4 __ksym;
extern const void bpf_modify_return_test __ksym;
extern const void bpf_fentry_test6 __ksym;
__u64 test1_result = 0;
SEC("fentry/bpf_fentry_test1")
int BPF_PROG(test1, int a)
{
__u64 addr = bpf_get_func_ip(ctx);
test1_result = (const void *) addr == &bpf_fentry_test1;
return 0;
}
__u64 test2_result = 0;
SEC("fexit/bpf_fentry_test2")
int BPF_PROG(test2, int a)
{
__u64 addr = bpf_get_func_ip(ctx);
test2_result = (const void *) addr == &bpf_fentry_test2;
return 0;
}
__u64 test3_result = 0;
SEC("kprobe/bpf_fentry_test3")
int test3(struct pt_regs *ctx)
{
__u64 addr = bpf_get_func_ip(ctx);
test3_result = (const void *) addr == &bpf_fentry_test3;
return 0;
}
__u64 test4_result = 0;
SEC("kretprobe/bpf_fentry_test4")
int BPF_KRETPROBE(test4)
{
__u64 addr = bpf_get_func_ip(ctx);
test4_result = (const void *) addr == &bpf_fentry_test4;
return 0;
}
__u64 test5_result = 0;
SEC("fmod_ret/bpf_modify_return_test")
int BPF_PROG(test5, int a, int *b, int ret)
{
__u64 addr = bpf_get_func_ip(ctx);
test5_result = (const void *) addr == &bpf_modify_return_test;
return ret;
}
__u64 test6_result = 0;
SEC("kprobe/bpf_fentry_test6+0x5")
int test6(struct pt_regs *ctx)
{
__u64 addr = bpf_get_func_ip(ctx);
test6_result = (const void *) addr == &bpf_fentry_test6 + 5;
return 0;
}

View File

@ -528,7 +528,6 @@ int __encap_ip6vxlan_eth(struct __sk_buff *skb)
static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
{
char buf[sizeof(struct v6hdr)];
struct gre_hdr greh;
struct udphdr udph;
int olen = len;

View File

@ -0,0 +1,20 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
SEC("xdp")
int xdp_context(struct xdp_md *xdp)
{
void *data = (void *)(long)xdp->data;
__u32 *metadata = (void *)(long)xdp->data_meta;
__u32 ret;
if (metadata + 1 > data)
return XDP_ABORTED;
ret = *metadata;
if (bpf_xdp_adjust_meta(xdp, 4))
return XDP_ABORTED;
return ret;
}
char _license[] SEC("license") = "GPL";

View File

@ -0,0 +1,297 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */
#include <linux/bpf.h>
#include <time.h>
#include <errno.h>
#include <bpf/bpf_helpers.h>
#include "bpf_tcp_helpers.h"
char _license[] SEC("license") = "GPL";
struct hmap_elem {
int counter;
struct bpf_timer timer;
struct bpf_spin_lock lock; /* unused */
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 1000);
__type(key, int);
__type(value, struct hmap_elem);
} hmap SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(map_flags, BPF_F_NO_PREALLOC);
__uint(max_entries, 1000);
__type(key, int);
__type(value, struct hmap_elem);
} hmap_malloc SEC(".maps");
struct elem {
struct bpf_timer t;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 2);
__type(key, int);
__type(value, struct elem);
} array SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__uint(max_entries, 4);
__type(key, int);
__type(value, struct elem);
} lru SEC(".maps");
__u64 bss_data;
__u64 err;
__u64 ok;
__u64 callback_check = 52;
__u64 callback2_check = 52;
#define ARRAY 1
#define HTAB 2
#define HTAB_MALLOC 3
#define LRU 4
/* callback for array and lru timers */
static int timer_cb1(void *map, int *key, struct bpf_timer *timer)
{
/* increment bss variable twice.
* Once via array timer callback and once via lru timer callback
*/
bss_data += 5;
/* *key == 0 - the callback was called for array timer.
* *key == 4 - the callback was called from lru timer.
*/
if (*key == ARRAY) {
struct bpf_timer *lru_timer;
int lru_key = LRU;
/* rearm array timer to be called again in ~35 seconds */
if (bpf_timer_start(timer, 1ull << 35, 0) != 0)
err |= 1;
lru_timer = bpf_map_lookup_elem(&lru, &lru_key);
if (!lru_timer)
return 0;
bpf_timer_set_callback(lru_timer, timer_cb1);
if (bpf_timer_start(lru_timer, 0, 0) != 0)
err |= 2;
} else if (*key == LRU) {
int lru_key, i;
for (i = LRU + 1;
i <= 100 /* for current LRU eviction algorithm this number
* should be larger than ~ lru->max_entries * 2
*/;
i++) {
struct elem init = {};
/* lru_key cannot be used as loop induction variable
* otherwise the loop will be unbounded.
*/
lru_key = i;
/* add more elements into lru map to push out current
* element and force deletion of this timer
*/
bpf_map_update_elem(map, &lru_key, &init, 0);
/* look it up to bump it into active list */
bpf_map_lookup_elem(map, &lru_key);
/* keep adding until *key changes underneath,
* which means that key/timer memory was reused
*/
if (*key != LRU)
break;
}
/* check that the timer was removed */
if (bpf_timer_cancel(timer) != -EINVAL)
err |= 4;
ok |= 1;
}
return 0;
}
SEC("fentry/bpf_fentry_test1")
int BPF_PROG(test1, int a)
{
struct bpf_timer *arr_timer, *lru_timer;
struct elem init = {};
int lru_key = LRU;
int array_key = ARRAY;
arr_timer = bpf_map_lookup_elem(&array, &array_key);
if (!arr_timer)
return 0;
bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC);
bpf_map_update_elem(&lru, &lru_key, &init, 0);
lru_timer = bpf_map_lookup_elem(&lru, &lru_key);
if (!lru_timer)
return 0;
bpf_timer_init(lru_timer, &lru, CLOCK_MONOTONIC);
bpf_timer_set_callback(arr_timer, timer_cb1);
bpf_timer_start(arr_timer, 0 /* call timer_cb1 asap */, 0);
/* init more timers to check that array destruction
* doesn't leak timer memory.
*/
array_key = 0;
arr_timer = bpf_map_lookup_elem(&array, &array_key);
if (!arr_timer)
return 0;
bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC);
return 0;
}
/* callback for prealloc and non-prealloca hashtab timers */
static int timer_cb2(void *map, int *key, struct hmap_elem *val)
{
if (*key == HTAB)
callback_check--;
else
callback2_check--;
if (val->counter > 0 && --val->counter) {
/* re-arm the timer again to execute after 1 usec */
bpf_timer_start(&val->timer, 1000, 0);
} else if (*key == HTAB) {
struct bpf_timer *arr_timer;
int array_key = ARRAY;
/* cancel arr_timer otherwise bpf_fentry_test1 prog
* will stay alive forever.
*/
arr_timer = bpf_map_lookup_elem(&array, &array_key);
if (!arr_timer)
return 0;
if (bpf_timer_cancel(arr_timer) != 1)
/* bpf_timer_cancel should return 1 to indicate
* that arr_timer was active at this time
*/
err |= 8;
/* try to cancel ourself. It shouldn't deadlock. */
if (bpf_timer_cancel(&val->timer) != -EDEADLK)
err |= 16;
/* delete this key and this timer anyway.
* It shouldn't deadlock either.
*/
bpf_map_delete_elem(map, key);
/* in preallocated hashmap both 'key' and 'val' could have been
* reused to store another map element (like in LRU above),
* but in controlled test environment the below test works.
* It's not a use-after-free. The memory is owned by the map.
*/
if (bpf_timer_start(&val->timer, 1000, 0) != -EINVAL)
err |= 32;
ok |= 2;
} else {
if (*key != HTAB_MALLOC)
err |= 64;
/* try to cancel ourself. It shouldn't deadlock. */
if (bpf_timer_cancel(&val->timer) != -EDEADLK)
err |= 128;
/* delete this key and this timer anyway.
* It shouldn't deadlock either.
*/
bpf_map_delete_elem(map, key);
/* in non-preallocated hashmap both 'key' and 'val' are RCU
* protected and still valid though this element was deleted
* from the map. Arm this timer for ~35 seconds. When callback
* finishes the call_rcu will invoke:
* htab_elem_free_rcu
* check_and_free_timer
* bpf_timer_cancel_and_free
* to cancel this 35 second sleep and delete the timer for real.
*/
if (bpf_timer_start(&val->timer, 1ull << 35, 0) != 0)
err |= 256;
ok |= 4;
}
return 0;
}
int bpf_timer_test(void)
{
struct hmap_elem *val;
int key = HTAB, key_malloc = HTAB_MALLOC;
val = bpf_map_lookup_elem(&hmap, &key);
if (val) {
if (bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME) != 0)
err |= 512;
bpf_timer_set_callback(&val->timer, timer_cb2);
bpf_timer_start(&val->timer, 1000, 0);
}
val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc);
if (val) {
if (bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME) != 0)
err |= 1024;
bpf_timer_set_callback(&val->timer, timer_cb2);
bpf_timer_start(&val->timer, 1000, 0);
}
return 0;
}
SEC("fentry/bpf_fentry_test2")
int BPF_PROG(test2, int a, int b)
{
struct hmap_elem init = {}, *val;
int key = HTAB, key_malloc = HTAB_MALLOC;
init.counter = 10; /* number of times to trigger timer_cb2 */
bpf_map_update_elem(&hmap, &key, &init, 0);
val = bpf_map_lookup_elem(&hmap, &key);
if (val)
bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME);
/* update the same key to free the timer */
bpf_map_update_elem(&hmap, &key, &init, 0);
bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0);
val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc);
if (val)
bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME);
/* update the same key to free the timer */
bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0);
/* init more timers to check that htab operations
* don't leak timer memory.
*/
key = 0;
bpf_map_update_elem(&hmap, &key, &init, 0);
val = bpf_map_lookup_elem(&hmap, &key);
if (val)
bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME);
bpf_map_delete_elem(&hmap, &key);
bpf_map_update_elem(&hmap, &key, &init, 0);
val = bpf_map_lookup_elem(&hmap, &key);
if (val)
bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME);
/* and with non-prealloc htab */
key_malloc = 0;
bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0);
val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc);
if (val)
bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME);
bpf_map_delete_elem(&hmap_malloc, &key_malloc);
bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0);
val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc);
if (val)
bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME);
return bpf_timer_test();
}

View File

@ -0,0 +1,88 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */
#include <linux/bpf.h>
#include <time.h>
#include <errno.h>
#include <bpf/bpf_helpers.h>
#include "bpf_tcp_helpers.h"
char _license[] SEC("license") = "GPL";
struct hmap_elem {
int pad; /* unused */
struct bpf_timer timer;
};
struct inner_map {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 1024);
__type(key, int);
__type(value, struct hmap_elem);
} inner_htab SEC(".maps");
#define ARRAY_KEY 1
#define HASH_KEY 1234
struct outer_arr {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
__uint(max_entries, 2);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
__array(values, struct inner_map);
} outer_arr SEC(".maps") = {
.values = { [ARRAY_KEY] = &inner_htab },
};
__u64 err;
__u64 ok;
__u64 cnt;
static int timer_cb1(void *map, int *key, struct hmap_elem *val);
static int timer_cb2(void *map, int *key, struct hmap_elem *val)
{
cnt++;
bpf_timer_set_callback(&val->timer, timer_cb1);
if (bpf_timer_start(&val->timer, 1000, 0))
err |= 1;
ok |= 1;
return 0;
}
/* callback for inner hash map */
static int timer_cb1(void *map, int *key, struct hmap_elem *val)
{
cnt++;
bpf_timer_set_callback(&val->timer, timer_cb2);
if (bpf_timer_start(&val->timer, 1000, 0))
err |= 2;
/* Do a lookup to make sure 'map' and 'key' pointers are correct */
bpf_map_lookup_elem(map, key);
ok |= 2;
return 0;
}
SEC("fentry/bpf_fentry_test1")
int BPF_PROG(test1, int a)
{
struct hmap_elem init = {};
struct bpf_map *inner_map;
struct hmap_elem *val;
int array_key = ARRAY_KEY;
int hash_key = HASH_KEY;
inner_map = bpf_map_lookup_elem(&outer_arr, &array_key);
if (!inner_map)
return 0;
bpf_map_update_elem(inner_map, &hash_key, &init, 0);
val = bpf_map_lookup_elem(inner_map, &hash_key);
if (!val)
return 0;
bpf_timer_init(&val->timer, inner_map, CLOCK_MONOTONIC);
if (bpf_timer_set_callback(&val->timer, timer_cb1))
err |= 4;
if (bpf_timer_start(&val->timer, 0, 0))
err |= 8;
return 0;
}

View File

@ -0,0 +1,74 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */
#include <linux/bpf.h>
#include <time.h>
#include <errno.h>
#include <bpf/bpf_helpers.h>
#include "bpf_tcp_helpers.h"
char _license[] SEC("license") = "GPL";
struct hmap_elem {
int pad; /* unused */
struct bpf_timer timer;
};
struct inner_map {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 1024);
__type(key, int);
__type(value, struct hmap_elem);
} inner_htab SEC(".maps");
#define ARRAY_KEY 1
#define ARRAY_KEY2 2
#define HASH_KEY 1234
struct outer_arr {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
__uint(max_entries, 2);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
__array(values, struct inner_map);
} outer_arr SEC(".maps") = {
.values = { [ARRAY_KEY] = &inner_htab },
};
__u64 err;
__u64 ok;
__u64 cnt;
/* callback for inner hash map */
static int timer_cb(void *map, int *key, struct hmap_elem *val)
{
return 0;
}
SEC("fentry/bpf_fentry_test1")
int BPF_PROG(test1, int a)
{
struct hmap_elem init = {};
struct bpf_map *inner_map, *inner_map2;
struct hmap_elem *val;
int array_key = ARRAY_KEY;
int array_key2 = ARRAY_KEY2;
int hash_key = HASH_KEY;
inner_map = bpf_map_lookup_elem(&outer_arr, &array_key);
if (!inner_map)
return 0;
inner_map2 = bpf_map_lookup_elem(&outer_arr, &array_key2);
if (!inner_map2)
return 0;
bpf_map_update_elem(inner_map, &hash_key, &init, 0);
val = bpf_map_lookup_elem(inner_map, &hash_key);
if (!val)
return 0;
bpf_timer_init(&val->timer, inner_map2, CLOCK_MONOTONIC);
if (bpf_timer_set_callback(&val->timer, timer_cb))
err |= 4;
if (bpf_timer_start(&val->timer, 0, 0))
err |= 8;
return 0;
}