bpf: add lookup/update support for per-cpu hash and array maps
The functions bpf_map_lookup_elem(map, key, value) and bpf_map_update_elem(map, key, value, flags) need to get/set values from all-cpus for per-cpu hash and array maps, so that user space can aggregate/update them as necessary. Example of single counter aggregation in user space: unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); long values[nr_cpus]; long value = 0; bpf_lookup_elem(fd, key, values); for (i = 0; i < nr_cpus; i++) value += values[i]; The user space must provide round_up(value_size, 8) * nr_cpus array to get/set values, since kernel will use 'long' copy of per-cpu values to try to copy good counters atomically. It's a best-effort, since bpf programs and user space are racing to access the same memory. Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
a10423b87a
commit
15a07b3381
|
@ -183,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog);
|
||||||
int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
|
int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
|
||||||
int bpf_obj_get_user(const char __user *pathname);
|
int bpf_obj_get_user(const char __user *pathname);
|
||||||
|
|
||||||
|
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
|
||||||
|
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
|
||||||
|
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
|
||||||
|
u64 flags);
|
||||||
|
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
|
||||||
|
u64 flags);
|
||||||
|
|
||||||
|
/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
|
||||||
|
* forced to use 'long' read/writes to try to atomically copy long counters.
|
||||||
|
* Best-effort only. No barriers here, since it _will_ race with concurrent
|
||||||
|
* updates from BPF programs. Called from bpf syscall and mostly used with
|
||||||
|
* size 8 or 16 bytes, so ask compiler to inline it.
|
||||||
|
*/
|
||||||
|
static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
|
||||||
|
{
|
||||||
|
const long *lsrc = src;
|
||||||
|
long *ldst = dst;
|
||||||
|
|
||||||
|
size /= sizeof(long);
|
||||||
|
while (size--)
|
||||||
|
*ldst++ = *lsrc++;
|
||||||
|
}
|
||||||
|
|
||||||
/* verify correctness of eBPF program */
|
/* verify correctness of eBPF program */
|
||||||
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
|
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -130,6 +130,32 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
|
||||||
return this_cpu_ptr(array->pptrs[index]);
|
return this_cpu_ptr(array->pptrs[index]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
|
||||||
|
{
|
||||||
|
struct bpf_array *array = container_of(map, struct bpf_array, map);
|
||||||
|
u32 index = *(u32 *)key;
|
||||||
|
void __percpu *pptr;
|
||||||
|
int cpu, off = 0;
|
||||||
|
u32 size;
|
||||||
|
|
||||||
|
if (unlikely(index >= array->map.max_entries))
|
||||||
|
return -ENOENT;
|
||||||
|
|
||||||
|
/* per_cpu areas are zero-filled and bpf programs can only
|
||||||
|
* access 'value_size' of them, so copying rounded areas
|
||||||
|
* will not leak any kernel data
|
||||||
|
*/
|
||||||
|
size = round_up(map->value_size, 8);
|
||||||
|
rcu_read_lock();
|
||||||
|
pptr = array->pptrs[index];
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
|
||||||
|
off += size;
|
||||||
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* Called from syscall */
|
/* Called from syscall */
|
||||||
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
|
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
|
||||||
{
|
{
|
||||||
|
@ -177,6 +203,44 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
|
||||||
|
u64 map_flags)
|
||||||
|
{
|
||||||
|
struct bpf_array *array = container_of(map, struct bpf_array, map);
|
||||||
|
u32 index = *(u32 *)key;
|
||||||
|
void __percpu *pptr;
|
||||||
|
int cpu, off = 0;
|
||||||
|
u32 size;
|
||||||
|
|
||||||
|
if (unlikely(map_flags > BPF_EXIST))
|
||||||
|
/* unknown flags */
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
if (unlikely(index >= array->map.max_entries))
|
||||||
|
/* all elements were pre-allocated, cannot insert a new one */
|
||||||
|
return -E2BIG;
|
||||||
|
|
||||||
|
if (unlikely(map_flags == BPF_NOEXIST))
|
||||||
|
/* all elements already exist */
|
||||||
|
return -EEXIST;
|
||||||
|
|
||||||
|
/* the user space will provide round_up(value_size, 8) bytes that
|
||||||
|
* will be copied into per-cpu area. bpf programs can only access
|
||||||
|
* value_size of it. During lookup the same extra bytes will be
|
||||||
|
* returned or zeros which were zero-filled by percpu_alloc,
|
||||||
|
* so no kernel data leaks possible
|
||||||
|
*/
|
||||||
|
size = round_up(map->value_size, 8);
|
||||||
|
rcu_read_lock();
|
||||||
|
pptr = array->pptrs[index];
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
|
||||||
|
off += size;
|
||||||
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* Called from syscall or from eBPF program */
|
/* Called from syscall or from eBPF program */
|
||||||
static int array_map_delete_elem(struct bpf_map *map, void *key)
|
static int array_map_delete_elem(struct bpf_map *map, void *key)
|
||||||
{
|
{
|
||||||
|
|
|
@ -290,7 +290,7 @@ static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
|
||||||
|
|
||||||
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
|
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
|
||||||
void *value, u32 key_size, u32 hash,
|
void *value, u32 key_size, u32 hash,
|
||||||
bool percpu)
|
bool percpu, bool onallcpus)
|
||||||
{
|
{
|
||||||
u32 size = htab->map.value_size;
|
u32 size = htab->map.value_size;
|
||||||
struct htab_elem *l_new;
|
struct htab_elem *l_new;
|
||||||
|
@ -312,8 +312,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* copy true value_size bytes */
|
if (!onallcpus) {
|
||||||
memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
|
/* copy true value_size bytes */
|
||||||
|
memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
|
||||||
|
} else {
|
||||||
|
int off = 0, cpu;
|
||||||
|
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
|
||||||
|
value + off, size);
|
||||||
|
off += size;
|
||||||
|
}
|
||||||
|
}
|
||||||
htab_elem_set_ptr(l_new, key_size, pptr);
|
htab_elem_set_ptr(l_new, key_size, pptr);
|
||||||
} else {
|
} else {
|
||||||
memcpy(l_new->key + round_up(key_size, 8), value, size);
|
memcpy(l_new->key + round_up(key_size, 8), value, size);
|
||||||
|
@ -368,7 +378,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
|
||||||
/* allocate new element outside of the lock, since
|
/* allocate new element outside of the lock, since
|
||||||
* we're most likley going to insert it
|
* we're most likley going to insert it
|
||||||
*/
|
*/
|
||||||
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false);
|
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
|
||||||
if (!l_new)
|
if (!l_new)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
|
@ -402,8 +412,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
|
static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
|
||||||
void *value, u64 map_flags)
|
void *value, u64 map_flags,
|
||||||
|
bool onallcpus)
|
||||||
{
|
{
|
||||||
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
|
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
|
||||||
struct htab_elem *l_new = NULL, *l_old;
|
struct htab_elem *l_new = NULL, *l_old;
|
||||||
|
@ -436,12 +447,25 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
|
||||||
goto err;
|
goto err;
|
||||||
|
|
||||||
if (l_old) {
|
if (l_old) {
|
||||||
|
void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
|
||||||
|
u32 size = htab->map.value_size;
|
||||||
|
|
||||||
/* per-cpu hash map can update value in-place */
|
/* per-cpu hash map can update value in-place */
|
||||||
memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)),
|
if (!onallcpus) {
|
||||||
value, htab->map.value_size);
|
memcpy(this_cpu_ptr(pptr), value, size);
|
||||||
|
} else {
|
||||||
|
int off = 0, cpu;
|
||||||
|
|
||||||
|
size = round_up(size, 8);
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
|
||||||
|
value + off, size);
|
||||||
|
off += size;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
l_new = alloc_htab_elem(htab, key, value, key_size,
|
l_new = alloc_htab_elem(htab, key, value, key_size,
|
||||||
hash, true);
|
hash, true, onallcpus);
|
||||||
if (!l_new) {
|
if (!l_new) {
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
goto err;
|
goto err;
|
||||||
|
@ -455,6 +479,12 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
|
||||||
|
void *value, u64 map_flags)
|
||||||
|
{
|
||||||
|
return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
|
||||||
|
}
|
||||||
|
|
||||||
/* Called from syscall or from eBPF program */
|
/* Called from syscall or from eBPF program */
|
||||||
static int htab_map_delete_elem(struct bpf_map *map, void *key)
|
static int htab_map_delete_elem(struct bpf_map *map, void *key)
|
||||||
{
|
{
|
||||||
|
@ -557,6 +587,41 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
|
||||||
|
{
|
||||||
|
struct htab_elem *l;
|
||||||
|
void __percpu *pptr;
|
||||||
|
int ret = -ENOENT;
|
||||||
|
int cpu, off = 0;
|
||||||
|
u32 size;
|
||||||
|
|
||||||
|
/* per_cpu areas are zero-filled and bpf programs can only
|
||||||
|
* access 'value_size' of them, so copying rounded areas
|
||||||
|
* will not leak any kernel data
|
||||||
|
*/
|
||||||
|
size = round_up(map->value_size, 8);
|
||||||
|
rcu_read_lock();
|
||||||
|
l = __htab_map_lookup_elem(map, key);
|
||||||
|
if (!l)
|
||||||
|
goto out;
|
||||||
|
pptr = htab_elem_get_ptr(l, map->key_size);
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
bpf_long_memcpy(value + off,
|
||||||
|
per_cpu_ptr(pptr, cpu), size);
|
||||||
|
off += size;
|
||||||
|
}
|
||||||
|
ret = 0;
|
||||||
|
out:
|
||||||
|
rcu_read_unlock();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
|
||||||
|
u64 map_flags)
|
||||||
|
{
|
||||||
|
return __htab_percpu_map_update_elem(map, key, value, map_flags, true);
|
||||||
|
}
|
||||||
|
|
||||||
static const struct bpf_map_ops htab_percpu_ops = {
|
static const struct bpf_map_ops htab_percpu_ops = {
|
||||||
.map_alloc = htab_map_alloc,
|
.map_alloc = htab_map_alloc,
|
||||||
.map_free = htab_map_free,
|
.map_free = htab_map_free,
|
||||||
|
|
|
@ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr)
|
||||||
int ufd = attr->map_fd;
|
int ufd = attr->map_fd;
|
||||||
struct bpf_map *map;
|
struct bpf_map *map;
|
||||||
void *key, *value, *ptr;
|
void *key, *value, *ptr;
|
||||||
|
u32 value_size;
|
||||||
struct fd f;
|
struct fd f;
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
|
@ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr)
|
||||||
if (copy_from_user(key, ukey, map->key_size) != 0)
|
if (copy_from_user(key, ukey, map->key_size) != 0)
|
||||||
goto free_key;
|
goto free_key;
|
||||||
|
|
||||||
|
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||||
|
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
|
||||||
|
value_size = round_up(map->value_size, 8) * num_possible_cpus();
|
||||||
|
else
|
||||||
|
value_size = map->value_size;
|
||||||
|
|
||||||
err = -ENOMEM;
|
err = -ENOMEM;
|
||||||
value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
|
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
|
||||||
if (!value)
|
if (!value)
|
||||||
goto free_key;
|
goto free_key;
|
||||||
|
|
||||||
rcu_read_lock();
|
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
|
||||||
ptr = map->ops->map_lookup_elem(map, key);
|
err = bpf_percpu_hash_copy(map, key, value);
|
||||||
if (ptr)
|
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
|
||||||
memcpy(value, ptr, map->value_size);
|
err = bpf_percpu_array_copy(map, key, value);
|
||||||
rcu_read_unlock();
|
} else {
|
||||||
|
rcu_read_lock();
|
||||||
|
ptr = map->ops->map_lookup_elem(map, key);
|
||||||
|
if (ptr)
|
||||||
|
memcpy(value, ptr, value_size);
|
||||||
|
rcu_read_unlock();
|
||||||
|
err = ptr ? 0 : -ENOENT;
|
||||||
|
}
|
||||||
|
|
||||||
err = -ENOENT;
|
if (err)
|
||||||
if (!ptr)
|
|
||||||
goto free_value;
|
goto free_value;
|
||||||
|
|
||||||
err = -EFAULT;
|
err = -EFAULT;
|
||||||
if (copy_to_user(uvalue, value, map->value_size) != 0)
|
if (copy_to_user(uvalue, value, value_size) != 0)
|
||||||
goto free_value;
|
goto free_value;
|
||||||
|
|
||||||
err = 0;
|
err = 0;
|
||||||
|
@ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr)
|
||||||
int ufd = attr->map_fd;
|
int ufd = attr->map_fd;
|
||||||
struct bpf_map *map;
|
struct bpf_map *map;
|
||||||
void *key, *value;
|
void *key, *value;
|
||||||
|
u32 value_size;
|
||||||
struct fd f;
|
struct fd f;
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
|
@ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr)
|
||||||
if (copy_from_user(key, ukey, map->key_size) != 0)
|
if (copy_from_user(key, ukey, map->key_size) != 0)
|
||||||
goto free_key;
|
goto free_key;
|
||||||
|
|
||||||
|
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||||
|
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
|
||||||
|
value_size = round_up(map->value_size, 8) * num_possible_cpus();
|
||||||
|
else
|
||||||
|
value_size = map->value_size;
|
||||||
|
|
||||||
err = -ENOMEM;
|
err = -ENOMEM;
|
||||||
value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
|
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
|
||||||
if (!value)
|
if (!value)
|
||||||
goto free_key;
|
goto free_key;
|
||||||
|
|
||||||
err = -EFAULT;
|
err = -EFAULT;
|
||||||
if (copy_from_user(value, uvalue, map->value_size) != 0)
|
if (copy_from_user(value, uvalue, value_size) != 0)
|
||||||
goto free_value;
|
goto free_value;
|
||||||
|
|
||||||
/* eBPF program that use maps are running under rcu_read_lock(),
|
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
|
||||||
* therefore all map accessors rely on this fact, so do the same here
|
err = bpf_percpu_hash_update(map, key, value, attr->flags);
|
||||||
*/
|
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
|
||||||
rcu_read_lock();
|
err = bpf_percpu_array_update(map, key, value, attr->flags);
|
||||||
err = map->ops->map_update_elem(map, key, value, attr->flags);
|
} else {
|
||||||
rcu_read_unlock();
|
rcu_read_lock();
|
||||||
|
err = map->ops->map_update_elem(map, key, value, attr->flags);
|
||||||
|
rcu_read_unlock();
|
||||||
|
}
|
||||||
|
|
||||||
free_value:
|
free_value:
|
||||||
kfree(value);
|
kfree(value);
|
||||||
|
|
Loading…
Reference in New Issue