From c7603cfa04e7c3a435b31d065f7cbdc829428f6e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 12 Jul 2021 16:06:15 -0700
Subject: [PATCH 01/64] bpf: Add ambient BPF runtime context stored in current

b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage()
helper") fixed the problem with cgroup-local storage use in BPF by
pre-allocating per-CPU array of 8 cgroup storage pointers to accommodate
possible BPF program preemptions and nested executions.

While this seems to work good in practice, it introduces new and unnecessary
failure mode in which not all BPF programs might be executed if we fail to
find an unused slot for cgroup storage, however unlikely it is. It might also
not be so unlikely when/if we allow sleepable cgroup BPF programs in the
future.

Further, the way that cgroup storage is implemented as ambiently-available
property during entire BPF program execution is a convenient way to pass extra
information to BPF program and helpers without requiring user code to pass
around extra arguments explicitly. So it would be good to have a generic
solution that can allow implementing this without arbitrary restrictions.
Ideally, such solution would work for both preemptable and sleepable BPF
programs in exactly the same way.

This patch introduces such solution, bpf_run_ctx. It adds one pointer field
(bpf_ctx) to task_struct. This field is maintained by BPF_PROG_RUN family of
macros in such a way that it always stays valid throughout BPF program
execution. BPF program preemption is handled by remembering previous
current->bpf_ctx value locally while executing nested BPF program and
restoring old value after nested BPF program finishes. This is handled by two
helper functions, bpf_set_run_ctx() and bpf_reset_run_ctx(), which are
supposed to be used before and after BPF program runs, respectively.

Restoring old value of the pointer handles preemption, while bpf_run_ctx
pointer being a property of current task_struct naturally solves this problem
for sleepable BPF programs by "following" BPF program execution as it is
scheduled in and out of CPU. It would even allow CPU migration of BPF
programs, even though it's not currently allowed by BPF infra.

This patch cleans up cgroup local storage handling as a first application. The
design itself is generic, though, with bpf_run_ctx being an empty struct that
is supposed to be embedded into a specific struct for a given BPF program type
(bpf_cg_run_ctx in this case). Follow up patches are planned that will expand
this mechanism for other uses within tracing BPF programs.

To verify that this change doesn't revert the fix to the original cgroup
storage issue, I ran the same repro as in the original report ([0]) and didn't
get any problems. Replacing bpf_reset_run_ctx(old_run_ctx) with
bpf_reset_run_ctx(NULL) triggers the issue pretty quickly (so repro does work).

  [0] https://lore.kernel.org/bpf/YEEvBUiJl2pJkxTd@krava/

Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210712230615.3525979-1-andrii@kernel.org
---
 include/linux/bpf-cgroup.h | 54 --------------------------------------
 include/linux/bpf.h        | 54 ++++++++++++++++++++++++--------------
 include/linux/sched.h      |  3 +++
 kernel/bpf/helpers.c       | 16 ++++-------
 kernel/bpf/local_storage.c |  3 ---
 kernel/fork.c              |  1 +
 net/bpf/test_run.c         | 23 ++++++++--------
 7 files changed, 54 insertions(+), 100 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8b77d08d4b47..a74cd1c3bd87 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -27,19 +27,6 @@ struct task_struct;
 extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE];
 #define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type])
 
-#define BPF_CGROUP_STORAGE_NEST_MAX	8
-
-struct bpf_cgroup_storage_info {
-	struct task_struct *task;
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
-};
-
-/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks
- * to use bpf cgroup storage simultaneously.
- */
-DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
-		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
-
 #define for_each_cgroup_storage_type(stype) \
 	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
 
@@ -172,44 +159,6 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	return BPF_CGROUP_STORAGE_SHARED;
 }
 
-static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage
-					 *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
-{
-	enum bpf_cgroup_storage_type stype;
-	int i, err = 0;
-
-	preempt_disable();
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL))
-			continue;
-
-		this_cpu_write(bpf_cgroup_storage_info[i].task, current);
-		for_each_cgroup_storage_type(stype)
-			this_cpu_write(bpf_cgroup_storage_info[i].storage[stype],
-				       storage[stype]);
-		goto out;
-	}
-	err = -EBUSY;
-	WARN_ON_ONCE(1);
-
-out:
-	preempt_enable();
-	return err;
-}
-
-static inline void bpf_cgroup_storage_unset(void)
-{
-	int i;
-
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
-			continue;
-
-		this_cpu_write(bpf_cgroup_storage_info[i].task, NULL);
-		return;
-	}
-}
-
 struct bpf_cgroup_storage *
 cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
 		      void *key, bool locked);
@@ -487,9 +436,6 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
-static inline int bpf_cgroup_storage_set(
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; }
-static inline void bpf_cgroup_storage_unset(void) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
 					    struct bpf_map *map) { return 0; }
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0edff8f5177e..978ebd16ae60 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1142,38 +1142,40 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *include_prog,
 			struct bpf_prog_array **new_array);
 
+struct bpf_run_ctx {};
+
+struct bpf_cg_run_ctx {
+	struct bpf_run_ctx run_ctx;
+	struct bpf_prog_array_item *prog_item;
+};
+
 /* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */
 #define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE			(1 << 0)
 /* BPF program asks to set CN on the packet. */
 #define BPF_RET_SET_CN						(1 << 0)
 
-/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY,
- * if bpf_cgroup_storage_set() failed, the rest of programs
- * will not execute. This should be a really rare scenario
- * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of
- * preemptions all between bpf_cgroup_storage_set() and
- * bpf_cgroup_storage_unset() on the same cpu.
- */
 #define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags)		\
 	({								\
 		struct bpf_prog_array_item *_item;			\
 		struct bpf_prog *_prog;					\
 		struct bpf_prog_array *_array;				\
+		struct bpf_run_ctx *old_run_ctx;			\
+		struct bpf_cg_run_ctx run_ctx;				\
 		u32 _ret = 1;						\
 		u32 func_ret;						\
 		migrate_disable();					\
 		rcu_read_lock();					\
 		_array = rcu_dereference(array);			\
 		_item = &_array->items[0];				\
+		old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);	\
 		while ((_prog = READ_ONCE(_item->prog))) {		\
-			if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
-				break;					\
+			run_ctx.prog_item = _item;			\
 			func_ret = func(_prog, ctx);			\
 			_ret &= (func_ret & 1);				\
-			*(ret_flags) |= (func_ret >> 1);			\
-			bpf_cgroup_storage_unset();			\
+			*(ret_flags) |= (func_ret >> 1);		\
 			_item++;					\
 		}							\
+		bpf_reset_run_ctx(old_run_ctx);				\
 		rcu_read_unlock();					\
 		migrate_enable();					\
 		_ret;							\
@@ -1184,6 +1186,8 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		struct bpf_prog_array_item *_item;	\
 		struct bpf_prog *_prog;			\
 		struct bpf_prog_array *_array;		\
+		struct bpf_run_ctx *old_run_ctx;	\
+		struct bpf_cg_run_ctx run_ctx;		\
 		u32 _ret = 1;				\
 		migrate_disable();			\
 		rcu_read_lock();			\
@@ -1191,17 +1195,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		if (unlikely(check_non_null && !_array))\
 			goto _out;			\
 		_item = &_array->items[0];		\
-		while ((_prog = READ_ONCE(_item->prog))) {		\
-			if (!set_cg_storage) {			\
-				_ret &= func(_prog, ctx);	\
-			} else {				\
-				if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage)))	\
-					break;			\
-				_ret &= func(_prog, ctx);	\
-				bpf_cgroup_storage_unset();	\
-			}				\
+		old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\
+		while ((_prog = READ_ONCE(_item->prog))) {	\
+			run_ctx.prog_item = _item;	\
+			_ret &= func(_prog, ctx);	\
 			_item++;			\
 		}					\
+		bpf_reset_run_ctx(old_run_ctx);		\
 _out:							\
 		rcu_read_unlock();			\
 		migrate_enable();			\
@@ -1284,6 +1284,20 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
+static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
+{
+	struct bpf_run_ctx *old_ctx;
+
+	old_ctx = current->bpf_ctx;
+	current->bpf_ctx = new_ctx;
+	return old_ctx;
+}
+
+static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
+{
+	current->bpf_ctx = old_ctx;
+}
+
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ec8d07d88641..c64119aa2e60 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -42,6 +42,7 @@ struct backing_dev_info;
 struct bio_list;
 struct blk_plug;
 struct bpf_local_storage;
+struct bpf_run_ctx;
 struct capture_control;
 struct cfs_rq;
 struct fs_struct;
@@ -1379,6 +1380,8 @@ struct task_struct {
 #ifdef CONFIG_BPF_SYSCALL
 	/* Used by BPF task local storage */
 	struct bpf_local_storage __rcu	*bpf_storage;
+	/* Used for BPF run context */
+	struct bpf_run_ctx		*bpf_ctx;
 #endif
 
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9fe846ec6bd1..15746f779fe1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -393,8 +393,6 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
 };
 
 #ifdef CONFIG_CGROUP_BPF
-DECLARE_PER_CPU(struct bpf_cgroup_storage_info,
-		bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
 
 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 {
@@ -403,17 +401,13 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	 * verifier checks that its value is correct.
 	 */
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
-	struct bpf_cgroup_storage *storage = NULL;
+	struct bpf_cgroup_storage *storage;
+	struct bpf_cg_run_ctx *ctx;
 	void *ptr;
-	int i;
 
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
-			continue;
-
-		storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
-		break;
-	}
+	/* get current cgroup storage from BPF run context */
+	ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+	storage = ctx->prog_item->cgroup_storage[stype];
 
 	if (stype == BPF_CGROUP_STORAGE_SHARED)
 		ptr = &READ_ONCE(storage->buf)->data[0];
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 95d70a08325d..362e81481594 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -11,9 +11,6 @@
 
 #ifdef CONFIG_CGROUP_BPF
 
-DEFINE_PER_CPU(struct bpf_cgroup_storage_info,
-	       bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]);
-
 #include "../cgroup/cgroup-internal.h"
 
 #define LOCAL_STORAGE_CREATE_FLAG_MASK					\
diff --git a/kernel/fork.c b/kernel/fork.c
index bc94b2cc5995..e8b41e212110 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2083,6 +2083,7 @@ static __latent_entropy struct task_struct *copy_process(
 #endif
 #ifdef CONFIG_BPF_SYSCALL
 	RCU_INIT_POINTER(p->bpf_storage, NULL);
+	p->bpf_ctx = NULL;
 #endif
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index cda8375bbbaf..8d46e2962786 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -88,17 +88,19 @@ static bool bpf_test_timer_continue(struct bpf_test_timer *t, u32 repeat, int *e
 static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 			u32 *retval, u32 *time, bool xdp)
 {
-	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL };
+	struct bpf_prog_array_item item = {.prog = prog};
+	struct bpf_run_ctx *old_ctx;
+	struct bpf_cg_run_ctx run_ctx;
 	struct bpf_test_timer t = { NO_MIGRATE };
 	enum bpf_cgroup_storage_type stype;
 	int ret;
 
 	for_each_cgroup_storage_type(stype) {
-		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
-		if (IS_ERR(storage[stype])) {
-			storage[stype] = NULL;
+		item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
+		if (IS_ERR(item.cgroup_storage[stype])) {
+			item.cgroup_storage[stype] = NULL;
 			for_each_cgroup_storage_type(stype)
-				bpf_cgroup_storage_free(storage[stype]);
+				bpf_cgroup_storage_free(item.cgroup_storage[stype]);
 			return -ENOMEM;
 		}
 	}
@@ -107,22 +109,19 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 		repeat = 1;
 
 	bpf_test_timer_enter(&t);
+	old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	do {
-		ret = bpf_cgroup_storage_set(storage);
-		if (ret)
-			break;
-
+		run_ctx.prog_item = &item;
 		if (xdp)
 			*retval = bpf_prog_run_xdp(prog, ctx);
 		else
 			*retval = BPF_PROG_RUN(prog, ctx);
-
-		bpf_cgroup_storage_unset();
 	} while (bpf_test_timer_continue(&t, repeat, &ret, time));
+	bpf_reset_run_ctx(old_ctx);
 	bpf_test_timer_leave(&t);
 
 	for_each_cgroup_storage_type(stype)
-		bpf_cgroup_storage_free(storage[stype]);
+		bpf_cgroup_storage_free(item.cgroup_storage[stype]);
 
 	return ret;
 }

From 8886534747c2e56d9d3b978c99e92dc479cd9426 Mon Sep 17 00:00:00 2001
From: "Roy, UjjaL" <royujjal@gmail.com>
Date: Fri, 16 Jul 2021 22:04:41 +0200
Subject: [PATCH 02/64] bpf, doc: Add heading and example for extensions in
 cbpf

Add new heading for extensions to make it more readable. Also, add one
more example of filtering interface index for better understanding.

Signed-off-by: Roy, UjjaL <royujjal@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/CAADnVQJ=DoRDcVkaXmY3EmNdLoO7gq1mkJOn5G=00wKH8qUtZQ@mail.gmail.com
---
 Documentation/networking/filter.rst | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst
index 3e2221f4abe4..5f13905b12e0 100644
--- a/Documentation/networking/filter.rst
+++ b/Documentation/networking/filter.rst
@@ -320,13 +320,6 @@ Examples for low-level BPF:
   ret #-1
   drop: ret #0
 
-**(Accelerated) VLAN w/ id 10**::
-
-  ld vlan_tci
-  jneq #10, drop
-  ret #-1
-  drop: ret #0
-
 **icmp random packet sampling, 1 in 4**::
 
   ldh [12]
@@ -358,6 +351,22 @@ Examples for low-level BPF:
   bad: ret #0             /* SECCOMP_RET_KILL_THREAD */
   good: ret #0x7fff0000   /* SECCOMP_RET_ALLOW */
 
+Examples for low-level BPF extension:
+
+**Packet for interface index 13**::
+
+  ld ifidx
+  jneq #13, drop
+  ret #-1
+  drop: ret #0
+
+**(Accelerated) VLAN w/ id 10**::
+
+  ld vlan_tci
+  jneq #10, drop
+  ret #-1
+  drop: ret #0
+
 The above example code can be placed into a file (here called "foo"), and
 then be passed to the bpf_asm tool for generating opcodes, output that xt_bpf
 and cls_bpf understands and can directly be loaded with. Example with above

From 1373ff59955621b7e71e7a1152036c93a5780c11 Mon Sep 17 00:00:00 2001
From: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Date: Tue, 13 Jul 2021 20:42:37 +0800
Subject: [PATCH 03/64] libbpf: Introduce 'btf_custom_path' to
 'bpf_obj_open_opts'

btf_custom_path allows developers to load custom BTF which libbpf will
subsequently use for CO-RE relocation instead of vmlinux BTF.

Having btf_custom_path in bpf_object_open_opts one can directly use the
skeleton's <objname>_bpf__open_opts() API to pass in the btf_custom_path
parameter, as opposed to using bpf_object__load_xattr() which is slated to be
deprecated ([0]).

This work continues previous work started by another developer ([1]).

  [0] https://lore.kernel.org/bpf/CAEf4BzbJZLjNoiK8_VfeVg_Vrg=9iYFv+po-38SMe=UzwDKJ=Q@mail.gmail.com/#t
  [1] https://yhbt.net/lore/all/CAEf4Bzbgw49w2PtowsrzKQNcxD4fZRE6AKByX-5-dMo-+oWHHA@mail.gmail.com/

Signed-off-by: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626180159-112996-2-git-send-email-chengshuyi@linux.alibaba.com
---
 tools/lib/bpf/libbpf.c | 28 ++++++++++++++++++++++++----
 tools/lib/bpf/libbpf.h |  9 ++++++++-
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4ccfae30e681..8bff28fc3fc9 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -498,6 +498,10 @@ struct bpf_object {
 	 * it at load time.
 	 */
 	struct btf *btf_vmlinux;
+	/* Path to the custom BTF to be used for BPF CO-RE relocations as an
+	 * override for vmlinux BTF.
+	 */
+	char *btf_custom_path;
 	/* vmlinux BTF override for CO-RE relocations */
 	struct btf *btf_vmlinux_override;
 	/* Lazily initialized kernel module BTFs */
@@ -2645,8 +2649,10 @@ static bool obj_needs_vmlinux_btf(const struct bpf_object *obj)
 	struct bpf_program *prog;
 	int i;
 
-	/* CO-RE relocations need kernel BTF */
-	if (obj->btf_ext && obj->btf_ext->core_relo_info.len)
+	/* CO-RE relocations need kernel BTF, only when btf_custom_path
+	 * is not specified
+	 */
+	if (obj->btf_ext && obj->btf_ext->core_relo_info.len && !obj->btf_custom_path)
 		return true;
 
 	/* Support for typed ksyms needs kernel BTF */
@@ -7596,7 +7602,7 @@ static struct bpf_object *
 __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
 		   const struct bpf_object_open_opts *opts)
 {
-	const char *obj_name, *kconfig;
+	const char *obj_name, *kconfig, *btf_tmp_path;
 	struct bpf_program *prog;
 	struct bpf_object *obj;
 	char tmp_name[64];
@@ -7627,6 +7633,19 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
 	if (IS_ERR(obj))
 		return obj;
 
+	btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL);
+	if (btf_tmp_path) {
+		if (strlen(btf_tmp_path) >= PATH_MAX) {
+			err = -ENAMETOOLONG;
+			goto out;
+		}
+		obj->btf_custom_path = strdup(btf_tmp_path);
+		if (!obj->btf_custom_path) {
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
 	kconfig = OPTS_GET(opts, kconfig, NULL);
 	if (kconfig) {
 		obj->kconfig = strdup(kconfig);
@@ -8097,7 +8116,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 	err = err ? : bpf_object__sanitize_maps(obj);
 	err = err ? : bpf_object__init_kern_struct_ops_maps(obj);
 	err = err ? : bpf_object__create_maps(obj);
-	err = err ? : bpf_object__relocate(obj, attr->target_btf_path);
+	err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : attr->target_btf_path);
 	err = err ? : bpf_object__load_progs(obj, attr->log_level);
 
 	if (obj->gen_loader) {
@@ -8744,6 +8763,7 @@ void bpf_object__close(struct bpf_object *obj)
 	for (i = 0; i < obj->nr_maps; i++)
 		bpf_map__destroy(&obj->maps[i]);
 
+	zfree(&obj->btf_custom_path);
 	zfree(&obj->kconfig);
 	zfree(&obj->externs);
 	obj->nr_extern = 0;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 6e61342ba56c..6b08c1023609 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -94,8 +94,15 @@ struct bpf_object_open_opts {
 	 * system Kconfig for CONFIG_xxx externs.
 	 */
 	const char *kconfig;
+	/* Path to the custom BTF to be used for BPF CO-RE relocations.
+	 * This custom BTF completely replaces the use of vmlinux BTF
+	 * for the purpose of CO-RE relocations.
+	 * NOTE: any other BPF feature (e.g., fentry/fexit programs,
+	 * struct_ops, etc) will need actual kernel BTF at /sys/kernel/btf/vmlinux.
+	 */
+	const char *btf_custom_path;
 };
-#define bpf_object_open_opts__last_field kconfig
+#define bpf_object_open_opts__last_field btf_custom_path
 
 LIBBPF_API struct bpf_object *bpf_object__open(const char *path);
 LIBBPF_API struct bpf_object *

From 18353c87e0e0440d4c7c746ed740738bbc1b538e Mon Sep 17 00:00:00 2001
From: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Date: Tue, 13 Jul 2021 20:42:38 +0800
Subject: [PATCH 04/64] libbpf: Fix the possible memory leak on error

If the strdup() fails then we need to call bpf_object__close(obj) to
avoid a resource leak.

Fixes: 166750bc1dd2 ("libbpf: Support libbpf-provided extern variables")
Signed-off-by: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626180159-112996-3-git-send-email-chengshuyi@linux.alibaba.com
---
 tools/lib/bpf/libbpf.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 8bff28fc3fc9..17bc922f543f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7649,8 +7649,10 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
 	kconfig = OPTS_GET(opts, kconfig, NULL);
 	if (kconfig) {
 		obj->kconfig = strdup(kconfig);
-		if (!obj->kconfig)
-			return ERR_PTR(-ENOMEM);
+		if (!obj->kconfig) {
+			err = -ENOMEM;
+			goto out;
+		}
 	}
 
 	err = bpf_object__elf_init(obj);

From f0b7d119254247cc373d1695889e5216f13fddcd Mon Sep 17 00:00:00 2001
From: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Date: Tue, 13 Jul 2021 20:42:39 +0800
Subject: [PATCH 05/64] selftests/bpf: Switch existing selftests to using
 open_opts for custom BTF

This patch mainly replaces the bpf_object_load_attr of
the core_autosize.c and core_reloc.c files with bpf_object_open_opts.

Signed-off-by: Shuyi Cheng <chengshuyi@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626180159-112996-4-git-send-email-chengshuyi@linux.alibaba.com
---
 .../selftests/bpf/prog_tests/core_autosize.c  | 22 ++++++++--------
 .../selftests/bpf/prog_tests/core_reloc.c     | 25 ++++++++-----------
 2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/core_autosize.c b/tools/testing/selftests/bpf/prog_tests/core_autosize.c
index 981c251453d9..3d4b2a358d47 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_autosize.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_autosize.c
@@ -53,8 +53,8 @@ void test_core_autosize(void)
 	char btf_file[] = "/tmp/core_autosize.btf.XXXXXX";
 	int err, fd = -1, zero = 0;
 	int char_id, short_id, int_id, long_long_id, void_ptr_id, id;
+	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts);
 	struct test_core_autosize* skel = NULL;
-	struct bpf_object_load_attr load_attr = {};
 	struct bpf_program *prog;
 	struct bpf_map *bss_map;
 	struct btf *btf = NULL;
@@ -125,9 +125,10 @@ void test_core_autosize(void)
 	fd = -1;
 
 	/* open and load BPF program with custom BTF as the kernel BTF */
-	skel = test_core_autosize__open();
+	open_opts.btf_custom_path = btf_file;
+	skel = test_core_autosize__open_opts(&open_opts);
 	if (!ASSERT_OK_PTR(skel, "skel_open"))
-		return;
+		goto cleanup;
 
 	/* disable handle_signed() for now */
 	prog = bpf_object__find_program_by_name(skel->obj, "handle_signed");
@@ -135,9 +136,7 @@ void test_core_autosize(void)
 		goto cleanup;
 	bpf_program__set_autoload(prog, false);
 
-	load_attr.obj = skel->obj;
-	load_attr.target_btf_path = btf_file;
-	err = bpf_object__load_xattr(&load_attr);
+	err = bpf_object__load(skel->obj);
 	if (!ASSERT_OK(err, "prog_load"))
 		goto cleanup;
 
@@ -204,14 +203,13 @@ void test_core_autosize(void)
 	skel = NULL;
 
 	/* now re-load with handle_signed() enabled, it should fail loading */
-	skel = test_core_autosize__open();
+	open_opts.btf_custom_path = btf_file;
+	skel = test_core_autosize__open_opts(&open_opts);
 	if (!ASSERT_OK_PTR(skel, "skel_open"))
-		return;
+		goto cleanup;
 
-	load_attr.obj = skel->obj;
-	load_attr.target_btf_path = btf_file;
-	err = bpf_object__load_xattr(&load_attr);
-	if (!ASSERT_ERR(err, "bad_prog_load"))
+	err = test_core_autosize__load(skel);
+	if (!ASSERT_ERR(err, "skel_load"))
 		goto cleanup;
 
 cleanup:
diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index d02e064c535f..4739b15b2a97 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -816,7 +816,7 @@ static size_t roundup_page(size_t sz)
 void test_core_reloc(void)
 {
 	const size_t mmap_sz = roundup_page(sizeof(struct data));
-	struct bpf_object_load_attr load_attr = {};
+	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts);
 	struct core_reloc_test_case *test_case;
 	const char *tp_name, *probe_name;
 	int err, i, equal;
@@ -846,9 +846,16 @@ void test_core_reloc(void)
 				continue;
 		}
 
-		obj = bpf_object__open_file(test_case->bpf_obj_file, NULL);
+		if (test_case->btf_src_file) {
+			err = access(test_case->btf_src_file, R_OK);
+			if (!ASSERT_OK(err, "btf_src_file"))
+				goto cleanup;
+		}
+
+		open_opts.btf_custom_path = test_case->btf_src_file;
+		obj = bpf_object__open_file(test_case->bpf_obj_file, &open_opts);
 		if (!ASSERT_OK_PTR(obj, "obj_open"))
-			continue;
+			goto cleanup;
 
 		probe_name = "raw_tracepoint/sys_enter";
 		tp_name = "sys_enter";
@@ -862,17 +869,7 @@ void test_core_reloc(void)
 			  "prog '%s' not found\n", probe_name))
 			goto cleanup;
 
-
-		if (test_case->btf_src_file) {
-			err = access(test_case->btf_src_file, R_OK);
-			if (!ASSERT_OK(err, "btf_src_file"))
-				goto cleanup;
-		}
-
-		load_attr.obj = obj;
-		load_attr.log_level = 0;
-		load_attr.target_btf_path = test_case->btf_src_file;
-		err = bpf_object__load_xattr(&load_attr);
+		err = bpf_object__load(obj);
 		if (err) {
 			if (!test_case->fails)
 				ASSERT_OK(err, "obj_load");

From 920d16af9b42adfc8524d880b0e8dba66a6cb87d Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 15 Jul 2021 16:15:24 +0100
Subject: [PATCH 06/64] libbpf: BTF dumper support for typed data

Add a BTF dumper for typed data, so that the user can dump a typed
version of the data provided.

The API is

int btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
                             void *data, size_t data_sz,
                             const struct btf_dump_type_data_opts *opts);

...where the id is the BTF id of the data pointed to by the "void *"
argument; for example the BTF id of "struct sk_buff" for a
"struct skb *" data pointer.  Options supported are

 - a starting indent level (indent_lvl)
 - a user-specified indent string which will be printed once per
   indent level; if NULL, tab is chosen but any string <= 32 chars
   can be provided.
 - a set of boolean options to control dump display, similar to those
   used for BPF helper bpf_snprintf_btf().  Options are
        - compact : omit newlines and other indentation
        - skip_names: omit member names
        - emit_zeroes: show zero-value members

Default output format is identical to that dumped by bpf_snprintf_btf(),
for example a "struct sk_buff" representation would look like this:

struct sk_buff){
	(union){
		(struct){
			.next = (struct sk_buff *)0xffffffffffffffff,
			.prev = (struct sk_buff *)0xffffffffffffffff,
		(union){
			.dev = (struct net_device *)0xffffffffffffffff,
			.dev_scratch = (long unsigned int)18446744073709551615,
		},
	},
...

If the data structure is larger than the *data_sz*
number of bytes that are available in *data*, as much
of the data as possible will be dumped and -E2BIG will
be returned.  This is useful as tracers will sometimes
not be able to capture all of the data associated with
a type; for example a "struct task_struct" is ~16k.
Being able to specify that only a subset is available is
important for such cases.  On success, the amount of data
dumped is returned.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626362126-27775-2-git-send-email-alan.maguire@oracle.com
---
 tools/lib/bpf/btf.h      |  19 +
 tools/lib/bpf/btf_dump.c | 819 ++++++++++++++++++++++++++++++++++++++-
 tools/lib/bpf/libbpf.map |   1 +
 3 files changed, 834 insertions(+), 5 deletions(-)

diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index b54f1c3ebd57..374e9f15de2e 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -184,6 +184,25 @@ LIBBPF_API int
 btf_dump__emit_type_decl(struct btf_dump *d, __u32 id,
 			 const struct btf_dump_emit_type_decl_opts *opts);
 
+
+struct btf_dump_type_data_opts {
+	/* size of this struct, for forward/backward compatibility */
+	size_t sz;
+	const char *indent_str;
+	int indent_level;
+	/* below match "show" flags for bpf_show_snprintf() */
+	bool compact;		/* no newlines/indentation */
+	bool skip_names;	/* skip member/type names */
+	bool emit_zeroes;	/* show 0-valued fields */
+	size_t :0;
+};
+#define btf_dump_type_data_opts__last_field emit_zeroes
+
+LIBBPF_API int
+btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
+			 const void *data, size_t data_sz,
+			 const struct btf_dump_type_data_opts *opts);
+
 /*
  * A set of helpers for easier BTF types handling
  */
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 5dc6b5172bb3..929cf931ed66 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -10,6 +10,8 @@
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
+#include <ctype.h>
+#include <endian.h>
 #include <errno.h>
 #include <linux/err.h>
 #include <linux/btf.h>
@@ -53,6 +55,26 @@ struct btf_dump_type_aux_state {
 	__u8 referenced: 1;
 };
 
+/* indent string length; one indent string is added for each indent level */
+#define BTF_DATA_INDENT_STR_LEN			32
+
+/*
+ * Common internal data for BTF type data dump operations.
+ */
+struct btf_dump_data {
+	const void *data_end;		/* end of valid data to show */
+	bool compact;
+	bool skip_names;
+	bool emit_zeroes;
+	__u8 indent_lvl;	/* base indent level */
+	char indent_str[BTF_DATA_INDENT_STR_LEN];
+	/* below are used during iteration */
+	int depth;
+	bool is_array_member;
+	bool is_array_terminated;
+	bool is_array_char;
+};
+
 struct btf_dump {
 	const struct btf *btf;
 	const struct btf_ext *btf_ext;
@@ -60,6 +82,7 @@ struct btf_dump {
 	struct btf_dump_opts opts;
 	int ptr_sz;
 	bool strip_mods;
+	bool skip_anon_defs;
 	int last_id;
 
 	/* per-type auxiliary state */
@@ -89,6 +112,10 @@ struct btf_dump {
 	 * name occurrences
 	 */
 	struct hashmap *ident_names;
+	/*
+	 * data for typed display; allocated if needed.
+	 */
+	struct btf_dump_data *typed_dump;
 };
 
 static size_t str_hash_fn(const void *key, void *ctx)
@@ -765,11 +792,11 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 		break;
 	case BTF_KIND_FUNC_PROTO: {
 		const struct btf_param *p = btf_params(t);
-		__u16 vlen = btf_vlen(t);
+		__u16 n = btf_vlen(t);
 		int i;
 
 		btf_dump_emit_type(d, t->type, cont_id);
-		for (i = 0; i < vlen; i++, p++)
+		for (i = 0; i < n; i++, p++)
 			btf_dump_emit_type(d, p->type, cont_id);
 
 		break;
@@ -852,8 +879,9 @@ static void btf_dump_emit_bit_padding(const struct btf_dump *d,
 static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id,
 				     const struct btf_type *t)
 {
-	btf_dump_printf(d, "%s %s",
+	btf_dump_printf(d, "%s%s%s",
 			btf_is_struct(t) ? "struct" : "union",
+			t->name_off ? " " : "",
 			btf_dump_type_name(d, id));
 }
 
@@ -1259,7 +1287,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 		case BTF_KIND_UNION:
 			btf_dump_emit_mods(d, decls);
 			/* inline anonymous struct/union */
-			if (t->name_off == 0)
+			if (t->name_off == 0 && !d->skip_anon_defs)
 				btf_dump_emit_struct_def(d, id, t, lvl);
 			else
 				btf_dump_emit_struct_fwd(d, id, t);
@@ -1267,7 +1295,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 		case BTF_KIND_ENUM:
 			btf_dump_emit_mods(d, decls);
 			/* inline anonymous enum */
-			if (t->name_off == 0)
+			if (t->name_off == 0 && !d->skip_anon_defs)
 				btf_dump_emit_enum_def(d, id, t, lvl);
 			else
 				btf_dump_emit_enum_fwd(d, id, t);
@@ -1392,6 +1420,39 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 	btf_dump_emit_name(d, fname, last_was_ptr);
 }
 
+/* show type name as (type_name) */
+static void btf_dump_emit_type_cast(struct btf_dump *d, __u32 id,
+				    bool top_level)
+{
+	const struct btf_type *t;
+
+	/* for array members, we don't bother emitting type name for each
+	 * member to avoid the redundancy of
+	 * .name = (char[4])[(char)'f',(char)'o',(char)'o',]
+	 */
+	if (d->typed_dump->is_array_member)
+		return;
+
+	/* avoid type name specification for variable/section; it will be done
+	 * for the associated variable value(s).
+	 */
+	t = btf__type_by_id(d->btf, id);
+	if (btf_is_var(t) || btf_is_datasec(t))
+		return;
+
+	if (top_level)
+		btf_dump_printf(d, "(");
+
+	d->skip_anon_defs = true;
+	d->strip_mods = true;
+	btf_dump_emit_type_decl(d, id, "", 0);
+	d->strip_mods = false;
+	d->skip_anon_defs = false;
+
+	if (top_level)
+		btf_dump_printf(d, ")");
+}
+
 /* return number of duplicates (occurrences) of a given name */
 static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map,
 				 const char *orig_name)
@@ -1442,3 +1503,751 @@ static const char *btf_dump_ident_name(struct btf_dump *d, __u32 id)
 {
 	return btf_dump_resolve_name(d, id, d->ident_names);
 }
+
+static int btf_dump_dump_type_data(struct btf_dump *d,
+				   const char *fname,
+				   const struct btf_type *t,
+				   __u32 id,
+				   const void *data,
+				   __u8 bits_offset,
+				   __u8 bit_sz);
+
+static const char *btf_dump_data_newline(struct btf_dump *d)
+{
+	return d->typed_dump->compact || d->typed_dump->depth == 0 ? "" : "\n";
+}
+
+static const char *btf_dump_data_delim(struct btf_dump *d)
+{
+	return d->typed_dump->depth == 0 ? "" : ",";
+}
+
+static void btf_dump_data_pfx(struct btf_dump *d)
+{
+	int i, lvl = d->typed_dump->indent_lvl + d->typed_dump->depth;
+
+	if (d->typed_dump->compact)
+		return;
+
+	for (i = 0; i < lvl; i++)
+		btf_dump_printf(d, "%s", d->typed_dump->indent_str);
+}
+
+/* A macro is used here as btf_type_value[s]() appends format specifiers
+ * to the format specifier passed in; these do the work of appending
+ * delimiters etc while the caller simply has to specify the type values
+ * in the format specifier + value(s).
+ */
+#define btf_dump_type_values(d, fmt, ...)				\
+	btf_dump_printf(d, fmt "%s%s",					\
+			##__VA_ARGS__,					\
+			btf_dump_data_delim(d),				\
+			btf_dump_data_newline(d))
+
+static int btf_dump_unsupported_data(struct btf_dump *d,
+				     const struct btf_type *t,
+				     __u32 id)
+{
+	btf_dump_printf(d, "<unsupported kind:%u>", btf_kind(t));
+	return -ENOTSUP;
+}
+
+static void btf_dump_int128(struct btf_dump *d,
+			    const struct btf_type *t,
+			    const void *data)
+{
+	__int128 num = *(__int128 *)data;
+
+	if ((num >> 64) == 0)
+		btf_dump_type_values(d, "0x%llx", (long long)num);
+	else
+		btf_dump_type_values(d, "0x%llx%016llx", (long long)num >> 32,
+				     (long long)num);
+}
+
+static unsigned __int128 btf_dump_bitfield_get_data(struct btf_dump *d,
+						    const struct btf_type *t,
+						    const void *data,
+						    __u8 bits_offset,
+						    __u8 bit_sz)
+{
+	__u16 left_shift_bits, right_shift_bits;
+	__u8 nr_copy_bits, nr_copy_bytes;
+	unsigned __int128 num = 0, ret;
+	const __u8 *bytes = data;
+	int i;
+
+	/* Bitfield value retrieval is done in two steps; first relevant bytes are
+	 * stored in num, then we left/right shift num to eliminate irrelevant bits.
+	 */
+	nr_copy_bits = bit_sz + bits_offset;
+	nr_copy_bytes = t->size;
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+	for (i = nr_copy_bytes - 1; i >= 0; i--)
+		num = num * 256 + bytes[i];
+#elif __BYTE_ORDER == __BIG_ENDIAN
+	for (i = 0; i < nr_copy_bytes; i++)
+		num = num * 256 + bytes[i];
+#else
+# error "Unrecognized __BYTE_ORDER__"
+#endif
+	left_shift_bits = 128 - nr_copy_bits;
+	right_shift_bits = 128 - bit_sz;
+
+	ret = (num << left_shift_bits) >> right_shift_bits;
+
+	return ret;
+}
+
+static int btf_dump_bitfield_check_zero(struct btf_dump *d,
+					const struct btf_type *t,
+					const void *data,
+					__u8 bits_offset,
+					__u8 bit_sz)
+{
+	__int128 check_num;
+
+	check_num = btf_dump_bitfield_get_data(d, t, data, bits_offset, bit_sz);
+	if (check_num == 0)
+		return -ENODATA;
+	return 0;
+}
+
+static int btf_dump_bitfield_data(struct btf_dump *d,
+				  const struct btf_type *t,
+				  const void *data,
+				  __u8 bits_offset,
+				  __u8 bit_sz)
+{
+	unsigned __int128 print_num;
+
+	print_num = btf_dump_bitfield_get_data(d, t, data, bits_offset, bit_sz);
+	btf_dump_int128(d, t, &print_num);
+
+	return 0;
+}
+
+/* ints, floats and ptrs */
+static int btf_dump_base_type_check_zero(struct btf_dump *d,
+					 const struct btf_type *t,
+					 __u32 id,
+					 const void *data)
+{
+	static __u8 bytecmp[16] = {};
+	int nr_bytes;
+
+	/* For pointer types, pointer size is not defined on a per-type basis.
+	 * On dump creation however, we store the pointer size.
+	 */
+	if (btf_kind(t) == BTF_KIND_PTR)
+		nr_bytes = d->ptr_sz;
+	else
+		nr_bytes = t->size;
+
+	if (nr_bytes < 1 || nr_bytes > 16) {
+		pr_warn("unexpected size %d for id [%u]\n", nr_bytes, id);
+		return -EINVAL;
+	}
+
+	if (memcmp(data, bytecmp, nr_bytes) == 0)
+		return -ENODATA;
+	return 0;
+}
+
+static int btf_dump_int_data(struct btf_dump *d,
+			     const struct btf_type *t,
+			     __u32 type_id,
+			     const void *data,
+			     __u8 bits_offset)
+{
+	__u8 encoding = btf_int_encoding(t);
+	bool sign = encoding & BTF_INT_SIGNED;
+	int sz = t->size;
+
+	if (sz == 0) {
+		pr_warn("unexpected size %d for id [%u]\n", sz, type_id);
+		return -EINVAL;
+	}
+
+	/* handle packed int data - accesses of integers not aligned on
+	 * int boundaries can cause problems on some platforms.
+	 */
+	if (((uintptr_t)data) % sz)
+		return btf_dump_bitfield_data(d, t, data, 0, 0);
+
+	switch (sz) {
+	case 16:
+		btf_dump_int128(d, t, data);
+		break;
+	case 8:
+		if (sign)
+			btf_dump_type_values(d, "%lld", *(long long *)data);
+		else
+			btf_dump_type_values(d, "%llu", *(unsigned long long *)data);
+		break;
+	case 4:
+		if (sign)
+			btf_dump_type_values(d, "%d", *(__s32 *)data);
+		else
+			btf_dump_type_values(d, "%u", *(__u32 *)data);
+		break;
+	case 2:
+		if (sign)
+			btf_dump_type_values(d, "%d", *(__s16 *)data);
+		else
+			btf_dump_type_values(d, "%u", *(__u16 *)data);
+		break;
+	case 1:
+		if (d->typed_dump->is_array_char) {
+			/* check for null terminator */
+			if (d->typed_dump->is_array_terminated)
+				break;
+			if (*(char *)data == '\0') {
+				d->typed_dump->is_array_terminated = true;
+				break;
+			}
+			if (isprint(*(char *)data)) {
+				btf_dump_type_values(d, "'%c'", *(char *)data);
+				break;
+			}
+		}
+		if (sign)
+			btf_dump_type_values(d, "%d", *(__s8 *)data);
+		else
+			btf_dump_type_values(d, "%u", *(__u8 *)data);
+		break;
+	default:
+		pr_warn("unexpected sz %d for id [%u]\n", sz, type_id);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+union float_data {
+	long double ld;
+	double d;
+	float f;
+};
+
+static int btf_dump_float_data(struct btf_dump *d,
+			       const struct btf_type *t,
+			       __u32 type_id,
+			       const void *data)
+{
+	const union float_data *flp = data;
+	union float_data fl;
+	int sz = t->size;
+
+	/* handle unaligned data; copy to local union */
+	if (((uintptr_t)data) % sz) {
+		memcpy(&fl, data, sz);
+		flp = &fl;
+	}
+
+	switch (sz) {
+	case 16:
+		btf_dump_type_values(d, "%Lf", flp->ld);
+		break;
+	case 8:
+		btf_dump_type_values(d, "%lf", flp->d);
+		break;
+	case 4:
+		btf_dump_type_values(d, "%f", flp->f);
+		break;
+	default:
+		pr_warn("unexpected size %d for id [%u]\n", sz, type_id);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int btf_dump_var_data(struct btf_dump *d,
+			     const struct btf_type *v,
+			     __u32 id,
+			     const void *data)
+{
+	enum btf_func_linkage linkage = btf_var(v)->linkage;
+	const struct btf_type *t;
+	const char *l;
+	__u32 type_id;
+
+	switch (linkage) {
+	case BTF_FUNC_STATIC:
+		l = "static ";
+		break;
+	case BTF_FUNC_EXTERN:
+		l = "extern ";
+		break;
+	case BTF_FUNC_GLOBAL:
+	default:
+		l = "";
+		break;
+	}
+
+	/* format of output here is [linkage] [type] [varname] = (type)value,
+	 * for example "static int cpu_profile_flip = (int)1"
+	 */
+	btf_dump_printf(d, "%s", l);
+	type_id = v->type;
+	t = btf__type_by_id(d->btf, type_id);
+	btf_dump_emit_type_cast(d, type_id, false);
+	btf_dump_printf(d, " %s = ", btf_name_of(d, v->name_off));
+	return btf_dump_dump_type_data(d, NULL, t, type_id, data, 0, 0);
+}
+
+static int btf_dump_array_data(struct btf_dump *d,
+			       const struct btf_type *t,
+			       __u32 id,
+			       const void *data)
+{
+	const struct btf_array *array = btf_array(t);
+	const struct btf_type *elem_type;
+	__u32 i, elem_size = 0, elem_type_id;
+	bool is_array_member;
+
+	elem_type_id = array->type;
+	elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL);
+	elem_size = btf__resolve_size(d->btf, elem_type_id);
+	if (elem_size <= 0) {
+		pr_warn("unexpected elem size %d for array type [%u]\n", elem_size, id);
+		return -EINVAL;
+	}
+
+	if (btf_is_int(elem_type)) {
+		/*
+		 * BTF_INT_CHAR encoding never seems to be set for
+		 * char arrays, so if size is 1 and element is
+		 * printable as a char, we'll do that.
+		 */
+		if (elem_size == 1)
+			d->typed_dump->is_array_char = true;
+	}
+
+	/* note that we increment depth before calling btf_dump_print() below;
+	 * this is intentional.  btf_dump_data_newline() will not print a
+	 * newline for depth 0 (since this leaves us with trailing newlines
+	 * at the end of typed display), so depth is incremented first.
+	 * For similar reasons, we decrement depth before showing the closing
+	 * parenthesis.
+	 */
+	d->typed_dump->depth++;
+	btf_dump_printf(d, "[%s", btf_dump_data_newline(d));
+
+	/* may be a multidimensional array, so store current "is array member"
+	 * status so we can restore it correctly later.
+	 */
+	is_array_member = d->typed_dump->is_array_member;
+	d->typed_dump->is_array_member = true;
+	for (i = 0; i < array->nelems; i++, data += elem_size) {
+		if (d->typed_dump->is_array_terminated)
+			break;
+		btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0);
+	}
+	d->typed_dump->is_array_member = is_array_member;
+	d->typed_dump->depth--;
+	btf_dump_data_pfx(d);
+	btf_dump_type_values(d, "]");
+
+	return 0;
+}
+
+static int btf_dump_struct_data(struct btf_dump *d,
+				const struct btf_type *t,
+				__u32 id,
+				const void *data)
+{
+	const struct btf_member *m = btf_members(t);
+	__u16 n = btf_vlen(t);
+	int i, err;
+
+	/* note that we increment depth before calling btf_dump_print() below;
+	 * this is intentional.  btf_dump_data_newline() will not print a
+	 * newline for depth 0 (since this leaves us with trailing newlines
+	 * at the end of typed display), so depth is incremented first.
+	 * For similar reasons, we decrement depth before showing the closing
+	 * parenthesis.
+	 */
+	d->typed_dump->depth++;
+	btf_dump_printf(d, "{%s", btf_dump_data_newline(d));
+
+	for (i = 0; i < n; i++, m++) {
+		const struct btf_type *mtype;
+		const char *mname;
+		__u32 moffset;
+		__u8 bit_sz;
+
+		mtype = btf__type_by_id(d->btf, m->type);
+		mname = btf_name_of(d, m->name_off);
+		moffset = btf_member_bit_offset(t, i);
+
+		bit_sz = btf_member_bitfield_size(t, i);
+		err = btf_dump_dump_type_data(d, mname, mtype, m->type, data + moffset / 8,
+					      moffset % 8, bit_sz);
+		if (err < 0)
+			return err;
+	}
+	d->typed_dump->depth--;
+	btf_dump_data_pfx(d);
+	btf_dump_type_values(d, "}");
+	return err;
+}
+
+static int btf_dump_ptr_data(struct btf_dump *d,
+			      const struct btf_type *t,
+			      __u32 id,
+			      const void *data)
+{
+	btf_dump_type_values(d, "%p", *(void **)data);
+	return 0;
+}
+
+static int btf_dump_get_enum_value(struct btf_dump *d,
+				   const struct btf_type *t,
+				   const void *data,
+				   __u32 id,
+				   __s64 *value)
+{
+	int sz = t->size;
+
+	/* handle unaligned enum value */
+	if (((uintptr_t)data) % sz) {
+		*value = (__s64)btf_dump_bitfield_get_data(d, t, data, 0, 0);
+		return 0;
+	}
+	switch (t->size) {
+	case 8:
+		*value = *(__s64 *)data;
+		return 0;
+	case 4:
+		*value = *(__s32 *)data;
+		return 0;
+	case 2:
+		*value = *(__s16 *)data;
+		return 0;
+	case 1:
+		*value = *(__s8 *)data;
+		return 0;
+	default:
+		pr_warn("unexpected size %d for enum, id:[%u]\n", t->size, id);
+		return -EINVAL;
+	}
+}
+
+static int btf_dump_enum_data(struct btf_dump *d,
+			      const struct btf_type *t,
+			      __u32 id,
+			      const void *data)
+{
+	const struct btf_enum *e;
+	__s64 value;
+	int i, err;
+
+	err = btf_dump_get_enum_value(d, t, data, id, &value);
+	if (err)
+		return err;
+
+	for (i = 0, e = btf_enum(t); i < btf_vlen(t); i++, e++) {
+		if (value != e->val)
+			continue;
+		btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off));
+		return 0;
+	}
+
+	btf_dump_type_values(d, "%d", value);
+	return 0;
+}
+
+static int btf_dump_datasec_data(struct btf_dump *d,
+				 const struct btf_type *t,
+				 __u32 id,
+				 const void *data)
+{
+	const struct btf_var_secinfo *vsi;
+	const struct btf_type *var;
+	__u32 i;
+	int err;
+
+	btf_dump_type_values(d, "SEC(\"%s\") ", btf_name_of(d, t->name_off));
+
+	for (i = 0, vsi = btf_var_secinfos(t); i < btf_vlen(t); i++, vsi++) {
+		var = btf__type_by_id(d->btf, vsi->type);
+		err = btf_dump_dump_type_data(d, NULL, var, vsi->type, data + vsi->offset, 0, 0);
+		if (err < 0)
+			return err;
+		btf_dump_printf(d, ";");
+	}
+	return 0;
+}
+
+/* return size of type, or if base type overflows, return -E2BIG. */
+static int btf_dump_type_data_check_overflow(struct btf_dump *d,
+					     const struct btf_type *t,
+					     __u32 id,
+					     const void *data,
+					     __u8 bits_offset)
+{
+	__s64 size = btf__resolve_size(d->btf, id);
+
+	if (size < 0 || size >= INT_MAX) {
+		pr_warn("unexpected size [%lld] for id [%u]\n",
+			size, id);
+		return -EINVAL;
+	}
+
+	/* Only do overflow checking for base types; we do not want to
+	 * avoid showing part of a struct, union or array, even if we
+	 * do not have enough data to show the full object.  By
+	 * restricting overflow checking to base types we can ensure
+	 * that partial display succeeds, while avoiding overflowing
+	 * and using bogus data for display.
+	 */
+	t = skip_mods_and_typedefs(d->btf, id, NULL);
+	if (!t) {
+		pr_warn("unexpected error skipping mods/typedefs for id [%u]\n",
+			id);
+		return -EINVAL;
+	}
+
+	switch (btf_kind(t)) {
+	case BTF_KIND_INT:
+	case BTF_KIND_FLOAT:
+	case BTF_KIND_PTR:
+	case BTF_KIND_ENUM:
+		if (data + bits_offset / 8 + size > d->typed_dump->data_end)
+			return -E2BIG;
+		break;
+	default:
+		break;
+	}
+	return (int)size;
+}
+
+static int btf_dump_type_data_check_zero(struct btf_dump *d,
+					 const struct btf_type *t,
+					 __u32 id,
+					 const void *data,
+					 __u8 bits_offset,
+					 __u8 bit_sz)
+{
+	__s64 value;
+	int i, err;
+
+	/* toplevel exceptions; we show zero values if
+	 * - we ask for them (emit_zeros)
+	 * - if we are at top-level so we see "struct empty { }"
+	 * - or if we are an array member and the array is non-empty and
+	 *   not a char array; we don't want to be in a situation where we
+	 *   have an integer array 0, 1, 0, 1 and only show non-zero values.
+	 *   If the array contains zeroes only, or is a char array starting
+	 *   with a '\0', the array-level check_zero() will prevent showing it;
+	 *   we are concerned with determining zero value at the array member
+	 *   level here.
+	 */
+	if (d->typed_dump->emit_zeroes || d->typed_dump->depth == 0 ||
+	    (d->typed_dump->is_array_member &&
+	     !d->typed_dump->is_array_char))
+		return 0;
+
+	t = skip_mods_and_typedefs(d->btf, id, NULL);
+
+	switch (btf_kind(t)) {
+	case BTF_KIND_INT:
+		if (bit_sz)
+			return btf_dump_bitfield_check_zero(d, t, data, bits_offset, bit_sz);
+		return btf_dump_base_type_check_zero(d, t, id, data);
+	case BTF_KIND_FLOAT:
+	case BTF_KIND_PTR:
+		return btf_dump_base_type_check_zero(d, t, id, data);
+	case BTF_KIND_ARRAY: {
+		const struct btf_array *array = btf_array(t);
+		const struct btf_type *elem_type;
+		__u32 elem_type_id, elem_size;
+		bool ischar;
+
+		elem_type_id = array->type;
+		elem_size = btf__resolve_size(d->btf, elem_type_id);
+		elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL);
+
+		ischar = btf_is_int(elem_type) && elem_size == 1;
+
+		/* check all elements; if _any_ element is nonzero, all
+		 * of array is displayed.  We make an exception however
+		 * for char arrays where the first element is 0; these
+		 * are considered zeroed also, even if later elements are
+		 * non-zero because the string is terminated.
+		 */
+		for (i = 0; i < array->nelems; i++) {
+			if (i == 0 && ischar && *(char *)data == 0)
+				return -ENODATA;
+			err = btf_dump_type_data_check_zero(d, elem_type,
+							    elem_type_id,
+							    data +
+							    (i * elem_size),
+							    bits_offset, 0);
+			if (err != -ENODATA)
+				return err;
+		}
+		return -ENODATA;
+	}
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION: {
+		const struct btf_member *m = btf_members(t);
+		__u16 n = btf_vlen(t);
+
+		/* if any struct/union member is non-zero, the struct/union
+		 * is considered non-zero and dumped.
+		 */
+		for (i = 0; i < n; i++, m++) {
+			const struct btf_type *mtype;
+			__u32 moffset;
+
+			mtype = btf__type_by_id(d->btf, m->type);
+			moffset = btf_member_bit_offset(t, i);
+
+			/* btf_int_bits() does not store member bitfield size;
+			 * bitfield size needs to be stored here so int display
+			 * of member can retrieve it.
+			 */
+			bit_sz = btf_member_bitfield_size(t, i);
+			err = btf_dump_type_data_check_zero(d, mtype, m->type, data + moffset / 8,
+							    moffset % 8, bit_sz);
+			if (err != ENODATA)
+				return err;
+		}
+		return -ENODATA;
+	}
+	case BTF_KIND_ENUM:
+		if (btf_dump_get_enum_value(d, t, data, id, &value))
+			return 0;
+		if (value == 0)
+			return -ENODATA;
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+/* returns size of data dumped, or error. */
+static int btf_dump_dump_type_data(struct btf_dump *d,
+				   const char *fname,
+				   const struct btf_type *t,
+				   __u32 id,
+				   const void *data,
+				   __u8 bits_offset,
+				   __u8 bit_sz)
+{
+	int size, err;
+
+	size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset);
+	if (size < 0)
+		return size;
+	err = btf_dump_type_data_check_zero(d, t, id, data, bits_offset, bit_sz);
+	if (err) {
+		/* zeroed data is expected and not an error, so simply skip
+		 * dumping such data.  Record other errors however.
+		 */
+		if (err == -ENODATA)
+			return size;
+		return err;
+	}
+	btf_dump_data_pfx(d);
+
+	if (!d->typed_dump->skip_names) {
+		if (fname && strlen(fname) > 0)
+			btf_dump_printf(d, ".%s = ", fname);
+		btf_dump_emit_type_cast(d, id, true);
+	}
+
+	t = skip_mods_and_typedefs(d->btf, id, NULL);
+
+	switch (btf_kind(t)) {
+	case BTF_KIND_UNKN:
+	case BTF_KIND_FWD:
+	case BTF_KIND_FUNC:
+	case BTF_KIND_FUNC_PROTO:
+		err = btf_dump_unsupported_data(d, t, id);
+		break;
+	case BTF_KIND_INT:
+		if (bit_sz)
+			err = btf_dump_bitfield_data(d, t, data, bits_offset, bit_sz);
+		else
+			err = btf_dump_int_data(d, t, id, data, bits_offset);
+		break;
+	case BTF_KIND_FLOAT:
+		err = btf_dump_float_data(d, t, id, data);
+		break;
+	case BTF_KIND_PTR:
+		err = btf_dump_ptr_data(d, t, id, data);
+		break;
+	case BTF_KIND_ARRAY:
+		err = btf_dump_array_data(d, t, id, data);
+		break;
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+		err = btf_dump_struct_data(d, t, id, data);
+		break;
+	case BTF_KIND_ENUM:
+		/* handle bitfield and int enum values */
+		if (bit_sz) {
+			unsigned __int128 print_num;
+			__s64 enum_val;
+
+			print_num = btf_dump_bitfield_get_data(d, t, data, bits_offset, bit_sz);
+			enum_val = (__s64)print_num;
+			err = btf_dump_enum_data(d, t, id, &enum_val);
+		} else
+			err = btf_dump_enum_data(d, t, id, data);
+		break;
+	case BTF_KIND_VAR:
+		err = btf_dump_var_data(d, t, id, data);
+		break;
+	case BTF_KIND_DATASEC:
+		err = btf_dump_datasec_data(d, t, id, data);
+		break;
+	default:
+		pr_warn("unexpected kind [%u] for id [%u]\n",
+			BTF_INFO_KIND(t->info), id);
+		return -EINVAL;
+	}
+	if (err < 0)
+		return err;
+	return size;
+}
+
+int btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
+			     const void *data, size_t data_sz,
+			     const struct btf_dump_type_data_opts *opts)
+{
+	const struct btf_type *t;
+	int ret;
+
+	if (!OPTS_VALID(opts, btf_dump_type_data_opts))
+		return libbpf_err(-EINVAL);
+
+	t = btf__type_by_id(d->btf, id);
+	if (!t)
+		return libbpf_err(-ENOENT);
+
+	d->typed_dump = calloc(1, sizeof(struct btf_dump_data));
+	if (!d->typed_dump)
+		return libbpf_err(-ENOMEM);
+
+	d->typed_dump->data_end = data + data_sz;
+	d->typed_dump->indent_lvl = OPTS_GET(opts, indent_level, 0);
+	/* default indent string is a tab */
+	if (!opts->indent_str)
+		d->typed_dump->indent_str[0] = '\t';
+	else
+		strncat(d->typed_dump->indent_str, opts->indent_str,
+			sizeof(d->typed_dump->indent_str) - 1);
+
+	d->typed_dump->compact = OPTS_GET(opts, compact, false);
+	d->typed_dump->skip_names = OPTS_GET(opts, skip_names, false);
+	d->typed_dump->emit_zeroes = OPTS_GET(opts, emit_zeroes, false);
+
+	ret = btf_dump_dump_type_data(d, NULL, t, id, data, 0, 0);
+
+	free(d->typed_dump);
+
+	return libbpf_err(ret);
+}
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 944c99d1ded3..5bfc10722647 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -373,5 +373,6 @@ LIBBPF_0.5.0 {
 		bpf_map__initial_value;
 		bpf_map_lookup_and_delete_elem_flags;
 		bpf_object__gen_loader;
+		btf_dump__dump_type_data;
 		libbpf_set_strict_mode;
 } LIBBPF_0.4.0;

From 17283337ff86a1956940bc6bfb3f7dafe82304f3 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 15 Jul 2021 16:15:25 +0100
Subject: [PATCH 07/64] selftests/bpf: Add ASSERT_STRNEQ() variant for
 test_progs

It will support strncmp()-style string comparisons.

Suggested-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626362126-27775-3-git-send-email-alan.maguire@oracle.com
---
 tools/testing/selftests/bpf/test_progs.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 8ef7f334e715..c8c2bf878f67 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -221,6 +221,18 @@ extern int test__join_cgroup(const char *path);
 	___ok;								\
 })
 
+#define ASSERT_STRNEQ(actual, expected, len, name) ({			\
+	static int duration = 0;					\
+	const char *___act = actual;					\
+	const char *___exp = expected;					\
+	int ___len = len;						\
+	bool ___ok = strncmp(___act, ___exp, ___len) == 0;		\
+	CHECK(!___ok, (name),						\
+	      "unexpected %s: actual '%.*s' != expected '%.*s'\n",	\
+	      (name), ___len, ___act, ___len, ___exp);			\
+	___ok;								\
+})
+
 #define ASSERT_OK(res, name) ({						\
 	static int duration = 0;					\
 	long long ___res = (res);					\

From 70a9241fbce5398965bf05a595ee892845ae31e9 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 15 Jul 2021 16:15:26 +0100
Subject: [PATCH 08/64] selftests/bpf: Add dump type data tests to btf dump
 tests

Test various type data dumping operations by comparing expected
format with the dumped string; an snprintf-style printf function
is used to record the string dumped.  Also verify overflow handling
where the data passed does not cover the full size of a type,
such as would occur if a tracer has a portion of the 8k
"struct task_struct".

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626362126-27775-4-git-send-email-alan.maguire@oracle.com
---
 .../selftests/bpf/prog_tests/btf_dump.c       | 598 ++++++++++++++++++
 1 file changed, 598 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index 1b90e684ff13..0b4ba53b5961 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -232,7 +232,576 @@ void test_btf_dump_incremental(void)
 	btf__free(btf);
 }
 
+#define STRSIZE				4096
+
+static void btf_dump_snprintf(void *ctx, const char *fmt, va_list args)
+{
+	char *s = ctx, new[STRSIZE];
+
+	vsnprintf(new, STRSIZE, fmt, args);
+	if (strlen(s) < STRSIZE)
+		strncat(s, new, STRSIZE - strlen(s) - 1);
+}
+
+static int btf_dump_data(struct btf *btf, struct btf_dump *d,
+			 char *name, char *prefix, __u64 flags, void *ptr,
+			 size_t ptr_sz, char *str, const char *expected_val)
+{
+	DECLARE_LIBBPF_OPTS(btf_dump_type_data_opts, opts);
+	size_t type_sz;
+	__s32 type_id;
+	int ret = 0;
+
+	if (flags & BTF_F_COMPACT)
+		opts.compact = true;
+	if (flags & BTF_F_NONAME)
+		opts.skip_names = true;
+	if (flags & BTF_F_ZERO)
+		opts.emit_zeroes = true;
+	if (prefix) {
+		ASSERT_STRNEQ(name, prefix, strlen(prefix),
+			      "verify prefix match");
+		name += strlen(prefix) + 1;
+	}
+	type_id = btf__find_by_name(btf, name);
+	if (!ASSERT_GE(type_id, 0, "find type id"))
+		return -ENOENT;
+	type_sz = btf__resolve_size(btf, type_id);
+	str[0] = '\0';
+	ret = btf_dump__dump_type_data(d, type_id, ptr, ptr_sz, &opts);
+	if (type_sz <= ptr_sz) {
+		if (!ASSERT_EQ(ret, type_sz, "failed/unexpected type_sz"))
+			return -EINVAL;
+	} else {
+		if (!ASSERT_EQ(ret, -E2BIG, "failed to return -E2BIG"))
+			return -EINVAL;
+	}
+	if (!ASSERT_STREQ(str, expected_val, "ensure expected/actual match"))
+		return -EFAULT;
+	return 0;
+}
+
+#define TEST_BTF_DUMP_DATA(_b, _d, _prefix, _str, _type, _flags,	\
+			   _expected, ...)				\
+	do {								\
+		char __ptrtype[64] = #_type;				\
+		char *_ptrtype = (char *)__ptrtype;			\
+		_type _ptrdata = __VA_ARGS__;				\
+		void *_ptr = &_ptrdata;					\
+									\
+		(void) btf_dump_data(_b, _d, _ptrtype, _prefix, _flags,	\
+				     _ptr, sizeof(_type), _str,		\
+				     _expected);			\
+	} while (0)
+
+/* Use where expected data string matches its stringified declaration */
+#define TEST_BTF_DUMP_DATA_C(_b, _d, _prefix,  _str, _type, _flags,	\
+			     ...)					\
+	TEST_BTF_DUMP_DATA(_b, _d, _prefix, _str, _type, _flags,	\
+			   "(" #_type ")" #__VA_ARGS__,	__VA_ARGS__)
+
+/* overflow test; pass typesize < expected type size, ensure E2BIG returned */
+#define TEST_BTF_DUMP_DATA_OVER(_b, _d, _prefix, _str, _type, _type_sz,	\
+				_expected, ...)				\
+	do {								\
+		char __ptrtype[64] = #_type;				\
+		char *_ptrtype = (char *)__ptrtype;			\
+		_type _ptrdata = __VA_ARGS__;				\
+		void *_ptr = &_ptrdata;					\
+									\
+		(void) btf_dump_data(_b, _d, _ptrtype, _prefix, 0,	\
+				     _ptr, _type_sz, _str, _expected);	\
+	} while (0)
+
+#define TEST_BTF_DUMP_VAR(_b, _d, _prefix, _str, _var, _type, _flags,	\
+			  _expected, ...)				\
+	do {								\
+		_type _ptrdata = __VA_ARGS__;				\
+		void *_ptr = &_ptrdata;					\
+									\
+		(void) btf_dump_data(_b, _d, _var, _prefix, _flags,	\
+				     _ptr, sizeof(_type), _str,		\
+				     _expected);			\
+	} while (0)
+
+static void test_btf_dump_int_data(struct btf *btf, struct btf_dump *d,
+				   char *str)
+{
+	/* simple int */
+	TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, int, BTF_F_COMPACT, 1234);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_NONAME,
+			   "1234", 1234);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, 0, "(int)1234", 1234);
+
+	/* zero value should be printed at toplevel */
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT, "(int)0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_NONAME,
+			   "0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_ZERO,
+			   "(int)0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, int,
+			   BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO,
+			   "0", 0);
+	TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, int, BTF_F_COMPACT, -4567);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_NONAME,
+			   "-4567", -4567);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, 0, "(int)-4567", -4567);
+
+	TEST_BTF_DUMP_DATA_OVER(btf, d, NULL, str, int, sizeof(int)-1, "", 1);
+}
+
+static void test_btf_dump_float_data(struct btf *btf, struct btf_dump *d,
+				     char *str)
+{
+	float t1 = 1.234567;
+	float t2 = -1.234567;
+	float t3 = 0.0;
+	double t4 = 5.678912;
+	double t5 = -5.678912;
+	double t6 = 0.0;
+	long double t7 = 9.876543;
+	long double t8 = -9.876543;
+	long double t9 = 0.0;
+
+	/* since the kernel does not likely have any float types in its BTF, we
+	 * will need to add some of various sizes.
+	 */
+
+	ASSERT_GT(btf__add_float(btf, "test_float", 4), 0, "add float");
+	ASSERT_OK(btf_dump_data(btf, d, "test_float", NULL, 0, &t1, 4, str,
+				"(test_float)1.234567"), "dump float");
+	ASSERT_OK(btf_dump_data(btf, d, "test_float", NULL, 0, &t2, 4, str,
+				"(test_float)-1.234567"), "dump float");
+	ASSERT_OK(btf_dump_data(btf, d, "test_float", NULL, 0, &t3, 4, str,
+				"(test_float)0.000000"), "dump float");
+
+	ASSERT_GT(btf__add_float(btf, "test_double", 8), 0, "add_double");
+	ASSERT_OK(btf_dump_data(btf, d, "test_double", NULL, 0, &t4, 8, str,
+		  "(test_double)5.678912"), "dump double");
+	ASSERT_OK(btf_dump_data(btf, d, "test_double", NULL, 0, &t5, 8, str,
+		  "(test_double)-5.678912"), "dump double");
+	ASSERT_OK(btf_dump_data(btf, d, "test_double", NULL, 0, &t6, 8, str,
+				"(test_double)0.000000"), "dump double");
+
+	ASSERT_GT(btf__add_float(btf, "test_long_double", 16), 0, "add long double");
+	ASSERT_OK(btf_dump_data(btf, d, "test_long_double", NULL, 0, &t7, 16,
+				str, "(test_long_double)9.876543"),
+				"dump long_double");
+	ASSERT_OK(btf_dump_data(btf, d, "test_long_double", NULL, 0, &t8, 16,
+				str, "(test_long_double)-9.876543"),
+				"dump long_double");
+	ASSERT_OK(btf_dump_data(btf, d, "test_long_double", NULL, 0, &t9, 16,
+				str, "(test_long_double)0.000000"),
+				"dump long_double");
+}
+
+static void test_btf_dump_char_data(struct btf *btf, struct btf_dump *d,
+				    char *str)
+{
+	/* simple char */
+	TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, char, BTF_F_COMPACT, 100);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_NONAME,
+			   "100", 100);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, 0, "(char)100", 100);
+	/* zero value should be printed at toplevel */
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT,
+			   "(char)0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_NONAME,
+			   "0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_ZERO,
+			   "(char)0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO,
+			   "0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, char, 0, "(char)0", 0);
+
+	TEST_BTF_DUMP_DATA_OVER(btf, d, NULL, str, char, sizeof(char)-1, "", 100);
+}
+
+static void test_btf_dump_typedef_data(struct btf *btf, struct btf_dump *d,
+				       char *str)
+{
+	/* simple typedef */
+	TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, uint64_t, BTF_F_COMPACT, 100);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT | BTF_F_NONAME,
+			   "1", 1);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, 0, "(u64)1", 1);
+	/* zero value should be printed at toplevel */
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT, "(u64)0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT | BTF_F_NONAME,
+			   "0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, BTF_F_COMPACT | BTF_F_ZERO,
+			   "(u64)0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64,
+			   BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO,
+			   "0", 0);
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, u64, 0, "(u64)0", 0);
+
+	/* typedef struct */
+	TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, atomic_t, BTF_F_COMPACT,
+			     {.counter = (int)1,});
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT | BTF_F_NONAME,
+			   "{1,}", { .counter = 1 });
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, 0,
+"(atomic_t){\n"
+"	.counter = (int)1,\n"
+"}",
+			   {.counter = 1,});
+	/* typedef with 0 value should be printed at toplevel */
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT, "(atomic_t){}",
+			   {.counter = 0,});
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT | BTF_F_NONAME,
+			   "{}", {.counter = 0,});
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, 0,
+"(atomic_t){\n"
+"}",
+			   {.counter = 0,});
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_COMPACT | BTF_F_ZERO,
+			   "(atomic_t){.counter = (int)0,}",
+			   {.counter = 0,});
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t,
+			   BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO,
+			   "{0,}", {.counter = 0,});
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, atomic_t, BTF_F_ZERO,
+"(atomic_t){\n"
+"	.counter = (int)0,\n"
+"}",
+			   { .counter = 0,});
+
+	/* overflow should show type but not value since it overflows */
+	TEST_BTF_DUMP_DATA_OVER(btf, d, NULL, str, atomic_t, sizeof(atomic_t)-1,
+				"(atomic_t){\n", { .counter = 1});
+}
+
+static void test_btf_dump_enum_data(struct btf *btf, struct btf_dump *d,
+				    char *str)
+{
+	/* enum where enum value does (and does not) exist */
+	TEST_BTF_DUMP_DATA_C(btf, d, "enum", str, enum bpf_cmd, BTF_F_COMPACT,
+			     BPF_MAP_CREATE);
+	TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, BTF_F_COMPACT,
+			   "(enum bpf_cmd)BPF_MAP_CREATE", 0);
+	TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd,
+			   BTF_F_COMPACT | BTF_F_NONAME,
+			   "BPF_MAP_CREATE",
+			   BPF_MAP_CREATE);
+	TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, 0,
+			   "(enum bpf_cmd)BPF_MAP_CREATE",
+			   BPF_MAP_CREATE);
+	TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd,
+			   BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO,
+			   "BPF_MAP_CREATE", 0);
+	TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd,
+			   BTF_F_COMPACT | BTF_F_ZERO,
+			   "(enum bpf_cmd)BPF_MAP_CREATE",
+			   BPF_MAP_CREATE);
+	TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd,
+			   BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO,
+			   "BPF_MAP_CREATE", BPF_MAP_CREATE);
+	TEST_BTF_DUMP_DATA_C(btf, d, "enum", str, enum bpf_cmd, BTF_F_COMPACT, 2000);
+	TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd,
+			   BTF_F_COMPACT | BTF_F_NONAME,
+			   "2000", 2000);
+	TEST_BTF_DUMP_DATA(btf, d, "enum", str, enum bpf_cmd, 0,
+			   "(enum bpf_cmd)2000", 2000);
+
+	TEST_BTF_DUMP_DATA_OVER(btf, d, "enum", str, enum bpf_cmd,
+				sizeof(enum bpf_cmd) - 1, "", BPF_MAP_CREATE);
+}
+
+static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d,
+				      char *str)
+{
+	DECLARE_LIBBPF_OPTS(btf_dump_type_data_opts, opts);
+	char zero_data[512] = { };
+	char type_data[512];
+	void *fops = type_data;
+	void *skb = type_data;
+	size_t type_sz;
+	__s32 type_id;
+	char *cmpstr;
+	int ret;
+
+	memset(type_data, 255, sizeof(type_data));
+
+	/* simple struct */
+	TEST_BTF_DUMP_DATA_C(btf, d, "struct", str, struct btf_enum, BTF_F_COMPACT,
+			     {.name_off = (__u32)3,.val = (__s32)-1,});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum,
+			   BTF_F_COMPACT | BTF_F_NONAME,
+			   "{3,-1,}",
+			   { .name_off = 3, .val = -1,});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, 0,
+"(struct btf_enum){\n"
+"	.name_off = (__u32)3,\n"
+"	.val = (__s32)-1,\n"
+"}",
+			   { .name_off = 3, .val = -1,});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum,
+			   BTF_F_COMPACT | BTF_F_NONAME,
+			   "{-1,}",
+			   { .name_off = 0, .val = -1,});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum,
+			   BTF_F_COMPACT | BTF_F_NONAME | BTF_F_ZERO,
+			   "{0,-1,}",
+			   { .name_off = 0, .val = -1,});
+	/* empty struct should be printed */
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, BTF_F_COMPACT,
+			   "(struct btf_enum){}",
+			   { .name_off = 0, .val = 0,});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum,
+			   BTF_F_COMPACT | BTF_F_NONAME,
+			   "{}",
+			   { .name_off = 0, .val = 0,});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum, 0,
+"(struct btf_enum){\n"
+"}",
+			   { .name_off = 0, .val = 0,});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum,
+			   BTF_F_COMPACT | BTF_F_ZERO,
+			   "(struct btf_enum){.name_off = (__u32)0,.val = (__s32)0,}",
+			   { .name_off = 0, .val = 0,});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct btf_enum,
+			   BTF_F_ZERO,
+"(struct btf_enum){\n"
+"	.name_off = (__u32)0,\n"
+"	.val = (__s32)0,\n"
+"}",
+			   { .name_off = 0, .val = 0,});
+
+	/* struct with pointers */
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, BTF_F_COMPACT,
+			   "(struct list_head){.next = (struct list_head *)0x1,}",
+			   { .next = (struct list_head *)1 });
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, 0,
+"(struct list_head){\n"
+"	.next = (struct list_head *)0x1,\n"
+"}",
+			   { .next = (struct list_head *)1 });
+	/* NULL pointer should not be displayed */
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, BTF_F_COMPACT,
+			   "(struct list_head){}",
+			   { .next = (struct list_head *)0 });
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct list_head, 0,
+"(struct list_head){\n"
+"}",
+			   { .next = (struct list_head *)0 });
+
+	/* struct with function pointers */
+	type_id = btf__find_by_name(btf, "file_operations");
+	if (ASSERT_GT(type_id, 0, "find type id")) {
+		type_sz = btf__resolve_size(btf, type_id);
+		str[0] = '\0';
+
+		ret = btf_dump__dump_type_data(d, type_id, fops, type_sz, &opts);
+		ASSERT_EQ(ret, type_sz,
+			  "unexpected return value dumping file_operations");
+		cmpstr =
+"(struct file_operations){\n"
+"	.owner = (struct module *)0xffffffffffffffff,\n"
+"	.llseek = (loff_t (*)(struct file *, loff_t, int))0xffffffffffffffff,";
+
+		ASSERT_STRNEQ(str, cmpstr, strlen(cmpstr), "file_operations");
+	}
+
+	/* struct with char array */
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, BTF_F_COMPACT,
+			   "(struct bpf_prog_info){.name = (char[16])['f','o','o',],}",
+			   { .name = "foo",});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info,
+			   BTF_F_COMPACT | BTF_F_NONAME,
+			   "{['f','o','o',],}",
+			   {.name = "foo",});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, 0,
+"(struct bpf_prog_info){\n"
+"	.name = (char[16])[\n"
+"		'f',\n"
+"		'o',\n"
+"		'o',\n"
+"	],\n"
+"}",
+			   {.name = "foo",});
+	/* leading null char means do not display string */
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, BTF_F_COMPACT,
+			   "(struct bpf_prog_info){}",
+			   {.name = {'\0', 'f', 'o', 'o'}});
+	/* handle non-printable characters */
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_prog_info, BTF_F_COMPACT,
+			   "(struct bpf_prog_info){.name = (char[16])[1,2,3,],}",
+			   { .name = {1, 2, 3, 0}});
+
+	/* struct with non-char array */
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, BTF_F_COMPACT,
+			   "(struct __sk_buff){.cb = (__u32[5])[1,2,3,4,5,],}",
+			   { .cb = {1, 2, 3, 4, 5,},});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff,
+			   BTF_F_COMPACT | BTF_F_NONAME,
+			   "{[1,2,3,4,5,],}",
+			   { .cb = { 1, 2, 3, 4, 5},});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, 0,
+"(struct __sk_buff){\n"
+"	.cb = (__u32[5])[\n"
+"		1,\n"
+"		2,\n"
+"		3,\n"
+"		4,\n"
+"		5,\n"
+"	],\n"
+"}",
+			   { .cb = { 1, 2, 3, 4, 5},});
+	/* For non-char, arrays, show non-zero values only */
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, BTF_F_COMPACT,
+			   "(struct __sk_buff){.cb = (__u32[5])[0,0,1,0,0,],}",
+			   { .cb = { 0, 0, 1, 0, 0},});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct __sk_buff, 0,
+"(struct __sk_buff){\n"
+"	.cb = (__u32[5])[\n"
+"		0,\n"
+"		0,\n"
+"		1,\n"
+"		0,\n"
+"		0,\n"
+"	],\n"
+"}",
+			   { .cb = { 0, 0, 1, 0, 0},});
+
+	/* struct with bitfields */
+	TEST_BTF_DUMP_DATA_C(btf, d, "struct", str, struct bpf_insn, BTF_F_COMPACT,
+		{.code = (__u8)1,.dst_reg = (__u8)0x2,.src_reg = (__u8)0x3,.off = (__s16)4,.imm = (__s32)5,});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_insn,
+			   BTF_F_COMPACT | BTF_F_NONAME,
+			   "{1,0x2,0x3,4,5,}",
+			   { .code = 1, .dst_reg = 0x2, .src_reg = 0x3, .off = 4,
+			     .imm = 5,});
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_insn, 0,
+"(struct bpf_insn){\n"
+"	.code = (__u8)1,\n"
+"	.dst_reg = (__u8)0x2,\n"
+"	.src_reg = (__u8)0x3,\n"
+"	.off = (__s16)4,\n"
+"	.imm = (__s32)5,\n"
+"}",
+			   {.code = 1, .dst_reg = 2, .src_reg = 3, .off = 4, .imm = 5});
+
+	/* zeroed bitfields should not be displayed */
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_insn, BTF_F_COMPACT,
+			   "(struct bpf_insn){.dst_reg = (__u8)0x1,}",
+			   { .code = 0, .dst_reg = 1});
+
+	/* struct with enum bitfield */
+	type_id = btf__find_by_name(btf, "fs_context");
+	if (ASSERT_GT(type_id,  0, "find fs_context")) {
+		type_sz = btf__resolve_size(btf, type_id);
+		str[0] = '\0';
+
+		opts.emit_zeroes = true;
+		ret = btf_dump__dump_type_data(d, type_id, zero_data, type_sz, &opts);
+		ASSERT_EQ(ret, type_sz,
+			  "unexpected return value dumping fs_context");
+
+		ASSERT_NEQ(strstr(str, "FS_CONTEXT_FOR_MOUNT"), NULL,
+				  "bitfield value not present");
+	}
+
+	/* struct with nested anon union */
+	TEST_BTF_DUMP_DATA(btf, d, "struct", str, struct bpf_sock_ops, BTF_F_COMPACT,
+			   "(struct bpf_sock_ops){.op = (__u32)1,(union){.args = (__u32[4])[1,2,3,4,],.reply = (__u32)1,.replylong = (__u32[4])[1,2,3,4,],},}",
+			   { .op = 1, .args = { 1, 2, 3, 4}});
+
+	/* union with nested struct */
+	TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT,
+			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},}",
+			   { .map = { .map_fd = 1 }});
+
+	/* struct skb with nested structs/unions; because type output is so
+	 * complex, we don't do a string comparison, just verify we return
+	 * the type size as the amount of data displayed.
+	 */
+	type_id = btf__find_by_name(btf, "sk_buff");
+	if (ASSERT_GT(type_id, 0, "find struct sk_buff")) {
+		type_sz = btf__resolve_size(btf, type_id);
+		str[0] = '\0';
+
+		ret = btf_dump__dump_type_data(d, type_id, skb, type_sz, &opts);
+		ASSERT_EQ(ret, type_sz,
+			  "unexpected return value dumping sk_buff");
+	}
+
+	/* overflow bpf_sock_ops struct with final element nonzero/zero.
+	 * Regardless of the value of the final field, we don't have all the
+	 * data we need to display it, so we should trigger an overflow.
+	 * In other words oveflow checking should trump "is field zero?"
+	 * checks because if we've overflowed, it shouldn't matter what the
+	 * field is - we can't trust its value so shouldn't display it.
+	 */
+	TEST_BTF_DUMP_DATA_OVER(btf, d, "struct", str, struct bpf_sock_ops,
+				sizeof(struct bpf_sock_ops) - 1,
+				"(struct bpf_sock_ops){\n\t.op = (__u32)1,\n",
+				{ .op = 1, .skb_tcp_flags = 2});
+	TEST_BTF_DUMP_DATA_OVER(btf, d, "struct", str, struct bpf_sock_ops,
+				sizeof(struct bpf_sock_ops) - 1,
+				"(struct bpf_sock_ops){\n\t.op = (__u32)1,\n",
+				{ .op = 1, .skb_tcp_flags = 0});
+}
+
+static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d,
+				   char *str)
+{
+	TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT,
+			  "int cpu_number = (int)100", 100);
+	TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT,
+			  "static int cpu_profile_flip = (int)2", 2);
+}
+
+static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str,
+			     const char *name, const char *expected_val,
+			     void *data, size_t data_sz)
+{
+	DECLARE_LIBBPF_OPTS(btf_dump_type_data_opts, opts);
+	int ret = 0, cmp;
+	size_t secsize;
+	__s32 type_id;
+
+	opts.compact = true;
+
+	type_id = btf__find_by_name(btf, name);
+	if (!ASSERT_GT(type_id, 0, "find type id"))
+		return;
+
+	secsize = btf__resolve_size(btf, type_id);
+	ASSERT_EQ(secsize,  0, "verify section size");
+
+	str[0] = '\0';
+	ret = btf_dump__dump_type_data(d, type_id, data, data_sz, &opts);
+	ASSERT_EQ(ret, 0, "unexpected return value");
+
+	cmp = strcmp(str, expected_val);
+	ASSERT_EQ(cmp, 0, "ensure expected/actual match");
+}
+
+static void test_btf_dump_datasec_data(char *str)
+{
+	struct btf *btf = btf__parse("xdping_kern.o", NULL);
+	struct btf_dump_opts opts = { .ctx = str };
+	char license[4] = "GPL";
+	struct btf_dump *d;
+
+	if (!ASSERT_OK_PTR(btf, "xdping_kern.o BTF not found"))
+		return;
+
+	d = btf_dump__new(btf, NULL, &opts, btf_dump_snprintf);
+	if (!ASSERT_OK_PTR(d, "could not create BTF dump"))
+		return;
+
+	test_btf_datasec(btf, d, str, "license",
+			 "SEC(\"license\") char[4] _license = (char[4])['G','P','L',];",
+			 license, sizeof(license));
+}
+
 void test_btf_dump() {
+	char str[STRSIZE];
+	struct btf_dump_opts opts = { .ctx = str };
+	struct btf_dump *d;
+	struct btf *btf;
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(btf_dump_test_cases); i++) {
@@ -245,4 +814,33 @@ void test_btf_dump() {
 	}
 	if (test__start_subtest("btf_dump: incremental"))
 		test_btf_dump_incremental();
+
+	btf = libbpf_find_kernel_btf();
+	if (!ASSERT_OK_PTR(btf, "no kernel BTF found"))
+		return;
+
+	d = btf_dump__new(btf, NULL, &opts, btf_dump_snprintf);
+	if (!ASSERT_OK_PTR(d, "could not create BTF dump"))
+		return;
+
+	/* Verify type display for various types. */
+	if (test__start_subtest("btf_dump: int_data"))
+		test_btf_dump_int_data(btf, d, str);
+	if (test__start_subtest("btf_dump: float_data"))
+		test_btf_dump_float_data(btf, d, str);
+	if (test__start_subtest("btf_dump: char_data"))
+		test_btf_dump_char_data(btf, d, str);
+	if (test__start_subtest("btf_dump: typedef_data"))
+		test_btf_dump_typedef_data(btf, d, str);
+	if (test__start_subtest("btf_dump: enum_data"))
+		test_btf_dump_enum_data(btf, d, str);
+	if (test__start_subtest("btf_dump: struct_data"))
+		test_btf_dump_struct_data(btf, d, str);
+	if (test__start_subtest("btf_dump: var_data"))
+		test_btf_dump_var_data(btf, d, str);
+	btf_dump__free(d);
+	btf__free(btf);
+
+	if (test__start_subtest("btf_dump: datasec_data"))
+		test_btf_dump_datasec_data(str);
 }

From 8d44c3578b48d5f605eddcfd6a644e3944455a6b Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Fri, 16 Jul 2021 23:46:55 +0100
Subject: [PATCH 09/64] libbpf: Clarify/fix unaligned data issues for btf typed
 dump

If data is packed, data structures can store it outside of usual
boundaries.  For example a 4-byte int can be stored on a unaligned
boundary in a case like this:

struct s {
	char f1;
	int f2;
} __attribute((packed));

...the int is stored at an offset of one byte.  Some platforms have
problems dereferencing data that is not aligned with its size, and
code exists to handle most cases of this for BTF typed data display.
However pointer display was missed, and a simple function to test if
"ptr_is_aligned(data, data_sz)" would help clarify this code.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626475617-25984-2-git-send-email-alan.maguire@oracle.com
---
 tools/lib/bpf/btf_dump.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 929cf931ed66..bf5bb4b127ed 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -1654,6 +1654,11 @@ static int btf_dump_base_type_check_zero(struct btf_dump *d,
 	return 0;
 }
 
+static bool ptr_is_aligned(const void *data, int data_sz)
+{
+	return ((uintptr_t)data) % data_sz == 0;
+}
+
 static int btf_dump_int_data(struct btf_dump *d,
 			     const struct btf_type *t,
 			     __u32 type_id,
@@ -1672,7 +1677,7 @@ static int btf_dump_int_data(struct btf_dump *d,
 	/* handle packed int data - accesses of integers not aligned on
 	 * int boundaries can cause problems on some platforms.
 	 */
-	if (((uintptr_t)data) % sz)
+	if (!ptr_is_aligned(data, sz))
 		return btf_dump_bitfield_data(d, t, data, 0, 0);
 
 	switch (sz) {
@@ -1739,7 +1744,7 @@ static int btf_dump_float_data(struct btf_dump *d,
 	int sz = t->size;
 
 	/* handle unaligned data; copy to local union */
-	if (((uintptr_t)data) % sz) {
+	if (!ptr_is_aligned(data, sz)) {
 		memcpy(&fl, data, sz);
 		flp = &fl;
 	}
@@ -1892,12 +1897,27 @@ static int btf_dump_struct_data(struct btf_dump *d,
 	return err;
 }
 
+union ptr_data {
+	unsigned int p;
+	unsigned long long lp;
+};
+
 static int btf_dump_ptr_data(struct btf_dump *d,
 			      const struct btf_type *t,
 			      __u32 id,
 			      const void *data)
 {
-	btf_dump_type_values(d, "%p", *(void **)data);
+	if (ptr_is_aligned(data, d->ptr_sz) && d->ptr_sz == sizeof(void *)) {
+		btf_dump_type_values(d, "%p", *(void **)data);
+	} else {
+		union ptr_data pt;
+
+		memcpy(&pt, data, d->ptr_sz);
+		if (d->ptr_sz == 4)
+			btf_dump_type_values(d, "0x%x", pt.p);
+		else
+			btf_dump_type_values(d, "0x%llx", pt.lp);
+	}
 	return 0;
 }
 
@@ -1910,7 +1930,7 @@ static int btf_dump_get_enum_value(struct btf_dump *d,
 	int sz = t->size;
 
 	/* handle unaligned enum value */
-	if (((uintptr_t)data) % sz) {
+	if (!ptr_is_aligned(data, sz)) {
 		*value = (__s64)btf_dump_bitfield_get_data(d, t, data, 0, 0);
 		return 0;
 	}

From 04eb4dff6a64d842f7f2c85c7cb1affc5ab3ebc9 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Fri, 16 Jul 2021 23:46:56 +0100
Subject: [PATCH 10/64] libbpf: Fix compilation errors on ppc64le for btf dump
 typed data

__s64 can be defined as either long or long long, depending on the
architecture. On ppc64le it's defined as long, giving this error:

 In file included from btf_dump.c:22:
btf_dump.c: In function 'btf_dump_type_data_check_overflow':
libbpf_internal.h:111:22: error: format '%lld' expects argument of
type 'long long int', but argument 3 has type '__s64' {aka 'long int'}
[-Werror=format=]
  111 |  libbpf_print(level, "libbpf: " fmt, ##__VA_ARGS__); \
      |                      ^~~~~~~~~~
libbpf_internal.h:114:27: note: in expansion of macro '__pr'
  114 | #define pr_warn(fmt, ...) __pr(LIBBPF_WARN, fmt, ##__VA_ARGS__)
      |                           ^~~~
btf_dump.c:1992:3: note: in expansion of macro 'pr_warn'
 1992 |   pr_warn("unexpected size [%lld] for id [%u]\n",
      |   ^~~~~~~
btf_dump.c:1992:32: note: format string is defined here
 1992 |   pr_warn("unexpected size [%lld] for id [%u]\n",
      |                             ~~~^
      |                                |
      |                                long long int
      |                             %ld

Cast to size_t and use %zu instead.

Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626475617-25984-3-git-send-email-alan.maguire@oracle.com
---
 tools/lib/bpf/btf_dump.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index bf5bb4b127ed..aa695ab9b826 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -2009,8 +2009,8 @@ static int btf_dump_type_data_check_overflow(struct btf_dump *d,
 	__s64 size = btf__resolve_size(d->btf, id);
 
 	if (size < 0 || size >= INT_MAX) {
-		pr_warn("unexpected size [%lld] for id [%u]\n",
-			size, id);
+		pr_warn("unexpected size [%zu] for id [%u]\n",
+			(size_t)size, id);
 		return -EINVAL;
 	}
 

From add192f81ab21b58471577c75e7be9c9add98223 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Fri, 16 Jul 2021 23:46:57 +0100
Subject: [PATCH 11/64] libbpf: Btf typed dump does not need to allocate dump
 data

By using the stack for this small structure, we avoid the need
for freeing memory in error paths.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626475617-25984-4-git-send-email-alan.maguire@oracle.com
---
 tools/lib/bpf/btf_dump.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index aa695ab9b826..accf6fea57da 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -2238,6 +2238,7 @@ int btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
 			     const void *data, size_t data_sz,
 			     const struct btf_dump_type_data_opts *opts)
 {
+	struct btf_dump_data typed_dump = {};
 	const struct btf_type *t;
 	int ret;
 
@@ -2248,12 +2249,10 @@ int btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
 	if (!t)
 		return libbpf_err(-ENOENT);
 
-	d->typed_dump = calloc(1, sizeof(struct btf_dump_data));
-	if (!d->typed_dump)
-		return libbpf_err(-ENOMEM);
-
+	d->typed_dump = &typed_dump;
 	d->typed_dump->data_end = data + data_sz;
 	d->typed_dump->indent_lvl = OPTS_GET(opts, indent_level, 0);
+
 	/* default indent string is a tab */
 	if (!opts->indent_str)
 		d->typed_dump->indent_str[0] = '\t';
@@ -2267,7 +2266,7 @@ int btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
 
 	ret = btf_dump_dump_type_data(d, NULL, t, id, data, 0, 0);
 
-	free(d->typed_dump);
+	d->typed_dump = NULL;
 
 	return libbpf_err(ret);
 }

From a21ab4c59e09c2a9994a6e393b7484e3b3f78a99 Mon Sep 17 00:00:00 2001
From: Martynas Pumputis <m@lambda.lt>
Date: Mon, 19 Jul 2021 19:38:37 +0200
Subject: [PATCH 12/64] libbpf: Fix removal of inner map in
 bpf_object__create_map

If creating an outer map of a BTF-defined map-in-map fails (via
bpf_object__create_map()), then the previously created its inner map
won't be destroyed.

Fix this by ensuring that the destroy routines are not bypassed in the
case of a failure.

Fixes: 646f02ffdd49c ("libbpf: Add BTF-defined map-in-map support")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Martynas Pumputis <m@lambda.lt>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210719173838.423148-2-m@lambda.lt
---
 tools/lib/bpf/libbpf.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 17bc922f543f..4c153c379989 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4527,6 +4527,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 {
 	struct bpf_create_map_attr create_attr;
 	struct bpf_map_def *def = &map->def;
+	int err = 0;
 
 	memset(&create_attr, 0, sizeof(create_attr));
 
@@ -4569,8 +4570,6 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 
 	if (bpf_map_type__is_map_in_map(def->type)) {
 		if (map->inner_map) {
-			int err;
-
 			err = bpf_object__create_map(obj, map->inner_map, true);
 			if (err) {
 				pr_warn("map '%s': failed to create inner map: %d\n",
@@ -4595,8 +4594,8 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 	if (map->fd < 0 && (create_attr.btf_key_type_id ||
 			    create_attr.btf_value_type_id)) {
 		char *cp, errmsg[STRERR_BUFSIZE];
-		int err = -errno;
 
+		err = -errno;
 		cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg));
 		pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n",
 			map->name, cp, err);
@@ -4608,8 +4607,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 		map->fd = bpf_create_map_xattr(&create_attr);
 	}
 
-	if (map->fd < 0)
-		return -errno;
+	err = map->fd < 0 ? -errno : 0;
 
 	if (bpf_map_type__is_map_in_map(def->type) && map->inner_map) {
 		if (obj->gen_loader)
@@ -4618,7 +4616,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 		zfree(&map->inner_map);
 	}
 
-	return 0;
+	return err;
 }
 
 static int init_map_slots(struct bpf_object *obj, struct bpf_map *map)

From 08f71a1e39a1f07a464ac782d9b612d6a74c7015 Mon Sep 17 00:00:00 2001
From: Martynas Pumputis <m@lambda.lt>
Date: Mon, 19 Jul 2021 19:38:38 +0200
Subject: [PATCH 13/64] selftests/bpf: Check inner map deletion

Add a test case to check whether an unsuccessful creation of an outer
map of a BTF-defined map-in-map destroys the inner map.

As bpf_object__create_map() is a static function, we cannot just call it
from the test case and then check whether a map accessible via
map->inner_map_fd has been closed. Instead, we iterate over all maps and
check whether the map "$MAP_NAME.inner" does not exist.

Signed-off-by: Martynas Pumputis <m@lambda.lt>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210719173838.423148-3-m@lambda.lt
---
 .../bpf/progs/test_map_in_map_invalid.c       | 26 ++++++++
 tools/testing/selftests/bpf/test_maps.c       | 63 ++++++++++++++++++-
 2 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/test_map_in_map_invalid.c

diff --git a/tools/testing/selftests/bpf/progs/test_map_in_map_invalid.c b/tools/testing/selftests/bpf/progs/test_map_in_map_invalid.c
new file mode 100644
index 000000000000..703c08e06442
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_map_in_map_invalid.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Isovalent, Inc. */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct inner {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, __u32);
+	__type(value, int);
+	__uint(max_entries, 4);
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(max_entries, 0); /* This will make map creation to fail */
+	__uint(key_size, sizeof(__u32));
+	__array(values, struct inner);
+} mim SEC(".maps");
+
+SEC("xdp")
+int xdp_noop0(struct xdp_md *ctx)
+{
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 30cbf5d98f7d..d832d135211c 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -1153,12 +1153,16 @@ static void test_sockmap(unsigned int tasks, void *data)
 }
 
 #define MAPINMAP_PROG "./test_map_in_map.o"
+#define MAPINMAP_INVALID_PROG "./test_map_in_map_invalid.o"
 static void test_map_in_map(void)
 {
 	struct bpf_object *obj;
 	struct bpf_map *map;
 	int mim_fd, fd, err;
 	int pos = 0;
+	struct bpf_map_info info = {};
+	__u32 len = sizeof(info);
+	__u32 id = 0;
 
 	obj = bpf_object__open(MAPINMAP_PROG);
 
@@ -1228,11 +1232,68 @@ static void test_map_in_map(void)
 	}
 
 	close(fd);
+	fd = -1;
 	bpf_object__close(obj);
+
+	/* Test that failing bpf_object__create_map() destroys the inner map */
+	obj = bpf_object__open(MAPINMAP_INVALID_PROG);
+	err = libbpf_get_error(obj);
+	if (err) {
+		printf("Failed to load %s program: %d %d",
+		       MAPINMAP_INVALID_PROG, err, errno);
+		goto out_map_in_map;
+	}
+
+	map = bpf_object__find_map_by_name(obj, "mim");
+	if (!map) {
+		printf("Failed to load array of maps from test prog\n");
+		goto out_map_in_map;
+	}
+
+	err = bpf_object__load(obj);
+	if (!err) {
+		printf("Loading obj supposed to fail\n");
+		goto out_map_in_map;
+	}
+
+	/* Iterate over all maps to check whether the internal map
+	 * ("mim.internal") has been destroyed.
+	 */
+	while (true) {
+		err = bpf_map_get_next_id(id, &id);
+		if (err) {
+			if (errno == ENOENT)
+				break;
+			printf("Failed to get next map: %d", errno);
+			goto out_map_in_map;
+		}
+
+		fd = bpf_map_get_fd_by_id(id);
+		if (fd < 0) {
+			if (errno == ENOENT)
+				continue;
+			printf("Failed to get map by id %u: %d", id, errno);
+			goto out_map_in_map;
+		}
+
+		err = bpf_obj_get_info_by_fd(fd, &info, &len);
+		if (err) {
+			printf("Failed to get map info by fd %d: %d", fd,
+			       errno);
+			goto out_map_in_map;
+		}
+
+		if (!strcmp(info.name, "mim.inner")) {
+			printf("Inner map mim.inner was not destroyed\n");
+			goto out_map_in_map;
+		}
+	}
+
 	return;
 
 out_map_in_map:
-	close(fd);
+	if (fd >= 0)
+		close(fd);
 	exit(1);
 }
 

From c39aa21599748f3845a47645f482d94099b11460 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 20 Jul 2021 11:48:32 -0700
Subject: [PATCH 14/64] bpf, selftests: Fix test_maps now that sockmap supports
 UDP

UDP socket support was added recently so testing UDP insert failure is no
longer correct and causes test_maps failure. The fix is easy though, we
simply need to test that UDP is correctly added instead of blocked.

Fixes: 122e6c79efe1c ("sock_map: Update sock type checks for UDP")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210720184832.452430-1-john.fastabend@gmail.com
---
 tools/testing/selftests/bpf/test_maps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index d832d135211c..5a8e069e64fa 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -764,8 +764,8 @@ static void test_sockmap(unsigned int tasks, void *data)
 	udp = socket(AF_INET, SOCK_DGRAM, 0);
 	i = 0;
 	err = bpf_map_update_elem(fd, &i, &udp, BPF_ANY);
-	if (!err) {
-		printf("Failed socket SOCK_DGRAM allowed '%i:%i'\n",
+	if (err) {
+		printf("Failed socket update SOCK_DGRAM '%i:%i'\n",
 		       i, udp);
 		goto out_sockmap;
 	}

From 875fc315dbc3cdd3a5cab57b2bb7c1976ec8df44 Mon Sep 17 00:00:00 2001
From: Vincent Li <vincent.mc.li@gmail.com>
Date: Mon, 19 Jul 2021 15:30:22 -0700
Subject: [PATCH 15/64] selftests, bpf: test_tc_tunnel.sh nc: Cannot use -p and
 -l

When run test_tc_tunnel.sh, it complains following error

ipip
encap 192.168.1.1 to 192.168.1.2, type ipip, mac none len 100
test basic connectivity
nc: cannot use -p and -l

nc man page has:

     -l  Listen for an incoming connection rather than initiating
         a connection to a remote host.Cannot be used together with
         any of the options -psxz. Additionally, any timeouts specified
         with the -w option are ignored.

Correct nc in server_listen().

Signed-off-by: Vincent Li <vincent.mc.li@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210719223022.66681-1-vincent.mc.li@gmail.com
---
 tools/testing/selftests/bpf/test_tc_tunnel.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
index c9dde9b9d987..088fcad138c9 100755
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
@@ -69,7 +69,7 @@ cleanup() {
 }
 
 server_listen() {
-	ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" > "${outfile}" &
+	ip netns exec "${ns2}" nc "${netcat_opt}" -l "${port}" > "${outfile}" &
 	server_pid=$!
 	sleep 0.2
 }

From a1d3cc3c5eca598cfabee3a35f30f34fbe2f709b Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Tue, 20 Jul 2021 09:49:51 +0100
Subject: [PATCH 16/64] libbpf: Avoid use of __int128 in typed dump display

__int128 is not supported for some 32-bit platforms (arm and i386).
__int128 was used in carrying out computations on bitfields which
aid display, but the same calculations could be done with __u64
with the small effect of not supporting 128-bit bitfields.

With these changes, a big-endian issue with casting 128-bit integers
to 64-bit for enum bitfields is solved also, as we now use 64-bit
integers for bitfield calculations.

Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Reported-by: Linux Kernel Functional Testing <lkft@linaro.org>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626770993-11073-2-git-send-email-alan.maguire@oracle.com
---
 tools/lib/bpf/btf_dump.c | 98 ++++++++++++++++++++++++++--------------
 1 file changed, 65 insertions(+), 33 deletions(-)

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index accf6fea57da..d52e546a515c 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -1552,31 +1552,26 @@ static int btf_dump_unsupported_data(struct btf_dump *d,
 	return -ENOTSUP;
 }
 
-static void btf_dump_int128(struct btf_dump *d,
-			    const struct btf_type *t,
-			    const void *data)
-{
-	__int128 num = *(__int128 *)data;
-
-	if ((num >> 64) == 0)
-		btf_dump_type_values(d, "0x%llx", (long long)num);
-	else
-		btf_dump_type_values(d, "0x%llx%016llx", (long long)num >> 32,
-				     (long long)num);
-}
-
-static unsigned __int128 btf_dump_bitfield_get_data(struct btf_dump *d,
-						    const struct btf_type *t,
-						    const void *data,
-						    __u8 bits_offset,
-						    __u8 bit_sz)
+static int btf_dump_get_bitfield_value(struct btf_dump *d,
+				       const struct btf_type *t,
+				       const void *data,
+				       __u8 bits_offset,
+				       __u8 bit_sz,
+				       __u64 *value)
 {
 	__u16 left_shift_bits, right_shift_bits;
 	__u8 nr_copy_bits, nr_copy_bytes;
-	unsigned __int128 num = 0, ret;
 	const __u8 *bytes = data;
+	int sz = t->size;
+	__u64 num = 0;
 	int i;
 
+	/* Maximum supported bitfield size is 64 bits */
+	if (sz > 8) {
+		pr_warn("unexpected bitfield size %d\n", sz);
+		return -EINVAL;
+	}
+
 	/* Bitfield value retrieval is done in two steps; first relevant bytes are
 	 * stored in num, then we left/right shift num to eliminate irrelevant bits.
 	 */
@@ -1591,12 +1586,12 @@ static unsigned __int128 btf_dump_bitfield_get_data(struct btf_dump *d,
 #else
 # error "Unrecognized __BYTE_ORDER__"
 #endif
-	left_shift_bits = 128 - nr_copy_bits;
-	right_shift_bits = 128 - bit_sz;
+	left_shift_bits = 64 - nr_copy_bits;
+	right_shift_bits = 64 - bit_sz;
 
-	ret = (num << left_shift_bits) >> right_shift_bits;
+	*value = (num << left_shift_bits) >> right_shift_bits;
 
-	return ret;
+	return 0;
 }
 
 static int btf_dump_bitfield_check_zero(struct btf_dump *d,
@@ -1605,9 +1600,12 @@ static int btf_dump_bitfield_check_zero(struct btf_dump *d,
 					__u8 bits_offset,
 					__u8 bit_sz)
 {
-	__int128 check_num;
+	__u64 check_num;
+	int err;
 
-	check_num = btf_dump_bitfield_get_data(d, t, data, bits_offset, bit_sz);
+	err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, &check_num);
+	if (err)
+		return err;
 	if (check_num == 0)
 		return -ENODATA;
 	return 0;
@@ -1619,10 +1617,14 @@ static int btf_dump_bitfield_data(struct btf_dump *d,
 				  __u8 bits_offset,
 				  __u8 bit_sz)
 {
-	unsigned __int128 print_num;
+	__u64 print_num;
+	int err;
 
-	print_num = btf_dump_bitfield_get_data(d, t, data, bits_offset, bit_sz);
-	btf_dump_int128(d, t, &print_num);
+	err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, &print_num);
+	if (err)
+		return err;
+
+	btf_dump_type_values(d, "0x%llx", (unsigned long long)print_num);
 
 	return 0;
 }
@@ -1681,9 +1683,29 @@ static int btf_dump_int_data(struct btf_dump *d,
 		return btf_dump_bitfield_data(d, t, data, 0, 0);
 
 	switch (sz) {
-	case 16:
-		btf_dump_int128(d, t, data);
+	case 16: {
+		const __u64 *ints = data;
+		__u64 lsi, msi;
+
+		/* avoid use of __int128 as some 32-bit platforms do not
+		 * support it.
+		 */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+		lsi = ints[0];
+		msi = ints[1];
+#elif __BYTE_ORDER == __BIG_ENDIAN
+		lsi = ints[1];
+		msi = ints[0];
+#else
+# error "Unrecognized __BYTE_ORDER__"
+#endif
+		if (msi == 0)
+			btf_dump_type_values(d, "0x%llx", (unsigned long long)lsi);
+		else
+			btf_dump_type_values(d, "0x%llx%016llx", (unsigned long long)msi,
+					     (unsigned long long)lsi);
 		break;
+	}
 	case 8:
 		if (sign)
 			btf_dump_type_values(d, "%lld", *(long long *)data);
@@ -1931,9 +1953,16 @@ static int btf_dump_get_enum_value(struct btf_dump *d,
 
 	/* handle unaligned enum value */
 	if (!ptr_is_aligned(data, sz)) {
-		*value = (__s64)btf_dump_bitfield_get_data(d, t, data, 0, 0);
+		__u64 val;
+		int err;
+
+		err = btf_dump_get_bitfield_value(d, t, data, 0, 0, &val);
+		if (err)
+			return err;
+		*value = (__s64)val;
 		return 0;
 	}
+
 	switch (t->size) {
 	case 8:
 		*value = *(__s64 *)data;
@@ -2209,10 +2238,13 @@ static int btf_dump_dump_type_data(struct btf_dump *d,
 	case BTF_KIND_ENUM:
 		/* handle bitfield and int enum values */
 		if (bit_sz) {
-			unsigned __int128 print_num;
+			__u64 print_num;
 			__s64 enum_val;
 
-			print_num = btf_dump_bitfield_get_data(d, t, data, bits_offset, bit_sz);
+			err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz,
+							  &print_num);
+			if (err)
+				break;
 			enum_val = (__s64)print_num;
 			err = btf_dump_enum_data(d, t, id, &enum_val);
 		} else

From a17553dde294289c5f45dd79b6cfd532df610b5f Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Tue, 20 Jul 2021 09:49:52 +0100
Subject: [PATCH 17/64] selftests/bpf: Add __int128-specific tests for typed
 data dump

Add tests for __int128 display for platforms that support it.
__int128s are dumped as hex values.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626770993-11073-3-git-send-email-alan.maguire@oracle.com
---
 .../testing/selftests/bpf/prog_tests/btf_dump.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index 0b4ba53b5961..52ccf0cf35e1 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -327,6 +327,14 @@ static int btf_dump_data(struct btf *btf, struct btf_dump *d,
 static void test_btf_dump_int_data(struct btf *btf, struct btf_dump *d,
 				   char *str)
 {
+#ifdef __SIZEOF_INT128__
+	__int128 i = 0xffffffffffffffff;
+
+	/* this dance is required because we cannot directly initialize
+	 * a 128-bit value to anything larger than a 64-bit value.
+	 */
+	i = (i << 64) | (i - 1);
+#endif
 	/* simple int */
 	TEST_BTF_DUMP_DATA_C(btf, d, NULL, str, int, BTF_F_COMPACT, 1234);
 	TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, BTF_F_COMPACT | BTF_F_NONAME,
@@ -348,6 +356,15 @@ static void test_btf_dump_int_data(struct btf *btf, struct btf_dump *d,
 	TEST_BTF_DUMP_DATA(btf, d, NULL, str, int, 0, "(int)-4567", -4567);
 
 	TEST_BTF_DUMP_DATA_OVER(btf, d, NULL, str, int, sizeof(int)-1, "", 1);
+
+#ifdef __SIZEOF_INT128__
+	TEST_BTF_DUMP_DATA(btf, d, NULL, str, __int128, BTF_F_COMPACT,
+			   "(__int128)0xffffffffffffffff",
+			   0xffffffffffffffff);
+	ASSERT_OK(btf_dump_data(btf, d, "__int128", NULL, 0, &i, 16, str,
+				"(__int128)0xfffffffffffffffffffffffffffffffe"),
+		  "dump __int128");
+#endif
 }
 
 static void test_btf_dump_float_data(struct btf *btf, struct btf_dump *d,

From 720c29fca9fb87c473148ff666b75314420cdda6 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Tue, 20 Jul 2021 09:49:53 +0100
Subject: [PATCH 18/64] libbpf: Propagate errors when retrieving enum value for
 typed data display

When retrieving the enum value associated with typed data during
"is data zero?" checking in btf_dump_type_data_check_zero(), the
return value of btf_dump_get_enum_value() is not passed to the caller
if the function returns a non-zero (error) value.  Currently, 0
is returned if the function returns an error.  We should instead
propagate the error to the caller.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1626770993-11073-4-git-send-email-alan.maguire@oracle.com
---
 tools/lib/bpf/btf_dump.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index d52e546a515c..e4b483f15fb9 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -2166,8 +2166,9 @@ static int btf_dump_type_data_check_zero(struct btf_dump *d,
 		return -ENODATA;
 	}
 	case BTF_KIND_ENUM:
-		if (btf_dump_get_enum_value(d, t, data, id, &value))
-			return 0;
+		err = btf_dump_get_enum_value(d, t, data, id, &value);
+		if (err)
+			return err;
 		if (value == 0)
 			return -ENODATA;
 		return 0;

From 16c5900ba776c5acd6568abd60c40f948a96e496 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 21 Jul 2021 23:19:45 +0200
Subject: [PATCH 19/64] bpf: Fix pointer cast warning

kp->addr is a pointer, so it cannot be cast directly to a 'u64'
when it gets interpreted as an integer value:

kernel/trace/bpf_trace.c: In function '____bpf_get_func_ip_kprobe':
kernel/trace/bpf_trace.c:968:21: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast]
  968 |         return kp ? (u64) kp->addr : 0;

Use the uintptr_t type instead.

Fixes: 9ffd9f3ff719 ("bpf: Add bpf_get_func_ip helper for kprobe programs")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210721212007.3876595-1-arnd@kernel.org
---
 kernel/trace/bpf_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 08906007306d..1f22ce1fa971 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -965,7 +965,7 @@ BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
 {
 	struct kprobe *kp = kprobe_running();
 
-	return kp ? (u64) kp->addr : 0;
+	return kp ? (uintptr_t)kp->addr : 0;
 }
 
 static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {

From 724f17b7d45d62c71e92471666647a823cb9baa9 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 21 Jul 2021 12:56:30 +0100
Subject: [PATCH 20/64] bpf: Remove redundant intiialization of variable stype

The variable stype is being initialized with a value that is never
read, it is being updated later on. The assignment is redundant and
can be removed.

Addresses-Coverity: ("Unused value")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210721115630.109279-1-colin.king@canonical.com
---
 kernel/bpf/local_storage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 362e81481594..7ed2a14dc0de 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -406,7 +406,7 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
 static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
 					 struct seq_file *m)
 {
-	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+	enum bpf_cgroup_storage_type stype;
 	struct bpf_cgroup_storage *storage;
 	int cpu;
 

From 9907442fcddbdacf55fa22e31f2306ae0d6172d6 Mon Sep 17 00:00:00 2001
From: Martynas Pumputis <m@lambda.lt>
Date: Wed, 21 Jul 2021 16:09:41 +0200
Subject: [PATCH 21/64] selftests/bpf: Mute expected invalid map creation error
 msg

Previously, the newly introduced test case in test_map_in_map(), which
checks whether the inner map is destroyed after unsuccessful creation of
the outer map, logged the following harmless and expected error:

    libbpf: map 'mim': failed to create: Invalid argument(-22) libbpf:
    failed to load object './test_map_in_map_invalid.o'

To avoid any possible confusion, mute the logging during loading of the
prog.

Fixes: 08f71a1e39a1 ("selftests/bpf: Check inner map deletion")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Martynas Pumputis <m@lambda.lt>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210721140941.563175-1-m@lambda.lt
---
 tools/testing/selftests/bpf/test_maps.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 5a8e069e64fa..14cea869235b 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -1163,6 +1163,7 @@ static void test_map_in_map(void)
 	struct bpf_map_info info = {};
 	__u32 len = sizeof(info);
 	__u32 id = 0;
+	libbpf_print_fn_t old_print_fn;
 
 	obj = bpf_object__open(MAPINMAP_PROG);
 
@@ -1250,12 +1251,16 @@ static void test_map_in_map(void)
 		goto out_map_in_map;
 	}
 
+	old_print_fn = libbpf_set_print(NULL);
+
 	err = bpf_object__load(obj);
 	if (!err) {
 		printf("Loading obj supposed to fail\n");
 		goto out_map_in_map;
 	}
 
+	libbpf_set_print(old_print_fn);
+
 	/* Iterate over all maps to check whether the internal map
 	 * ("mim.internal") has been destroyed.
 	 */

From 1f71a468a75ff4f13c55966c74284aa4a6bcc334 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 21 Jul 2021 23:58:08 +0200
Subject: [PATCH 22/64] libbpf: Fix func leak in attach_kprobe

Add missing free() for func pointer in attach_kprobe function.

Fixes: a2488b5f483f ("libbpf: Allow specification of "kprobe/function+offset"")
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/bpf/20210721215810.889975-2-jolsa@kernel.org
---
 tools/lib/bpf/libbpf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4c153c379989..d46c2dd37be2 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10431,6 +10431,7 @@ static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec,
 		return libbpf_err_ptr(err);
 	}
 	if (opts.retprobe && offset != 0) {
+		free(func);
 		err = -EINVAL;
 		pr_warn("kretprobes do not support offset specification\n");
 		return libbpf_err_ptr(err);

From e3f9bc35ea7e9871667d7f769cf0079211828e89 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 21 Jul 2021 23:58:09 +0200
Subject: [PATCH 23/64] libbpf: Allow decimal offset for kprobes

Allow to specify decimal offset in SEC macro, like:
  SEC("kprobe/bpf_fentry_test7+5")

Add selftest for that.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/bpf/20210721215810.889975-3-jolsa@kernel.org
---
 tools/lib/bpf/libbpf.c                                |  2 +-
 .../selftests/bpf/prog_tests/get_func_ip_test.c       |  2 ++
 tools/testing/selftests/bpf/progs/get_func_ip_test.c  | 11 +++++++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index d46c2dd37be2..52f4f1d4f495 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10424,7 +10424,7 @@ static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec,
 	func_name = prog->sec_name + sec->len;
 	opts.retprobe = strcmp(sec->sec, "kretprobe/") == 0;
 
-	n = sscanf(func_name, "%m[a-zA-Z0-9_.]+%lx", &func, &offset);
+	n = sscanf(func_name, "%m[a-zA-Z0-9_.]+%li", &func, &offset);
 	if (n < 1) {
 		err = -EINVAL;
 		pr_warn("kprobe name is invalid: %s\n", func_name);
diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c
index 088b3653610d..02a465f36d59 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c
@@ -17,6 +17,7 @@ void test_get_func_ip_test(void)
 	 */
 #ifndef __x86_64__
 	bpf_program__set_autoload(skel->progs.test6, false);
+	bpf_program__set_autoload(skel->progs.test7, false);
 #endif
 
 	err = get_func_ip_test__load(skel);
@@ -46,6 +47,7 @@ void test_get_func_ip_test(void)
 	ASSERT_EQ(skel->bss->test5_result, 1, "test5_result");
 #ifdef __x86_64__
 	ASSERT_EQ(skel->bss->test6_result, 1, "test6_result");
+	ASSERT_EQ(skel->bss->test7_result, 1, "test7_result");
 #endif
 
 cleanup:
diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
index acd587b6e859..a587aeca5ae0 100644
--- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
@@ -11,6 +11,7 @@ extern const void bpf_fentry_test3 __ksym;
 extern const void bpf_fentry_test4 __ksym;
 extern const void bpf_modify_return_test __ksym;
 extern const void bpf_fentry_test6 __ksym;
+extern const void bpf_fentry_test7 __ksym;
 
 __u64 test1_result = 0;
 SEC("fentry/bpf_fentry_test1")
@@ -71,3 +72,13 @@ int test6(struct pt_regs *ctx)
 	test6_result = (const void *) addr == &bpf_fentry_test6 + 5;
 	return 0;
 }
+
+__u64 test7_result = 0;
+SEC("kprobe/bpf_fentry_test7+5")
+int test7(struct pt_regs *ctx)
+{
+	__u64 addr = bpf_get_func_ip(ctx);
+
+	test7_result = (const void *) addr == &bpf_fentry_test7 + 5;
+	return 0;
+}

From da97553ec6e1ba229f5b90c0e25799ea8afede51 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 21 Jul 2021 23:58:10 +0200
Subject: [PATCH 24/64] libbpf: Export bpf_program__attach_kprobe_opts function

Export bpf_program__attach_kprobe_opts as a public API.

Rename bpf_program_attach_kprobe_opts to bpf_kprobe_opts and turn it into OPTS
struct.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/bpf/20210721215810.889975-4-jolsa@kernel.org
---
 tools/lib/bpf/libbpf.c   | 31 +++++++++++++++++--------------
 tools/lib/bpf/libbpf.h   | 15 +++++++++++++++
 tools/lib/bpf/libbpf.map |  1 +
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 52f4f1d4f495..e595816b8b76 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10366,25 +10366,28 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name,
 	return pfd;
 }
 
-struct bpf_program_attach_kprobe_opts {
-	bool retprobe;
-	unsigned long offset;
-};
-
-static struct bpf_link*
+struct bpf_link *
 bpf_program__attach_kprobe_opts(struct bpf_program *prog,
 				const char *func_name,
-				struct bpf_program_attach_kprobe_opts *opts)
+				struct bpf_kprobe_opts *opts)
 {
 	char errmsg[STRERR_BUFSIZE];
 	struct bpf_link *link;
+	unsigned long offset;
+	bool retprobe;
 	int pfd, err;
 
-	pfd = perf_event_open_probe(false /* uprobe */, opts->retprobe, func_name,
-				    opts->offset, -1 /* pid */);
+	if (!OPTS_VALID(opts, bpf_kprobe_opts))
+		return libbpf_err_ptr(-EINVAL);
+
+	retprobe = OPTS_GET(opts, retprobe, false);
+	offset = OPTS_GET(opts, offset, 0);
+
+	pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name,
+				    offset, -1 /* pid */);
 	if (pfd < 0) {
 		pr_warn("prog '%s': failed to create %s '%s' perf event: %s\n",
-			prog->name, opts->retprobe ? "kretprobe" : "kprobe", func_name,
+			prog->name, retprobe ? "kretprobe" : "kprobe", func_name,
 			libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
 		return libbpf_err_ptr(pfd);
 	}
@@ -10393,7 +10396,7 @@ bpf_program__attach_kprobe_opts(struct bpf_program *prog,
 	if (err) {
 		close(pfd);
 		pr_warn("prog '%s': failed to attach to %s '%s': %s\n",
-			prog->name, opts->retprobe ? "kretprobe" : "kprobe", func_name,
+			prog->name, retprobe ? "kretprobe" : "kprobe", func_name,
 			libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
 		return libbpf_err_ptr(err);
 	}
@@ -10404,9 +10407,9 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog,
 					    bool retprobe,
 					    const char *func_name)
 {
-	struct bpf_program_attach_kprobe_opts opts = {
+	DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts,
 		.retprobe = retprobe,
-	};
+	);
 
 	return bpf_program__attach_kprobe_opts(prog, func_name, &opts);
 }
@@ -10414,7 +10417,7 @@ struct bpf_link *bpf_program__attach_kprobe(struct bpf_program *prog,
 static struct bpf_link *attach_kprobe(const struct bpf_sec_def *sec,
 				      struct bpf_program *prog)
 {
-	struct bpf_program_attach_kprobe_opts opts;
+	DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts);
 	unsigned long offset = 0;
 	struct bpf_link *link;
 	const char *func_name;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 6b08c1023609..9ec6b7244889 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -104,6 +104,17 @@ struct bpf_object_open_opts {
 };
 #define bpf_object_open_opts__last_field btf_custom_path
 
+struct bpf_kprobe_opts {
+	/* size of this struct, for forward/backward compatiblity */
+	size_t sz;
+	/* function's offset to install kprobe to */
+	unsigned long offset;
+	/* kprobe is return probe */
+	bool retprobe;
+	size_t :0;
+};
+#define bpf_kprobe_opts__last_field retprobe
+
 LIBBPF_API struct bpf_object *bpf_object__open(const char *path);
 LIBBPF_API struct bpf_object *
 bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts);
@@ -250,6 +261,10 @@ LIBBPF_API struct bpf_link *
 bpf_program__attach_kprobe(struct bpf_program *prog, bool retprobe,
 			   const char *func_name);
 LIBBPF_API struct bpf_link *
+bpf_program__attach_kprobe_opts(struct bpf_program *prog,
+                                const char *func_name,
+                                struct bpf_kprobe_opts *opts);
+LIBBPF_API struct bpf_link *
 bpf_program__attach_uprobe(struct bpf_program *prog, bool retprobe,
 			   pid_t pid, const char *binary_path,
 			   size_t func_offset);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 5bfc10722647..887d372a3f27 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -372,6 +372,7 @@ LIBBPF_0.5.0 {
 	global:
 		bpf_map__initial_value;
 		bpf_map_lookup_and_delete_elem_flags;
+		bpf_program__attach_kprobe_opts;
 		bpf_object__gen_loader;
 		btf_dump__dump_type_data;
 		libbpf_set_strict_mode;

From 525e2f9fd0229eb10cb460a9e6d978257f24804e Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:05:41 -0700
Subject: [PATCH 25/64] tcp: seq_file: Avoid skipping sk during
 tcp_seek_last_pos

st->bucket stores the current bucket number.
st->offset stores the offset within this bucket that is the sk to be
seq_show().  Thus, st->offset only makes sense within the same
st->bucket.

These two variables are an optimization for the common no-lseek case.
When resuming the seq_file iteration (i.e. seq_start()),
tcp_seek_last_pos() tries to continue from the st->offset
at bucket st->bucket.

However, it is possible that the bucket pointed by st->bucket
has changed and st->offset may end up skipping the whole st->bucket
without finding a sk.  In this case, tcp_seek_last_pos() currently
continues to satisfy the offset condition in the next (and incorrect)
bucket.  Instead, regardless of the offset value, the first sk of the
next bucket should be returned.  Thus, "bucket == st->bucket" check is
added to tcp_seek_last_pos().

The chance of hitting this is small and the issue is a decade old,
so targeting for the next tree.

Fixes: a8b690f98baf ("tcp: Fix slowness in read /proc/net/tcp")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200541.1033917-1-kafai@fb.com
---
 net/ipv4/tcp_ipv4.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b9dc2d6197be..ee85abde968c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2451,6 +2451,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
 static void *tcp_seek_last_pos(struct seq_file *seq)
 {
 	struct tcp_iter_state *st = seq->private;
+	int bucket = st->bucket;
 	int offset = st->offset;
 	int orig_num = st->num;
 	void *rc = NULL;
@@ -2461,7 +2462,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
 			break;
 		st->state = TCP_SEQ_STATE_LISTENING;
 		rc = listening_get_next(seq, NULL);
-		while (offset-- && rc)
+		while (offset-- && rc && bucket == st->bucket)
 			rc = listening_get_next(seq, rc);
 		if (rc)
 			break;
@@ -2472,7 +2473,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
 		if (st->bucket > tcp_hashinfo.ehash_mask)
 			break;
 		rc = established_get_first(seq);
-		while (offset-- && rc)
+		while (offset-- && rc && bucket == st->bucket)
 			rc = established_get_next(seq, rc);
 	}
 

From ad2d61376a0517f19f49fc23de9e12d2b06484fc Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:05:48 -0700
Subject: [PATCH 26/64] tcp: seq_file: Refactor net and family matching

This patch refactors the net and family matching into
two new helpers, seq_sk_match() and seq_file_family().

seq_file_family() is in the later part of the file to prepare
the change of a following patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200548.1034629-1-kafai@fb.com
---
 net/ipv4/tcp_ipv4.c | 68 ++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 38 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ee85abde968c..f2583c4699fd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2277,6 +2277,17 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
 #ifdef CONFIG_PROC_FS
 /* Proc filesystem TCP sock list dumping. */
 
+static unsigned short seq_file_family(const struct seq_file *seq);
+
+static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
+{
+	unsigned short family = seq_file_family(seq);
+
+	/* AF_UNSPEC is used as a match all */
+	return ((family == AF_UNSPEC || family == sk->sk_family) &&
+		net_eq(sock_net(sk), seq_file_net(seq)));
+}
+
 /*
  * Get next listener socket follow cur.  If cur is NULL, get first socket
  * starting from bucket given in st->bucket; when st->bucket is zero the
@@ -2284,18 +2295,11 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
  */
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
-	struct tcp_seq_afinfo *afinfo;
 	struct tcp_iter_state *st = seq->private;
-	struct net *net = seq_file_net(seq);
 	struct inet_listen_hashbucket *ilb;
 	struct hlist_nulls_node *node;
 	struct sock *sk = cur;
 
-	if (st->bpf_seq_afinfo)
-		afinfo = st->bpf_seq_afinfo;
-	else
-		afinfo = PDE_DATA(file_inode(seq->file));
-
 	if (!sk) {
 get_head:
 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
@@ -2311,10 +2315,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 	sk = sk_nulls_next(sk);
 get_sk:
 	sk_nulls_for_each_from(sk, node) {
-		if (!net_eq(sock_net(sk), net))
-			continue;
-		if (afinfo->family == AF_UNSPEC ||
-		    sk->sk_family == afinfo->family)
+		if (seq_sk_match(seq, sk))
 			return sk;
 	}
 	spin_unlock(&ilb->lock);
@@ -2351,15 +2352,7 @@ static inline bool empty_bucket(const struct tcp_iter_state *st)
  */
 static void *established_get_first(struct seq_file *seq)
 {
-	struct tcp_seq_afinfo *afinfo;
 	struct tcp_iter_state *st = seq->private;
-	struct net *net = seq_file_net(seq);
-	void *rc = NULL;
-
-	if (st->bpf_seq_afinfo)
-		afinfo = st->bpf_seq_afinfo;
-	else
-		afinfo = PDE_DATA(file_inode(seq->file));
 
 	st->offset = 0;
 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
@@ -2373,32 +2366,20 @@ static void *established_get_first(struct seq_file *seq)
 
 		spin_lock_bh(lock);
 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
-			if ((afinfo->family != AF_UNSPEC &&
-			     sk->sk_family != afinfo->family) ||
-			    !net_eq(sock_net(sk), net)) {
-				continue;
-			}
-			rc = sk;
-			goto out;
+			if (seq_sk_match(seq, sk))
+				return sk;
 		}
 		spin_unlock_bh(lock);
 	}
-out:
-	return rc;
+
+	return NULL;
 }
 
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
-	struct tcp_seq_afinfo *afinfo;
 	struct sock *sk = cur;
 	struct hlist_nulls_node *node;
 	struct tcp_iter_state *st = seq->private;
-	struct net *net = seq_file_net(seq);
-
-	if (st->bpf_seq_afinfo)
-		afinfo = st->bpf_seq_afinfo;
-	else
-		afinfo = PDE_DATA(file_inode(seq->file));
 
 	++st->num;
 	++st->offset;
@@ -2406,9 +2387,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	sk = sk_nulls_next(sk);
 
 	sk_nulls_for_each_from(sk, node) {
-		if ((afinfo->family == AF_UNSPEC ||
-		     sk->sk_family == afinfo->family) &&
-		    net_eq(sock_net(sk), net))
+		if (seq_sk_match(seq, sk))
 			return sk;
 	}
 
@@ -2754,6 +2733,19 @@ static const struct seq_operations bpf_iter_tcp_seq_ops = {
 	.stop		= bpf_iter_tcp_seq_stop,
 };
 #endif
+static unsigned short seq_file_family(const struct seq_file *seq)
+{
+	const struct tcp_iter_state *st = seq->private;
+	const struct tcp_seq_afinfo *afinfo = st->bpf_seq_afinfo;
+
+	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
+	if (afinfo)
+		return AF_UNSPEC;
+
+	/* Iterated from proc fs */
+	afinfo = PDE_DATA(file_inode(seq->file));
+	return afinfo->family;
+}
 
 static const struct seq_operations tcp4_seq_ops = {
 	.show		= tcp4_seq_show,

From 62001372c2b6cdf2346afb2cf94ed3d950eee64c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:05:54 -0700
Subject: [PATCH 27/64] bpf: tcp: seq_file: Remove bpf_seq_afinfo from
 tcp_iter_state

A following patch will create a separate struct to store extra
bpf_iter state and it will embed the existing tcp_iter_state like this:
struct bpf_tcp_iter_state {
	struct tcp_iter_state state;
	/* More bpf_iter specific states here ... */
}

As a prep work, this patch removes the
"struct tcp_seq_afinfo *bpf_seq_afinfo" where its purpose is
to tell if it is iterating from bpf_iter instead of proc fs.
Currently, if "*bpf_seq_afinfo" is not NULL, it is iterating from
bpf_iter.  The kernel should not filter by the addr family and
leave this filtering decision to the bpf prog.

Instead of adding a "*bpf_seq_afinfo" pointer, this patch uses the
"seq->op == &bpf_iter_tcp_seq_ops" test to tell if it is iterating
from the bpf iter.

The bpf_iter_(init|fini)_tcp() is left here to prepare for
the change of a following patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200554.1034982-1-kafai@fb.com
---
 include/net/tcp.h   |  1 -
 net/ipv4/tcp_ipv4.c | 25 +++++--------------------
 2 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 17df9b047ee4..ba3034123e1d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1959,7 +1959,6 @@ struct tcp_iter_state {
 	struct seq_net_private	p;
 	enum tcp_seq_states	state;
 	struct sock		*syn_wait_sk;
-	struct tcp_seq_afinfo	*bpf_seq_afinfo;
 	int			bucket, offset, sbucket, num;
 	loff_t			last_pos;
 };
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f2583c4699fd..665f99d14436 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2735,12 +2735,13 @@ static const struct seq_operations bpf_iter_tcp_seq_ops = {
 #endif
 static unsigned short seq_file_family(const struct seq_file *seq)
 {
-	const struct tcp_iter_state *st = seq->private;
-	const struct tcp_seq_afinfo *afinfo = st->bpf_seq_afinfo;
+	const struct tcp_seq_afinfo *afinfo;
 
+#ifdef CONFIG_BPF_SYSCALL
 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
-	if (afinfo)
+	if (seq->op == &bpf_iter_tcp_seq_ops)
 		return AF_UNSPEC;
+#endif
 
 	/* Iterated from proc fs */
 	afinfo = PDE_DATA(file_inode(seq->file));
@@ -2998,27 +2999,11 @@ DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
 
 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
 {
-	struct tcp_iter_state *st = priv_data;
-	struct tcp_seq_afinfo *afinfo;
-	int ret;
-
-	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
-	if (!afinfo)
-		return -ENOMEM;
-
-	afinfo->family = AF_UNSPEC;
-	st->bpf_seq_afinfo = afinfo;
-	ret = bpf_iter_init_seq_net(priv_data, aux);
-	if (ret)
-		kfree(afinfo);
-	return ret;
+	return bpf_iter_init_seq_net(priv_data, aux);
 }
 
 static void bpf_iter_fini_tcp(void *priv_data)
 {
-	struct tcp_iter_state *st = priv_data;
-
-	kfree(st->bpf_seq_afinfo);
 	bpf_iter_fini_seq_net(priv_data);
 }
 

From b72acf4501d7c31e96749f0f5052b3bcb25fc2cb Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:06:00 -0700
Subject: [PATCH 28/64] tcp: seq_file: Add listening_get_first()

The current listening_get_next() is overloaded by passing
NULL to the 2nd arg, like listening_get_next(seq, NULL), to
mean get_first().

This patch moves some logic from the listening_get_next() into
a new function listening_get_first().  It will be equivalent
to the current established_get_first() and established_get_next()
setup.  get_first() is to find a non empty bucket and return
the first sk.  get_next() is to find the next sk of the current
bucket and then resorts to get_first() if the current bucket is
exhausted.

The next patch is to move the listener seq_file iteration from
listening_hash (port only) to lhash2 (port+addr).
Separating out listening_get_first() from listening_get_next()
here will make the following lhash2 changes cleaner and easier to
follow.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200600.1035353-1-kafai@fb.com
---
 net/ipv4/tcp_ipv4.c | 59 ++++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 665f99d14436..48a0a3873c7a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2288,10 +2288,38 @@ static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
 		net_eq(sock_net(sk), seq_file_net(seq)));
 }
 
-/*
- * Get next listener socket follow cur.  If cur is NULL, get first socket
- * starting from bucket given in st->bucket; when st->bucket is zero the
- * very first socket in the hash table is returned.
+/* Find a non empty bucket (starting from st->bucket)
+ * and return the first sk from it.
+ */
+static void *listening_get_first(struct seq_file *seq)
+{
+	struct tcp_iter_state *st = seq->private;
+
+	st->offset = 0;
+	for (; st->bucket < INET_LHTABLE_SIZE; st->bucket++) {
+		struct inet_listen_hashbucket *ilb;
+		struct hlist_nulls_node *node;
+		struct sock *sk;
+
+		ilb = &tcp_hashinfo.listening_hash[st->bucket];
+		if (hlist_nulls_empty(&ilb->nulls_head))
+			continue;
+
+		spin_lock(&ilb->lock);
+		sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+			if (seq_sk_match(seq, sk))
+				return sk;
+		}
+		spin_unlock(&ilb->lock);
+	}
+
+	return NULL;
+}
+
+/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
+ * If "cur" is the last one in the st->bucket,
+ * call listening_get_first() to return the first sk of the next
+ * non empty bucket.
  */
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
@@ -2300,29 +2328,20 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 	struct hlist_nulls_node *node;
 	struct sock *sk = cur;
 
-	if (!sk) {
-get_head:
-		ilb = &tcp_hashinfo.listening_hash[st->bucket];
-		spin_lock(&ilb->lock);
-		sk = sk_nulls_head(&ilb->nulls_head);
-		st->offset = 0;
-		goto get_sk;
-	}
-	ilb = &tcp_hashinfo.listening_hash[st->bucket];
 	++st->num;
 	++st->offset;
 
 	sk = sk_nulls_next(sk);
-get_sk:
+
 	sk_nulls_for_each_from(sk, node) {
 		if (seq_sk_match(seq, sk))
 			return sk;
 	}
+
+	ilb = &tcp_hashinfo.listening_hash[st->bucket];
 	spin_unlock(&ilb->lock);
-	st->offset = 0;
-	if (++st->bucket < INET_LHTABLE_SIZE)
-		goto get_head;
-	return NULL;
+	++st->bucket;
+	return listening_get_first(seq);
 }
 
 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
@@ -2332,7 +2351,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
 
 	st->bucket = 0;
 	st->offset = 0;
-	rc = listening_get_next(seq, NULL);
+	rc = listening_get_first(seq);
 
 	while (rc && *pos) {
 		rc = listening_get_next(seq, rc);
@@ -2440,7 +2459,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
 		if (st->bucket >= INET_LHTABLE_SIZE)
 			break;
 		st->state = TCP_SEQ_STATE_LISTENING;
-		rc = listening_get_next(seq, NULL);
+		rc = listening_get_first(seq);
 		while (offset-- && rc && bucket == st->bucket)
 			rc = listening_get_next(seq, rc);
 		if (rc)

From 05c0b35709c58b83d4dc515d2ac52e9c0f197d03 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:06:06 -0700
Subject: [PATCH 29/64] tcp: seq_file: Replace listening_hash with lhash2

This patch moves the tcp seq_file iteration on listeners
from the port only listening_hash to the port+addr lhash2.

When iterating from the bpf iter, the next patch will need to
lock the socket such that the bpf iter can call setsockopt (e.g. to
change the TCP_CONGESTION).  To avoid locking the bucket and then locking
the sock, the bpf iter will first batch some sockets from the same bucket
and then unlock the bucket.  If the bucket size is small (which
usually is), it is easier to batch the whole bucket such that it is less
likely to miss a setsockopt on a socket due to changes in the bucket.

However, the port only listening_hash could have many listeners
hashed to a bucket (e.g. many individual VIP(s):443 and also
multiple by the number of SO_REUSEPORT).  We have seen bucket size in
tens of thousands range.  Also, the chance of having changes
in some popular port buckets (e.g. 443) is also high.

The port+addr lhash2 was introduced to solve this large listener bucket
issue.  Also, the listening_hash usage has already been replaced with
lhash2 in the fast path inet[6]_lookup_listener().  This patch follows
the same direction on moving to lhash2 and iterates the lhash2
instead of listening_hash.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200606.1035783-1-kafai@fb.com
---
 include/net/inet_hashtables.h |  6 ++++++
 net/ipv4/tcp_ipv4.c           | 35 ++++++++++++++++++-----------------
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ca6a3ea9057e..f72ec113ae56 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -160,6 +160,12 @@ struct inet_hashinfo {
 					____cacheline_aligned_in_smp;
 };
 
+#define inet_lhash2_for_each_icsk_continue(__icsk) \
+	hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node)
+
+#define inet_lhash2_for_each_icsk(__icsk, list) \
+	hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node)
+
 #define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
 	hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 48a0a3873c7a..d38b4379dca4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2296,21 +2296,22 @@ static void *listening_get_first(struct seq_file *seq)
 	struct tcp_iter_state *st = seq->private;
 
 	st->offset = 0;
-	for (; st->bucket < INET_LHTABLE_SIZE; st->bucket++) {
-		struct inet_listen_hashbucket *ilb;
-		struct hlist_nulls_node *node;
+	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
+		struct inet_listen_hashbucket *ilb2;
+		struct inet_connection_sock *icsk;
 		struct sock *sk;
 
-		ilb = &tcp_hashinfo.listening_hash[st->bucket];
-		if (hlist_nulls_empty(&ilb->nulls_head))
+		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
+		if (hlist_empty(&ilb2->head))
 			continue;
 
-		spin_lock(&ilb->lock);
-		sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+		spin_lock(&ilb2->lock);
+		inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
+			sk = (struct sock *)icsk;
 			if (seq_sk_match(seq, sk))
 				return sk;
 		}
-		spin_unlock(&ilb->lock);
+		spin_unlock(&ilb2->lock);
 	}
 
 	return NULL;
@@ -2324,22 +2325,22 @@ static void *listening_get_first(struct seq_file *seq)
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
 	struct tcp_iter_state *st = seq->private;
-	struct inet_listen_hashbucket *ilb;
-	struct hlist_nulls_node *node;
+	struct inet_listen_hashbucket *ilb2;
+	struct inet_connection_sock *icsk;
 	struct sock *sk = cur;
 
 	++st->num;
 	++st->offset;
 
-	sk = sk_nulls_next(sk);
-
-	sk_nulls_for_each_from(sk, node) {
+	icsk = inet_csk(sk);
+	inet_lhash2_for_each_icsk_continue(icsk) {
+		sk = (struct sock *)icsk;
 		if (seq_sk_match(seq, sk))
 			return sk;
 	}
 
-	ilb = &tcp_hashinfo.listening_hash[st->bucket];
-	spin_unlock(&ilb->lock);
+	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
+	spin_unlock(&ilb2->lock);
 	++st->bucket;
 	return listening_get_first(seq);
 }
@@ -2456,7 +2457,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
 
 	switch (st->state) {
 	case TCP_SEQ_STATE_LISTENING:
-		if (st->bucket >= INET_LHTABLE_SIZE)
+		if (st->bucket > tcp_hashinfo.lhash2_mask)
 			break;
 		st->state = TCP_SEQ_STATE_LISTENING;
 		rc = listening_get_first(seq);
@@ -2541,7 +2542,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v)
 	switch (st->state) {
 	case TCP_SEQ_STATE_LISTENING:
 		if (v != SEQ_START_TOKEN)
-			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
+			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
 		break;
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (v)

From 04c7820b776f1c4b48698574c47de9e940d368e8 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:06:13 -0700
Subject: [PATCH 30/64] bpf: tcp: Bpf iter batching and lock_sock

This patch does batching and lock_sock for the bpf tcp iter.
It does not affect the proc fs iteration.

With bpf-tcp-cc, new algo rollout happens more often.  Instead of
restarting the application to pick up the new tcp-cc, the next patch
will allow bpf iter to do setsockopt(TCP_CONGESTION).  This requires
locking the sock.

Also, unlike the proc iteration (cat /proc/net/tcp[6]), the bpf iter
can inspect all fields of a tcp_sock.  It will be useful to have a
consistent view on some of the fields (e.g. the ones reported in
tcp_get_info() that also acquires the sock lock).

Double lock: locking the bucket first and then locking the sock could
lead to deadlock.  This patch takes a batching approach similar to
inet_diag.  While holding the bucket lock, it batch a number of sockets
into an array first and then unlock the bucket.  Before doing show(),
it then calls lock_sock_fast().

In a machine with ~400k connections, the maximum number of
sk in a bucket of the established hashtable is 7.  0.02% of
the established connections fall into this bucket size.

For listen hash (port+addr lhash2), the bucket is usually very
small also except for the SO_REUSEPORT use case which the
userspace could have one SO_REUSEPORT socket per thread.

While batching is used, it can also minimize the chance of missing
sock in the setsockopt use case if the whole bucket is batched.
This patch will start with a batch array with INIT_BATCH_SZ (16)
which will be enough for the most common cases.  bpf_iter_tcp_batch()
will try to realloc to a larger array to handle exception case (e.g.
the SO_REUSEPORT case in the lhash2).

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200613.1036157-1-kafai@fb.com
---
 net/ipv4/tcp_ipv4.c | 237 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 231 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d38b4379dca4..84ac0135d389 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2687,6 +2687,15 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
 }
 
 #ifdef CONFIG_BPF_SYSCALL
+struct bpf_tcp_iter_state {
+	struct tcp_iter_state state;
+	unsigned int cur_sk;
+	unsigned int end_sk;
+	unsigned int max_sk;
+	struct sock **batch;
+	bool st_bucket_done;
+};
+
 struct bpf_iter__tcp {
 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
 	__bpf_md_ptr(struct sock_common *, sk_common);
@@ -2705,16 +2714,204 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
 	return bpf_iter_run_prog(prog, &ctx);
 }
 
+static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
+{
+	while (iter->cur_sk < iter->end_sk)
+		sock_put(iter->batch[iter->cur_sk++]);
+}
+
+static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
+				      unsigned int new_batch_sz)
+{
+	struct sock **new_batch;
+
+	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
+			     GFP_USER | __GFP_NOWARN);
+	if (!new_batch)
+		return -ENOMEM;
+
+	bpf_iter_tcp_put_batch(iter);
+	kvfree(iter->batch);
+	iter->batch = new_batch;
+	iter->max_sk = new_batch_sz;
+
+	return 0;
+}
+
+static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
+						 struct sock *start_sk)
+{
+	struct bpf_tcp_iter_state *iter = seq->private;
+	struct tcp_iter_state *st = &iter->state;
+	struct inet_connection_sock *icsk;
+	unsigned int expected = 1;
+	struct sock *sk;
+
+	sock_hold(start_sk);
+	iter->batch[iter->end_sk++] = start_sk;
+
+	icsk = inet_csk(start_sk);
+	inet_lhash2_for_each_icsk_continue(icsk) {
+		sk = (struct sock *)icsk;
+		if (seq_sk_match(seq, sk)) {
+			if (iter->end_sk < iter->max_sk) {
+				sock_hold(sk);
+				iter->batch[iter->end_sk++] = sk;
+			}
+			expected++;
+		}
+	}
+	spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
+
+	return expected;
+}
+
+static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
+						   struct sock *start_sk)
+{
+	struct bpf_tcp_iter_state *iter = seq->private;
+	struct tcp_iter_state *st = &iter->state;
+	struct hlist_nulls_node *node;
+	unsigned int expected = 1;
+	struct sock *sk;
+
+	sock_hold(start_sk);
+	iter->batch[iter->end_sk++] = start_sk;
+
+	sk = sk_nulls_next(start_sk);
+	sk_nulls_for_each_from(sk, node) {
+		if (seq_sk_match(seq, sk)) {
+			if (iter->end_sk < iter->max_sk) {
+				sock_hold(sk);
+				iter->batch[iter->end_sk++] = sk;
+			}
+			expected++;
+		}
+	}
+	spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+
+	return expected;
+}
+
+static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
+{
+	struct bpf_tcp_iter_state *iter = seq->private;
+	struct tcp_iter_state *st = &iter->state;
+	unsigned int expected;
+	bool resized = false;
+	struct sock *sk;
+
+	/* The st->bucket is done.  Directly advance to the next
+	 * bucket instead of having the tcp_seek_last_pos() to skip
+	 * one by one in the current bucket and eventually find out
+	 * it has to advance to the next bucket.
+	 */
+	if (iter->st_bucket_done) {
+		st->offset = 0;
+		st->bucket++;
+		if (st->state == TCP_SEQ_STATE_LISTENING &&
+		    st->bucket > tcp_hashinfo.lhash2_mask) {
+			st->state = TCP_SEQ_STATE_ESTABLISHED;
+			st->bucket = 0;
+		}
+	}
+
+again:
+	/* Get a new batch */
+	iter->cur_sk = 0;
+	iter->end_sk = 0;
+	iter->st_bucket_done = false;
+
+	sk = tcp_seek_last_pos(seq);
+	if (!sk)
+		return NULL; /* Done */
+
+	if (st->state == TCP_SEQ_STATE_LISTENING)
+		expected = bpf_iter_tcp_listening_batch(seq, sk);
+	else
+		expected = bpf_iter_tcp_established_batch(seq, sk);
+
+	if (iter->end_sk == expected) {
+		iter->st_bucket_done = true;
+		return sk;
+	}
+
+	if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
+		resized = true;
+		goto again;
+	}
+
+	return sk;
+}
+
+static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	/* bpf iter does not support lseek, so it always
+	 * continue from where it was stop()-ped.
+	 */
+	if (*pos)
+		return bpf_iter_tcp_batch(seq);
+
+	return SEQ_START_TOKEN;
+}
+
+static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_tcp_iter_state *iter = seq->private;
+	struct tcp_iter_state *st = &iter->state;
+	struct sock *sk;
+
+	/* Whenever seq_next() is called, the iter->cur_sk is
+	 * done with seq_show(), so advance to the next sk in
+	 * the batch.
+	 */
+	if (iter->cur_sk < iter->end_sk) {
+		/* Keeping st->num consistent in tcp_iter_state.
+		 * bpf_iter_tcp does not use st->num.
+		 * meta.seq_num is used instead.
+		 */
+		st->num++;
+		/* Move st->offset to the next sk in the bucket such that
+		 * the future start() will resume at st->offset in
+		 * st->bucket.  See tcp_seek_last_pos().
+		 */
+		st->offset++;
+		sock_put(iter->batch[iter->cur_sk++]);
+	}
+
+	if (iter->cur_sk < iter->end_sk)
+		sk = iter->batch[iter->cur_sk];
+	else
+		sk = bpf_iter_tcp_batch(seq);
+
+	++*pos;
+	/* Keeping st->last_pos consistent in tcp_iter_state.
+	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
+	 */
+	st->last_pos = *pos;
+	return sk;
+}
+
 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
 {
 	struct bpf_iter_meta meta;
 	struct bpf_prog *prog;
 	struct sock *sk = v;
+	bool slow;
 	uid_t uid;
+	int ret;
 
 	if (v == SEQ_START_TOKEN)
 		return 0;
 
+	if (sk_fullsock(sk))
+		slow = lock_sock_fast(sk);
+
+	if (unlikely(sk_unhashed(sk))) {
+		ret = SEQ_SKIP;
+		goto unlock;
+	}
+
 	if (sk->sk_state == TCP_TIME_WAIT) {
 		uid = 0;
 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
@@ -2728,11 +2925,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
 
 	meta.seq = seq;
 	prog = bpf_iter_get_info(&meta, false);
-	return tcp_prog_seq_show(prog, &meta, v, uid);
+	ret = tcp_prog_seq_show(prog, &meta, v, uid);
+
+unlock:
+	if (sk_fullsock(sk))
+		unlock_sock_fast(sk, slow);
+	return ret;
+
 }
 
 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
 {
+	struct bpf_tcp_iter_state *iter = seq->private;
 	struct bpf_iter_meta meta;
 	struct bpf_prog *prog;
 
@@ -2743,13 +2947,16 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
 	}
 
-	tcp_seq_stop(seq, v);
+	if (iter->cur_sk < iter->end_sk) {
+		bpf_iter_tcp_put_batch(iter);
+		iter->st_bucket_done = false;
+	}
 }
 
 static const struct seq_operations bpf_iter_tcp_seq_ops = {
 	.show		= bpf_iter_tcp_seq_show,
-	.start		= tcp_seq_start,
-	.next		= tcp_seq_next,
+	.start		= bpf_iter_tcp_seq_start,
+	.next		= bpf_iter_tcp_seq_next,
 	.stop		= bpf_iter_tcp_seq_stop,
 };
 #endif
@@ -3017,21 +3224,39 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
 		     struct sock_common *sk_common, uid_t uid)
 
+#define INIT_BATCH_SZ 16
+
 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
 {
-	return bpf_iter_init_seq_net(priv_data, aux);
+	struct bpf_tcp_iter_state *iter = priv_data;
+	int err;
+
+	err = bpf_iter_init_seq_net(priv_data, aux);
+	if (err)
+		return err;
+
+	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
+	if (err) {
+		bpf_iter_fini_seq_net(priv_data);
+		return err;
+	}
+
+	return 0;
 }
 
 static void bpf_iter_fini_tcp(void *priv_data)
 {
+	struct bpf_tcp_iter_state *iter = priv_data;
+
 	bpf_iter_fini_seq_net(priv_data);
+	kvfree(iter->batch);
 }
 
 static const struct bpf_iter_seq_info tcp_seq_info = {
 	.seq_ops		= &bpf_iter_tcp_seq_ops,
 	.init_seq_private	= bpf_iter_init_tcp,
 	.fini_seq_private	= bpf_iter_fini_tcp,
-	.seq_priv_size		= sizeof(struct tcp_iter_state),
+	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
 };
 
 static struct bpf_iter_reg tcp_reg_info = {

From 3cee6fb8e69ecd79be891c89a94974c48a25a437 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:06:19 -0700
Subject: [PATCH 31/64] bpf: tcp: Support bpf_(get|set)sockopt in bpf tcp iter

This patch allows bpf tcp iter to call bpf_(get|set)sockopt.
To allow a specific bpf iter (tcp here) to call a set of helpers,
get_func_proto function pointer is added to bpf_iter_reg.
The bpf iter is a tracing prog which currently requires
CAP_PERFMON or CAP_SYS_ADMIN, so this patch does not
impose other capability checks for bpf_(get|set)sockopt.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200619.1036715-1-kafai@fb.com
---
 include/linux/bpf.h      |  8 ++++++++
 kernel/bpf/bpf_iter.c    | 22 ++++++++++++++++++++++
 kernel/trace/bpf_trace.c |  7 ++++++-
 net/core/filter.c        | 34 ++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      | 15 +++++++++++++++
 5 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 978ebd16ae60..c8cc09013210 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1442,6 +1442,9 @@ typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux,
 					struct seq_file *seq);
 typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux,
 					 struct bpf_link_info *info);
+typedef const struct bpf_func_proto *
+(*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id,
+			     const struct bpf_prog *prog);
 
 enum bpf_iter_feature {
 	BPF_ITER_RESCHED	= BIT(0),
@@ -1454,6 +1457,7 @@ struct bpf_iter_reg {
 	bpf_iter_detach_target_t detach_target;
 	bpf_iter_show_fdinfo_t show_fdinfo;
 	bpf_iter_fill_link_info_t fill_link_info;
+	bpf_iter_get_func_proto_t get_func_proto;
 	u32 ctx_arg_info_size;
 	u32 feature;
 	struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
@@ -1476,6 +1480,8 @@ struct bpf_iter__bpf_map_elem {
 int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
 void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
+const struct bpf_func_proto *
+bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
 int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog);
 int bpf_iter_new_fd(struct bpf_link *link);
 bool bpf_link_is_iter(struct bpf_link *link);
@@ -2050,6 +2056,8 @@ extern const struct bpf_func_proto bpf_task_storage_get_proto;
 extern const struct bpf_func_proto bpf_task_storage_delete_proto;
 extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
+extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
+extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
 	enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 2d4fbdbb194e..2e9d47bb40ff 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -360,6 +360,28 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
 	return supported;
 }
 
+const struct bpf_func_proto *
+bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	const struct bpf_iter_target_info *tinfo;
+	const struct bpf_func_proto *fn = NULL;
+
+	mutex_lock(&targets_mutex);
+	list_for_each_entry(tinfo, &targets, list) {
+		if (tinfo->btf_id == prog->aux->attach_btf_id) {
+			const struct bpf_iter_reg *reg_info;
+
+			reg_info = tinfo->reg_info;
+			if (reg_info->get_func_proto)
+				fn = reg_info->get_func_proto(func_id, prog);
+			break;
+		}
+	}
+	mutex_unlock(&targets_mutex);
+
+	return fn;
+}
+
 static void bpf_iter_link_release(struct bpf_link *link)
 {
 	struct bpf_iter_link *iter_link =
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1f22ce1fa971..c5e0b6a64091 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1461,6 +1461,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 const struct bpf_func_proto *
 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
+	const struct bpf_func_proto *fn;
+
 	switch (func_id) {
 #ifdef CONFIG_NET
 	case BPF_FUNC_skb_output:
@@ -1501,7 +1503,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_d_path:
 		return &bpf_d_path_proto;
 	default:
-		return raw_tp_prog_func_proto(func_id, prog);
+		fn = raw_tp_prog_func_proto(func_id, prog);
+		if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
+			fn = bpf_iter_get_func_proto(func_id, prog);
+		return fn;
 	}
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 3b4986e96e9c..faf29fd82276 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5016,6 +5016,40 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
 	return -EINVAL;
 }
 
+BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
+	   int, optname, char *, optval, int, optlen)
+{
+	return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_proto = {
+	.func		= bpf_sk_setsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
+	   int, optname, char *, optval, int, optlen)
+{
+	return _bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_getsockopt_proto = {
+	.func		= bpf_sk_getsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
 BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
 	   int, level, int, optname, char *, optval, int, optlen)
 {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 84ac0135d389..f9c6e47141fd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3259,6 +3259,20 @@ static const struct bpf_iter_seq_info tcp_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
 };
 
+static const struct bpf_func_proto *
+bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
+			    const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_setsockopt:
+		return &bpf_sk_setsockopt_proto;
+	case BPF_FUNC_getsockopt:
+		return &bpf_sk_getsockopt_proto;
+	default:
+		return NULL;
+	}
+}
+
 static struct bpf_iter_reg tcp_reg_info = {
 	.target			= "tcp",
 	.ctx_arg_info_size	= 1,
@@ -3266,6 +3280,7 @@ static struct bpf_iter_reg tcp_reg_info = {
 		{ offsetof(struct bpf_iter__tcp, sk_common),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.get_func_proto		= bpf_iter_tcp_get_func_proto,
 	.seq_info		= &tcp_seq_info,
 };
 

From eed92afdd14c26b1e319fbe9eaa80e62ffa97a2c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 1 Jul 2021 13:06:25 -0700
Subject: [PATCH 32/64] bpf: selftest: Test batching and bpf_(get|set)sockopt
 in bpf tcp iter

This patch adds tests for the batching and bpf_(get|set)sockopt in
bpf tcp iter.

It first creates:
a) 1 non SO_REUSEPORT listener in lhash2.
b) 256 passive and active fds connected to the listener in (a).
c) 256 SO_REUSEPORT listeners in one of the lhash2 bucket.

The test sets all listeners and connections to bpf_cubic before
running the bpf iter.

The bpf iter then calls setsockopt(TCP_CONGESTION) to switch
each listener and connection from bpf_cubic to bpf_dctcp.

The bpf iter has a random_retry mode such that it can return EAGAIN
to the usespace in the middle of a batch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210701200625.1036874-1-kafai@fb.com
---
 tools/testing/selftests/bpf/network_helpers.c |  85 ++++++-
 tools/testing/selftests/bpf/network_helpers.h |   4 +
 .../bpf/prog_tests/bpf_iter_setsockopt.c      | 226 ++++++++++++++++++
 .../selftests/bpf/progs/bpf_iter_setsockopt.c |  72 ++++++
 .../selftests/bpf/progs/bpf_tracing_net.h     |   6 +
 5 files changed, 384 insertions(+), 9 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c

diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 2060bc122c53..26468a8f44f3 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -66,17 +66,13 @@ int settimeo(int fd, int timeout_ms)
 
 #define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; })
 
-int start_server(int family, int type, const char *addr_str, __u16 port,
-		 int timeout_ms)
+static int __start_server(int type, const struct sockaddr *addr,
+			  socklen_t addrlen, int timeout_ms, bool reuseport)
 {
-	struct sockaddr_storage addr = {};
-	socklen_t len;
+	int on = 1;
 	int fd;
 
-	if (make_sockaddr(family, addr_str, port, &addr, &len))
-		return -1;
-
-	fd = socket(family, type, 0);
+	fd = socket(addr->sa_family, type, 0);
 	if (fd < 0) {
 		log_err("Failed to create server socket");
 		return -1;
@@ -85,7 +81,13 @@ int start_server(int family, int type, const char *addr_str, __u16 port,
 	if (settimeo(fd, timeout_ms))
 		goto error_close;
 
-	if (bind(fd, (const struct sockaddr *)&addr, len) < 0) {
+	if (reuseport &&
+	    setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on))) {
+		log_err("Failed to set SO_REUSEPORT");
+		return -1;
+	}
+
+	if (bind(fd, addr, addrlen) < 0) {
 		log_err("Failed to bind socket");
 		goto error_close;
 	}
@@ -104,6 +106,69 @@ int start_server(int family, int type, const char *addr_str, __u16 port,
 	return -1;
 }
 
+int start_server(int family, int type, const char *addr_str, __u16 port,
+		 int timeout_ms)
+{
+	struct sockaddr_storage addr;
+	socklen_t addrlen;
+
+	if (make_sockaddr(family, addr_str, port, &addr, &addrlen))
+		return -1;
+
+	return __start_server(type, (struct sockaddr *)&addr,
+			      addrlen, timeout_ms, false);
+}
+
+int *start_reuseport_server(int family, int type, const char *addr_str,
+			    __u16 port, int timeout_ms, unsigned int nr_listens)
+{
+	struct sockaddr_storage addr;
+	unsigned int nr_fds = 0;
+	socklen_t addrlen;
+	int *fds;
+
+	if (!nr_listens)
+		return NULL;
+
+	if (make_sockaddr(family, addr_str, port, &addr, &addrlen))
+		return NULL;
+
+	fds = malloc(sizeof(*fds) * nr_listens);
+	if (!fds)
+		return NULL;
+
+	fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen,
+				timeout_ms, true);
+	if (fds[0] == -1)
+		goto close_fds;
+	nr_fds = 1;
+
+	if (getsockname(fds[0], (struct sockaddr *)&addr, &addrlen))
+		goto close_fds;
+
+	for (; nr_fds < nr_listens; nr_fds++) {
+		fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr,
+					     addrlen, timeout_ms, true);
+		if (fds[nr_fds] == -1)
+			goto close_fds;
+	}
+
+	return fds;
+
+close_fds:
+	free_fds(fds, nr_fds);
+	return NULL;
+}
+
+void free_fds(int *fds, unsigned int nr_close_fds)
+{
+	if (fds) {
+		while (nr_close_fds)
+			close(fds[--nr_close_fds]);
+		free(fds);
+	}
+}
+
 int fastopen_connect(int server_fd, const char *data, unsigned int data_len,
 		     int timeout_ms)
 {
@@ -217,6 +282,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port,
 	if (family == AF_INET) {
 		struct sockaddr_in *sin = (void *)addr;
 
+		memset(addr, 0, sizeof(*sin));
 		sin->sin_family = AF_INET;
 		sin->sin_port = htons(port);
 		if (addr_str &&
@@ -230,6 +296,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port,
 	} else if (family == AF_INET6) {
 		struct sockaddr_in6 *sin6 = (void *)addr;
 
+		memset(addr, 0, sizeof(*sin6));
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_port = htons(port);
 		if (addr_str &&
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 5e0d51c07b63..d60bc2897770 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -36,6 +36,10 @@ extern struct ipv6_packet pkt_v6;
 int settimeo(int fd, int timeout_ms);
 int start_server(int family, int type, const char *addr, __u16 port,
 		 int timeout_ms);
+int *start_reuseport_server(int family, int type, const char *addr_str,
+			    __u16 port, int timeout_ms,
+			    unsigned int nr_listens);
+void free_fds(int *fds, unsigned int nr_close_fds);
 int connect_to_fd(int server_fd, int timeout_ms);
 int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms);
 int fastopen_connect(int server_fd, const char *data, unsigned int data_len,
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c
new file mode 100644
index 000000000000..85babb0487b3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter_setsockopt.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <test_progs.h>
+#include "network_helpers.h"
+#include "bpf_dctcp.skel.h"
+#include "bpf_cubic.skel.h"
+#include "bpf_iter_setsockopt.skel.h"
+
+static int create_netns(void)
+{
+	if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
+		return -1;
+
+	if (!ASSERT_OK(system("ip link set dev lo up"), "bring up lo"))
+		return -1;
+
+	return 0;
+}
+
+static unsigned int set_bpf_cubic(int *fds, unsigned int nr_fds)
+{
+	unsigned int i;
+
+	for (i = 0; i < nr_fds; i++) {
+		if (setsockopt(fds[i], SOL_TCP, TCP_CONGESTION, "bpf_cubic",
+			       sizeof("bpf_cubic")))
+			return i;
+	}
+
+	return nr_fds;
+}
+
+static unsigned int check_bpf_dctcp(int *fds, unsigned int nr_fds)
+{
+	char tcp_cc[16];
+	socklen_t optlen = sizeof(tcp_cc);
+	unsigned int i;
+
+	for (i = 0; i < nr_fds; i++) {
+		if (getsockopt(fds[i], SOL_TCP, TCP_CONGESTION,
+			       tcp_cc, &optlen) ||
+		    strcmp(tcp_cc, "bpf_dctcp"))
+			return i;
+	}
+
+	return nr_fds;
+}
+
+static int *make_established(int listen_fd, unsigned int nr_est,
+			     int **paccepted_fds)
+{
+	int *est_fds, *accepted_fds;
+	unsigned int i;
+
+	est_fds = malloc(sizeof(*est_fds) * nr_est);
+	if (!est_fds)
+		return NULL;
+
+	accepted_fds = malloc(sizeof(*accepted_fds) * nr_est);
+	if (!accepted_fds) {
+		free(est_fds);
+		return NULL;
+	}
+
+	for (i = 0; i < nr_est; i++) {
+		est_fds[i] = connect_to_fd(listen_fd, 0);
+		if (est_fds[i] == -1)
+			break;
+		if (set_bpf_cubic(&est_fds[i], 1) != 1) {
+			close(est_fds[i]);
+			break;
+		}
+
+		accepted_fds[i] = accept(listen_fd, NULL, 0);
+		if (accepted_fds[i] == -1) {
+			close(est_fds[i]);
+			break;
+		}
+	}
+
+	if (!ASSERT_EQ(i, nr_est, "create established fds")) {
+		free_fds(accepted_fds, i);
+		free_fds(est_fds, i);
+		return NULL;
+	}
+
+	*paccepted_fds = accepted_fds;
+	return est_fds;
+}
+
+static unsigned short get_local_port(int fd)
+{
+	struct sockaddr_in6 addr;
+	socklen_t addrlen = sizeof(addr);
+
+	if (!getsockname(fd, &addr, &addrlen))
+		return ntohs(addr.sin6_port);
+
+	return 0;
+}
+
+static void do_bpf_iter_setsockopt(struct bpf_iter_setsockopt *iter_skel,
+				   bool random_retry)
+{
+	int *reuse_listen_fds = NULL, *accepted_fds = NULL, *est_fds = NULL;
+	unsigned int nr_reuse_listens = 256, nr_est = 256;
+	int err, iter_fd = -1, listen_fd = -1;
+	char buf;
+
+	/* Prepare non-reuseport listen_fd */
+	listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+	if (!ASSERT_GE(listen_fd, 0, "start_server"))
+		return;
+	if (!ASSERT_EQ(set_bpf_cubic(&listen_fd, 1), 1,
+		       "set listen_fd to cubic"))
+		goto done;
+	iter_skel->bss->listen_hport = get_local_port(listen_fd);
+	if (!ASSERT_NEQ(iter_skel->bss->listen_hport, 0,
+			"get_local_port(listen_fd)"))
+		goto done;
+
+	/* Connect to non-reuseport listen_fd */
+	est_fds = make_established(listen_fd, nr_est, &accepted_fds);
+	if (!ASSERT_OK_PTR(est_fds, "create established"))
+		goto done;
+
+	/* Prepare reuseport listen fds */
+	reuse_listen_fds = start_reuseport_server(AF_INET6, SOCK_STREAM,
+						  "::1", 0, 0,
+						  nr_reuse_listens);
+	if (!ASSERT_OK_PTR(reuse_listen_fds, "start_reuseport_server"))
+		goto done;
+	if (!ASSERT_EQ(set_bpf_cubic(reuse_listen_fds, nr_reuse_listens),
+		       nr_reuse_listens, "set reuse_listen_fds to cubic"))
+		goto done;
+	iter_skel->bss->reuse_listen_hport = get_local_port(reuse_listen_fds[0]);
+	if (!ASSERT_NEQ(iter_skel->bss->reuse_listen_hport, 0,
+			"get_local_port(reuse_listen_fds[0])"))
+		goto done;
+
+	/* Run bpf tcp iter to switch from bpf_cubic to bpf_dctcp */
+	iter_skel->bss->random_retry = random_retry;
+	iter_fd = bpf_iter_create(bpf_link__fd(iter_skel->links.change_tcp_cc));
+	if (!ASSERT_GE(iter_fd, 0, "create iter_fd"))
+		goto done;
+
+	while ((err = read(iter_fd, &buf, sizeof(buf))) == -1 &&
+	       errno == EAGAIN)
+		;
+	if (!ASSERT_OK(err, "read iter error"))
+		goto done;
+
+	/* Check reuseport listen fds for dctcp */
+	ASSERT_EQ(check_bpf_dctcp(reuse_listen_fds, nr_reuse_listens),
+		  nr_reuse_listens,
+		  "check reuse_listen_fds dctcp");
+
+	/* Check non reuseport listen fd for dctcp */
+	ASSERT_EQ(check_bpf_dctcp(&listen_fd, 1), 1,
+		  "check listen_fd dctcp");
+
+	/* Check established fds for dctcp */
+	ASSERT_EQ(check_bpf_dctcp(est_fds, nr_est), nr_est,
+		  "check est_fds dctcp");
+
+	/* Check accepted fds for dctcp */
+	ASSERT_EQ(check_bpf_dctcp(accepted_fds, nr_est), nr_est,
+		  "check accepted_fds dctcp");
+
+done:
+	if (iter_fd != -1)
+		close(iter_fd);
+	if (listen_fd != -1)
+		close(listen_fd);
+	free_fds(reuse_listen_fds, nr_reuse_listens);
+	free_fds(accepted_fds, nr_est);
+	free_fds(est_fds, nr_est);
+}
+
+void test_bpf_iter_setsockopt(void)
+{
+	struct bpf_iter_setsockopt *iter_skel = NULL;
+	struct bpf_cubic *cubic_skel = NULL;
+	struct bpf_dctcp *dctcp_skel = NULL;
+	struct bpf_link *cubic_link = NULL;
+	struct bpf_link *dctcp_link = NULL;
+
+	if (create_netns())
+		return;
+
+	/* Load iter_skel */
+	iter_skel = bpf_iter_setsockopt__open_and_load();
+	if (!ASSERT_OK_PTR(iter_skel, "iter_skel"))
+		return;
+	iter_skel->links.change_tcp_cc = bpf_program__attach_iter(iter_skel->progs.change_tcp_cc, NULL);
+	if (!ASSERT_OK_PTR(iter_skel->links.change_tcp_cc, "attach iter"))
+		goto done;
+
+	/* Load bpf_cubic */
+	cubic_skel = bpf_cubic__open_and_load();
+	if (!ASSERT_OK_PTR(cubic_skel, "cubic_skel"))
+		goto done;
+	cubic_link = bpf_map__attach_struct_ops(cubic_skel->maps.cubic);
+	if (!ASSERT_OK_PTR(cubic_link, "cubic_link"))
+		goto done;
+
+	/* Load bpf_dctcp */
+	dctcp_skel = bpf_dctcp__open_and_load();
+	if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel"))
+		goto done;
+	dctcp_link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp);
+	if (!ASSERT_OK_PTR(dctcp_link, "dctcp_link"))
+		goto done;
+
+	do_bpf_iter_setsockopt(iter_skel, true);
+	do_bpf_iter_setsockopt(iter_skel, false);
+
+done:
+	bpf_link__destroy(cubic_link);
+	bpf_link__destroy(dctcp_link);
+	bpf_cubic__destroy(cubic_skel);
+	bpf_dctcp__destroy(dctcp_skel);
+	bpf_iter_setsockopt__destroy(iter_skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c
new file mode 100644
index 000000000000..b77adfd55d73
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_setsockopt.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define bpf_tcp_sk(skc)	({				\
+	struct sock_common *_skc = skc;			\
+	sk = NULL;					\
+	tp = NULL;					\
+	if (_skc) {					\
+		tp = bpf_skc_to_tcp_sock(_skc);		\
+		sk = (struct sock *)tp;			\
+	}						\
+	tp;						\
+})
+
+unsigned short reuse_listen_hport = 0;
+unsigned short listen_hport = 0;
+char cubic_cc[TCP_CA_NAME_MAX] = "bpf_cubic";
+char dctcp_cc[TCP_CA_NAME_MAX] = "bpf_dctcp";
+bool random_retry = false;
+
+static bool tcp_cc_eq(const char *a, const char *b)
+{
+	int i;
+
+	for (i = 0; i < TCP_CA_NAME_MAX; i++) {
+		if (a[i] != b[i])
+			return false;
+		if (!a[i])
+			break;
+	}
+
+	return true;
+}
+
+SEC("iter/tcp")
+int change_tcp_cc(struct bpf_iter__tcp *ctx)
+{
+	char cur_cc[TCP_CA_NAME_MAX];
+	struct tcp_sock *tp;
+	struct sock *sk;
+	int ret;
+
+	if (!bpf_tcp_sk(ctx->sk_common))
+		return 0;
+
+	if (sk->sk_family != AF_INET6 ||
+	    (sk->sk_state != TCP_LISTEN &&
+	     sk->sk_state != TCP_ESTABLISHED) ||
+	    (sk->sk_num != reuse_listen_hport &&
+	     sk->sk_num != listen_hport &&
+	     bpf_ntohs(sk->sk_dport) != listen_hport))
+		return 0;
+
+	if (bpf_getsockopt(tp, SOL_TCP, TCP_CONGESTION,
+			   cur_cc, sizeof(cur_cc)))
+		return 0;
+
+	if (!tcp_cc_eq(cur_cc, cubic_cc))
+		return 0;
+
+	if (random_retry && bpf_get_prandom_u32() % 4 == 1)
+		return 1;
+
+	bpf_setsockopt(tp, SOL_TCP, TCP_CONGESTION, dctcp_cc, sizeof(dctcp_cc));
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index 01378911252b..3af0998a0623 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -5,6 +5,10 @@
 #define AF_INET			2
 #define AF_INET6		10
 
+#define SOL_TCP			6
+#define TCP_CONGESTION		13
+#define TCP_CA_NAME_MAX		16
+
 #define ICSK_TIME_RETRANS	1
 #define ICSK_TIME_PROBE0	3
 #define ICSK_TIME_LOSS_PROBE	5
@@ -32,6 +36,8 @@
 #define ir_v6_rmt_addr		req.__req_common.skc_v6_daddr
 #define ir_v6_loc_addr		req.__req_common.skc_v6_rcv_saddr
 
+#define sk_num			__sk_common.skc_num
+#define sk_dport		__sk_common.skc_dport
 #define sk_family		__sk_common.skc_family
 #define sk_rmem_alloc		sk_backlog.rmem_alloc
 #define sk_refcnt		__sk_common.skc_refcnt

From e244d34d0ea1aebf60e83ee6d1701a81448f31c1 Mon Sep 17 00:00:00 2001
From: Evgeniy Litvinenko <evgeniyl@fb.com>
Date: Fri, 23 Jul 2021 15:15:11 -0700
Subject: [PATCH 33/64] libbpf: Add bpf_map__pin_path function

Add bpf_map__pin_path, so that the inconsistently named
bpf_map__get_pin_path can be deprecated later. This is part of the
effort towards libbpf v1.0: https://github.com/libbpf/libbpf/issues/307

Also, add a selftest for the new function.

Signed-off-by: Evgeniy Litvinenko <evgeniyl@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210723221511.803683-1-evgeniyl@fb.com
---
 tools/lib/bpf/libbpf.c                           | 5 +++++
 tools/lib/bpf/libbpf.h                           | 1 +
 tools/lib/bpf/libbpf.map                         | 1 +
 tools/testing/selftests/bpf/prog_tests/pinning.c | 9 +++++++++
 4 files changed, 16 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e595816b8b76..a53ca29b44ab 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8511,6 +8511,11 @@ const char *bpf_map__get_pin_path(const struct bpf_map *map)
 	return map->pin_path;
 }
 
+const char *bpf_map__pin_path(const struct bpf_map *map)
+{
+	return map->pin_path;
+}
+
 bool bpf_map__is_pinned(const struct bpf_map *map)
 {
 	return map->pinned;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 9ec6b7244889..1271d99bb7aa 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -499,6 +499,7 @@ LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map);
 LIBBPF_API bool bpf_map__is_internal(const struct bpf_map *map);
 LIBBPF_API int bpf_map__set_pin_path(struct bpf_map *map, const char *path);
 LIBBPF_API const char *bpf_map__get_pin_path(const struct bpf_map *map);
+LIBBPF_API const char *bpf_map__pin_path(const struct bpf_map *map);
 LIBBPF_API bool bpf_map__is_pinned(const struct bpf_map *map);
 LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path);
 LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 887d372a3f27..c240d488eb5e 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -371,6 +371,7 @@ LIBBPF_0.4.0 {
 LIBBPF_0.5.0 {
 	global:
 		bpf_map__initial_value;
+		bpf_map__pin_path;
 		bpf_map_lookup_and_delete_elem_flags;
 		bpf_program__attach_kprobe_opts;
 		bpf_object__gen_loader;
diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c
index fcf54b3a1dd0..d4b953ae3407 100644
--- a/tools/testing/selftests/bpf/prog_tests/pinning.c
+++ b/tools/testing/selftests/bpf/prog_tests/pinning.c
@@ -125,6 +125,10 @@ void test_pinning(void)
 	if (CHECK(err, "pin maps", "err %d errno %d\n", err, errno))
 		goto out;
 
+	/* get pinning path */
+	if (!ASSERT_STREQ(bpf_map__pin_path(map), pinpath, "get pin path"))
+		goto out;
+
 	/* set pinning path of other map and re-pin all */
 	map = bpf_object__find_map_by_name(obj, "nopinmap");
 	if (CHECK(!map, "find map", "NULL map"))
@@ -134,6 +138,11 @@ void test_pinning(void)
 	if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno))
 		goto out;
 
+	/* get pinning path after set */
+	if (!ASSERT_STREQ(bpf_map__pin_path(map), custpinpath,
+			  "get pin path after set"))
+		goto out;
+
 	/* should only pin the one unpinned map */
 	err = bpf_object__pin_maps(obj, NULL);
 	if (CHECK(err, "pin maps", "err %d errno %d\n", err, errno))

From 7a18844223d40b684e2f24a02741a1bd53048218 Mon Sep 17 00:00:00 2001
From: Evgeniy Litvinenko <evgeniyl@fb.com>
Date: Fri, 23 Jul 2021 15:36:45 -0700
Subject: [PATCH 34/64] selftests/bpf: Document vmtest.sh dependencies

Add a list of vmtest script dependencies to make it easier for new
contributors to get going.

Signed-off-by: Evgeniy Litvinenko <evgeniyl@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210723223645.907802-1-evgeniyl@fb.com
---
 tools/testing/selftests/bpf/README.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
index 8deec1ca9150..9b17f2867488 100644
--- a/tools/testing/selftests/bpf/README.rst
+++ b/tools/testing/selftests/bpf/README.rst
@@ -19,6 +19,13 @@ the CI. It builds the kernel (without overwriting your existing Kconfig), recomp
 bpf selftests, runs them (by default ``tools/testing/selftests/bpf/test_progs``) and
 saves the resulting output (by default in ``~/.bpf_selftests``).
 
+Script dependencies:
+- clang (preferably built from sources, https://github.com/llvm/llvm-project);
+- pahole (preferably built from sources, https://git.kernel.org/pub/scm/devel/pahole/pahole.git/);
+- qemu;
+- docutils (for ``rst2man``);
+- libcap-devel.
+
 For more information on about using the script, run:
 
 .. code-block:: console

From ae7f47041d928b1a2f28717d095b4153c63cbf6a Mon Sep 17 00:00:00 2001
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
Date: Wed, 21 Jul 2021 12:40:58 +0200
Subject: [PATCH 35/64] bpf/tests: Fix copy-and-paste error in double word test

This test now operates on DW as stated instead of W, which was
already covered by another test.

Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210721104058.3755254-1-johan.almbladh@anyfinetworks.com
---
 lib/test_bpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index d500320778c7..1c5299cb3f19 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -4286,8 +4286,8 @@ static struct bpf_test tests[] = {
 		.u.insns_int = {
 			BPF_LD_IMM64(R0, 0),
 			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
-			BPF_STX_MEM(BPF_W, R10, R1, -40),
-			BPF_LDX_MEM(BPF_W, R0, R10, -40),
+			BPF_STX_MEM(BPF_DW, R10, R1, -40),
+			BPF_LDX_MEM(BPF_DW, R0, R10, -40),
 			BPF_EXIT_INSN(),
 		},
 		INTERNAL,

From 2b7e9f25e590726cca76700ebdb10e92a7a72ca1 Mon Sep 17 00:00:00 2001
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
Date: Wed, 21 Jul 2021 12:38:22 +0200
Subject: [PATCH 36/64] bpf/tests: Do not PASS tests without actually testing
 the result

Each test case can have a set of sub-tests, where each sub-test can
run the cBPF/eBPF test snippet with its own data_size and expected
result. Before, the end of the sub-test array was indicated by both
data_size and result being zero. However, most or all of the internal
eBPF tests has a data_size of zero already. When such a test also had
an expected value of zero, the test was never run but reported as
PASS anyway.

Now the test runner always runs the first sub-test, regardless of the
data_size and result values. The sub-test array zero-termination only
applies for any additional sub-tests.

There are other ways fix it of course, but this solution at least
removes the surprise of eBPF tests with a zero result always succeeding.

Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210721103822.3755111-1-johan.almbladh@anyfinetworks.com
---
 lib/test_bpf.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 1c5299cb3f19..f6d5d30d01bf 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -6659,7 +6659,14 @@ static int run_one(const struct bpf_prog *fp, struct bpf_test *test)
 		u64 duration;
 		u32 ret;
 
-		if (test->test[i].data_size == 0 &&
+		/*
+		 * NOTE: Several sub-tests may be present, in which case
+		 * a zero {data_size, result} tuple indicates the end of
+		 * the sub-test array. The first test is always run,
+		 * even if both data_size and result happen to be zero.
+		 */
+		if (i > 0 &&
+		    test->test[i].data_size == 0 &&
 		    test->test[i].result == 0)
 			break;
 

From 6e43b28607848eeb079c033f415b410788569b27 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 20 Jul 2021 17:08:19 -0700
Subject: [PATCH 37/64] libbpf: Cleanup the layering between CORE and
 bpf_program.

CO-RE processing functions don't need to know 'struct bpf_program' details.
Cleanup the layering to eventually be able to move CO-RE logic into a separate file.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210721000822.40958-2-alexei.starovoitov@gmail.com
---
 tools/lib/bpf/libbpf.c | 74 ++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 36 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a53ca29b44ab..66e9c9e55444 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5611,7 +5611,7 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec,
 	return 1;
 }
 
-static int bpf_core_calc_field_relo(const struct bpf_program *prog,
+static int bpf_core_calc_field_relo(const char *prog_name,
 				    const struct bpf_core_relo *relo,
 				    const struct bpf_core_spec *spec,
 				    __u32 *val, __u32 *field_sz, __u32 *type_id,
@@ -5655,7 +5655,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
 			*val = sz;
 		} else {
 			pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n",
-				prog->name, relo->kind, relo->insn_off / 8);
+				prog_name, relo->kind, relo->insn_off / 8);
 			return -EINVAL;
 		}
 		if (validate)
@@ -5677,7 +5677,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
 			if (byte_sz >= 8) {
 				/* bitfield can't be read with 64-bit read */
 				pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n",
-					prog->name, relo->kind, relo->insn_off / 8);
+					prog_name, relo->kind, relo->insn_off / 8);
 				return -E2BIG;
 			}
 			byte_sz *= 2;
@@ -5827,7 +5827,7 @@ struct bpf_core_relo_res
  * with each other. Otherwise, libbpf will refuse to proceed due to ambiguity.
  * If instruction has to be poisoned, *poison will be set to true.
  */
-static int bpf_core_calc_relo(const struct bpf_program *prog,
+static int bpf_core_calc_relo(const char *prog_name,
 			      const struct bpf_core_relo *relo,
 			      int relo_idx,
 			      const struct bpf_core_spec *local_spec,
@@ -5845,10 +5845,10 @@ static int bpf_core_calc_relo(const struct bpf_program *prog,
 	res->orig_type_id = res->new_type_id = 0;
 
 	if (core_relo_is_field_based(relo->kind)) {
-		err = bpf_core_calc_field_relo(prog, relo, local_spec,
+		err = bpf_core_calc_field_relo(prog_name, relo, local_spec,
 					       &res->orig_val, &res->orig_sz,
 					       &res->orig_type_id, &res->validate);
-		err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec,
+		err = err ?: bpf_core_calc_field_relo(prog_name, relo, targ_spec,
 						      &res->new_val, &res->new_sz,
 						      &res->new_type_id, NULL);
 		if (err)
@@ -5906,7 +5906,7 @@ static int bpf_core_calc_relo(const struct bpf_program *prog,
 	} else if (err == -EOPNOTSUPP) {
 		/* EOPNOTSUPP means unknown/unsupported relocation */
 		pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n",
-			prog->name, relo_idx, core_relo_kind_str(relo->kind),
+			prog_name, relo_idx, core_relo_kind_str(relo->kind),
 			relo->kind, relo->insn_off / 8);
 	}
 
@@ -5917,11 +5917,11 @@ static int bpf_core_calc_relo(const struct bpf_program *prog,
  * Turn instruction for which CO_RE relocation failed into invalid one with
  * distinct signature.
  */
-static void bpf_core_poison_insn(struct bpf_program *prog, int relo_idx,
+static void bpf_core_poison_insn(const char *prog_name, int relo_idx,
 				 int insn_idx, struct bpf_insn *insn)
 {
 	pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n",
-		 prog->name, relo_idx, insn_idx);
+		 prog_name, relo_idx, insn_idx);
 	insn->code = BPF_JMP | BPF_CALL;
 	insn->dst_reg = 0;
 	insn->src_reg = 0;
@@ -5977,6 +5977,7 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 			       int relo_idx,
 			       const struct bpf_core_relo_res *res)
 {
+	const char *prog_name = prog->name;
 	__u32 orig_val, new_val;
 	struct bpf_insn *insn;
 	int insn_idx;
@@ -5999,8 +6000,8 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 		 * verifier about "unknown opcode 00"
 		 */
 		if (is_ldimm64_insn(insn))
-			bpf_core_poison_insn(prog, relo_idx, insn_idx + 1, insn + 1);
-		bpf_core_poison_insn(prog, relo_idx, insn_idx, insn);
+			bpf_core_poison_insn(prog_name, relo_idx, insn_idx + 1, insn + 1);
+		bpf_core_poison_insn(prog_name, relo_idx, insn_idx, insn);
 		return 0;
 	}
 
@@ -6014,14 +6015,14 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 			return -EINVAL;
 		if (res->validate && insn->imm != orig_val) {
 			pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n",
-				prog->name, relo_idx,
+				prog_name, relo_idx,
 				insn_idx, insn->imm, orig_val, new_val);
 			return -EINVAL;
 		}
 		orig_val = insn->imm;
 		insn->imm = new_val;
 		pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %u -> %u\n",
-			 prog->name, relo_idx, insn_idx,
+			 prog_name, relo_idx, insn_idx,
 			 orig_val, new_val);
 		break;
 	case BPF_LDX:
@@ -6029,25 +6030,25 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 	case BPF_STX:
 		if (res->validate && insn->off != orig_val) {
 			pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %u -> %u\n",
-				prog->name, relo_idx, insn_idx, insn->off, orig_val, new_val);
+				prog_name, relo_idx, insn_idx, insn->off, orig_val, new_val);
 			return -EINVAL;
 		}
 		if (new_val > SHRT_MAX) {
 			pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %u\n",
-				prog->name, relo_idx, insn_idx, new_val);
+				prog_name, relo_idx, insn_idx, new_val);
 			return -ERANGE;
 		}
 		if (res->fail_memsz_adjust) {
 			pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. "
 				"Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n",
-				prog->name, relo_idx, insn_idx);
+				prog_name, relo_idx, insn_idx);
 			goto poison;
 		}
 
 		orig_val = insn->off;
 		insn->off = new_val;
 		pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n",
-			 prog->name, relo_idx, insn_idx, orig_val, new_val);
+			 prog_name, relo_idx, insn_idx, orig_val, new_val);
 
 		if (res->new_sz != res->orig_sz) {
 			int insn_bytes_sz, insn_bpf_sz;
@@ -6055,20 +6056,20 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 			insn_bytes_sz = insn_bpf_size_to_bytes(insn);
 			if (insn_bytes_sz != res->orig_sz) {
 				pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n",
-					prog->name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz);
+					prog_name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz);
 				return -EINVAL;
 			}
 
 			insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz);
 			if (insn_bpf_sz < 0) {
 				pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n",
-					prog->name, relo_idx, insn_idx, res->new_sz);
+					prog_name, relo_idx, insn_idx, res->new_sz);
 				return -EINVAL;
 			}
 
 			insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code);
 			pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n",
-				 prog->name, relo_idx, insn_idx, res->orig_sz, res->new_sz);
+				 prog_name, relo_idx, insn_idx, res->orig_sz, res->new_sz);
 		}
 		break;
 	case BPF_LD: {
@@ -6080,14 +6081,14 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 		    insn[1].code != 0 || insn[1].dst_reg != 0 ||
 		    insn[1].src_reg != 0 || insn[1].off != 0) {
 			pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n",
-				prog->name, relo_idx, insn_idx);
+				prog_name, relo_idx, insn_idx);
 			return -EINVAL;
 		}
 
 		imm = insn[0].imm + ((__u64)insn[1].imm << 32);
 		if (res->validate && imm != orig_val) {
 			pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %u -> %u\n",
-				prog->name, relo_idx,
+				prog_name, relo_idx,
 				insn_idx, (unsigned long long)imm,
 				orig_val, new_val);
 			return -EINVAL;
@@ -6096,13 +6097,13 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 		insn[0].imm = new_val;
 		insn[1].imm = 0; /* currently only 32-bit values are supported */
 		pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %u\n",
-			 prog->name, relo_idx, insn_idx,
+			 prog_name, relo_idx, insn_idx,
 			 (unsigned long long)imm, new_val);
 		break;
 	}
 	default:
 		pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n",
-			prog->name, relo_idx, insn_idx, insn->code,
+			prog_name, relo_idx, insn_idx, insn->code,
 			insn->src_reg, insn->dst_reg, insn->off, insn->imm);
 		return -EINVAL;
 	}
@@ -6238,6 +6239,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 	const struct btf_type *local_type;
 	const char *local_name;
 	struct core_cand_list *cands = NULL;
+	const char *prog_name = prog->name;
 	__u32 local_id;
 	const char *spec_str;
 	int i, j, err;
@@ -6264,13 +6266,13 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 	err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec);
 	if (err) {
 		pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n",
-			prog->name, relo_idx, local_id, btf_kind_str(local_type),
+			prog_name, relo_idx, local_id, btf_kind_str(local_type),
 			str_is_empty(local_name) ? "<anon>" : local_name,
 			spec_str, err);
 		return -EINVAL;
 	}
 
-	pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog->name,
+	pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name,
 		 relo_idx, core_relo_kind_str(relo->kind), relo->kind);
 	bpf_core_dump_spec(LIBBPF_DEBUG, &local_spec);
 	libbpf_print(LIBBPF_DEBUG, "\n");
@@ -6287,7 +6289,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 	/* libbpf doesn't support candidate search for anonymous types */
 	if (str_is_empty(spec_str)) {
 		pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n",
-			prog->name, relo_idx, core_relo_kind_str(relo->kind), relo->kind);
+			prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind);
 		return -EOPNOTSUPP;
 	}
 
@@ -6295,7 +6297,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 		cands = bpf_core_find_cands(prog->obj, local_btf, local_id);
 		if (IS_ERR(cands)) {
 			pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n",
-				prog->name, relo_idx, local_id, btf_kind_str(local_type),
+				prog_name, relo_idx, local_id, btf_kind_str(local_type),
 				local_name, PTR_ERR(cands));
 			return PTR_ERR(cands);
 		}
@@ -6311,13 +6313,13 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 					  cands->cands[i].id, &cand_spec);
 		if (err < 0) {
 			pr_warn("prog '%s': relo #%d: error matching candidate #%d ",
-				prog->name, relo_idx, i);
+				prog_name, relo_idx, i);
 			bpf_core_dump_spec(LIBBPF_WARN, &cand_spec);
 			libbpf_print(LIBBPF_WARN, ": %d\n", err);
 			return err;
 		}
 
-		pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog->name,
+		pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name,
 			 relo_idx, err == 0 ? "non-matching" : "matching", i);
 		bpf_core_dump_spec(LIBBPF_DEBUG, &cand_spec);
 		libbpf_print(LIBBPF_DEBUG, "\n");
@@ -6325,7 +6327,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 		if (err == 0)
 			continue;
 
-		err = bpf_core_calc_relo(prog, relo, relo_idx, &local_spec, &cand_spec, &cand_res);
+		err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, &cand_spec, &cand_res);
 		if (err)
 			return err;
 
@@ -6337,7 +6339,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 			 * should all resolve to the same bit offset
 			 */
 			pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n",
-				prog->name, relo_idx, cand_spec.bit_offset,
+				prog_name, relo_idx, cand_spec.bit_offset,
 				targ_spec.bit_offset);
 			return -EINVAL;
 		} else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) {
@@ -6346,7 +6348,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 			 * proceed due to ambiguity
 			 */
 			pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %u != %s %u\n",
-				prog->name, relo_idx,
+				prog_name, relo_idx,
 				cand_res.poison ? "failure" : "success", cand_res.new_val,
 				targ_res.poison ? "failure" : "success", targ_res.new_val);
 			return -EINVAL;
@@ -6379,10 +6381,10 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 	 */
 	if (j == 0) {
 		pr_debug("prog '%s': relo #%d: no matching targets found\n",
-			 prog->name, relo_idx);
+			 prog_name, relo_idx);
 
 		/* calculate single target relo result explicitly */
-		err = bpf_core_calc_relo(prog, relo, relo_idx, &local_spec, NULL, &targ_res);
+		err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, NULL, &targ_res);
 		if (err)
 			return err;
 	}
@@ -6392,7 +6394,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 	err = bpf_core_patch_insn(prog, relo, relo_idx, &targ_res);
 	if (err) {
 		pr_warn("prog '%s': relo #%d: failed to patch insn #%zu: %d\n",
-			prog->name, relo_idx, relo->insn_off / BPF_INSN_SZ, err);
+			prog_name, relo_idx, relo->insn_off / BPF_INSN_SZ, err);
 		return -EINVAL;
 	}
 

From 3ee4f5335511b5357d3e762b3461b0d13e565ad5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 20 Jul 2021 17:08:20 -0700
Subject: [PATCH 38/64] libbpf: Split bpf_core_apply_relo() into bpf_program
 independent helper.

bpf_core_apply_relo() doesn't need to know bpf_program internals
and hashmap details.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210721000822.40958-3-alexei.starovoitov@gmail.com
---
 tools/lib/bpf/libbpf.c | 117 +++++++++++++++++++++++++----------------
 1 file changed, 71 insertions(+), 46 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 66e9c9e55444..8649f657e81c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5972,26 +5972,13 @@ static int insn_bytes_to_bpf_size(__u32 sz)
  * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64};
  * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}.
  */
-static int bpf_core_patch_insn(struct bpf_program *prog,
-			       const struct bpf_core_relo *relo,
-			       int relo_idx,
-			       const struct bpf_core_relo_res *res)
+static int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn,
+			       int insn_idx, const struct bpf_core_relo *relo,
+			       int relo_idx, const struct bpf_core_relo_res *res)
 {
-	const char *prog_name = prog->name;
 	__u32 orig_val, new_val;
-	struct bpf_insn *insn;
-	int insn_idx;
 	__u8 class;
 
-	if (relo->insn_off % BPF_INSN_SZ)
-		return -EINVAL;
-	insn_idx = relo->insn_off / BPF_INSN_SZ;
-	/* adjust insn_idx from section frame of reference to the local
-	 * program's frame of reference; (sub-)program code is not yet
-	 * relocated, so it's enough to just subtract in-section offset
-	 */
-	insn_idx = insn_idx - prog->sec_insn_off;
-	insn = &prog->insns[insn_idx];
 	class = BPF_CLASS(insn->code);
 
 	if (res->poison) {
@@ -6077,7 +6064,6 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
 
 		if (!is_ldimm64_insn(insn) ||
 		    insn[0].src_reg != 0 || insn[0].off != 0 ||
-		    insn_idx + 1 >= prog->insns_cnt ||
 		    insn[1].code != 0 || insn[1].dst_reg != 0 ||
 		    insn[1].src_reg != 0 || insn[1].off != 0) {
 			pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n",
@@ -6227,19 +6213,17 @@ static void *u32_as_hash_key(__u32 x)
  *    between multiple relocations for the same type ID and is updated as some
  *    of the candidates are pruned due to structural incompatibility.
  */
-static int bpf_core_apply_relo(struct bpf_program *prog,
-			       const struct bpf_core_relo *relo,
-			       int relo_idx,
-			       const struct btf *local_btf,
-			       struct hashmap *cand_cache)
+static int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
+				    int insn_idx,
+				    const struct bpf_core_relo *relo,
+				    int relo_idx,
+				    const struct btf *local_btf,
+				    struct core_cand_list *cands)
 {
 	struct bpf_core_spec local_spec, cand_spec, targ_spec = {};
-	const void *type_key = u32_as_hash_key(relo->type_id);
 	struct bpf_core_relo_res cand_res, targ_res;
 	const struct btf_type *local_type;
 	const char *local_name;
-	struct core_cand_list *cands = NULL;
-	const char *prog_name = prog->name;
 	__u32 local_id;
 	const char *spec_str;
 	int i, j, err;
@@ -6257,12 +6241,6 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 	if (str_is_empty(spec_str))
 		return -EINVAL;
 
-	if (prog->obj->gen_loader) {
-		pr_warn("// TODO core_relo: prog %td insn[%d] %s %s kind %d\n",
-			prog - prog->obj->programs, relo->insn_off / 8,
-			local_name, spec_str, relo->kind);
-		return -ENOTSUP;
-	}
 	err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec);
 	if (err) {
 		pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n",
@@ -6293,20 +6271,6 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 		return -EOPNOTSUPP;
 	}
 
-	if (!hashmap__find(cand_cache, type_key, (void **)&cands)) {
-		cands = bpf_core_find_cands(prog->obj, local_btf, local_id);
-		if (IS_ERR(cands)) {
-			pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n",
-				prog_name, relo_idx, local_id, btf_kind_str(local_type),
-				local_name, PTR_ERR(cands));
-			return PTR_ERR(cands);
-		}
-		err = hashmap__set(cand_cache, type_key, cands, NULL, NULL);
-		if (err) {
-			bpf_core_free_cands(cands);
-			return err;
-		}
-	}
 
 	for (i = 0, j = 0; i < cands->len; i++) {
 		err = bpf_core_spec_match(&local_spec, cands->cands[i].btf,
@@ -6391,7 +6355,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 
 patch_insn:
 	/* bpf_core_patch_insn() should know how to handle missing targ_spec */
-	err = bpf_core_patch_insn(prog, relo, relo_idx, &targ_res);
+	err = bpf_core_patch_insn(prog_name, insn, insn_idx, relo, relo_idx, &targ_res);
 	if (err) {
 		pr_warn("prog '%s': relo #%d: failed to patch insn #%zu: %d\n",
 			prog_name, relo_idx, relo->insn_off / BPF_INSN_SZ, err);
@@ -6401,6 +6365,67 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 	return 0;
 }
 
+static int bpf_core_apply_relo(struct bpf_program *prog,
+			       const struct bpf_core_relo *relo,
+			       int relo_idx,
+			       const struct btf *local_btf,
+			       struct hashmap *cand_cache)
+{
+	const void *type_key = u32_as_hash_key(relo->type_id);
+	struct core_cand_list *cands = NULL;
+	const char *prog_name = prog->name;
+	const struct btf_type *local_type;
+	const char *local_name;
+	__u32 local_id = relo->type_id;
+	struct bpf_insn *insn;
+	int insn_idx, err;
+
+	if (relo->insn_off % BPF_INSN_SZ)
+		return -EINVAL;
+	insn_idx = relo->insn_off / BPF_INSN_SZ;
+	/* adjust insn_idx from section frame of reference to the local
+	 * program's frame of reference; (sub-)program code is not yet
+	 * relocated, so it's enough to just subtract in-section offset
+	 */
+	insn_idx = insn_idx - prog->sec_insn_off;
+	if (insn_idx > prog->insns_cnt)
+		return -EINVAL;
+	insn = &prog->insns[insn_idx];
+
+	local_type = btf__type_by_id(local_btf, local_id);
+	if (!local_type)
+		return -EINVAL;
+
+	local_name = btf__name_by_offset(local_btf, local_type->name_off);
+	if (!local_name)
+		return -EINVAL;
+
+	if (prog->obj->gen_loader) {
+		pr_warn("// TODO core_relo: prog %td insn[%d] %s kind %d\n",
+			prog - prog->obj->programs, relo->insn_off / 8,
+			local_name, relo->kind);
+		return -ENOTSUP;
+	}
+
+	if (relo->kind != BPF_TYPE_ID_LOCAL &&
+	    !hashmap__find(cand_cache, type_key, (void **)&cands)) {
+		cands = bpf_core_find_cands(prog->obj, local_btf, local_id);
+		if (IS_ERR(cands)) {
+			pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n",
+				prog_name, relo_idx, local_id, btf_kind_str(local_type),
+				local_name, PTR_ERR(cands));
+			return PTR_ERR(cands);
+		}
+		err = hashmap__set(cand_cache, type_key, cands, NULL, NULL);
+		if (err) {
+			bpf_core_free_cands(cands);
+			return err;
+		}
+	}
+
+	return bpf_core_apply_relo_insn(prog_name, insn, insn_idx, relo, relo_idx, local_btf, cands);
+}
+
 static int
 bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
 {

From 301ba4d710284e088d278adc477b7edad834577f Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 20 Jul 2021 17:08:21 -0700
Subject: [PATCH 39/64] libbpf: Move CO-RE types into relo_core.h.

In order to make a clean split of CO-RE logic move its types
into independent header file.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210721000822.40958-4-alexei.starovoitov@gmail.com
---
 tools/lib/bpf/libbpf.c          | 32 ++++--------
 tools/lib/bpf/libbpf_internal.h | 71 +------------------------
 tools/lib/bpf/relo_core.h       | 92 +++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 93 deletions(-)
 create mode 100644 tools/lib/bpf/relo_core.h

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 8649f657e81c..5dcb048da8df 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5022,34 +5022,20 @@ static size_t bpf_core_essential_name_len(const char *name)
 	return n;
 }
 
-struct core_cand
-{
-	const struct btf *btf;
-	const struct btf_type *t;
-	const char *name;
-	__u32 id;
-};
-
-/* dynamically sized list of type IDs and its associated struct btf */
-struct core_cand_list {
-	struct core_cand *cands;
-	int len;
-};
-
-static void bpf_core_free_cands(struct core_cand_list *cands)
+static void bpf_core_free_cands(struct bpf_core_cand_list *cands)
 {
 	free(cands->cands);
 	free(cands);
 }
 
-static int bpf_core_add_cands(struct core_cand *local_cand,
+static int bpf_core_add_cands(struct bpf_core_cand *local_cand,
 			      size_t local_essent_len,
 			      const struct btf *targ_btf,
 			      const char *targ_btf_name,
 			      int targ_start_id,
-			      struct core_cand_list *cands)
+			      struct bpf_core_cand_list *cands)
 {
-	struct core_cand *new_cands, *cand;
+	struct bpf_core_cand *new_cands, *cand;
 	const struct btf_type *t;
 	const char *targ_name;
 	size_t targ_essent_len;
@@ -5185,11 +5171,11 @@ static int load_module_btfs(struct bpf_object *obj)
 	return 0;
 }
 
-static struct core_cand_list *
+static struct bpf_core_cand_list *
 bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 local_type_id)
 {
-	struct core_cand local_cand = {};
-	struct core_cand_list *cands;
+	struct bpf_core_cand local_cand = {};
+	struct bpf_core_cand_list *cands;
 	const struct btf *main_btf;
 	size_t local_essent_len;
 	int err, i;
@@ -6218,7 +6204,7 @@ static int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn
 				    const struct bpf_core_relo *relo,
 				    int relo_idx,
 				    const struct btf *local_btf,
-				    struct core_cand_list *cands)
+				    struct bpf_core_cand_list *cands)
 {
 	struct bpf_core_spec local_spec, cand_spec, targ_spec = {};
 	struct bpf_core_relo_res cand_res, targ_res;
@@ -6372,7 +6358,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
 			       struct hashmap *cand_cache)
 {
 	const void *type_key = u32_as_hash_key(relo->type_id);
-	struct core_cand_list *cands = NULL;
+	struct bpf_core_cand_list *cands = NULL;
 	const char *prog_name = prog->name;
 	const struct btf_type *local_type;
 	const char *local_name;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 016ca7cb4f8a..3178d5685dce 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -14,6 +14,7 @@
 #include <errno.h>
 #include <linux/err.h>
 #include "libbpf_legacy.h"
+#include "relo_core.h"
 
 /* make sure libbpf doesn't use kernel-only integer typedefs */
 #pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
@@ -366,76 +367,6 @@ struct bpf_line_info_min {
 	__u32	line_col;
 };
 
-/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value
- * has to be adjusted by relocations.
- */
-enum bpf_core_relo_kind {
-	BPF_FIELD_BYTE_OFFSET = 0,	/* field byte offset */
-	BPF_FIELD_BYTE_SIZE = 1,	/* field size in bytes */
-	BPF_FIELD_EXISTS = 2,		/* field existence in target kernel */
-	BPF_FIELD_SIGNED = 3,		/* field signedness (0 - unsigned, 1 - signed) */
-	BPF_FIELD_LSHIFT_U64 = 4,	/* bitfield-specific left bitshift */
-	BPF_FIELD_RSHIFT_U64 = 5,	/* bitfield-specific right bitshift */
-	BPF_TYPE_ID_LOCAL = 6,		/* type ID in local BPF object */
-	BPF_TYPE_ID_TARGET = 7,		/* type ID in target kernel */
-	BPF_TYPE_EXISTS = 8,		/* type existence in target kernel */
-	BPF_TYPE_SIZE = 9,		/* type size in bytes */
-	BPF_ENUMVAL_EXISTS = 10,	/* enum value existence in target kernel */
-	BPF_ENUMVAL_VALUE = 11,		/* enum value integer value */
-};
-
-/* The minimum bpf_core_relo checked by the loader
- *
- * CO-RE relocation captures the following data:
- * - insn_off - instruction offset (in bytes) within a BPF program that needs
- *   its insn->imm field to be relocated with actual field info;
- * - type_id - BTF type ID of the "root" (containing) entity of a relocatable
- *   type or field;
- * - access_str_off - offset into corresponding .BTF string section. String
- *   interpretation depends on specific relocation kind:
- *     - for field-based relocations, string encodes an accessed field using
- *     a sequence of field and array indices, separated by colon (:). It's
- *     conceptually very close to LLVM's getelementptr ([0]) instruction's
- *     arguments for identifying offset to a field.
- *     - for type-based relocations, strings is expected to be just "0";
- *     - for enum value-based relocations, string contains an index of enum
- *     value within its enum type;
- *
- * Example to provide a better feel.
- *
- *   struct sample {
- *       int a;
- *       struct {
- *           int b[10];
- *       };
- *   };
- *
- *   struct sample *s = ...;
- *   int x = &s->a;     // encoded as "0:0" (a is field #0)
- *   int y = &s->b[5];  // encoded as "0:1:0:5" (anon struct is field #1, 
- *                      // b is field #0 inside anon struct, accessing elem #5)
- *   int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
- *
- * type_id for all relocs in this example  will capture BTF type id of
- * `struct sample`.
- *
- * Such relocation is emitted when using __builtin_preserve_access_index()
- * Clang built-in, passing expression that captures field address, e.g.:
- *
- * bpf_probe_read(&dst, sizeof(dst),
- *		  __builtin_preserve_access_index(&src->a.b.c));
- *
- * In this case Clang will emit field relocation recording necessary data to
- * be able to find offset of embedded `a.b.c` field within `src` struct.
- *
- *   [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction
- */
-struct bpf_core_relo {
-	__u32   insn_off;
-	__u32   type_id;
-	__u32   access_str_off;
-	enum bpf_core_relo_kind kind;
-};
 
 typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx);
 typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx);
diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h
new file mode 100644
index 000000000000..ddf20151fe41
--- /dev/null
+++ b/tools/lib/bpf/relo_core.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (c) 2019 Facebook */
+
+#ifndef __RELO_CORE_H
+#define __RELO_CORE_H
+
+/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value
+ * has to be adjusted by relocations.
+ */
+enum bpf_core_relo_kind {
+	BPF_FIELD_BYTE_OFFSET = 0,	/* field byte offset */
+	BPF_FIELD_BYTE_SIZE = 1,	/* field size in bytes */
+	BPF_FIELD_EXISTS = 2,		/* field existence in target kernel */
+	BPF_FIELD_SIGNED = 3,		/* field signedness (0 - unsigned, 1 - signed) */
+	BPF_FIELD_LSHIFT_U64 = 4,	/* bitfield-specific left bitshift */
+	BPF_FIELD_RSHIFT_U64 = 5,	/* bitfield-specific right bitshift */
+	BPF_TYPE_ID_LOCAL = 6,		/* type ID in local BPF object */
+	BPF_TYPE_ID_TARGET = 7,		/* type ID in target kernel */
+	BPF_TYPE_EXISTS = 8,		/* type existence in target kernel */
+	BPF_TYPE_SIZE = 9,		/* type size in bytes */
+	BPF_ENUMVAL_EXISTS = 10,	/* enum value existence in target kernel */
+	BPF_ENUMVAL_VALUE = 11,		/* enum value integer value */
+};
+
+/* The minimum bpf_core_relo checked by the loader
+ *
+ * CO-RE relocation captures the following data:
+ * - insn_off - instruction offset (in bytes) within a BPF program that needs
+ *   its insn->imm field to be relocated with actual field info;
+ * - type_id - BTF type ID of the "root" (containing) entity of a relocatable
+ *   type or field;
+ * - access_str_off - offset into corresponding .BTF string section. String
+ *   interpretation depends on specific relocation kind:
+ *     - for field-based relocations, string encodes an accessed field using
+ *     a sequence of field and array indices, separated by colon (:). It's
+ *     conceptually very close to LLVM's getelementptr ([0]) instruction's
+ *     arguments for identifying offset to a field.
+ *     - for type-based relocations, strings is expected to be just "0";
+ *     - for enum value-based relocations, string contains an index of enum
+ *     value within its enum type;
+ *
+ * Example to provide a better feel.
+ *
+ *   struct sample {
+ *       int a;
+ *       struct {
+ *           int b[10];
+ *       };
+ *   };
+ *
+ *   struct sample *s = ...;
+ *   int x = &s->a;     // encoded as "0:0" (a is field #0)
+ *   int y = &s->b[5];  // encoded as "0:1:0:5" (anon struct is field #1,
+ *                      // b is field #0 inside anon struct, accessing elem #5)
+ *   int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
+ *
+ * type_id for all relocs in this example  will capture BTF type id of
+ * `struct sample`.
+ *
+ * Such relocation is emitted when using __builtin_preserve_access_index()
+ * Clang built-in, passing expression that captures field address, e.g.:
+ *
+ * bpf_probe_read(&dst, sizeof(dst),
+ *		  __builtin_preserve_access_index(&src->a.b.c));
+ *
+ * In this case Clang will emit field relocation recording necessary data to
+ * be able to find offset of embedded `a.b.c` field within `src` struct.
+ *
+ *   [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction
+ */
+struct bpf_core_relo {
+	__u32   insn_off;
+	__u32   type_id;
+	__u32   access_str_off;
+	enum bpf_core_relo_kind kind;
+};
+
+struct bpf_core_cand
+{
+	const struct btf *btf;
+	const struct btf_type *t;
+	const char *name;
+	__u32 id;
+};
+
+/* dynamically sized list of type IDs and its associated struct btf */
+struct bpf_core_cand_list {
+	struct bpf_core_cand *cands;
+	int len;
+};
+
+#endif

From b0588390dbcedcd74fab6ffb8afe8d52380fd8b6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 20 Jul 2021 17:08:22 -0700
Subject: [PATCH 40/64] libbpf: Split CO-RE logic into relo_core.c.

Move CO-RE logic into separate file.
The internal interface between libbpf and CO-RE is through
bpf_core_apply_relo_insn() function and few structs defined in relo_core.h.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210721000822.40958-5-alexei.starovoitov@gmail.com
---
 tools/lib/bpf/Build             |    2 +-
 tools/lib/bpf/libbpf.c          | 1297 +------------------------------
 tools/lib/bpf/libbpf_internal.h |   10 +
 tools/lib/bpf/relo_core.c       | 1295 ++++++++++++++++++++++++++++++
 tools/lib/bpf/relo_core.h       |   12 +-
 5 files changed, 1319 insertions(+), 1297 deletions(-)
 create mode 100644 tools/lib/bpf/relo_core.c

diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index 430f6874fa41..94f0a146bb7b 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1,3 +1,3 @@
 libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
 	    netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \
-	    btf_dump.o ringbuf.o strset.o linker.o gen_loader.o
+	    btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5dcb048da8df..27b9a6bec9fb 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -595,11 +595,6 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn)
 	       insn->off == 0;
 }
 
-static bool is_ldimm64_insn(struct bpf_insn *insn)
-{
-	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
-}
-
 static bool is_call_insn(const struct bpf_insn *insn)
 {
 	return insn->code == (BPF_JMP | BPF_CALL);
@@ -4725,279 +4720,6 @@ bpf_object__create_maps(struct bpf_object *obj)
 	return err;
 }
 
-#define BPF_CORE_SPEC_MAX_LEN 64
-
-/* represents BPF CO-RE field or array element accessor */
-struct bpf_core_accessor {
-	__u32 type_id;		/* struct/union type or array element type */
-	__u32 idx;		/* field index or array index */
-	const char *name;	/* field name or NULL for array accessor */
-};
-
-struct bpf_core_spec {
-	const struct btf *btf;
-	/* high-level spec: named fields and array indices only */
-	struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
-	/* original unresolved (no skip_mods_or_typedefs) root type ID */
-	__u32 root_type_id;
-	/* CO-RE relocation kind */
-	enum bpf_core_relo_kind relo_kind;
-	/* high-level spec length */
-	int len;
-	/* raw, low-level spec: 1-to-1 with accessor spec string */
-	int raw_spec[BPF_CORE_SPEC_MAX_LEN];
-	/* raw spec length */
-	int raw_len;
-	/* field bit offset represented by spec */
-	__u32 bit_offset;
-};
-
-static bool str_is_empty(const char *s)
-{
-	return !s || !s[0];
-}
-
-static bool is_flex_arr(const struct btf *btf,
-			const struct bpf_core_accessor *acc,
-			const struct btf_array *arr)
-{
-	const struct btf_type *t;
-
-	/* not a flexible array, if not inside a struct or has non-zero size */
-	if (!acc->name || arr->nelems > 0)
-		return false;
-
-	/* has to be the last member of enclosing struct */
-	t = btf__type_by_id(btf, acc->type_id);
-	return acc->idx == btf_vlen(t) - 1;
-}
-
-static const char *core_relo_kind_str(enum bpf_core_relo_kind kind)
-{
-	switch (kind) {
-	case BPF_FIELD_BYTE_OFFSET: return "byte_off";
-	case BPF_FIELD_BYTE_SIZE: return "byte_sz";
-	case BPF_FIELD_EXISTS: return "field_exists";
-	case BPF_FIELD_SIGNED: return "signed";
-	case BPF_FIELD_LSHIFT_U64: return "lshift_u64";
-	case BPF_FIELD_RSHIFT_U64: return "rshift_u64";
-	case BPF_TYPE_ID_LOCAL: return "local_type_id";
-	case BPF_TYPE_ID_TARGET: return "target_type_id";
-	case BPF_TYPE_EXISTS: return "type_exists";
-	case BPF_TYPE_SIZE: return "type_size";
-	case BPF_ENUMVAL_EXISTS: return "enumval_exists";
-	case BPF_ENUMVAL_VALUE: return "enumval_value";
-	default: return "unknown";
-	}
-}
-
-static bool core_relo_is_field_based(enum bpf_core_relo_kind kind)
-{
-	switch (kind) {
-	case BPF_FIELD_BYTE_OFFSET:
-	case BPF_FIELD_BYTE_SIZE:
-	case BPF_FIELD_EXISTS:
-	case BPF_FIELD_SIGNED:
-	case BPF_FIELD_LSHIFT_U64:
-	case BPF_FIELD_RSHIFT_U64:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static bool core_relo_is_type_based(enum bpf_core_relo_kind kind)
-{
-	switch (kind) {
-	case BPF_TYPE_ID_LOCAL:
-	case BPF_TYPE_ID_TARGET:
-	case BPF_TYPE_EXISTS:
-	case BPF_TYPE_SIZE:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind)
-{
-	switch (kind) {
-	case BPF_ENUMVAL_EXISTS:
-	case BPF_ENUMVAL_VALUE:
-		return true;
-	default:
-		return false;
-	}
-}
-
-/*
- * Turn bpf_core_relo into a low- and high-level spec representation,
- * validating correctness along the way, as well as calculating resulting
- * field bit offset, specified by accessor string. Low-level spec captures
- * every single level of nestedness, including traversing anonymous
- * struct/union members. High-level one only captures semantically meaningful
- * "turning points": named fields and array indicies.
- * E.g., for this case:
- *
- *   struct sample {
- *       int __unimportant;
- *       struct {
- *           int __1;
- *           int __2;
- *           int a[7];
- *       };
- *   };
- *
- *   struct sample *s = ...;
- *
- *   int x = &s->a[3]; // access string = '0:1:2:3'
- *
- * Low-level spec has 1:1 mapping with each element of access string (it's
- * just a parsed access string representation): [0, 1, 2, 3].
- *
- * High-level spec will capture only 3 points:
- *   - intial zero-index access by pointer (&s->... is the same as &s[0]...);
- *   - field 'a' access (corresponds to '2' in low-level spec);
- *   - array element #3 access (corresponds to '3' in low-level spec).
- *
- * Type-based relocations (TYPE_EXISTS/TYPE_SIZE,
- * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their
- * spec and raw_spec are kept empty.
- *
- * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access
- * string to specify enumerator's value index that need to be relocated.
- */
-static int bpf_core_parse_spec(const struct btf *btf,
-			       __u32 type_id,
-			       const char *spec_str,
-			       enum bpf_core_relo_kind relo_kind,
-			       struct bpf_core_spec *spec)
-{
-	int access_idx, parsed_len, i;
-	struct bpf_core_accessor *acc;
-	const struct btf_type *t;
-	const char *name;
-	__u32 id;
-	__s64 sz;
-
-	if (str_is_empty(spec_str) || *spec_str == ':')
-		return -EINVAL;
-
-	memset(spec, 0, sizeof(*spec));
-	spec->btf = btf;
-	spec->root_type_id = type_id;
-	spec->relo_kind = relo_kind;
-
-	/* type-based relocations don't have a field access string */
-	if (core_relo_is_type_based(relo_kind)) {
-		if (strcmp(spec_str, "0"))
-			return -EINVAL;
-		return 0;
-	}
-
-	/* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */
-	while (*spec_str) {
-		if (*spec_str == ':')
-			++spec_str;
-		if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1)
-			return -EINVAL;
-		if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
-			return -E2BIG;
-		spec_str += parsed_len;
-		spec->raw_spec[spec->raw_len++] = access_idx;
-	}
-
-	if (spec->raw_len == 0)
-		return -EINVAL;
-
-	t = skip_mods_and_typedefs(btf, type_id, &id);
-	if (!t)
-		return -EINVAL;
-
-	access_idx = spec->raw_spec[0];
-	acc = &spec->spec[0];
-	acc->type_id = id;
-	acc->idx = access_idx;
-	spec->len++;
-
-	if (core_relo_is_enumval_based(relo_kind)) {
-		if (!btf_is_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t))
-			return -EINVAL;
-
-		/* record enumerator name in a first accessor */
-		acc->name = btf__name_by_offset(btf, btf_enum(t)[access_idx].name_off);
-		return 0;
-	}
-
-	if (!core_relo_is_field_based(relo_kind))
-		return -EINVAL;
-
-	sz = btf__resolve_size(btf, id);
-	if (sz < 0)
-		return sz;
-	spec->bit_offset = access_idx * sz * 8;
-
-	for (i = 1; i < spec->raw_len; i++) {
-		t = skip_mods_and_typedefs(btf, id, &id);
-		if (!t)
-			return -EINVAL;
-
-		access_idx = spec->raw_spec[i];
-		acc = &spec->spec[spec->len];
-
-		if (btf_is_composite(t)) {
-			const struct btf_member *m;
-			__u32 bit_offset;
-
-			if (access_idx >= btf_vlen(t))
-				return -EINVAL;
-
-			bit_offset = btf_member_bit_offset(t, access_idx);
-			spec->bit_offset += bit_offset;
-
-			m = btf_members(t) + access_idx;
-			if (m->name_off) {
-				name = btf__name_by_offset(btf, m->name_off);
-				if (str_is_empty(name))
-					return -EINVAL;
-
-				acc->type_id = id;
-				acc->idx = access_idx;
-				acc->name = name;
-				spec->len++;
-			}
-
-			id = m->type;
-		} else if (btf_is_array(t)) {
-			const struct btf_array *a = btf_array(t);
-			bool flex;
-
-			t = skip_mods_and_typedefs(btf, a->type, &id);
-			if (!t)
-				return -EINVAL;
-
-			flex = is_flex_arr(btf, acc - 1, a);
-			if (!flex && access_idx >= a->nelems)
-				return -EINVAL;
-
-			spec->spec[spec->len].type_id = id;
-			spec->spec[spec->len].idx = access_idx;
-			spec->len++;
-
-			sz = btf__resolve_size(btf, id);
-			if (sz < 0)
-				return sz;
-			spec->bit_offset += access_idx * sz * 8;
-		} else {
-			pr_warn("relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n",
-				type_id, spec_str, i, id, btf_kind_str(t));
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-
 static bool bpf_core_is_flavor_sep(const char *s)
 {
 	/* check X___Y name pattern, where X and Y are not underscores */
@@ -5010,7 +4732,7 @@ static bool bpf_core_is_flavor_sep(const char *s)
  * before last triple underscore. Struct name part after last triple
  * underscore is ignored by BPF CO-RE relocation during relocation matching.
  */
-static size_t bpf_core_essential_name_len(const char *name)
+size_t bpf_core_essential_name_len(const char *name)
 {
 	size_t n = strlen(name);
 	int i;
@@ -5229,165 +4951,6 @@ bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 l
 	return ERR_PTR(err);
 }
 
-/* Check two types for compatibility for the purpose of field access
- * relocation. const/volatile/restrict and typedefs are skipped to ensure we
- * are relocating semantically compatible entities:
- *   - any two STRUCTs/UNIONs are compatible and can be mixed;
- *   - any two FWDs are compatible, if their names match (modulo flavor suffix);
- *   - any two PTRs are always compatible;
- *   - for ENUMs, names should be the same (ignoring flavor suffix) or at
- *     least one of enums should be anonymous;
- *   - for ENUMs, check sizes, names are ignored;
- *   - for INT, size and signedness are ignored;
- *   - any two FLOATs are always compatible;
- *   - for ARRAY, dimensionality is ignored, element types are checked for
- *     compatibility recursively;
- *   - everything else shouldn't be ever a target of relocation.
- * These rules are not set in stone and probably will be adjusted as we get
- * more experience with using BPF CO-RE relocations.
- */
-static int bpf_core_fields_are_compat(const struct btf *local_btf,
-				      __u32 local_id,
-				      const struct btf *targ_btf,
-				      __u32 targ_id)
-{
-	const struct btf_type *local_type, *targ_type;
-
-recur:
-	local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id);
-	targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id);
-	if (!local_type || !targ_type)
-		return -EINVAL;
-
-	if (btf_is_composite(local_type) && btf_is_composite(targ_type))
-		return 1;
-	if (btf_kind(local_type) != btf_kind(targ_type))
-		return 0;
-
-	switch (btf_kind(local_type)) {
-	case BTF_KIND_PTR:
-	case BTF_KIND_FLOAT:
-		return 1;
-	case BTF_KIND_FWD:
-	case BTF_KIND_ENUM: {
-		const char *local_name, *targ_name;
-		size_t local_len, targ_len;
-
-		local_name = btf__name_by_offset(local_btf,
-						 local_type->name_off);
-		targ_name = btf__name_by_offset(targ_btf, targ_type->name_off);
-		local_len = bpf_core_essential_name_len(local_name);
-		targ_len = bpf_core_essential_name_len(targ_name);
-		/* one of them is anonymous or both w/ same flavor-less names */
-		return local_len == 0 || targ_len == 0 ||
-		       (local_len == targ_len &&
-			strncmp(local_name, targ_name, local_len) == 0);
-	}
-	case BTF_KIND_INT:
-		/* just reject deprecated bitfield-like integers; all other
-		 * integers are by default compatible between each other
-		 */
-		return btf_int_offset(local_type) == 0 &&
-		       btf_int_offset(targ_type) == 0;
-	case BTF_KIND_ARRAY:
-		local_id = btf_array(local_type)->type;
-		targ_id = btf_array(targ_type)->type;
-		goto recur;
-	default:
-		pr_warn("unexpected kind %d relocated, local [%d], target [%d]\n",
-			btf_kind(local_type), local_id, targ_id);
-		return 0;
-	}
-}
-
-/*
- * Given single high-level named field accessor in local type, find
- * corresponding high-level accessor for a target type. Along the way,
- * maintain low-level spec for target as well. Also keep updating target
- * bit offset.
- *
- * Searching is performed through recursive exhaustive enumeration of all
- * fields of a struct/union. If there are any anonymous (embedded)
- * structs/unions, they are recursively searched as well. If field with
- * desired name is found, check compatibility between local and target types,
- * before returning result.
- *
- * 1 is returned, if field is found.
- * 0 is returned if no compatible field is found.
- * <0 is returned on error.
- */
-static int bpf_core_match_member(const struct btf *local_btf,
-				 const struct bpf_core_accessor *local_acc,
-				 const struct btf *targ_btf,
-				 __u32 targ_id,
-				 struct bpf_core_spec *spec,
-				 __u32 *next_targ_id)
-{
-	const struct btf_type *local_type, *targ_type;
-	const struct btf_member *local_member, *m;
-	const char *local_name, *targ_name;
-	__u32 local_id;
-	int i, n, found;
-
-	targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id);
-	if (!targ_type)
-		return -EINVAL;
-	if (!btf_is_composite(targ_type))
-		return 0;
-
-	local_id = local_acc->type_id;
-	local_type = btf__type_by_id(local_btf, local_id);
-	local_member = btf_members(local_type) + local_acc->idx;
-	local_name = btf__name_by_offset(local_btf, local_member->name_off);
-
-	n = btf_vlen(targ_type);
-	m = btf_members(targ_type);
-	for (i = 0; i < n; i++, m++) {
-		__u32 bit_offset;
-
-		bit_offset = btf_member_bit_offset(targ_type, i);
-
-		/* too deep struct/union/array nesting */
-		if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
-			return -E2BIG;
-
-		/* speculate this member will be the good one */
-		spec->bit_offset += bit_offset;
-		spec->raw_spec[spec->raw_len++] = i;
-
-		targ_name = btf__name_by_offset(targ_btf, m->name_off);
-		if (str_is_empty(targ_name)) {
-			/* embedded struct/union, we need to go deeper */
-			found = bpf_core_match_member(local_btf, local_acc,
-						      targ_btf, m->type,
-						      spec, next_targ_id);
-			if (found) /* either found or error */
-				return found;
-		} else if (strcmp(local_name, targ_name) == 0) {
-			/* matching named field */
-			struct bpf_core_accessor *targ_acc;
-
-			targ_acc = &spec->spec[spec->len++];
-			targ_acc->type_id = targ_id;
-			targ_acc->idx = i;
-			targ_acc->name = targ_name;
-
-			*next_targ_id = m->type;
-			found = bpf_core_fields_are_compat(local_btf,
-							   local_member->type,
-							   targ_btf, m->type);
-			if (!found)
-				spec->len--; /* pop accessor */
-			return found;
-		}
-		/* member turned out not to be what we looked for */
-		spec->bit_offset -= bit_offset;
-		spec->raw_len--;
-	}
-
-	return 0;
-}
-
 /* Check local and target types for compatibility. This check is used for
  * type-based CO-RE relocations and follow slightly different rules than
  * field-based relocations. This function assumes that root types were already
@@ -5407,8 +4970,8 @@ static int bpf_core_match_member(const struct btf *local_btf,
  * These rules are not set in stone and probably will be adjusted as we get
  * more experience with using BPF CO-RE relocations.
  */
-static int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
-				     const struct btf *targ_btf, __u32 targ_id)
+int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+			      const struct btf *targ_btf, __u32 targ_id)
 {
 	const struct btf_type *local_type, *targ_type;
 	int depth = 32; /* max recursion depth */
@@ -5482,658 +5045,6 @@ static int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id
 	}
 }
 
-/*
- * Try to match local spec to a target type and, if successful, produce full
- * target spec (high-level, low-level + bit offset).
- */
-static int bpf_core_spec_match(struct bpf_core_spec *local_spec,
-			       const struct btf *targ_btf, __u32 targ_id,
-			       struct bpf_core_spec *targ_spec)
-{
-	const struct btf_type *targ_type;
-	const struct bpf_core_accessor *local_acc;
-	struct bpf_core_accessor *targ_acc;
-	int i, sz, matched;
-
-	memset(targ_spec, 0, sizeof(*targ_spec));
-	targ_spec->btf = targ_btf;
-	targ_spec->root_type_id = targ_id;
-	targ_spec->relo_kind = local_spec->relo_kind;
-
-	if (core_relo_is_type_based(local_spec->relo_kind)) {
-		return bpf_core_types_are_compat(local_spec->btf,
-						 local_spec->root_type_id,
-						 targ_btf, targ_id);
-	}
-
-	local_acc = &local_spec->spec[0];
-	targ_acc = &targ_spec->spec[0];
-
-	if (core_relo_is_enumval_based(local_spec->relo_kind)) {
-		size_t local_essent_len, targ_essent_len;
-		const struct btf_enum *e;
-		const char *targ_name;
-
-		/* has to resolve to an enum */
-		targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id);
-		if (!btf_is_enum(targ_type))
-			return 0;
-
-		local_essent_len = bpf_core_essential_name_len(local_acc->name);
-
-		for (i = 0, e = btf_enum(targ_type); i < btf_vlen(targ_type); i++, e++) {
-			targ_name = btf__name_by_offset(targ_spec->btf, e->name_off);
-			targ_essent_len = bpf_core_essential_name_len(targ_name);
-			if (targ_essent_len != local_essent_len)
-				continue;
-			if (strncmp(local_acc->name, targ_name, local_essent_len) == 0) {
-				targ_acc->type_id = targ_id;
-				targ_acc->idx = i;
-				targ_acc->name = targ_name;
-				targ_spec->len++;
-				targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx;
-				targ_spec->raw_len++;
-				return 1;
-			}
-		}
-		return 0;
-	}
-
-	if (!core_relo_is_field_based(local_spec->relo_kind))
-		return -EINVAL;
-
-	for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) {
-		targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id,
-						   &targ_id);
-		if (!targ_type)
-			return -EINVAL;
-
-		if (local_acc->name) {
-			matched = bpf_core_match_member(local_spec->btf,
-							local_acc,
-							targ_btf, targ_id,
-							targ_spec, &targ_id);
-			if (matched <= 0)
-				return matched;
-		} else {
-			/* for i=0, targ_id is already treated as array element
-			 * type (because it's the original struct), for others
-			 * we should find array element type first
-			 */
-			if (i > 0) {
-				const struct btf_array *a;
-				bool flex;
-
-				if (!btf_is_array(targ_type))
-					return 0;
-
-				a = btf_array(targ_type);
-				flex = is_flex_arr(targ_btf, targ_acc - 1, a);
-				if (!flex && local_acc->idx >= a->nelems)
-					return 0;
-				if (!skip_mods_and_typedefs(targ_btf, a->type,
-							    &targ_id))
-					return -EINVAL;
-			}
-
-			/* too deep struct/union/array nesting */
-			if (targ_spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
-				return -E2BIG;
-
-			targ_acc->type_id = targ_id;
-			targ_acc->idx = local_acc->idx;
-			targ_acc->name = NULL;
-			targ_spec->len++;
-			targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx;
-			targ_spec->raw_len++;
-
-			sz = btf__resolve_size(targ_btf, targ_id);
-			if (sz < 0)
-				return sz;
-			targ_spec->bit_offset += local_acc->idx * sz * 8;
-		}
-	}
-
-	return 1;
-}
-
-static int bpf_core_calc_field_relo(const char *prog_name,
-				    const struct bpf_core_relo *relo,
-				    const struct bpf_core_spec *spec,
-				    __u32 *val, __u32 *field_sz, __u32 *type_id,
-				    bool *validate)
-{
-	const struct bpf_core_accessor *acc;
-	const struct btf_type *t;
-	__u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id;
-	const struct btf_member *m;
-	const struct btf_type *mt;
-	bool bitfield;
-	__s64 sz;
-
-	*field_sz = 0;
-
-	if (relo->kind == BPF_FIELD_EXISTS) {
-		*val = spec ? 1 : 0;
-		return 0;
-	}
-
-	if (!spec)
-		return -EUCLEAN; /* request instruction poisoning */
-
-	acc = &spec->spec[spec->len - 1];
-	t = btf__type_by_id(spec->btf, acc->type_id);
-
-	/* a[n] accessor needs special handling */
-	if (!acc->name) {
-		if (relo->kind == BPF_FIELD_BYTE_OFFSET) {
-			*val = spec->bit_offset / 8;
-			/* remember field size for load/store mem size */
-			sz = btf__resolve_size(spec->btf, acc->type_id);
-			if (sz < 0)
-				return -EINVAL;
-			*field_sz = sz;
-			*type_id = acc->type_id;
-		} else if (relo->kind == BPF_FIELD_BYTE_SIZE) {
-			sz = btf__resolve_size(spec->btf, acc->type_id);
-			if (sz < 0)
-				return -EINVAL;
-			*val = sz;
-		} else {
-			pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n",
-				prog_name, relo->kind, relo->insn_off / 8);
-			return -EINVAL;
-		}
-		if (validate)
-			*validate = true;
-		return 0;
-	}
-
-	m = btf_members(t) + acc->idx;
-	mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id);
-	bit_off = spec->bit_offset;
-	bit_sz = btf_member_bitfield_size(t, acc->idx);
-
-	bitfield = bit_sz > 0;
-	if (bitfield) {
-		byte_sz = mt->size;
-		byte_off = bit_off / 8 / byte_sz * byte_sz;
-		/* figure out smallest int size necessary for bitfield load */
-		while (bit_off + bit_sz - byte_off * 8 > byte_sz * 8) {
-			if (byte_sz >= 8) {
-				/* bitfield can't be read with 64-bit read */
-				pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n",
-					prog_name, relo->kind, relo->insn_off / 8);
-				return -E2BIG;
-			}
-			byte_sz *= 2;
-			byte_off = bit_off / 8 / byte_sz * byte_sz;
-		}
-	} else {
-		sz = btf__resolve_size(spec->btf, field_type_id);
-		if (sz < 0)
-			return -EINVAL;
-		byte_sz = sz;
-		byte_off = spec->bit_offset / 8;
-		bit_sz = byte_sz * 8;
-	}
-
-	/* for bitfields, all the relocatable aspects are ambiguous and we
-	 * might disagree with compiler, so turn off validation of expected
-	 * value, except for signedness
-	 */
-	if (validate)
-		*validate = !bitfield;
-
-	switch (relo->kind) {
-	case BPF_FIELD_BYTE_OFFSET:
-		*val = byte_off;
-		if (!bitfield) {
-			*field_sz = byte_sz;
-			*type_id = field_type_id;
-		}
-		break;
-	case BPF_FIELD_BYTE_SIZE:
-		*val = byte_sz;
-		break;
-	case BPF_FIELD_SIGNED:
-		/* enums will be assumed unsigned */
-		*val = btf_is_enum(mt) ||
-		       (btf_int_encoding(mt) & BTF_INT_SIGNED);
-		if (validate)
-			*validate = true; /* signedness is never ambiguous */
-		break;
-	case BPF_FIELD_LSHIFT_U64:
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-		*val = 64 - (bit_off + bit_sz - byte_off  * 8);
-#else
-		*val = (8 - byte_sz) * 8 + (bit_off - byte_off * 8);
-#endif
-		break;
-	case BPF_FIELD_RSHIFT_U64:
-		*val = 64 - bit_sz;
-		if (validate)
-			*validate = true; /* right shift is never ambiguous */
-		break;
-	case BPF_FIELD_EXISTS:
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	return 0;
-}
-
-static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo,
-				   const struct bpf_core_spec *spec,
-				   __u32 *val)
-{
-	__s64 sz;
-
-	/* type-based relos return zero when target type is not found */
-	if (!spec) {
-		*val = 0;
-		return 0;
-	}
-
-	switch (relo->kind) {
-	case BPF_TYPE_ID_TARGET:
-		*val = spec->root_type_id;
-		break;
-	case BPF_TYPE_EXISTS:
-		*val = 1;
-		break;
-	case BPF_TYPE_SIZE:
-		sz = btf__resolve_size(spec->btf, spec->root_type_id);
-		if (sz < 0)
-			return -EINVAL;
-		*val = sz;
-		break;
-	case BPF_TYPE_ID_LOCAL:
-	/* BPF_TYPE_ID_LOCAL is handled specially and shouldn't get here */
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	return 0;
-}
-
-static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo,
-				      const struct bpf_core_spec *spec,
-				      __u32 *val)
-{
-	const struct btf_type *t;
-	const struct btf_enum *e;
-
-	switch (relo->kind) {
-	case BPF_ENUMVAL_EXISTS:
-		*val = spec ? 1 : 0;
-		break;
-	case BPF_ENUMVAL_VALUE:
-		if (!spec)
-			return -EUCLEAN; /* request instruction poisoning */
-		t = btf__type_by_id(spec->btf, spec->spec[0].type_id);
-		e = btf_enum(t) + spec->spec[0].idx;
-		*val = e->val;
-		break;
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	return 0;
-}
-
-struct bpf_core_relo_res
-{
-	/* expected value in the instruction, unless validate == false */
-	__u32 orig_val;
-	/* new value that needs to be patched up to */
-	__u32 new_val;
-	/* relocation unsuccessful, poison instruction, but don't fail load */
-	bool poison;
-	/* some relocations can't be validated against orig_val */
-	bool validate;
-	/* for field byte offset relocations or the forms:
-	 *     *(T *)(rX + <off>) = rY
-	 *     rX = *(T *)(rY + <off>),
-	 * we remember original and resolved field size to adjust direct
-	 * memory loads of pointers and integers; this is necessary for 32-bit
-	 * host kernel architectures, but also allows to automatically
-	 * relocate fields that were resized from, e.g., u32 to u64, etc.
-	 */
-	bool fail_memsz_adjust;
-	__u32 orig_sz;
-	__u32 orig_type_id;
-	__u32 new_sz;
-	__u32 new_type_id;
-};
-
-/* Calculate original and target relocation values, given local and target
- * specs and relocation kind. These values are calculated for each candidate.
- * If there are multiple candidates, resulting values should all be consistent
- * with each other. Otherwise, libbpf will refuse to proceed due to ambiguity.
- * If instruction has to be poisoned, *poison will be set to true.
- */
-static int bpf_core_calc_relo(const char *prog_name,
-			      const struct bpf_core_relo *relo,
-			      int relo_idx,
-			      const struct bpf_core_spec *local_spec,
-			      const struct bpf_core_spec *targ_spec,
-			      struct bpf_core_relo_res *res)
-{
-	int err = -EOPNOTSUPP;
-
-	res->orig_val = 0;
-	res->new_val = 0;
-	res->poison = false;
-	res->validate = true;
-	res->fail_memsz_adjust = false;
-	res->orig_sz = res->new_sz = 0;
-	res->orig_type_id = res->new_type_id = 0;
-
-	if (core_relo_is_field_based(relo->kind)) {
-		err = bpf_core_calc_field_relo(prog_name, relo, local_spec,
-					       &res->orig_val, &res->orig_sz,
-					       &res->orig_type_id, &res->validate);
-		err = err ?: bpf_core_calc_field_relo(prog_name, relo, targ_spec,
-						      &res->new_val, &res->new_sz,
-						      &res->new_type_id, NULL);
-		if (err)
-			goto done;
-		/* Validate if it's safe to adjust load/store memory size.
-		 * Adjustments are performed only if original and new memory
-		 * sizes differ.
-		 */
-		res->fail_memsz_adjust = false;
-		if (res->orig_sz != res->new_sz) {
-			const struct btf_type *orig_t, *new_t;
-
-			orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id);
-			new_t = btf__type_by_id(targ_spec->btf, res->new_type_id);
-
-			/* There are two use cases in which it's safe to
-			 * adjust load/store's mem size:
-			 *   - reading a 32-bit kernel pointer, while on BPF
-			 *   size pointers are always 64-bit; in this case
-			 *   it's safe to "downsize" instruction size due to
-			 *   pointer being treated as unsigned integer with
-			 *   zero-extended upper 32-bits;
-			 *   - reading unsigned integers, again due to
-			 *   zero-extension is preserving the value correctly.
-			 *
-			 * In all other cases it's incorrect to attempt to
-			 * load/store field because read value will be
-			 * incorrect, so we poison relocated instruction.
-			 */
-			if (btf_is_ptr(orig_t) && btf_is_ptr(new_t))
-				goto done;
-			if (btf_is_int(orig_t) && btf_is_int(new_t) &&
-			    btf_int_encoding(orig_t) != BTF_INT_SIGNED &&
-			    btf_int_encoding(new_t) != BTF_INT_SIGNED)
-				goto done;
-
-			/* mark as invalid mem size adjustment, but this will
-			 * only be checked for LDX/STX/ST insns
-			 */
-			res->fail_memsz_adjust = true;
-		}
-	} else if (core_relo_is_type_based(relo->kind)) {
-		err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val);
-		err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val);
-	} else if (core_relo_is_enumval_based(relo->kind)) {
-		err = bpf_core_calc_enumval_relo(relo, local_spec, &res->orig_val);
-		err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val);
-	}
-
-done:
-	if (err == -EUCLEAN) {
-		/* EUCLEAN is used to signal instruction poisoning request */
-		res->poison = true;
-		err = 0;
-	} else if (err == -EOPNOTSUPP) {
-		/* EOPNOTSUPP means unknown/unsupported relocation */
-		pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n",
-			prog_name, relo_idx, core_relo_kind_str(relo->kind),
-			relo->kind, relo->insn_off / 8);
-	}
-
-	return err;
-}
-
-/*
- * Turn instruction for which CO_RE relocation failed into invalid one with
- * distinct signature.
- */
-static void bpf_core_poison_insn(const char *prog_name, int relo_idx,
-				 int insn_idx, struct bpf_insn *insn)
-{
-	pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n",
-		 prog_name, relo_idx, insn_idx);
-	insn->code = BPF_JMP | BPF_CALL;
-	insn->dst_reg = 0;
-	insn->src_reg = 0;
-	insn->off = 0;
-	/* if this instruction is reachable (not a dead code),
-	 * verifier will complain with the following message:
-	 * invalid func unknown#195896080
-	 */
-	insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */
-}
-
-static int insn_bpf_size_to_bytes(struct bpf_insn *insn)
-{
-	switch (BPF_SIZE(insn->code)) {
-	case BPF_DW: return 8;
-	case BPF_W: return 4;
-	case BPF_H: return 2;
-	case BPF_B: return 1;
-	default: return -1;
-	}
-}
-
-static int insn_bytes_to_bpf_size(__u32 sz)
-{
-	switch (sz) {
-	case 8: return BPF_DW;
-	case 4: return BPF_W;
-	case 2: return BPF_H;
-	case 1: return BPF_B;
-	default: return -1;
-	}
-}
-
-/*
- * Patch relocatable BPF instruction.
- *
- * Patched value is determined by relocation kind and target specification.
- * For existence relocations target spec will be NULL if field/type is not found.
- * Expected insn->imm value is determined using relocation kind and local
- * spec, and is checked before patching instruction. If actual insn->imm value
- * is wrong, bail out with error.
- *
- * Currently supported classes of BPF instruction are:
- * 1. rX = <imm> (assignment with immediate operand);
- * 2. rX += <imm> (arithmetic operations with immediate operand);
- * 3. rX = <imm64> (load with 64-bit immediate value);
- * 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64};
- * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64};
- * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}.
- */
-static int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn,
-			       int insn_idx, const struct bpf_core_relo *relo,
-			       int relo_idx, const struct bpf_core_relo_res *res)
-{
-	__u32 orig_val, new_val;
-	__u8 class;
-
-	class = BPF_CLASS(insn->code);
-
-	if (res->poison) {
-poison:
-		/* poison second part of ldimm64 to avoid confusing error from
-		 * verifier about "unknown opcode 00"
-		 */
-		if (is_ldimm64_insn(insn))
-			bpf_core_poison_insn(prog_name, relo_idx, insn_idx + 1, insn + 1);
-		bpf_core_poison_insn(prog_name, relo_idx, insn_idx, insn);
-		return 0;
-	}
-
-	orig_val = res->orig_val;
-	new_val = res->new_val;
-
-	switch (class) {
-	case BPF_ALU:
-	case BPF_ALU64:
-		if (BPF_SRC(insn->code) != BPF_K)
-			return -EINVAL;
-		if (res->validate && insn->imm != orig_val) {
-			pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n",
-				prog_name, relo_idx,
-				insn_idx, insn->imm, orig_val, new_val);
-			return -EINVAL;
-		}
-		orig_val = insn->imm;
-		insn->imm = new_val;
-		pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %u -> %u\n",
-			 prog_name, relo_idx, insn_idx,
-			 orig_val, new_val);
-		break;
-	case BPF_LDX:
-	case BPF_ST:
-	case BPF_STX:
-		if (res->validate && insn->off != orig_val) {
-			pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %u -> %u\n",
-				prog_name, relo_idx, insn_idx, insn->off, orig_val, new_val);
-			return -EINVAL;
-		}
-		if (new_val > SHRT_MAX) {
-			pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %u\n",
-				prog_name, relo_idx, insn_idx, new_val);
-			return -ERANGE;
-		}
-		if (res->fail_memsz_adjust) {
-			pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. "
-				"Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n",
-				prog_name, relo_idx, insn_idx);
-			goto poison;
-		}
-
-		orig_val = insn->off;
-		insn->off = new_val;
-		pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n",
-			 prog_name, relo_idx, insn_idx, orig_val, new_val);
-
-		if (res->new_sz != res->orig_sz) {
-			int insn_bytes_sz, insn_bpf_sz;
-
-			insn_bytes_sz = insn_bpf_size_to_bytes(insn);
-			if (insn_bytes_sz != res->orig_sz) {
-				pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n",
-					prog_name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz);
-				return -EINVAL;
-			}
-
-			insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz);
-			if (insn_bpf_sz < 0) {
-				pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n",
-					prog_name, relo_idx, insn_idx, res->new_sz);
-				return -EINVAL;
-			}
-
-			insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code);
-			pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n",
-				 prog_name, relo_idx, insn_idx, res->orig_sz, res->new_sz);
-		}
-		break;
-	case BPF_LD: {
-		__u64 imm;
-
-		if (!is_ldimm64_insn(insn) ||
-		    insn[0].src_reg != 0 || insn[0].off != 0 ||
-		    insn[1].code != 0 || insn[1].dst_reg != 0 ||
-		    insn[1].src_reg != 0 || insn[1].off != 0) {
-			pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n",
-				prog_name, relo_idx, insn_idx);
-			return -EINVAL;
-		}
-
-		imm = insn[0].imm + ((__u64)insn[1].imm << 32);
-		if (res->validate && imm != orig_val) {
-			pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %u -> %u\n",
-				prog_name, relo_idx,
-				insn_idx, (unsigned long long)imm,
-				orig_val, new_val);
-			return -EINVAL;
-		}
-
-		insn[0].imm = new_val;
-		insn[1].imm = 0; /* currently only 32-bit values are supported */
-		pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %u\n",
-			 prog_name, relo_idx, insn_idx,
-			 (unsigned long long)imm, new_val);
-		break;
-	}
-	default:
-		pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n",
-			prog_name, relo_idx, insn_idx, insn->code,
-			insn->src_reg, insn->dst_reg, insn->off, insn->imm);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* Output spec definition in the format:
- * [<type-id>] (<type-name>) + <raw-spec> => <offset>@<spec>,
- * where <spec> is a C-syntax view of recorded field access, e.g.: x.a[3].b
- */
-static void bpf_core_dump_spec(int level, const struct bpf_core_spec *spec)
-{
-	const struct btf_type *t;
-	const struct btf_enum *e;
-	const char *s;
-	__u32 type_id;
-	int i;
-
-	type_id = spec->root_type_id;
-	t = btf__type_by_id(spec->btf, type_id);
-	s = btf__name_by_offset(spec->btf, t->name_off);
-
-	libbpf_print(level, "[%u] %s %s", type_id, btf_kind_str(t), str_is_empty(s) ? "<anon>" : s);
-
-	if (core_relo_is_type_based(spec->relo_kind))
-		return;
-
-	if (core_relo_is_enumval_based(spec->relo_kind)) {
-		t = skip_mods_and_typedefs(spec->btf, type_id, NULL);
-		e = btf_enum(t) + spec->raw_spec[0];
-		s = btf__name_by_offset(spec->btf, e->name_off);
-
-		libbpf_print(level, "::%s = %u", s, e->val);
-		return;
-	}
-
-	if (core_relo_is_field_based(spec->relo_kind)) {
-		for (i = 0; i < spec->len; i++) {
-			if (spec->spec[i].name)
-				libbpf_print(level, ".%s", spec->spec[i].name);
-			else if (i > 0 || spec->spec[i].idx > 0)
-				libbpf_print(level, "[%u]", spec->spec[i].idx);
-		}
-
-		libbpf_print(level, " (");
-		for (i = 0; i < spec->raw_len; i++)
-			libbpf_print(level, "%s%d", i == 0 ? "" : ":", spec->raw_spec[i]);
-
-		if (spec->bit_offset % 8)
-			libbpf_print(level, " @ offset %u.%u)",
-				     spec->bit_offset / 8, spec->bit_offset % 8);
-		else
-			libbpf_print(level, " @ offset %u)", spec->bit_offset / 8);
-		return;
-	}
-}
-
 static size_t bpf_core_hash_fn(const void *key, void *ctx)
 {
 	return (size_t)key;
@@ -6149,208 +5060,6 @@ static void *u32_as_hash_key(__u32 x)
 	return (void *)(uintptr_t)x;
 }
 
-/*
- * CO-RE relocate single instruction.
- *
- * The outline and important points of the algorithm:
- * 1. For given local type, find corresponding candidate target types.
- *    Candidate type is a type with the same "essential" name, ignoring
- *    everything after last triple underscore (___). E.g., `sample`,
- *    `sample___flavor_one`, `sample___flavor_another_one`, are all candidates
- *    for each other. Names with triple underscore are referred to as
- *    "flavors" and are useful, among other things, to allow to
- *    specify/support incompatible variations of the same kernel struct, which
- *    might differ between different kernel versions and/or build
- *    configurations.
- *
- *    N.B. Struct "flavors" could be generated by bpftool's BTF-to-C
- *    converter, when deduplicated BTF of a kernel still contains more than
- *    one different types with the same name. In that case, ___2, ___3, etc
- *    are appended starting from second name conflict. But start flavors are
- *    also useful to be defined "locally", in BPF program, to extract same
- *    data from incompatible changes between different kernel
- *    versions/configurations. For instance, to handle field renames between
- *    kernel versions, one can use two flavors of the struct name with the
- *    same common name and use conditional relocations to extract that field,
- *    depending on target kernel version.
- * 2. For each candidate type, try to match local specification to this
- *    candidate target type. Matching involves finding corresponding
- *    high-level spec accessors, meaning that all named fields should match,
- *    as well as all array accesses should be within the actual bounds. Also,
- *    types should be compatible (see bpf_core_fields_are_compat for details).
- * 3. It is supported and expected that there might be multiple flavors
- *    matching the spec. As long as all the specs resolve to the same set of
- *    offsets across all candidates, there is no error. If there is any
- *    ambiguity, CO-RE relocation will fail. This is necessary to accomodate
- *    imprefection of BTF deduplication, which can cause slight duplication of
- *    the same BTF type, if some directly or indirectly referenced (by
- *    pointer) type gets resolved to different actual types in different
- *    object files. If such situation occurs, deduplicated BTF will end up
- *    with two (or more) structurally identical types, which differ only in
- *    types they refer to through pointer. This should be OK in most cases and
- *    is not an error.
- * 4. Candidate types search is performed by linearly scanning through all
- *    types in target BTF. It is anticipated that this is overall more
- *    efficient memory-wise and not significantly worse (if not better)
- *    CPU-wise compared to prebuilding a map from all local type names to
- *    a list of candidate type names. It's also sped up by caching resolved
- *    list of matching candidates per each local "root" type ID, that has at
- *    least one bpf_core_relo associated with it. This list is shared
- *    between multiple relocations for the same type ID and is updated as some
- *    of the candidates are pruned due to structural incompatibility.
- */
-static int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
-				    int insn_idx,
-				    const struct bpf_core_relo *relo,
-				    int relo_idx,
-				    const struct btf *local_btf,
-				    struct bpf_core_cand_list *cands)
-{
-	struct bpf_core_spec local_spec, cand_spec, targ_spec = {};
-	struct bpf_core_relo_res cand_res, targ_res;
-	const struct btf_type *local_type;
-	const char *local_name;
-	__u32 local_id;
-	const char *spec_str;
-	int i, j, err;
-
-	local_id = relo->type_id;
-	local_type = btf__type_by_id(local_btf, local_id);
-	if (!local_type)
-		return -EINVAL;
-
-	local_name = btf__name_by_offset(local_btf, local_type->name_off);
-	if (!local_name)
-		return -EINVAL;
-
-	spec_str = btf__name_by_offset(local_btf, relo->access_str_off);
-	if (str_is_empty(spec_str))
-		return -EINVAL;
-
-	err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec);
-	if (err) {
-		pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n",
-			prog_name, relo_idx, local_id, btf_kind_str(local_type),
-			str_is_empty(local_name) ? "<anon>" : local_name,
-			spec_str, err);
-		return -EINVAL;
-	}
-
-	pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name,
-		 relo_idx, core_relo_kind_str(relo->kind), relo->kind);
-	bpf_core_dump_spec(LIBBPF_DEBUG, &local_spec);
-	libbpf_print(LIBBPF_DEBUG, "\n");
-
-	/* TYPE_ID_LOCAL relo is special and doesn't need candidate search */
-	if (relo->kind == BPF_TYPE_ID_LOCAL) {
-		targ_res.validate = true;
-		targ_res.poison = false;
-		targ_res.orig_val = local_spec.root_type_id;
-		targ_res.new_val = local_spec.root_type_id;
-		goto patch_insn;
-	}
-
-	/* libbpf doesn't support candidate search for anonymous types */
-	if (str_is_empty(spec_str)) {
-		pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n",
-			prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind);
-		return -EOPNOTSUPP;
-	}
-
-
-	for (i = 0, j = 0; i < cands->len; i++) {
-		err = bpf_core_spec_match(&local_spec, cands->cands[i].btf,
-					  cands->cands[i].id, &cand_spec);
-		if (err < 0) {
-			pr_warn("prog '%s': relo #%d: error matching candidate #%d ",
-				prog_name, relo_idx, i);
-			bpf_core_dump_spec(LIBBPF_WARN, &cand_spec);
-			libbpf_print(LIBBPF_WARN, ": %d\n", err);
-			return err;
-		}
-
-		pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name,
-			 relo_idx, err == 0 ? "non-matching" : "matching", i);
-		bpf_core_dump_spec(LIBBPF_DEBUG, &cand_spec);
-		libbpf_print(LIBBPF_DEBUG, "\n");
-
-		if (err == 0)
-			continue;
-
-		err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, &cand_spec, &cand_res);
-		if (err)
-			return err;
-
-		if (j == 0) {
-			targ_res = cand_res;
-			targ_spec = cand_spec;
-		} else if (cand_spec.bit_offset != targ_spec.bit_offset) {
-			/* if there are many field relo candidates, they
-			 * should all resolve to the same bit offset
-			 */
-			pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n",
-				prog_name, relo_idx, cand_spec.bit_offset,
-				targ_spec.bit_offset);
-			return -EINVAL;
-		} else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) {
-			/* all candidates should result in the same relocation
-			 * decision and value, otherwise it's dangerous to
-			 * proceed due to ambiguity
-			 */
-			pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %u != %s %u\n",
-				prog_name, relo_idx,
-				cand_res.poison ? "failure" : "success", cand_res.new_val,
-				targ_res.poison ? "failure" : "success", targ_res.new_val);
-			return -EINVAL;
-		}
-
-		cands->cands[j++] = cands->cands[i];
-	}
-
-	/*
-	 * For BPF_FIELD_EXISTS relo or when used BPF program has field
-	 * existence checks or kernel version/config checks, it's expected
-	 * that we might not find any candidates. In this case, if field
-	 * wasn't found in any candidate, the list of candidates shouldn't
-	 * change at all, we'll just handle relocating appropriately,
-	 * depending on relo's kind.
-	 */
-	if (j > 0)
-		cands->len = j;
-
-	/*
-	 * If no candidates were found, it might be both a programmer error,
-	 * as well as expected case, depending whether instruction w/
-	 * relocation is guarded in some way that makes it unreachable (dead
-	 * code) if relocation can't be resolved. This is handled in
-	 * bpf_core_patch_insn() uniformly by replacing that instruction with
-	 * BPF helper call insn (using invalid helper ID). If that instruction
-	 * is indeed unreachable, then it will be ignored and eliminated by
-	 * verifier. If it was an error, then verifier will complain and point
-	 * to a specific instruction number in its log.
-	 */
-	if (j == 0) {
-		pr_debug("prog '%s': relo #%d: no matching targets found\n",
-			 prog_name, relo_idx);
-
-		/* calculate single target relo result explicitly */
-		err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, NULL, &targ_res);
-		if (err)
-			return err;
-	}
-
-patch_insn:
-	/* bpf_core_patch_insn() should know how to handle missing targ_spec */
-	err = bpf_core_patch_insn(prog_name, insn, insn_idx, relo, relo_idx, &targ_res);
-	if (err) {
-		pr_warn("prog '%s': relo #%d: failed to patch insn #%zu: %d\n",
-			prog_name, relo_idx, relo->insn_off / BPF_INSN_SZ, err);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static int bpf_core_apply_relo(struct bpf_program *prog,
 			       const struct bpf_core_relo *relo,
 			       int relo_idx,
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 3178d5685dce..f7b691d5f9eb 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -425,4 +425,14 @@ static inline void *libbpf_ptr(void *ret)
 	return ret;
 }
 
+static inline bool str_is_empty(const char *s)
+{
+	return !s || !s[0];
+}
+
+static inline bool is_ldimm64_insn(struct bpf_insn *insn)
+{
+	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+}
+
 #endif /* __LIBBPF_LIBBPF_INTERNAL_H */
diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c
new file mode 100644
index 000000000000..4016ed492d0c
--- /dev/null
+++ b/tools/lib/bpf/relo_core.c
@@ -0,0 +1,1295 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+/* Copyright (c) 2019 Facebook */
+
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <ctype.h>
+#include <linux/err.h>
+
+#include "libbpf.h"
+#include "bpf.h"
+#include "btf.h"
+#include "str_error.h"
+#include "libbpf_internal.h"
+
+#define BPF_CORE_SPEC_MAX_LEN 64
+
+/* represents BPF CO-RE field or array element accessor */
+struct bpf_core_accessor {
+	__u32 type_id;		/* struct/union type or array element type */
+	__u32 idx;		/* field index or array index */
+	const char *name;	/* field name or NULL for array accessor */
+};
+
+struct bpf_core_spec {
+	const struct btf *btf;
+	/* high-level spec: named fields and array indices only */
+	struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN];
+	/* original unresolved (no skip_mods_or_typedefs) root type ID */
+	__u32 root_type_id;
+	/* CO-RE relocation kind */
+	enum bpf_core_relo_kind relo_kind;
+	/* high-level spec length */
+	int len;
+	/* raw, low-level spec: 1-to-1 with accessor spec string */
+	int raw_spec[BPF_CORE_SPEC_MAX_LEN];
+	/* raw spec length */
+	int raw_len;
+	/* field bit offset represented by spec */
+	__u32 bit_offset;
+};
+
+static bool is_flex_arr(const struct btf *btf,
+			const struct bpf_core_accessor *acc,
+			const struct btf_array *arr)
+{
+	const struct btf_type *t;
+
+	/* not a flexible array, if not inside a struct or has non-zero size */
+	if (!acc->name || arr->nelems > 0)
+		return false;
+
+	/* has to be the last member of enclosing struct */
+	t = btf__type_by_id(btf, acc->type_id);
+	return acc->idx == btf_vlen(t) - 1;
+}
+
+static const char *core_relo_kind_str(enum bpf_core_relo_kind kind)
+{
+	switch (kind) {
+	case BPF_FIELD_BYTE_OFFSET: return "byte_off";
+	case BPF_FIELD_BYTE_SIZE: return "byte_sz";
+	case BPF_FIELD_EXISTS: return "field_exists";
+	case BPF_FIELD_SIGNED: return "signed";
+	case BPF_FIELD_LSHIFT_U64: return "lshift_u64";
+	case BPF_FIELD_RSHIFT_U64: return "rshift_u64";
+	case BPF_TYPE_ID_LOCAL: return "local_type_id";
+	case BPF_TYPE_ID_TARGET: return "target_type_id";
+	case BPF_TYPE_EXISTS: return "type_exists";
+	case BPF_TYPE_SIZE: return "type_size";
+	case BPF_ENUMVAL_EXISTS: return "enumval_exists";
+	case BPF_ENUMVAL_VALUE: return "enumval_value";
+	default: return "unknown";
+	}
+}
+
+static bool core_relo_is_field_based(enum bpf_core_relo_kind kind)
+{
+	switch (kind) {
+	case BPF_FIELD_BYTE_OFFSET:
+	case BPF_FIELD_BYTE_SIZE:
+	case BPF_FIELD_EXISTS:
+	case BPF_FIELD_SIGNED:
+	case BPF_FIELD_LSHIFT_U64:
+	case BPF_FIELD_RSHIFT_U64:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool core_relo_is_type_based(enum bpf_core_relo_kind kind)
+{
+	switch (kind) {
+	case BPF_TYPE_ID_LOCAL:
+	case BPF_TYPE_ID_TARGET:
+	case BPF_TYPE_EXISTS:
+	case BPF_TYPE_SIZE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind)
+{
+	switch (kind) {
+	case BPF_ENUMVAL_EXISTS:
+	case BPF_ENUMVAL_VALUE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * Turn bpf_core_relo into a low- and high-level spec representation,
+ * validating correctness along the way, as well as calculating resulting
+ * field bit offset, specified by accessor string. Low-level spec captures
+ * every single level of nestedness, including traversing anonymous
+ * struct/union members. High-level one only captures semantically meaningful
+ * "turning points": named fields and array indicies.
+ * E.g., for this case:
+ *
+ *   struct sample {
+ *       int __unimportant;
+ *       struct {
+ *           int __1;
+ *           int __2;
+ *           int a[7];
+ *       };
+ *   };
+ *
+ *   struct sample *s = ...;
+ *
+ *   int x = &s->a[3]; // access string = '0:1:2:3'
+ *
+ * Low-level spec has 1:1 mapping with each element of access string (it's
+ * just a parsed access string representation): [0, 1, 2, 3].
+ *
+ * High-level spec will capture only 3 points:
+ *   - intial zero-index access by pointer (&s->... is the same as &s[0]...);
+ *   - field 'a' access (corresponds to '2' in low-level spec);
+ *   - array element #3 access (corresponds to '3' in low-level spec).
+ *
+ * Type-based relocations (TYPE_EXISTS/TYPE_SIZE,
+ * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their
+ * spec and raw_spec are kept empty.
+ *
+ * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access
+ * string to specify enumerator's value index that need to be relocated.
+ */
+static int bpf_core_parse_spec(const struct btf *btf,
+			       __u32 type_id,
+			       const char *spec_str,
+			       enum bpf_core_relo_kind relo_kind,
+			       struct bpf_core_spec *spec)
+{
+	int access_idx, parsed_len, i;
+	struct bpf_core_accessor *acc;
+	const struct btf_type *t;
+	const char *name;
+	__u32 id;
+	__s64 sz;
+
+	if (str_is_empty(spec_str) || *spec_str == ':')
+		return -EINVAL;
+
+	memset(spec, 0, sizeof(*spec));
+	spec->btf = btf;
+	spec->root_type_id = type_id;
+	spec->relo_kind = relo_kind;
+
+	/* type-based relocations don't have a field access string */
+	if (core_relo_is_type_based(relo_kind)) {
+		if (strcmp(spec_str, "0"))
+			return -EINVAL;
+		return 0;
+	}
+
+	/* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */
+	while (*spec_str) {
+		if (*spec_str == ':')
+			++spec_str;
+		if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1)
+			return -EINVAL;
+		if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
+			return -E2BIG;
+		spec_str += parsed_len;
+		spec->raw_spec[spec->raw_len++] = access_idx;
+	}
+
+	if (spec->raw_len == 0)
+		return -EINVAL;
+
+	t = skip_mods_and_typedefs(btf, type_id, &id);
+	if (!t)
+		return -EINVAL;
+
+	access_idx = spec->raw_spec[0];
+	acc = &spec->spec[0];
+	acc->type_id = id;
+	acc->idx = access_idx;
+	spec->len++;
+
+	if (core_relo_is_enumval_based(relo_kind)) {
+		if (!btf_is_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t))
+			return -EINVAL;
+
+		/* record enumerator name in a first accessor */
+		acc->name = btf__name_by_offset(btf, btf_enum(t)[access_idx].name_off);
+		return 0;
+	}
+
+	if (!core_relo_is_field_based(relo_kind))
+		return -EINVAL;
+
+	sz = btf__resolve_size(btf, id);
+	if (sz < 0)
+		return sz;
+	spec->bit_offset = access_idx * sz * 8;
+
+	for (i = 1; i < spec->raw_len; i++) {
+		t = skip_mods_and_typedefs(btf, id, &id);
+		if (!t)
+			return -EINVAL;
+
+		access_idx = spec->raw_spec[i];
+		acc = &spec->spec[spec->len];
+
+		if (btf_is_composite(t)) {
+			const struct btf_member *m;
+			__u32 bit_offset;
+
+			if (access_idx >= btf_vlen(t))
+				return -EINVAL;
+
+			bit_offset = btf_member_bit_offset(t, access_idx);
+			spec->bit_offset += bit_offset;
+
+			m = btf_members(t) + access_idx;
+			if (m->name_off) {
+				name = btf__name_by_offset(btf, m->name_off);
+				if (str_is_empty(name))
+					return -EINVAL;
+
+				acc->type_id = id;
+				acc->idx = access_idx;
+				acc->name = name;
+				spec->len++;
+			}
+
+			id = m->type;
+		} else if (btf_is_array(t)) {
+			const struct btf_array *a = btf_array(t);
+			bool flex;
+
+			t = skip_mods_and_typedefs(btf, a->type, &id);
+			if (!t)
+				return -EINVAL;
+
+			flex = is_flex_arr(btf, acc - 1, a);
+			if (!flex && access_idx >= a->nelems)
+				return -EINVAL;
+
+			spec->spec[spec->len].type_id = id;
+			spec->spec[spec->len].idx = access_idx;
+			spec->len++;
+
+			sz = btf__resolve_size(btf, id);
+			if (sz < 0)
+				return sz;
+			spec->bit_offset += access_idx * sz * 8;
+		} else {
+			pr_warn("relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n",
+				type_id, spec_str, i, id, btf_kind_str(t));
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/* Check two types for compatibility for the purpose of field access
+ * relocation. const/volatile/restrict and typedefs are skipped to ensure we
+ * are relocating semantically compatible entities:
+ *   - any two STRUCTs/UNIONs are compatible and can be mixed;
+ *   - any two FWDs are compatible, if their names match (modulo flavor suffix);
+ *   - any two PTRs are always compatible;
+ *   - for ENUMs, names should be the same (ignoring flavor suffix) or at
+ *     least one of enums should be anonymous;
+ *   - for ENUMs, check sizes, names are ignored;
+ *   - for INT, size and signedness are ignored;
+ *   - any two FLOATs are always compatible;
+ *   - for ARRAY, dimensionality is ignored, element types are checked for
+ *     compatibility recursively;
+ *   - everything else shouldn't be ever a target of relocation.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
+static int bpf_core_fields_are_compat(const struct btf *local_btf,
+				      __u32 local_id,
+				      const struct btf *targ_btf,
+				      __u32 targ_id)
+{
+	const struct btf_type *local_type, *targ_type;
+
+recur:
+	local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id);
+	targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id);
+	if (!local_type || !targ_type)
+		return -EINVAL;
+
+	if (btf_is_composite(local_type) && btf_is_composite(targ_type))
+		return 1;
+	if (btf_kind(local_type) != btf_kind(targ_type))
+		return 0;
+
+	switch (btf_kind(local_type)) {
+	case BTF_KIND_PTR:
+	case BTF_KIND_FLOAT:
+		return 1;
+	case BTF_KIND_FWD:
+	case BTF_KIND_ENUM: {
+		const char *local_name, *targ_name;
+		size_t local_len, targ_len;
+
+		local_name = btf__name_by_offset(local_btf,
+						 local_type->name_off);
+		targ_name = btf__name_by_offset(targ_btf, targ_type->name_off);
+		local_len = bpf_core_essential_name_len(local_name);
+		targ_len = bpf_core_essential_name_len(targ_name);
+		/* one of them is anonymous or both w/ same flavor-less names */
+		return local_len == 0 || targ_len == 0 ||
+		       (local_len == targ_len &&
+			strncmp(local_name, targ_name, local_len) == 0);
+	}
+	case BTF_KIND_INT:
+		/* just reject deprecated bitfield-like integers; all other
+		 * integers are by default compatible between each other
+		 */
+		return btf_int_offset(local_type) == 0 &&
+		       btf_int_offset(targ_type) == 0;
+	case BTF_KIND_ARRAY:
+		local_id = btf_array(local_type)->type;
+		targ_id = btf_array(targ_type)->type;
+		goto recur;
+	default:
+		pr_warn("unexpected kind %d relocated, local [%d], target [%d]\n",
+			btf_kind(local_type), local_id, targ_id);
+		return 0;
+	}
+}
+
+/*
+ * Given single high-level named field accessor in local type, find
+ * corresponding high-level accessor for a target type. Along the way,
+ * maintain low-level spec for target as well. Also keep updating target
+ * bit offset.
+ *
+ * Searching is performed through recursive exhaustive enumeration of all
+ * fields of a struct/union. If there are any anonymous (embedded)
+ * structs/unions, they are recursively searched as well. If field with
+ * desired name is found, check compatibility between local and target types,
+ * before returning result.
+ *
+ * 1 is returned, if field is found.
+ * 0 is returned if no compatible field is found.
+ * <0 is returned on error.
+ */
+static int bpf_core_match_member(const struct btf *local_btf,
+				 const struct bpf_core_accessor *local_acc,
+				 const struct btf *targ_btf,
+				 __u32 targ_id,
+				 struct bpf_core_spec *spec,
+				 __u32 *next_targ_id)
+{
+	const struct btf_type *local_type, *targ_type;
+	const struct btf_member *local_member, *m;
+	const char *local_name, *targ_name;
+	__u32 local_id;
+	int i, n, found;
+
+	targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id);
+	if (!targ_type)
+		return -EINVAL;
+	if (!btf_is_composite(targ_type))
+		return 0;
+
+	local_id = local_acc->type_id;
+	local_type = btf__type_by_id(local_btf, local_id);
+	local_member = btf_members(local_type) + local_acc->idx;
+	local_name = btf__name_by_offset(local_btf, local_member->name_off);
+
+	n = btf_vlen(targ_type);
+	m = btf_members(targ_type);
+	for (i = 0; i < n; i++, m++) {
+		__u32 bit_offset;
+
+		bit_offset = btf_member_bit_offset(targ_type, i);
+
+		/* too deep struct/union/array nesting */
+		if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
+			return -E2BIG;
+
+		/* speculate this member will be the good one */
+		spec->bit_offset += bit_offset;
+		spec->raw_spec[spec->raw_len++] = i;
+
+		targ_name = btf__name_by_offset(targ_btf, m->name_off);
+		if (str_is_empty(targ_name)) {
+			/* embedded struct/union, we need to go deeper */
+			found = bpf_core_match_member(local_btf, local_acc,
+						      targ_btf, m->type,
+						      spec, next_targ_id);
+			if (found) /* either found or error */
+				return found;
+		} else if (strcmp(local_name, targ_name) == 0) {
+			/* matching named field */
+			struct bpf_core_accessor *targ_acc;
+
+			targ_acc = &spec->spec[spec->len++];
+			targ_acc->type_id = targ_id;
+			targ_acc->idx = i;
+			targ_acc->name = targ_name;
+
+			*next_targ_id = m->type;
+			found = bpf_core_fields_are_compat(local_btf,
+							   local_member->type,
+							   targ_btf, m->type);
+			if (!found)
+				spec->len--; /* pop accessor */
+			return found;
+		}
+		/* member turned out not to be what we looked for */
+		spec->bit_offset -= bit_offset;
+		spec->raw_len--;
+	}
+
+	return 0;
+}
+
+/*
+ * Try to match local spec to a target type and, if successful, produce full
+ * target spec (high-level, low-level + bit offset).
+ */
+static int bpf_core_spec_match(struct bpf_core_spec *local_spec,
+			       const struct btf *targ_btf, __u32 targ_id,
+			       struct bpf_core_spec *targ_spec)
+{
+	const struct btf_type *targ_type;
+	const struct bpf_core_accessor *local_acc;
+	struct bpf_core_accessor *targ_acc;
+	int i, sz, matched;
+
+	memset(targ_spec, 0, sizeof(*targ_spec));
+	targ_spec->btf = targ_btf;
+	targ_spec->root_type_id = targ_id;
+	targ_spec->relo_kind = local_spec->relo_kind;
+
+	if (core_relo_is_type_based(local_spec->relo_kind)) {
+		return bpf_core_types_are_compat(local_spec->btf,
+						 local_spec->root_type_id,
+						 targ_btf, targ_id);
+	}
+
+	local_acc = &local_spec->spec[0];
+	targ_acc = &targ_spec->spec[0];
+
+	if (core_relo_is_enumval_based(local_spec->relo_kind)) {
+		size_t local_essent_len, targ_essent_len;
+		const struct btf_enum *e;
+		const char *targ_name;
+
+		/* has to resolve to an enum */
+		targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id);
+		if (!btf_is_enum(targ_type))
+			return 0;
+
+		local_essent_len = bpf_core_essential_name_len(local_acc->name);
+
+		for (i = 0, e = btf_enum(targ_type); i < btf_vlen(targ_type); i++, e++) {
+			targ_name = btf__name_by_offset(targ_spec->btf, e->name_off);
+			targ_essent_len = bpf_core_essential_name_len(targ_name);
+			if (targ_essent_len != local_essent_len)
+				continue;
+			if (strncmp(local_acc->name, targ_name, local_essent_len) == 0) {
+				targ_acc->type_id = targ_id;
+				targ_acc->idx = i;
+				targ_acc->name = targ_name;
+				targ_spec->len++;
+				targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx;
+				targ_spec->raw_len++;
+				return 1;
+			}
+		}
+		return 0;
+	}
+
+	if (!core_relo_is_field_based(local_spec->relo_kind))
+		return -EINVAL;
+
+	for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) {
+		targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id,
+						   &targ_id);
+		if (!targ_type)
+			return -EINVAL;
+
+		if (local_acc->name) {
+			matched = bpf_core_match_member(local_spec->btf,
+							local_acc,
+							targ_btf, targ_id,
+							targ_spec, &targ_id);
+			if (matched <= 0)
+				return matched;
+		} else {
+			/* for i=0, targ_id is already treated as array element
+			 * type (because it's the original struct), for others
+			 * we should find array element type first
+			 */
+			if (i > 0) {
+				const struct btf_array *a;
+				bool flex;
+
+				if (!btf_is_array(targ_type))
+					return 0;
+
+				a = btf_array(targ_type);
+				flex = is_flex_arr(targ_btf, targ_acc - 1, a);
+				if (!flex && local_acc->idx >= a->nelems)
+					return 0;
+				if (!skip_mods_and_typedefs(targ_btf, a->type,
+							    &targ_id))
+					return -EINVAL;
+			}
+
+			/* too deep struct/union/array nesting */
+			if (targ_spec->raw_len == BPF_CORE_SPEC_MAX_LEN)
+				return -E2BIG;
+
+			targ_acc->type_id = targ_id;
+			targ_acc->idx = local_acc->idx;
+			targ_acc->name = NULL;
+			targ_spec->len++;
+			targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx;
+			targ_spec->raw_len++;
+
+			sz = btf__resolve_size(targ_btf, targ_id);
+			if (sz < 0)
+				return sz;
+			targ_spec->bit_offset += local_acc->idx * sz * 8;
+		}
+	}
+
+	return 1;
+}
+
+static int bpf_core_calc_field_relo(const char *prog_name,
+				    const struct bpf_core_relo *relo,
+				    const struct bpf_core_spec *spec,
+				    __u32 *val, __u32 *field_sz, __u32 *type_id,
+				    bool *validate)
+{
+	const struct bpf_core_accessor *acc;
+	const struct btf_type *t;
+	__u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id;
+	const struct btf_member *m;
+	const struct btf_type *mt;
+	bool bitfield;
+	__s64 sz;
+
+	*field_sz = 0;
+
+	if (relo->kind == BPF_FIELD_EXISTS) {
+		*val = spec ? 1 : 0;
+		return 0;
+	}
+
+	if (!spec)
+		return -EUCLEAN; /* request instruction poisoning */
+
+	acc = &spec->spec[spec->len - 1];
+	t = btf__type_by_id(spec->btf, acc->type_id);
+
+	/* a[n] accessor needs special handling */
+	if (!acc->name) {
+		if (relo->kind == BPF_FIELD_BYTE_OFFSET) {
+			*val = spec->bit_offset / 8;
+			/* remember field size for load/store mem size */
+			sz = btf__resolve_size(spec->btf, acc->type_id);
+			if (sz < 0)
+				return -EINVAL;
+			*field_sz = sz;
+			*type_id = acc->type_id;
+		} else if (relo->kind == BPF_FIELD_BYTE_SIZE) {
+			sz = btf__resolve_size(spec->btf, acc->type_id);
+			if (sz < 0)
+				return -EINVAL;
+			*val = sz;
+		} else {
+			pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n",
+				prog_name, relo->kind, relo->insn_off / 8);
+			return -EINVAL;
+		}
+		if (validate)
+			*validate = true;
+		return 0;
+	}
+
+	m = btf_members(t) + acc->idx;
+	mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id);
+	bit_off = spec->bit_offset;
+	bit_sz = btf_member_bitfield_size(t, acc->idx);
+
+	bitfield = bit_sz > 0;
+	if (bitfield) {
+		byte_sz = mt->size;
+		byte_off = bit_off / 8 / byte_sz * byte_sz;
+		/* figure out smallest int size necessary for bitfield load */
+		while (bit_off + bit_sz - byte_off * 8 > byte_sz * 8) {
+			if (byte_sz >= 8) {
+				/* bitfield can't be read with 64-bit read */
+				pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n",
+					prog_name, relo->kind, relo->insn_off / 8);
+				return -E2BIG;
+			}
+			byte_sz *= 2;
+			byte_off = bit_off / 8 / byte_sz * byte_sz;
+		}
+	} else {
+		sz = btf__resolve_size(spec->btf, field_type_id);
+		if (sz < 0)
+			return -EINVAL;
+		byte_sz = sz;
+		byte_off = spec->bit_offset / 8;
+		bit_sz = byte_sz * 8;
+	}
+
+	/* for bitfields, all the relocatable aspects are ambiguous and we
+	 * might disagree with compiler, so turn off validation of expected
+	 * value, except for signedness
+	 */
+	if (validate)
+		*validate = !bitfield;
+
+	switch (relo->kind) {
+	case BPF_FIELD_BYTE_OFFSET:
+		*val = byte_off;
+		if (!bitfield) {
+			*field_sz = byte_sz;
+			*type_id = field_type_id;
+		}
+		break;
+	case BPF_FIELD_BYTE_SIZE:
+		*val = byte_sz;
+		break;
+	case BPF_FIELD_SIGNED:
+		/* enums will be assumed unsigned */
+		*val = btf_is_enum(mt) ||
+		       (btf_int_encoding(mt) & BTF_INT_SIGNED);
+		if (validate)
+			*validate = true; /* signedness is never ambiguous */
+		break;
+	case BPF_FIELD_LSHIFT_U64:
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+		*val = 64 - (bit_off + bit_sz - byte_off  * 8);
+#else
+		*val = (8 - byte_sz) * 8 + (bit_off - byte_off * 8);
+#endif
+		break;
+	case BPF_FIELD_RSHIFT_U64:
+		*val = 64 - bit_sz;
+		if (validate)
+			*validate = true; /* right shift is never ambiguous */
+		break;
+	case BPF_FIELD_EXISTS:
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo,
+				   const struct bpf_core_spec *spec,
+				   __u32 *val)
+{
+	__s64 sz;
+
+	/* type-based relos return zero when target type is not found */
+	if (!spec) {
+		*val = 0;
+		return 0;
+	}
+
+	switch (relo->kind) {
+	case BPF_TYPE_ID_TARGET:
+		*val = spec->root_type_id;
+		break;
+	case BPF_TYPE_EXISTS:
+		*val = 1;
+		break;
+	case BPF_TYPE_SIZE:
+		sz = btf__resolve_size(spec->btf, spec->root_type_id);
+		if (sz < 0)
+			return -EINVAL;
+		*val = sz;
+		break;
+	case BPF_TYPE_ID_LOCAL:
+	/* BPF_TYPE_ID_LOCAL is handled specially and shouldn't get here */
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo,
+				      const struct bpf_core_spec *spec,
+				      __u32 *val)
+{
+	const struct btf_type *t;
+	const struct btf_enum *e;
+
+	switch (relo->kind) {
+	case BPF_ENUMVAL_EXISTS:
+		*val = spec ? 1 : 0;
+		break;
+	case BPF_ENUMVAL_VALUE:
+		if (!spec)
+			return -EUCLEAN; /* request instruction poisoning */
+		t = btf__type_by_id(spec->btf, spec->spec[0].type_id);
+		e = btf_enum(t) + spec->spec[0].idx;
+		*val = e->val;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+struct bpf_core_relo_res
+{
+	/* expected value in the instruction, unless validate == false */
+	__u32 orig_val;
+	/* new value that needs to be patched up to */
+	__u32 new_val;
+	/* relocation unsuccessful, poison instruction, but don't fail load */
+	bool poison;
+	/* some relocations can't be validated against orig_val */
+	bool validate;
+	/* for field byte offset relocations or the forms:
+	 *     *(T *)(rX + <off>) = rY
+	 *     rX = *(T *)(rY + <off>),
+	 * we remember original and resolved field size to adjust direct
+	 * memory loads of pointers and integers; this is necessary for 32-bit
+	 * host kernel architectures, but also allows to automatically
+	 * relocate fields that were resized from, e.g., u32 to u64, etc.
+	 */
+	bool fail_memsz_adjust;
+	__u32 orig_sz;
+	__u32 orig_type_id;
+	__u32 new_sz;
+	__u32 new_type_id;
+};
+
+/* Calculate original and target relocation values, given local and target
+ * specs and relocation kind. These values are calculated for each candidate.
+ * If there are multiple candidates, resulting values should all be consistent
+ * with each other. Otherwise, libbpf will refuse to proceed due to ambiguity.
+ * If instruction has to be poisoned, *poison will be set to true.
+ */
+static int bpf_core_calc_relo(const char *prog_name,
+			      const struct bpf_core_relo *relo,
+			      int relo_idx,
+			      const struct bpf_core_spec *local_spec,
+			      const struct bpf_core_spec *targ_spec,
+			      struct bpf_core_relo_res *res)
+{
+	int err = -EOPNOTSUPP;
+
+	res->orig_val = 0;
+	res->new_val = 0;
+	res->poison = false;
+	res->validate = true;
+	res->fail_memsz_adjust = false;
+	res->orig_sz = res->new_sz = 0;
+	res->orig_type_id = res->new_type_id = 0;
+
+	if (core_relo_is_field_based(relo->kind)) {
+		err = bpf_core_calc_field_relo(prog_name, relo, local_spec,
+					       &res->orig_val, &res->orig_sz,
+					       &res->orig_type_id, &res->validate);
+		err = err ?: bpf_core_calc_field_relo(prog_name, relo, targ_spec,
+						      &res->new_val, &res->new_sz,
+						      &res->new_type_id, NULL);
+		if (err)
+			goto done;
+		/* Validate if it's safe to adjust load/store memory size.
+		 * Adjustments are performed only if original and new memory
+		 * sizes differ.
+		 */
+		res->fail_memsz_adjust = false;
+		if (res->orig_sz != res->new_sz) {
+			const struct btf_type *orig_t, *new_t;
+
+			orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id);
+			new_t = btf__type_by_id(targ_spec->btf, res->new_type_id);
+
+			/* There are two use cases in which it's safe to
+			 * adjust load/store's mem size:
+			 *   - reading a 32-bit kernel pointer, while on BPF
+			 *   size pointers are always 64-bit; in this case
+			 *   it's safe to "downsize" instruction size due to
+			 *   pointer being treated as unsigned integer with
+			 *   zero-extended upper 32-bits;
+			 *   - reading unsigned integers, again due to
+			 *   zero-extension is preserving the value correctly.
+			 *
+			 * In all other cases it's incorrect to attempt to
+			 * load/store field because read value will be
+			 * incorrect, so we poison relocated instruction.
+			 */
+			if (btf_is_ptr(orig_t) && btf_is_ptr(new_t))
+				goto done;
+			if (btf_is_int(orig_t) && btf_is_int(new_t) &&
+			    btf_int_encoding(orig_t) != BTF_INT_SIGNED &&
+			    btf_int_encoding(new_t) != BTF_INT_SIGNED)
+				goto done;
+
+			/* mark as invalid mem size adjustment, but this will
+			 * only be checked for LDX/STX/ST insns
+			 */
+			res->fail_memsz_adjust = true;
+		}
+	} else if (core_relo_is_type_based(relo->kind)) {
+		err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val);
+		err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val);
+	} else if (core_relo_is_enumval_based(relo->kind)) {
+		err = bpf_core_calc_enumval_relo(relo, local_spec, &res->orig_val);
+		err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val);
+	}
+
+done:
+	if (err == -EUCLEAN) {
+		/* EUCLEAN is used to signal instruction poisoning request */
+		res->poison = true;
+		err = 0;
+	} else if (err == -EOPNOTSUPP) {
+		/* EOPNOTSUPP means unknown/unsupported relocation */
+		pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n",
+			prog_name, relo_idx, core_relo_kind_str(relo->kind),
+			relo->kind, relo->insn_off / 8);
+	}
+
+	return err;
+}
+
+/*
+ * Turn instruction for which CO_RE relocation failed into invalid one with
+ * distinct signature.
+ */
+static void bpf_core_poison_insn(const char *prog_name, int relo_idx,
+				 int insn_idx, struct bpf_insn *insn)
+{
+	pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n",
+		 prog_name, relo_idx, insn_idx);
+	insn->code = BPF_JMP | BPF_CALL;
+	insn->dst_reg = 0;
+	insn->src_reg = 0;
+	insn->off = 0;
+	/* if this instruction is reachable (not a dead code),
+	 * verifier will complain with the following message:
+	 * invalid func unknown#195896080
+	 */
+	insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */
+}
+
+static int insn_bpf_size_to_bytes(struct bpf_insn *insn)
+{
+	switch (BPF_SIZE(insn->code)) {
+	case BPF_DW: return 8;
+	case BPF_W: return 4;
+	case BPF_H: return 2;
+	case BPF_B: return 1;
+	default: return -1;
+	}
+}
+
+static int insn_bytes_to_bpf_size(__u32 sz)
+{
+	switch (sz) {
+	case 8: return BPF_DW;
+	case 4: return BPF_W;
+	case 2: return BPF_H;
+	case 1: return BPF_B;
+	default: return -1;
+	}
+}
+
+/*
+ * Patch relocatable BPF instruction.
+ *
+ * Patched value is determined by relocation kind and target specification.
+ * For existence relocations target spec will be NULL if field/type is not found.
+ * Expected insn->imm value is determined using relocation kind and local
+ * spec, and is checked before patching instruction. If actual insn->imm value
+ * is wrong, bail out with error.
+ *
+ * Currently supported classes of BPF instruction are:
+ * 1. rX = <imm> (assignment with immediate operand);
+ * 2. rX += <imm> (arithmetic operations with immediate operand);
+ * 3. rX = <imm64> (load with 64-bit immediate value);
+ * 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64};
+ * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64};
+ * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}.
+ */
+static int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn,
+			       int insn_idx, const struct bpf_core_relo *relo,
+			       int relo_idx, const struct bpf_core_relo_res *res)
+{
+	__u32 orig_val, new_val;
+	__u8 class;
+
+	class = BPF_CLASS(insn->code);
+
+	if (res->poison) {
+poison:
+		/* poison second part of ldimm64 to avoid confusing error from
+		 * verifier about "unknown opcode 00"
+		 */
+		if (is_ldimm64_insn(insn))
+			bpf_core_poison_insn(prog_name, relo_idx, insn_idx + 1, insn + 1);
+		bpf_core_poison_insn(prog_name, relo_idx, insn_idx, insn);
+		return 0;
+	}
+
+	orig_val = res->orig_val;
+	new_val = res->new_val;
+
+	switch (class) {
+	case BPF_ALU:
+	case BPF_ALU64:
+		if (BPF_SRC(insn->code) != BPF_K)
+			return -EINVAL;
+		if (res->validate && insn->imm != orig_val) {
+			pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n",
+				prog_name, relo_idx,
+				insn_idx, insn->imm, orig_val, new_val);
+			return -EINVAL;
+		}
+		orig_val = insn->imm;
+		insn->imm = new_val;
+		pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %u -> %u\n",
+			 prog_name, relo_idx, insn_idx,
+			 orig_val, new_val);
+		break;
+	case BPF_LDX:
+	case BPF_ST:
+	case BPF_STX:
+		if (res->validate && insn->off != orig_val) {
+			pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %u -> %u\n",
+				prog_name, relo_idx, insn_idx, insn->off, orig_val, new_val);
+			return -EINVAL;
+		}
+		if (new_val > SHRT_MAX) {
+			pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %u\n",
+				prog_name, relo_idx, insn_idx, new_val);
+			return -ERANGE;
+		}
+		if (res->fail_memsz_adjust) {
+			pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. "
+				"Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n",
+				prog_name, relo_idx, insn_idx);
+			goto poison;
+		}
+
+		orig_val = insn->off;
+		insn->off = new_val;
+		pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n",
+			 prog_name, relo_idx, insn_idx, orig_val, new_val);
+
+		if (res->new_sz != res->orig_sz) {
+			int insn_bytes_sz, insn_bpf_sz;
+
+			insn_bytes_sz = insn_bpf_size_to_bytes(insn);
+			if (insn_bytes_sz != res->orig_sz) {
+				pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n",
+					prog_name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz);
+				return -EINVAL;
+			}
+
+			insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz);
+			if (insn_bpf_sz < 0) {
+				pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n",
+					prog_name, relo_idx, insn_idx, res->new_sz);
+				return -EINVAL;
+			}
+
+			insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code);
+			pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n",
+				 prog_name, relo_idx, insn_idx, res->orig_sz, res->new_sz);
+		}
+		break;
+	case BPF_LD: {
+		__u64 imm;
+
+		if (!is_ldimm64_insn(insn) ||
+		    insn[0].src_reg != 0 || insn[0].off != 0 ||
+		    insn[1].code != 0 || insn[1].dst_reg != 0 ||
+		    insn[1].src_reg != 0 || insn[1].off != 0) {
+			pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n",
+				prog_name, relo_idx, insn_idx);
+			return -EINVAL;
+		}
+
+		imm = insn[0].imm + ((__u64)insn[1].imm << 32);
+		if (res->validate && imm != orig_val) {
+			pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %u -> %u\n",
+				prog_name, relo_idx,
+				insn_idx, (unsigned long long)imm,
+				orig_val, new_val);
+			return -EINVAL;
+		}
+
+		insn[0].imm = new_val;
+		insn[1].imm = 0; /* currently only 32-bit values are supported */
+		pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %u\n",
+			 prog_name, relo_idx, insn_idx,
+			 (unsigned long long)imm, new_val);
+		break;
+	}
+	default:
+		pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n",
+			prog_name, relo_idx, insn_idx, insn->code,
+			insn->src_reg, insn->dst_reg, insn->off, insn->imm);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Output spec definition in the format:
+ * [<type-id>] (<type-name>) + <raw-spec> => <offset>@<spec>,
+ * where <spec> is a C-syntax view of recorded field access, e.g.: x.a[3].b
+ */
+static void bpf_core_dump_spec(int level, const struct bpf_core_spec *spec)
+{
+	const struct btf_type *t;
+	const struct btf_enum *e;
+	const char *s;
+	__u32 type_id;
+	int i;
+
+	type_id = spec->root_type_id;
+	t = btf__type_by_id(spec->btf, type_id);
+	s = btf__name_by_offset(spec->btf, t->name_off);
+
+	libbpf_print(level, "[%u] %s %s", type_id, btf_kind_str(t), str_is_empty(s) ? "<anon>" : s);
+
+	if (core_relo_is_type_based(spec->relo_kind))
+		return;
+
+	if (core_relo_is_enumval_based(spec->relo_kind)) {
+		t = skip_mods_and_typedefs(spec->btf, type_id, NULL);
+		e = btf_enum(t) + spec->raw_spec[0];
+		s = btf__name_by_offset(spec->btf, e->name_off);
+
+		libbpf_print(level, "::%s = %u", s, e->val);
+		return;
+	}
+
+	if (core_relo_is_field_based(spec->relo_kind)) {
+		for (i = 0; i < spec->len; i++) {
+			if (spec->spec[i].name)
+				libbpf_print(level, ".%s", spec->spec[i].name);
+			else if (i > 0 || spec->spec[i].idx > 0)
+				libbpf_print(level, "[%u]", spec->spec[i].idx);
+		}
+
+		libbpf_print(level, " (");
+		for (i = 0; i < spec->raw_len; i++)
+			libbpf_print(level, "%s%d", i == 0 ? "" : ":", spec->raw_spec[i]);
+
+		if (spec->bit_offset % 8)
+			libbpf_print(level, " @ offset %u.%u)",
+				     spec->bit_offset / 8, spec->bit_offset % 8);
+		else
+			libbpf_print(level, " @ offset %u)", spec->bit_offset / 8);
+		return;
+	}
+}
+
+/*
+ * CO-RE relocate single instruction.
+ *
+ * The outline and important points of the algorithm:
+ * 1. For given local type, find corresponding candidate target types.
+ *    Candidate type is a type with the same "essential" name, ignoring
+ *    everything after last triple underscore (___). E.g., `sample`,
+ *    `sample___flavor_one`, `sample___flavor_another_one`, are all candidates
+ *    for each other. Names with triple underscore are referred to as
+ *    "flavors" and are useful, among other things, to allow to
+ *    specify/support incompatible variations of the same kernel struct, which
+ *    might differ between different kernel versions and/or build
+ *    configurations.
+ *
+ *    N.B. Struct "flavors" could be generated by bpftool's BTF-to-C
+ *    converter, when deduplicated BTF of a kernel still contains more than
+ *    one different types with the same name. In that case, ___2, ___3, etc
+ *    are appended starting from second name conflict. But start flavors are
+ *    also useful to be defined "locally", in BPF program, to extract same
+ *    data from incompatible changes between different kernel
+ *    versions/configurations. For instance, to handle field renames between
+ *    kernel versions, one can use two flavors of the struct name with the
+ *    same common name and use conditional relocations to extract that field,
+ *    depending on target kernel version.
+ * 2. For each candidate type, try to match local specification to this
+ *    candidate target type. Matching involves finding corresponding
+ *    high-level spec accessors, meaning that all named fields should match,
+ *    as well as all array accesses should be within the actual bounds. Also,
+ *    types should be compatible (see bpf_core_fields_are_compat for details).
+ * 3. It is supported and expected that there might be multiple flavors
+ *    matching the spec. As long as all the specs resolve to the same set of
+ *    offsets across all candidates, there is no error. If there is any
+ *    ambiguity, CO-RE relocation will fail. This is necessary to accomodate
+ *    imprefection of BTF deduplication, which can cause slight duplication of
+ *    the same BTF type, if some directly or indirectly referenced (by
+ *    pointer) type gets resolved to different actual types in different
+ *    object files. If such situation occurs, deduplicated BTF will end up
+ *    with two (or more) structurally identical types, which differ only in
+ *    types they refer to through pointer. This should be OK in most cases and
+ *    is not an error.
+ * 4. Candidate types search is performed by linearly scanning through all
+ *    types in target BTF. It is anticipated that this is overall more
+ *    efficient memory-wise and not significantly worse (if not better)
+ *    CPU-wise compared to prebuilding a map from all local type names to
+ *    a list of candidate type names. It's also sped up by caching resolved
+ *    list of matching candidates per each local "root" type ID, that has at
+ *    least one bpf_core_relo associated with it. This list is shared
+ *    between multiple relocations for the same type ID and is updated as some
+ *    of the candidates are pruned due to structural incompatibility.
+ */
+int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn,
+			     int insn_idx,
+			     const struct bpf_core_relo *relo,
+			     int relo_idx,
+			     const struct btf *local_btf,
+			     struct bpf_core_cand_list *cands)
+{
+	struct bpf_core_spec local_spec, cand_spec, targ_spec = {};
+	struct bpf_core_relo_res cand_res, targ_res;
+	const struct btf_type *local_type;
+	const char *local_name;
+	__u32 local_id;
+	const char *spec_str;
+	int i, j, err;
+
+	local_id = relo->type_id;
+	local_type = btf__type_by_id(local_btf, local_id);
+	if (!local_type)
+		return -EINVAL;
+
+	local_name = btf__name_by_offset(local_btf, local_type->name_off);
+	if (!local_name)
+		return -EINVAL;
+
+	spec_str = btf__name_by_offset(local_btf, relo->access_str_off);
+	if (str_is_empty(spec_str))
+		return -EINVAL;
+
+	err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec);
+	if (err) {
+		pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n",
+			prog_name, relo_idx, local_id, btf_kind_str(local_type),
+			str_is_empty(local_name) ? "<anon>" : local_name,
+			spec_str, err);
+		return -EINVAL;
+	}
+
+	pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name,
+		 relo_idx, core_relo_kind_str(relo->kind), relo->kind);
+	bpf_core_dump_spec(LIBBPF_DEBUG, &local_spec);
+	libbpf_print(LIBBPF_DEBUG, "\n");
+
+	/* TYPE_ID_LOCAL relo is special and doesn't need candidate search */
+	if (relo->kind == BPF_TYPE_ID_LOCAL) {
+		targ_res.validate = true;
+		targ_res.poison = false;
+		targ_res.orig_val = local_spec.root_type_id;
+		targ_res.new_val = local_spec.root_type_id;
+		goto patch_insn;
+	}
+
+	/* libbpf doesn't support candidate search for anonymous types */
+	if (str_is_empty(spec_str)) {
+		pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n",
+			prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind);
+		return -EOPNOTSUPP;
+	}
+
+
+	for (i = 0, j = 0; i < cands->len; i++) {
+		err = bpf_core_spec_match(&local_spec, cands->cands[i].btf,
+					  cands->cands[i].id, &cand_spec);
+		if (err < 0) {
+			pr_warn("prog '%s': relo #%d: error matching candidate #%d ",
+				prog_name, relo_idx, i);
+			bpf_core_dump_spec(LIBBPF_WARN, &cand_spec);
+			libbpf_print(LIBBPF_WARN, ": %d\n", err);
+			return err;
+		}
+
+		pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name,
+			 relo_idx, err == 0 ? "non-matching" : "matching", i);
+		bpf_core_dump_spec(LIBBPF_DEBUG, &cand_spec);
+		libbpf_print(LIBBPF_DEBUG, "\n");
+
+		if (err == 0)
+			continue;
+
+		err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, &cand_spec, &cand_res);
+		if (err)
+			return err;
+
+		if (j == 0) {
+			targ_res = cand_res;
+			targ_spec = cand_spec;
+		} else if (cand_spec.bit_offset != targ_spec.bit_offset) {
+			/* if there are many field relo candidates, they
+			 * should all resolve to the same bit offset
+			 */
+			pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n",
+				prog_name, relo_idx, cand_spec.bit_offset,
+				targ_spec.bit_offset);
+			return -EINVAL;
+		} else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) {
+			/* all candidates should result in the same relocation
+			 * decision and value, otherwise it's dangerous to
+			 * proceed due to ambiguity
+			 */
+			pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %u != %s %u\n",
+				prog_name, relo_idx,
+				cand_res.poison ? "failure" : "success", cand_res.new_val,
+				targ_res.poison ? "failure" : "success", targ_res.new_val);
+			return -EINVAL;
+		}
+
+		cands->cands[j++] = cands->cands[i];
+	}
+
+	/*
+	 * For BPF_FIELD_EXISTS relo or when used BPF program has field
+	 * existence checks or kernel version/config checks, it's expected
+	 * that we might not find any candidates. In this case, if field
+	 * wasn't found in any candidate, the list of candidates shouldn't
+	 * change at all, we'll just handle relocating appropriately,
+	 * depending on relo's kind.
+	 */
+	if (j > 0)
+		cands->len = j;
+
+	/*
+	 * If no candidates were found, it might be both a programmer error,
+	 * as well as expected case, depending whether instruction w/
+	 * relocation is guarded in some way that makes it unreachable (dead
+	 * code) if relocation can't be resolved. This is handled in
+	 * bpf_core_patch_insn() uniformly by replacing that instruction with
+	 * BPF helper call insn (using invalid helper ID). If that instruction
+	 * is indeed unreachable, then it will be ignored and eliminated by
+	 * verifier. If it was an error, then verifier will complain and point
+	 * to a specific instruction number in its log.
+	 */
+	if (j == 0) {
+		pr_debug("prog '%s': relo #%d: no matching targets found\n",
+			 prog_name, relo_idx);
+
+		/* calculate single target relo result explicitly */
+		err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, NULL, &targ_res);
+		if (err)
+			return err;
+	}
+
+patch_insn:
+	/* bpf_core_patch_insn() should know how to handle missing targ_spec */
+	err = bpf_core_patch_insn(prog_name, insn, insn_idx, relo, relo_idx, &targ_res);
+	if (err) {
+		pr_warn("prog '%s': relo #%d: failed to patch insn #%u: %d\n",
+			prog_name, relo_idx, relo->insn_off / 8, err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h
index ddf20151fe41..3b9f8f18346c 100644
--- a/tools/lib/bpf/relo_core.h
+++ b/tools/lib/bpf/relo_core.h
@@ -75,8 +75,7 @@ struct bpf_core_relo {
 	enum bpf_core_relo_kind kind;
 };
 
-struct bpf_core_cand
-{
+struct bpf_core_cand {
 	const struct btf *btf;
 	const struct btf_type *t;
 	const char *name;
@@ -89,4 +88,13 @@ struct bpf_core_cand_list {
 	int len;
 };
 
+int bpf_core_apply_relo_insn(const char *prog_name,
+			     struct bpf_insn *insn, int insn_idx,
+			     const struct bpf_core_relo *relo, int relo_idx,
+			     const struct btf *local_btf,
+			     struct bpf_core_cand_list *cands);
+int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+			      const struct btf *targ_btf, __u32 targ_id);
+
+size_t bpf_core_essential_name_len(const char *name);
 #endif

From 4cda0c82a34bce96a4e7f229e48a0a57f39acd1b Mon Sep 17 00:00:00 2001
From: Jussi Maki <joamaki@gmail.com>
Date: Thu, 8 Jul 2021 02:17:27 +0000
Subject: [PATCH 41/64] selftests/bpf: Use ping6 only if available in
 tc_redirect

In the tc_redirect test only use ping6 if it's available and
otherwise fall back to using "ping -6".

Signed-off-by: Jussi Maki <joamaki@gmail.com>
---
 .../selftests/bpf/prog_tests/tc_redirect.c    | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index 5703c918812b..932e4ee3f97c 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -13,15 +13,16 @@
 #define _GNU_SOURCE
 
 #include <arpa/inet.h>
+#include <linux/if.h>
+#include <linux/if_tun.h>
 #include <linux/limits.h>
 #include <linux/sysctl.h>
-#include <linux/if_tun.h>
-#include <linux/if.h>
 #include <sched.h>
 #include <stdbool.h>
 #include <stdio.h>
-#include <sys/stat.h>
 #include <sys/mount.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
 #include "test_progs.h"
 #include "network_helpers.h"
@@ -389,11 +390,21 @@ static void test_tcp(int family, const char *addr, __u16 port)
 		close(client_fd);
 }
 
+static char *ping_command(int family)
+{
+	if (family == AF_INET6) {
+		/* On some systems 'ping' doesn't support IPv6, so use ping6 if it is present. */
+		if (!system("which ping6 >/dev/null 2>&1"))
+			return "ping6";
+		else
+			return "ping -6";
+	}
+	return "ping";
+}
+
 static int test_ping(int family, const char *addr)
 {
-	const char *ping = family == AF_INET6 ? "ping6" : "ping";
-
-	SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping, addr);
+	SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s > /dev/null", ping_command(family), addr);
 	return 0;
 fail:
 	return -1;

From 7d07006f05922b95518be403f08ef8437b67aa32 Mon Sep 17 00:00:00 2001
From: Juhee Kang <claudiajkang@gmail.com>
Date: Tue, 27 Jul 2021 04:10:55 +0000
Subject: [PATCH 42/64] samples: bpf: Fix tracex7 error raised on the missing
 argument

The current behavior of 'tracex7' doesn't consist with other bpf samples
tracex{1..6}. Other samples do not require any argument to run with, but
tracex7 should be run with btrfs device argument. (it should be executed
with test_override_return.sh)

Currently, tracex7 doesn't have any description about how to run this
program and raises an unexpected error. And this result might be
confusing since users might not have a hunch about how to run this
program.

    // Current behavior
    # ./tracex7
    sh: 1: Syntax error: word unexpected (expecting ")")
    // Fixed behavior
    # ./tracex7
    ERROR: Run with the btrfs device argument!

In order to fix this error, this commit adds logic to report a message
and exit when running this program with a missing argument.

Additionally in test_override_return.sh, there is a problem with
multiple directory(tmpmnt) creation. So in this commit adds a line with
removing the directory with every execution.

Signed-off-by: Juhee Kang <claudiajkang@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210727041056.23455-1-claudiajkang@gmail.com
---
 samples/bpf/test_override_return.sh | 1 +
 samples/bpf/tracex7_user.c          | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
index e68b9ee6814b..35db26f736b9 100755
--- a/samples/bpf/test_override_return.sh
+++ b/samples/bpf/test_override_return.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
+rm -r tmpmnt
 rm -f testfile.img
 dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
 DEVICE=$(losetup --show -f testfile.img)
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
index fdcd6580dd73..8be7ce18d3ba 100644
--- a/samples/bpf/tracex7_user.c
+++ b/samples/bpf/tracex7_user.c
@@ -14,6 +14,11 @@ int main(int argc, char **argv)
 	int ret = 0;
 	FILE *f;
 
+	if (!argv[1]) {
+		fprintf(stderr, "ERROR: Run with the btrfs device argument!\n");
+		return 0;
+	}
+
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 	obj = bpf_object__open_file(filename, NULL);
 	if (libbpf_get_error(obj)) {

From 05e9b4f60d31a03815a699be24226bf26b7021f9 Mon Sep 17 00:00:00 2001
From: Juhee Kang <claudiajkang@gmail.com>
Date: Tue, 27 Jul 2021 04:10:56 +0000
Subject: [PATCH 43/64] samples: bpf: Add the omitted xdp samples to .gitignore

There are recently added xdp samples (xdp_redirect_map_multi and
xdpsock_ctrl_proc) which are not managed by .gitignore.

This commit adds these files to .gitignore.

Signed-off-by: Juhee Kang <claudiajkang@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210727041056.23455-2-claudiajkang@gmail.com
---
 samples/bpf/.gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore
index 0b9548ea8477..fcba217f0ae2 100644
--- a/samples/bpf/.gitignore
+++ b/samples/bpf/.gitignore
@@ -45,11 +45,13 @@ xdp_monitor
 xdp_redirect
 xdp_redirect_cpu
 xdp_redirect_map
+xdp_redirect_map_multi
 xdp_router_ipv4
 xdp_rxq_info
 xdp_sample_pkts
 xdp_tx_iptunnel
 xdpsock
+xdpsock_ctrl_proc
 xsk_fwd
 testfile.img
 hbm_out.log

From c139e40a515d2d1e51f7c08bd63ed4d1c7f64163 Mon Sep 17 00:00:00 2001
From: Jason Wang <wangborong@cdjrlc.com>
Date: Tue, 27 Jul 2021 19:59:28 +0800
Subject: [PATCH 44/64] libbpf: Fix comment typo

Remove the repeated word 'the' in line 48.

Signed-off-by: Jason Wang <wangborong@cdjrlc.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210727115928.74600-1-wangborong@cdjrlc.com
---
 tools/lib/bpf/libbpf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 27b9a6bec9fb..6c6834622944 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -5958,7 +5958,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj)
 
 	for (i = 0; i < obj->nr_programs; i++) {
 		struct bpf_program *p = &obj->programs[i];
-		
+
 		if (!p->nr_reloc)
 			continue;
 
@@ -8260,7 +8260,7 @@ static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix,
 	ret = snprintf(btf_type_name, sizeof(btf_type_name),
 		       "%s%s", prefix, name);
 	/* snprintf returns the number of characters written excluding the
-	 * the terminating null. So, if >= BTF_MAX_NAME_SIZE are written, it
+	 * terminating null. So, if >= BTF_MAX_NAME_SIZE are written, it
 	 * indicates truncation.
 	 */
 	if (ret < 0 || ret >= sizeof(btf_type_name))
@@ -8802,7 +8802,7 @@ struct bpf_link {
 int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog)
 {
 	int ret;
-	
+
 	ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL);
 	return libbpf_err_errno(ret);
 }

From 043c5bb3c4f43670ab4fea0b847373ab42d25f3e Mon Sep 17 00:00:00 2001
From: Martynas Pumputis <m@lambda.lt>
Date: Mon, 26 Jul 2021 17:20:01 +0200
Subject: [PATCH 45/64] libbpf: Fix race when pinning maps in parallel

When loading in parallel multiple programs which use the same to-be
pinned map, it is possible that two instances of the loader will call
bpf_object__create_maps() at the same time. If the map doesn't exist
when both instances call bpf_object__reuse_map(), then one of the
instances will fail with EEXIST when calling bpf_map__pin().

Fix the race by retrying reusing a map if bpf_map__pin() returns
EEXIST. The fix is similar to the one in iproute2: e4c4685fd6e4 ("bpf:
Fix race condition with map pinning").

Before retrying the pinning, we don't do any special cleaning of an
internal map state. The closer code inspection revealed that it's not
required:

    - bpf_object__create_map(): map->inner_map is destroyed after a
      successful call, map->fd is closed if pinning fails.
    - bpf_object__populate_internal_map(): created map elements is
      destroyed upon close(map->fd).
    - init_map_slots(): slots are freed after their initialization.

Signed-off-by: Martynas Pumputis <m@lambda.lt>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210726152001.34845-1-m@lambda.lt
---
 tools/lib/bpf/libbpf.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 6c6834622944..a1ca6fb0c6d8 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4657,10 +4657,13 @@ bpf_object__create_maps(struct bpf_object *obj)
 	char *cp, errmsg[STRERR_BUFSIZE];
 	unsigned int i, j;
 	int err;
+	bool retried;
 
 	for (i = 0; i < obj->nr_maps; i++) {
 		map = &obj->maps[i];
 
+		retried = false;
+retry:
 		if (map->pin_path) {
 			err = bpf_object__reuse_map(map);
 			if (err) {
@@ -4668,6 +4671,12 @@ bpf_object__create_maps(struct bpf_object *obj)
 					map->name);
 				goto err_out;
 			}
+			if (retried && map->fd < 0) {
+				pr_warn("map '%s': cannot find pinned map\n",
+					map->name);
+				err = -ENOENT;
+				goto err_out;
+			}
 		}
 
 		if (map->fd >= 0) {
@@ -4701,9 +4710,13 @@ bpf_object__create_maps(struct bpf_object *obj)
 		if (map->pin_path && !map->pinned) {
 			err = bpf_map__pin(map, NULL);
 			if (err) {
+				zclose(map->fd);
+				if (!retried && err == -EEXIST) {
+					retried = true;
+					goto retry;
+				}
 				pr_warn("map '%s': failed to auto-pin at '%s': %d\n",
 					map->name, map->pin_path, err);
-				zclose(map->fd);
 				goto err_out;
 			}
 		}

From 33b57e0cc78eb82f2921eb4c6d1c8fcaa733823b Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 27 Jul 2021 15:23:35 -0700
Subject: [PATCH 46/64] bpf: Increase supported cgroup storage value size

Current max cgroup storage value size is 4k (PAGE_SIZE). The other local
storages accept up to 64k (BPF_LOCAL_STORAGE_MAX_VALUE_SIZE). Let's align
max cgroup value size with the other storages.

For percpu, the max is 32k (PCPU_MIN_UNIT_SIZE) because percpu
allocator is not happy about larger values.

netcnt test is extended to exercise those maximum values
(non-percpu max size is close to, but not real max).

v4:
* remove inner union (Andrii Nakryiko)
* keep net_cnt on the stack (Andrii Nakryiko)

v3:
* refine SIZEOF_BPF_LOCAL_STORAGE_ELEM comment (Yonghong Song)
* anonymous struct in percpu_net_cnt & net_cnt (Yonghong Song)
* reorder free (Yonghong Song)

v2:
* cap max_value_size instead of BUILD_BUG_ON (Martin KaFai Lau)

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210727222335.4029096-1-sdf@google.com
---
 kernel/bpf/local_storage.c                    | 11 +++++-
 tools/testing/selftests/bpf/netcnt_common.h   | 38 ++++++++++++++-----
 .../testing/selftests/bpf/progs/netcnt_prog.c |  8 ++--
 tools/testing/selftests/bpf/test_netcnt.c     |  4 +-
 4 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 7ed2a14dc0de..035e9e3a7132 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,6 +1,7 @@
 //SPDX-License-Identifier: GPL-2.0
 #include <linux/bpf-cgroup.h>
 #include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
 #include <linux/btf.h>
 #include <linux/bug.h>
 #include <linux/filter.h>
@@ -283,9 +284,17 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key,
 
 static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 {
+	__u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_cgroup_storage_map *map;
 
+	/* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu
+	 * is the same as other local storages.
+	 */
+	if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+		max_value_size = min_t(__u32, max_value_size,
+				       PCPU_MIN_UNIT_SIZE);
+
 	if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
 	    attr->key_size != sizeof(__u64))
 		return ERR_PTR(-EINVAL);
@@ -293,7 +302,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 	if (attr->value_size == 0)
 		return ERR_PTR(-EINVAL);
 
-	if (attr->value_size > PAGE_SIZE)
+	if (attr->value_size > max_value_size)
 		return ERR_PTR(-E2BIG);
 
 	if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK ||
diff --git a/tools/testing/selftests/bpf/netcnt_common.h b/tools/testing/selftests/bpf/netcnt_common.h
index 81084c1c2c23..0ab1c88041cd 100644
--- a/tools/testing/selftests/bpf/netcnt_common.h
+++ b/tools/testing/selftests/bpf/netcnt_common.h
@@ -6,19 +6,39 @@
 
 #define MAX_PERCPU_PACKETS 32
 
-struct percpu_net_cnt {
-	__u64 packets;
-	__u64 bytes;
+/* sizeof(struct bpf_local_storage_elem):
+ *
+ * It really is about 128 bytes on x86_64, but allocate more to account for
+ * possible layout changes, different architectures, etc.
+ * The kernel will wrap up to PAGE_SIZE internally anyway.
+ */
+#define SIZEOF_BPF_LOCAL_STORAGE_ELEM		256
 
-	__u64 prev_ts;
+/* Try to estimate kernel's BPF_LOCAL_STORAGE_MAX_VALUE_SIZE: */
+#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE	(0xFFFF - \
+						 SIZEOF_BPF_LOCAL_STORAGE_ELEM)
 
-	__u64 prev_packets;
-	__u64 prev_bytes;
+#define PCPU_MIN_UNIT_SIZE			32768
+
+union percpu_net_cnt {
+	struct {
+		__u64 packets;
+		__u64 bytes;
+
+		__u64 prev_ts;
+
+		__u64 prev_packets;
+		__u64 prev_bytes;
+	};
+	__u8 data[PCPU_MIN_UNIT_SIZE];
 };
 
-struct net_cnt {
-	__u64 packets;
-	__u64 bytes;
+union net_cnt {
+	struct {
+		__u64 packets;
+		__u64 bytes;
+	};
+	__u8 data[BPF_LOCAL_STORAGE_MAX_VALUE_SIZE];
 };
 
 #endif
diff --git a/tools/testing/selftests/bpf/progs/netcnt_prog.c b/tools/testing/selftests/bpf/progs/netcnt_prog.c
index d071adf178bd..43649bce4c54 100644
--- a/tools/testing/selftests/bpf/progs/netcnt_prog.c
+++ b/tools/testing/selftests/bpf/progs/netcnt_prog.c
@@ -13,21 +13,21 @@
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
 	__type(key, struct bpf_cgroup_storage_key);
-	__type(value, struct percpu_net_cnt);
+	__type(value, union percpu_net_cnt);
 } percpu_netcnt SEC(".maps");
 
 struct {
 	__uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
 	__type(key, struct bpf_cgroup_storage_key);
-	__type(value, struct net_cnt);
+	__type(value, union net_cnt);
 } netcnt SEC(".maps");
 
 SEC("cgroup/skb")
 int bpf_nextcnt(struct __sk_buff *skb)
 {
-	struct percpu_net_cnt *percpu_cnt;
+	union percpu_net_cnt *percpu_cnt;
 	char fmt[] = "%d %llu %llu\n";
-	struct net_cnt *cnt;
+	union net_cnt *cnt;
 	__u64 ts, dt;
 	int ret;
 
diff --git a/tools/testing/selftests/bpf/test_netcnt.c b/tools/testing/selftests/bpf/test_netcnt.c
index a7b9a69f4fd5..4990a99e7381 100644
--- a/tools/testing/selftests/bpf/test_netcnt.c
+++ b/tools/testing/selftests/bpf/test_netcnt.c
@@ -33,14 +33,14 @@ static int bpf_find_map(const char *test, struct bpf_object *obj,
 
 int main(int argc, char **argv)
 {
-	struct percpu_net_cnt *percpu_netcnt;
+	union percpu_net_cnt *percpu_netcnt;
 	struct bpf_cgroup_storage_key key;
 	int map_fd, percpu_map_fd;
 	int error = EXIT_FAILURE;
-	struct net_cnt netcnt;
 	struct bpf_object *obj;
 	int prog_fd, cgroup_fd;
 	unsigned long packets;
+	union net_cnt netcnt;
 	unsigned long bytes;
 	int cpu, nproc;
 	__u32 prog_cnt;

From 5aad03685185b5133a28e1ee1d4e98d3fd3642a3 Mon Sep 17 00:00:00 2001
From: Hengqi Chen <hengqi.chen@gmail.com>
Date: Tue, 27 Jul 2021 21:25:31 +0800
Subject: [PATCH 47/64] tools/resolve_btfids: Emit warnings and patch zero id
 for missing symbols

Kernel functions referenced by .BTF_ids may be changed from global to static
and get inlined or get renamed/removed, and thus disappears from BTF.
This causes kernel build failure when resolve_btfids do id patch for symbols
in .BTF_ids in vmlinux. Update resolve_btfids to emit warning messages and
patch zero id for missing symbols instead of aborting kernel build process.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210727132532.2473636-2-hengqi.chen@gmail.com
---
 tools/bpf/resolve_btfids/main.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 3ad9301b0f00..de6365b53c9c 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -291,7 +291,7 @@ static int compressed_section_fix(Elf *elf, Elf_Scn *scn, GElf_Shdr *sh)
 	sh->sh_addralign = expected;
 
 	if (gelf_update_shdr(scn, sh) == 0) {
-		printf("FAILED cannot update section header: %s\n",
+		pr_err("FAILED cannot update section header: %s\n",
 			elf_errmsg(-1));
 		return -1;
 	}
@@ -317,6 +317,7 @@ static int elf_collect(struct object *obj)
 
 	elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL);
 	if (!elf) {
+		close(fd);
 		pr_err("FAILED cannot create ELF descriptor: %s\n",
 			elf_errmsg(-1));
 		return -1;
@@ -484,7 +485,7 @@ static int symbols_resolve(struct object *obj)
 	err = libbpf_get_error(btf);
 	if (err) {
 		pr_err("FAILED: load BTF from %s: %s\n",
-			obj->path, strerror(-err));
+			obj->btf ?: obj->path, strerror(-err));
 		return -1;
 	}
 
@@ -555,8 +556,7 @@ static int id_patch(struct object *obj, struct btf_id *id)
 	int i;
 
 	if (!id->id) {
-		pr_err("FAILED unresolved symbol %s\n", id->name);
-		return -EINVAL;
+		pr_err("WARN: resolve_btfids: unresolved symbol %s\n", id->name);
 	}
 
 	for (i = 0; i < id->addr_cnt; i++) {
@@ -734,8 +734,9 @@ int main(int argc, const char **argv)
 
 	err = 0;
 out:
-	if (obj.efile.elf)
+	if (obj.efile.elf) {
 		elf_end(obj.efile.elf);
-	close(obj.efile.fd);
+		close(obj.efile.fd);
+	}
 	return err;
 }

From d36216429ff3e69db4f6ea5e0c86b80010f5f30b Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 28 Jul 2021 11:30:25 -0700
Subject: [PATCH 48/64] bpf: Emit better log message if bpf_iter ctx arg btf_id
 == 0

To avoid kernel build failure due to some missing .BTF-ids referenced
functions/types, the patch ([1]) tries to fill btf_id 0 for
these types.

In bpf verifier, for percpu variable and helper returning btf_id cases,
verifier already emitted proper warning with something like
  verbose(env, "Helper has invalid btf_id in R%d\n", regno);
  verbose(env, "invalid return type %d of func %s#%d\n",
          fn->ret_type, func_id_name(func_id), func_id);

But this is not the case for bpf_iter context arguments.
I hacked resolve_btfids to encode btf_id 0 for struct task_struct.
With `./test_progs -n 7/5`, I got,
  0: (79) r2 = *(u64 *)(r1 +0)
  func 'bpf_iter_task' arg0 has btf_id 29739 type STRUCT 'bpf_iter_meta'
  ; struct seq_file *seq = ctx->meta->seq;
  1: (79) r6 = *(u64 *)(r2 +0)
  ; struct task_struct *task = ctx->task;
  2: (79) r7 = *(u64 *)(r1 +8)
  ; if (task == (void *)0) {
  3: (55) if r7 != 0x0 goto pc+11
  ...
  ; BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid);
  26: (61) r1 = *(u32 *)(r7 +1372)
  Type '(anon)' is not a struct

Basically, verifier will return btf_id 0 for task_struct.
Later on, when the code tries to access task->tgid, the
verifier correctly complains the type is '(anon)' and it is
not a struct. Users still need to backtrace to find out
what is going on.

Let us catch the invalid btf_id 0 earlier
and provide better message indicating btf_id is wrong.
The new error message looks like below:
  R1 type=ctx expected=fp
  ; struct seq_file *seq = ctx->meta->seq;
  0: (79) r2 = *(u64 *)(r1 +0)
  func 'bpf_iter_task' arg0 has btf_id 29739 type STRUCT 'bpf_iter_meta'
  ; struct seq_file *seq = ctx->meta->seq;
  1: (79) r6 = *(u64 *)(r2 +0)
  ; struct task_struct *task = ctx->task;
  2: (79) r7 = *(u64 *)(r1 +8)
  invalid btf_id for context argument offset 8
  invalid bpf_context access off=8 size=8

[1] https://lore.kernel.org/bpf/20210727132532.2473636-1-hengqi.chen@gmail.com/

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210728183025.1461750-1-yhs@fb.com
---
 kernel/bpf/btf.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7780131f710e..c395024610ed 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4825,6 +4825,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
 
 		if (ctx_arg_info->offset == off) {
+			if (!ctx_arg_info->btf_id) {
+				bpf_log(log,"invalid btf_id for context argument offset %u\n", off);
+				return false;
+			}
+
 			info->reg_type = ctx_arg_info->reg_type;
 			info->btf = btf_vmlinux;
 			info->btf_id = ctx_arg_info->btf_id;

From 6d2d73cdd673d493f9f3751188757129b1d23fb7 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Thu, 29 Jul 2021 17:20:21 +0100
Subject: [PATCH 49/64] libbpf: Return non-null error on failures in
 libbpf_find_prog_btf_id()

Variable "err" is initialised to -EINVAL so that this error code is
returned when something goes wrong in libbpf_find_prog_btf_id().
However, a recent change in the function made use of the variable in
such a way that it is set to 0 if retrieving linear information on the
program is successful, and this 0 value remains if we error out on
failures at later stages.

Let's fix this by setting err to -EINVAL later in the function.

Fixes: e9fc3ce99b34 ("libbpf: Streamline error reporting for high-level APIs")
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210729162028.29512-2-quentin@isovalent.com
---
 tools/lib/bpf/libbpf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a1ca6fb0c6d8..7b2b5d261a08 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8317,7 +8317,7 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd)
 	struct bpf_prog_info_linear *info_linear;
 	struct bpf_prog_info *info;
 	struct btf *btf = NULL;
-	int err = -EINVAL;
+	int err;
 
 	info_linear = bpf_program__get_prog_info_linear(attach_prog_fd, 0);
 	err = libbpf_get_error(info_linear);
@@ -8326,6 +8326,8 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd)
 			attach_prog_fd);
 		return err;
 	}
+
+	err = -EINVAL;
 	info = &info_linear->info;
 	if (!info->btf_id) {
 		pr_warn("The target program doesn't have BTF\n");

From 3c7e58590600eca3402f08e7fbdf4f2d1e36c5c8 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Thu, 29 Jul 2021 17:20:22 +0100
Subject: [PATCH 50/64] libbpf: Rename btf__load() as btf__load_into_kernel()

As part of the effort to move towards a v1.0 for libbpf, rename
btf__load() function, used to "upload" BTF information into the kernel,
as btf__load_into_kernel(). This new name better reflects what the
function does.

References:

- https://github.com/libbpf/libbpf/issues/278
- https://github.com/libbpf/libbpf/wiki/Libbpf:-the-road-to-v1.0#btfh-apis

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210729162028.29512-3-quentin@isovalent.com
---
 tools/lib/bpf/btf.c      | 3 ++-
 tools/lib/bpf/btf.h      | 1 +
 tools/lib/bpf/libbpf.c   | 2 +-
 tools/lib/bpf/libbpf.map | 1 +
 4 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index b46760b93bb4..7e0de560490e 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1180,7 +1180,7 @@ int btf__finalize_data(struct bpf_object *obj, struct btf *btf)
 
 static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian);
 
-int btf__load(struct btf *btf)
+int btf__load_into_kernel(struct btf *btf)
 {
 	__u32 log_buf_size = 0, raw_size;
 	char *log_buf = NULL;
@@ -1228,6 +1228,7 @@ int btf__load(struct btf *btf)
 	free(log_buf);
 	return libbpf_err(err);
 }
+int btf__load(struct btf *) __attribute__((alias("btf__load_into_kernel")));
 
 int btf__fd(const struct btf *btf)
 {
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 374e9f15de2e..fd8a21d936ef 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -46,6 +46,7 @@ LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_b
 
 LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf);
 LIBBPF_API int btf__load(struct btf *btf);
+LIBBPF_API int btf__load_into_kernel(struct btf *btf);
 LIBBPF_API __s32 btf__find_by_name(const struct btf *btf,
 				   const char *type_name);
 LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf,
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7b2b5d261a08..9a657d6d7da3 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2769,7 +2769,7 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj)
 		 */
 		btf__set_fd(kern_btf, 0);
 	} else {
-		err = btf__load(kern_btf);
+		err = btf__load_into_kernel(kern_btf);
 	}
 	if (sanitize) {
 		if (!err) {
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index c240d488eb5e..4d80eb8c56b0 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -375,6 +375,7 @@ LIBBPF_0.5.0 {
 		bpf_map_lookup_and_delete_elem_flags;
 		bpf_program__attach_kprobe_opts;
 		bpf_object__gen_loader;
+		btf__load_into_kernel;
 		btf_dump__dump_type_data;
 		libbpf_set_strict_mode;
 } LIBBPF_0.4.0;

From 6cc93e2f2c1c865acadedfea174bde893a2aa376 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Thu, 29 Jul 2021 17:20:23 +0100
Subject: [PATCH 51/64] libbpf: Rename btf__get_from_id() as
 btf__load_from_kernel_by_id()

Rename function btf__get_from_id() as btf__load_from_kernel_by_id() to
better indicate what the function does. Change the new function so that,
instead of requiring a pointer to the pointer to update and returning
with an error code, it takes a single argument (the id of the BTF
object) and returns the corresponding pointer. This is more in line with
the existing constructors.

The other tools calling the (soon-to-be) deprecated btf__get_from_id()
function will be updated in a future commit.

References:

- https://github.com/libbpf/libbpf/issues/278
- https://github.com/libbpf/libbpf/wiki/Libbpf:-the-road-to-v1.0#btfh-apis

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210729162028.29512-4-quentin@isovalent.com
---
 tools/lib/bpf/btf.c      | 25 +++++++++++++++++--------
 tools/lib/bpf/btf.h      |  4 +++-
 tools/lib/bpf/libbpf.c   |  5 +++--
 tools/lib/bpf/libbpf.map |  1 +
 4 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 7e0de560490e..948c29fee447 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1383,21 +1383,30 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf)
 	return btf;
 }
 
+struct btf *btf__load_from_kernel_by_id(__u32 id)
+{
+	struct btf *btf;
+	int btf_fd;
+
+	btf_fd = bpf_btf_get_fd_by_id(id);
+	if (btf_fd < 0)
+		return libbpf_err_ptr(-errno);
+
+	btf = btf_get_from_fd(btf_fd, NULL);
+	close(btf_fd);
+
+	return libbpf_ptr(btf);
+}
+
 int btf__get_from_id(__u32 id, struct btf **btf)
 {
 	struct btf *res;
-	int err, btf_fd;
+	int err;
 
 	*btf = NULL;
-	btf_fd = bpf_btf_get_fd_by_id(id);
-	if (btf_fd < 0)
-		return libbpf_err(-errno);
-
-	res = btf_get_from_fd(btf_fd, NULL);
+	res = btf__load_from_kernel_by_id(id);
 	err = libbpf_get_error(res);
 
-	close(btf_fd);
-
 	if (err)
 		return libbpf_err(err);
 
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index fd8a21d936ef..5d955329a1f4 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -44,6 +44,9 @@ LIBBPF_API struct btf *btf__parse_elf_split(const char *path, struct btf *base_b
 LIBBPF_API struct btf *btf__parse_raw(const char *path);
 LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf);
 
+LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id);
+LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf);
+
 LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf);
 LIBBPF_API int btf__load(struct btf *btf);
 LIBBPF_API int btf__load_into_kernel(struct btf *btf);
@@ -67,7 +70,6 @@ LIBBPF_API void btf__set_fd(struct btf *btf, int fd);
 LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size);
 LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
 LIBBPF_API const char *btf__str_by_offset(const struct btf *btf, __u32 offset);
-LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf);
 LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 				    __u32 expected_key_size,
 				    __u32 expected_value_size,
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 9a657d6d7da3..313883179919 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8316,7 +8316,7 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd)
 {
 	struct bpf_prog_info_linear *info_linear;
 	struct bpf_prog_info *info;
-	struct btf *btf = NULL;
+	struct btf *btf;
 	int err;
 
 	info_linear = bpf_program__get_prog_info_linear(attach_prog_fd, 0);
@@ -8333,7 +8333,8 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd)
 		pr_warn("The target program doesn't have BTF\n");
 		goto out;
 	}
-	if (btf__get_from_id(info->btf_id, &btf)) {
+	btf = btf__load_from_kernel_by_id(info->btf_id);
+	if (libbpf_get_error(btf)) {
 		pr_warn("Failed to get BTF of the program\n");
 		goto out;
 	}
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 4d80eb8c56b0..3a9c6939301e 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -375,6 +375,7 @@ LIBBPF_0.5.0 {
 		bpf_map_lookup_and_delete_elem_flags;
 		bpf_program__attach_kprobe_opts;
 		bpf_object__gen_loader;
+		btf__load_from_kernel_by_id;
 		btf__load_into_kernel;
 		btf_dump__dump_type_data;
 		libbpf_set_strict_mode;

From 369e955b3d1c12f6ec2e51a95911bb80ada55d79 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Thu, 29 Jul 2021 17:20:24 +0100
Subject: [PATCH 52/64] tools: Free BTF objects at various locations

Make sure to call btf__free() (and not simply free(), which does not
free all pointers stored in the struct) on pointers to struct btf
objects retrieved at various locations.

These were found while updating the calls to btf__get_from_id().

Fixes: 999d82cbc044 ("tools/bpf: enhance test_btf file testing to test func info")
Fixes: 254471e57a86 ("tools/bpf: bpftool: add support for func types")
Fixes: 7b612e291a5a ("perf tools: Synthesize PERF_RECORD_* for loaded BPF programs")
Fixes: d56354dc4909 ("perf tools: Save bpf_prog_info and BTF of new BPF programs")
Fixes: 47c09d6a9f67 ("bpftool: Introduce "prog profile" command")
Fixes: fa853c4b839e ("perf stat: Enable counting events for BPF programs")
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210729162028.29512-5-quentin@isovalent.com
---
 tools/bpf/bpftool/prog.c                     | 5 ++++-
 tools/perf/util/bpf-event.c                  | 4 ++--
 tools/perf/util/bpf_counter.c                | 3 ++-
 tools/testing/selftests/bpf/prog_tests/btf.c | 1 +
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index cc48726740ad..9d709b427665 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -781,6 +781,8 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode,
 		kernel_syms_destroy(&dd);
 	}
 
+	btf__free(btf);
+
 	return 0;
 }
 
@@ -2002,8 +2004,8 @@ static char *profile_target_name(int tgt_fd)
 	struct bpf_prog_info_linear *info_linear;
 	struct bpf_func_info *func_info;
 	const struct btf_type *t;
+	struct btf *btf = NULL;
 	char *name = NULL;
-	struct btf *btf;
 
 	info_linear = bpf_program__get_prog_info_linear(
 		tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO);
@@ -2027,6 +2029,7 @@ static char *profile_target_name(int tgt_fd)
 	}
 	name = strdup(btf__name_by_offset(btf, t->name_off));
 out:
+	btf__free(btf);
 	free(info_linear);
 	return name;
 }
diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index cdecda1ddd36..17a9844e4fbf 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -296,7 +296,7 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session,
 
 out:
 	free(info_linear);
-	free(btf);
+	btf__free(btf);
 	return err ? -1 : 0;
 }
 
@@ -486,7 +486,7 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id)
 	perf_env__fetch_btf(env, btf_id, btf);
 
 out:
-	free(btf);
+	btf__free(btf);
 	close(fd);
 }
 
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index 8150e03367bb..beca55129b0b 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -64,8 +64,8 @@ static char *bpf_target_prog_name(int tgt_fd)
 	struct bpf_prog_info_linear *info_linear;
 	struct bpf_func_info *func_info;
 	const struct btf_type *t;
+	struct btf *btf = NULL;
 	char *name = NULL;
-	struct btf *btf;
 
 	info_linear = bpf_program__get_prog_info_linear(
 		tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO);
@@ -89,6 +89,7 @@ static char *bpf_target_prog_name(int tgt_fd)
 	}
 	name = strdup(btf__name_by_offset(btf, t->name_off));
 out:
+	btf__free(btf);
 	free(info_linear);
 	return name;
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 857e3f26086f..68e415f4d33c 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -4386,6 +4386,7 @@ static void do_test_file(unsigned int test_num)
 	fprintf(stderr, "OK");
 
 done:
+	btf__free(btf);
 	free(func_info);
 	bpf_object__close(obj);
 }

From 86f4b7f2578f69284fa782be54e700c42c757897 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Thu, 29 Jul 2021 17:20:25 +0100
Subject: [PATCH 53/64] tools: Replace btf__get_from_id() with
 btf__load_from_kernel_by_id()

Replace the calls to function btf__get_from_id(), which we plan to
deprecate before the library reaches v1.0, with calls to
btf__load_from_kernel_by_id() in tools/ (bpftool, perf, selftests).
Update the surrounding code accordingly (instead of passing a pointer to
the btf struct, get it as a return value from the function).

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210729162028.29512-6-quentin@isovalent.com
---
 tools/bpf/bpftool/btf.c                      |  8 ++-----
 tools/bpf/bpftool/btf_dumper.c               |  6 +++--
 tools/bpf/bpftool/map.c                      | 14 ++++++------
 tools/bpf/bpftool/prog.c                     | 24 +++++++++++++-------
 tools/perf/util/bpf-event.c                  |  7 +++---
 tools/perf/util/bpf_counter.c                |  9 ++++++--
 tools/testing/selftests/bpf/prog_tests/btf.c |  3 ++-
 7 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 385d5c955cf3..9162a18e84c0 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -580,16 +580,12 @@ static int do_dump(int argc, char **argv)
 	}
 
 	if (!btf) {
-		err = btf__get_from_id(btf_id, &btf);
+		btf = btf__load_from_kernel_by_id(btf_id);
+		err = libbpf_get_error(btf);
 		if (err) {
 			p_err("get btf by id (%u): %s", btf_id, strerror(err));
 			goto done;
 		}
-		if (!btf) {
-			err = -ENOENT;
-			p_err("can't find btf with ID (%u)", btf_id);
-			goto done;
-		}
 	}
 
 	if (dump_c) {
diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c
index 7ca54d046362..9c25286a5c73 100644
--- a/tools/bpf/bpftool/btf_dumper.c
+++ b/tools/bpf/bpftool/btf_dumper.c
@@ -64,8 +64,10 @@ static int dump_prog_id_as_func_ptr(const struct btf_dumper *d,
 	}
 	info = &prog_info->info;
 
-	if (!info->btf_id || !info->nr_func_info ||
-	    btf__get_from_id(info->btf_id, &prog_btf))
+	if (!info->btf_id || !info->nr_func_info)
+		goto print;
+	prog_btf = btf__load_from_kernel_by_id(info->btf_id);
+	if (libbpf_get_error(prog_btf))
 		goto print;
 	finfo = u64_to_ptr(info->func_info);
 	func_type = btf__type_by_id(prog_btf, finfo->type_id);
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 09ae0381205b..7e7f748bb0be 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -807,10 +807,11 @@ static struct btf *get_map_kv_btf(const struct bpf_map_info *info)
 	} else if (info->btf_value_type_id) {
 		int err;
 
-		err = btf__get_from_id(info->btf_id, &btf);
-		if (err || !btf) {
+		btf = btf__load_from_kernel_by_id(info->btf_id);
+		err = libbpf_get_error(btf);
+		if (err) {
 			p_err("failed to get btf");
-			btf = err ? ERR_PTR(err) : ERR_PTR(-ESRCH);
+			btf = ERR_PTR(err);
 		}
 	}
 
@@ -1039,11 +1040,10 @@ static void print_key_value(struct bpf_map_info *info, void *key,
 			    void *value)
 {
 	json_writer_t *btf_wtr;
-	struct btf *btf = NULL;
-	int err;
+	struct btf *btf;
 
-	err = btf__get_from_id(info->btf_id, &btf);
-	if (err) {
+	btf = btf__load_from_kernel_by_id(info->btf_id);
+	if (libbpf_get_error(btf)) {
 		p_err("failed to get btf");
 		return;
 	}
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 9d709b427665..b1996b8f1d42 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -249,10 +249,10 @@ static void show_prog_metadata(int fd, __u32 num_maps)
 	struct bpf_map_info map_info;
 	struct btf_var_secinfo *vsi;
 	bool printed_header = false;
-	struct btf *btf = NULL;
 	unsigned int i, vlen;
 	void *value = NULL;
 	const char *name;
+	struct btf *btf;
 	int err;
 
 	if (!num_maps)
@@ -263,8 +263,8 @@ static void show_prog_metadata(int fd, __u32 num_maps)
 	if (!value)
 		return;
 
-	err = btf__get_from_id(map_info.btf_id, &btf);
-	if (err || !btf)
+	btf = btf__load_from_kernel_by_id(map_info.btf_id);
+	if (libbpf_get_error(btf))
 		goto out_free;
 
 	t_datasec = btf__type_by_id(btf, map_info.btf_value_type_id);
@@ -646,9 +646,12 @@ prog_dump(struct bpf_prog_info *info, enum dump_mode mode,
 		member_len = info->xlated_prog_len;
 	}
 
-	if (info->btf_id && btf__get_from_id(info->btf_id, &btf)) {
-		p_err("failed to get btf");
-		return -1;
+	if (info->btf_id) {
+		btf = btf__load_from_kernel_by_id(info->btf_id);
+		if (libbpf_get_error(btf)) {
+			p_err("failed to get btf");
+			return -1;
+		}
 	}
 
 	func_info = u64_to_ptr(info->func_info);
@@ -2014,12 +2017,17 @@ static char *profile_target_name(int tgt_fd)
 		return NULL;
 	}
 
-	if (info_linear->info.btf_id == 0 ||
-	    btf__get_from_id(info_linear->info.btf_id, &btf)) {
+	if (info_linear->info.btf_id == 0) {
 		p_err("prog FD %d doesn't have valid btf", tgt_fd);
 		goto out;
 	}
 
+	btf = btf__load_from_kernel_by_id(info_linear->info.btf_id);
+	if (libbpf_get_error(btf)) {
+		p_err("failed to load btf for prog FD %d", tgt_fd);
+		goto out;
+	}
+
 	func_info = u64_to_ptr(info_linear->info.func_info);
 	t = btf__type_by_id(btf, func_info[0].type_id);
 	if (!t) {
diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 17a9844e4fbf..996d025b8ed8 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -223,10 +223,10 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session,
 			free(info_linear);
 			return -1;
 		}
-		if (btf__get_from_id(info->btf_id, &btf)) {
+		btf = btf__load_from_kernel_by_id(info->btf_id);
+		if (libbpf_get_error(btf)) {
 			pr_debug("%s: failed to get BTF of id %u, aborting\n", __func__, info->btf_id);
 			err = -1;
-			btf = NULL;
 			goto out;
 		}
 		perf_env__fetch_btf(env, info->btf_id, btf);
@@ -478,7 +478,8 @@ static void perf_env__add_bpf_info(struct perf_env *env, u32 id)
 	if (btf_id == 0)
 		goto out;
 
-	if (btf__get_from_id(btf_id, &btf)) {
+	btf = btf__load_from_kernel_by_id(btf_id);
+	if (libbpf_get_error(btf)) {
 		pr_debug("%s: failed to get BTF of id %u, aborting\n",
 			 __func__, btf_id);
 		goto out;
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index beca55129b0b..ba0f20853651 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -74,12 +74,17 @@ static char *bpf_target_prog_name(int tgt_fd)
 		return NULL;
 	}
 
-	if (info_linear->info.btf_id == 0 ||
-	    btf__get_from_id(info_linear->info.btf_id, &btf)) {
+	if (info_linear->info.btf_id == 0) {
 		pr_debug("prog FD %d doesn't have valid btf\n", tgt_fd);
 		goto out;
 	}
 
+	btf = btf__load_from_kernel_by_id(info_linear->info.btf_id);
+	if (libbpf_get_error(btf)) {
+		pr_debug("failed to load btf for prog FD %d\n", tgt_fd);
+		goto out;
+	}
+
 	func_info = u64_to_ptr(info_linear->info.func_info);
 	t = btf__type_by_id(btf, func_info[0].type_id);
 	if (!t) {
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
index 68e415f4d33c..649f87382c8d 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -4350,7 +4350,8 @@ static void do_test_file(unsigned int test_num)
 		goto done;
 	}
 
-	err = btf__get_from_id(info.btf_id, &btf);
+	btf = btf__load_from_kernel_by_id(info.btf_id);
+	err = libbpf_get_error(btf);
 	if (CHECK(err, "cannot get btf from kernel, err: %d", err))
 		goto done;
 

From 61fc51b1d3e5915e356f2c0b67cd3bb13b640413 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Thu, 29 Jul 2021 17:20:27 +0100
Subject: [PATCH 54/64] libbpf: Add split BTF support for
 btf__load_from_kernel_by_id()

Add a new API function btf__load_from_kernel_by_id_split(), which takes
a pointer to a base BTF object in order to support split BTF objects
when retrieving BTF information from the kernel.

Reference: https://github.com/libbpf/libbpf/issues/314

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210729162028.29512-8-quentin@isovalent.com
---
 tools/lib/bpf/btf.c      | 9 +++++++--
 tools/lib/bpf/btf.h      | 1 +
 tools/lib/bpf/libbpf.map | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 948c29fee447..cafa4f6bd9b1 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1383,7 +1383,7 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf)
 	return btf;
 }
 
-struct btf *btf__load_from_kernel_by_id(__u32 id)
+struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf)
 {
 	struct btf *btf;
 	int btf_fd;
@@ -1392,12 +1392,17 @@ struct btf *btf__load_from_kernel_by_id(__u32 id)
 	if (btf_fd < 0)
 		return libbpf_err_ptr(-errno);
 
-	btf = btf_get_from_fd(btf_fd, NULL);
+	btf = btf_get_from_fd(btf_fd, base_btf);
 	close(btf_fd);
 
 	return libbpf_ptr(btf);
 }
 
+struct btf *btf__load_from_kernel_by_id(__u32 id)
+{
+	return btf__load_from_kernel_by_id_split(id, NULL);
+}
+
 int btf__get_from_id(__u32 id, struct btf **btf)
 {
 	struct btf *res;
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 5d955329a1f4..596a42c8f4f5 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -45,6 +45,7 @@ LIBBPF_API struct btf *btf__parse_raw(const char *path);
 LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf);
 
 LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id);
+LIBBPF_API struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf);
 LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf);
 
 LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 3a9c6939301e..5aca3686ca5e 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -376,6 +376,7 @@ LIBBPF_0.5.0 {
 		bpf_program__attach_kprobe_opts;
 		bpf_object__gen_loader;
 		btf__load_from_kernel_by_id;
+		btf__load_from_kernel_by_id_split;
 		btf__load_into_kernel;
 		btf_dump__dump_type_data;
 		libbpf_set_strict_mode;

From 211ab78f7658b50ea10c4569be63ca5009fd39b4 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Thu, 29 Jul 2021 17:20:28 +0100
Subject: [PATCH 55/64] tools: bpftool: Support dumping split BTF by id

Split BTF objects are typically BTF objects for kernel modules, which
are incrementally built on top of kernel BTF instead of redefining all
kernel symbols they need. We can use bpftool with its -B command-line
option to dump split BTF objects. It works well when the handle provided
for the BTF object to dump is a "path" to the BTF object, typically
under /sys/kernel/btf, because bpftool internally calls
btf__parse_split() which can take a "base_btf" pointer and resolve the
BTF reconstruction (although in that case, the "-B" option is
unnecessary because bpftool performs autodetection).

However, it did not work so far when passing the BTF object through its
id, because bpftool would call btf__get_from_id() which did not provide
a way to pass a "base_btf" pointer.

In other words, the following works:

    # bpftool btf dump file /sys/kernel/btf/i2c_smbus -B /sys/kernel/btf/vmlinux

But this was not possible:

    # bpftool btf dump id 6 -B /sys/kernel/btf/vmlinux

The libbpf API has recently changed, and btf__get_from_id() has been
deprecated in favour of btf__load_from_kernel_by_id() and its version
with support for split BTF, btf__load_from_kernel_by_id_split(). Let's
update bpftool to make it able to dump the BTF object in the second case
as well.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20210729162028.29512-9-quentin@isovalent.com
---
 tools/bpf/bpftool/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 9162a18e84c0..0ce3643278d4 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -580,7 +580,7 @@ static int do_dump(int argc, char **argv)
 	}
 
 	if (!btf) {
-		btf = btf__load_from_kernel_by_id(btf_id);
+		btf = btf__load_from_kernel_by_id_split(btf_id, base_btf);
 		err = libbpf_get_error(btf);
 		if (err) {
 			p_err("get btf by id (%u): %s", btf_id, strerror(err));

From a710eed386f182fcbfe517b659f60024fdb7c40c Mon Sep 17 00:00:00 2001
From: Hengqi Chen <hengqi.chen@gmail.com>
Date: Fri, 30 Jul 2021 19:40:12 +0800
Subject: [PATCH 56/64] libbpf: Add btf__load_vmlinux_btf/btf__load_module_btf

Add two new APIs: btf__load_vmlinux_btf and btf__load_module_btf.
btf__load_vmlinux_btf is just an alias to the existing API named
libbpf_find_kernel_btf, rename to be more precisely and consistent
with existing BTF APIs. btf__load_module_btf can be used to load
module BTF, add it for completeness. These two APIs are useful for
implementing tracing tools and introspection tools. This is part
of the effort towards libbpf 1.0 ([0]).

  [0] Closes: https://github.com/libbpf/libbpf/issues/280

Signed-off-by: Hengqi Chen <hengqi.chen@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210730114012.494408-1-hengqi.chen@gmail.com
---
 tools/lib/bpf/btf.c      | 14 ++++++++++++--
 tools/lib/bpf/btf.h      |  6 ++++--
 tools/lib/bpf/libbpf.c   |  4 ++--
 tools/lib/bpf/libbpf.map |  2 ++
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index cafa4f6bd9b1..85de4fd50699 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -4036,7 +4036,7 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d)
 		 */
 		if (d->hypot_adjust_canon)
 			continue;
-		
+
 		if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD)
 			d->map[t_id] = c_id;
 
@@ -4409,7 +4409,7 @@ static int btf_dedup_remap_types(struct btf_dedup *d)
  * Probe few well-known locations for vmlinux kernel image and try to load BTF
  * data out of it to use for target BTF.
  */
-struct btf *libbpf_find_kernel_btf(void)
+struct btf *btf__load_vmlinux_btf(void)
 {
 	struct {
 		const char *path_fmt;
@@ -4455,6 +4455,16 @@ struct btf *libbpf_find_kernel_btf(void)
 	return libbpf_err_ptr(-ESRCH);
 }
 
+struct btf *libbpf_find_kernel_btf(void) __attribute__((alias("btf__load_vmlinux_btf")));
+
+struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf)
+{
+	char path[80];
+
+	snprintf(path, sizeof(path), "/sys/kernel/btf/%s", module_name);
+	return btf__parse_split(path, vmlinux_btf);
+}
+
 int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx)
 {
 	int i, n, err;
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 596a42c8f4f5..4a711f990904 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -44,6 +44,10 @@ LIBBPF_API struct btf *btf__parse_elf_split(const char *path, struct btf *base_b
 LIBBPF_API struct btf *btf__parse_raw(const char *path);
 LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf);
 
+LIBBPF_API struct btf *btf__load_vmlinux_btf(void);
+LIBBPF_API struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf);
+LIBBPF_API struct btf *libbpf_find_kernel_btf(void);
+
 LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id);
 LIBBPF_API struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf);
 LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf);
@@ -93,8 +97,6 @@ int btf_ext__reloc_line_info(const struct btf *btf,
 LIBBPF_API __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext);
 LIBBPF_API __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext);
 
-LIBBPF_API struct btf *libbpf_find_kernel_btf(void);
-
 LIBBPF_API int btf__find_str(struct btf *btf, const char *s);
 LIBBPF_API int btf__add_str(struct btf *btf, const char *s);
 LIBBPF_API int btf__add_type(struct btf *btf, const struct btf *src_btf,
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 313883179919..cb106e8c42cb 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2680,7 +2680,7 @@ static int bpf_object__load_vmlinux_btf(struct bpf_object *obj, bool force)
 	if (!force && !obj_needs_vmlinux_btf(obj))
 		return 0;
 
-	obj->btf_vmlinux = libbpf_find_kernel_btf();
+	obj->btf_vmlinux = btf__load_vmlinux_btf();
 	err = libbpf_get_error(obj->btf_vmlinux);
 	if (err) {
 		pr_warn("Error loading vmlinux BTF: %d\n", err);
@@ -8297,7 +8297,7 @@ int libbpf_find_vmlinux_btf_id(const char *name,
 	struct btf *btf;
 	int err;
 
-	btf = libbpf_find_kernel_btf();
+	btf = btf__load_vmlinux_btf();
 	err = libbpf_get_error(btf);
 	if (err) {
 		pr_warn("vmlinux BTF is not found\n");
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 5aca3686ca5e..58e0fb2c482f 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -378,6 +378,8 @@ LIBBPF_0.5.0 {
 		btf__load_from_kernel_by_id;
 		btf__load_from_kernel_by_id_split;
 		btf__load_into_kernel;
+		btf__load_module_btf;
+		btf__load_vmlinux_btf;
 		btf_dump__dump_type_data;
 		libbpf_set_strict_mode;
 } LIBBPF_0.4.0;

From 0b846445985895e75958ecd59061fd7bf77e0c3f Mon Sep 17 00:00:00 2001
From: Cong Wang <cong.wang@bytedance.com>
Date: Fri, 23 Jul 2021 11:36:30 -0700
Subject: [PATCH 57/64] unix_bpf: Fix a potential deadlock in
 unix_dgram_bpf_recvmsg()

As Eric noticed, __unix_dgram_recvmsg() may acquire u->iolock
too, so we have to release it before calling this function.

Fixes: 9825d866ce0d ("af_unix: Implement unix_dgram_bpf_recvmsg()")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 net/unix/unix_bpf.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index db0cda29fb2f..177e883f451e 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -44,7 +44,7 @@ static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
 {
 	struct unix_sock *u = unix_sk(sk);
 	struct sk_psock *psock;
-	int copied, ret;
+	int copied;
 
 	psock = sk_psock_get(sk);
 	if (unlikely(!psock))
@@ -53,8 +53,9 @@ static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
 	mutex_lock(&u->iolock);
 	if (!skb_queue_empty(&sk->sk_receive_queue) &&
 	    sk_psock_queue_empty(psock)) {
-		ret = __unix_dgram_recvmsg(sk, msg, len, flags);
-		goto out;
+		mutex_unlock(&u->iolock);
+		sk_psock_put(sk, psock);
+		return __unix_dgram_recvmsg(sk, msg, len, flags);
 	}
 
 msg_bytes_ready:
@@ -68,16 +69,15 @@ static int unix_dgram_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
 		if (data) {
 			if (!sk_psock_queue_empty(psock))
 				goto msg_bytes_ready;
-			ret = __unix_dgram_recvmsg(sk, msg, len, flags);
-			goto out;
+			mutex_unlock(&u->iolock);
+			sk_psock_put(sk, psock);
+			return __unix_dgram_recvmsg(sk, msg, len, flags);
 		}
 		copied = -EAGAIN;
 	}
-	ret = copied;
-out:
 	mutex_unlock(&u->iolock);
 	sk_psock_put(sk, psock);
-	return ret;
+	return copied;
 }
 
 static struct proto *unix_prot_saved __read_mostly;

From 510b4d4c5d4cbfdeaf35e4bc6483e8afa16b0e9e Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 30 Jul 2021 22:54:29 +0100
Subject: [PATCH 58/64] tools: bpftool: Slightly ease bash completion updates

Bash completion for bpftool gets two minor improvements in this patch.

Move the detection of attach types for "bpftool cgroup attach" outside
of the "case/esac" bloc, where we cannot reuse our variable holding the
list of supported attach types as a pattern list. After the change, we
have only one list of cgroup attach types to update when new types are
added, instead of the former two lists.

Also rename the variables holding lists of names for program types, map
types, and attach types, to make them more unique. This can make it
slightly easier to point people to the relevant variables to update, but
the main objective here is to help run a script to check that bash
completion is up-to-date with bpftool's source code.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210730215435.7095-2-quentin@isovalent.com
---
 tools/bpf/bpftool/bash-completion/bpftool | 54 ++++++++++++-----------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index cc33c5824a2f..a7c947e00345 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -404,8 +404,9 @@ _bpftool()
                             return 0
                             ;;
                         5)
-                            COMPREPLY=( $( compgen -W 'msg_verdict stream_verdict \
-                                stream_parser flow_dissector' -- "$cur" ) )
+                            local BPFTOOL_PROG_ATTACH_TYPES='msg_verdict \
+                                stream_verdict stream_parser flow_dissector'
+                            COMPREPLY=( $( compgen -W "$BPFTOOL_PROG_ATTACH_TYPES" -- "$cur" ) )
                             return 0
                             ;;
                         6)
@@ -464,7 +465,7 @@ _bpftool()
 
                     case $prev in
                         type)
-                            COMPREPLY=( $( compgen -W "socket kprobe \
+                            local BPFTOOL_PROG_LOAD_TYPES='socket kprobe \
                                 kretprobe classifier flow_dissector \
                                 action tracepoint raw_tracepoint \
                                 xdp perf_event cgroup/skb cgroup/sock \
@@ -479,8 +480,8 @@ _bpftool()
                                 cgroup/post_bind4 cgroup/post_bind6 \
                                 cgroup/sysctl cgroup/getsockopt \
                                 cgroup/setsockopt cgroup/sock_release struct_ops \
-                                fentry fexit freplace sk_lookup" -- \
-                                                   "$cur" ) )
+                                fentry fexit freplace sk_lookup'
+                            COMPREPLY=( $( compgen -W "$BPFTOOL_PROG_LOAD_TYPES" -- "$cur" ) )
                             return 0
                             ;;
                         id)
@@ -698,15 +699,15 @@ _bpftool()
                             return 0
                             ;;
                         type)
-                            COMPREPLY=( $( compgen -W 'hash array prog_array \
-                                perf_event_array percpu_hash percpu_array \
-                                stack_trace cgroup_array lru_hash \
+                            local BPFTOOL_MAP_CREATE_TYPES='hash array \
+                                prog_array perf_event_array percpu_hash \
+                                percpu_array stack_trace cgroup_array lru_hash \
                                 lru_percpu_hash lpm_trie array_of_maps \
                                 hash_of_maps devmap devmap_hash sockmap cpumap \
                                 xskmap sockhash cgroup_storage reuseport_sockarray \
                                 percpu_cgroup_storage queue stack sk_storage \
-                                struct_ops inode_storage task_storage' -- \
-                                                   "$cur" ) )
+                                struct_ops inode_storage task_storage'
+                            COMPREPLY=( $( compgen -W "$BPFTOOL_MAP_CREATE_TYPES" -- "$cur" ) )
                             return 0
                             ;;
                         key|value|flags|entries)
@@ -1017,34 +1018,37 @@ _bpftool()
                     return 0
                     ;;
                 attach|detach)
-                    local ATTACH_TYPES='ingress egress sock_create sock_ops \
-                        device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \
+                    local BPFTOOL_CGROUP_ATTACH_TYPES='ingress egress \
+                        sock_create sock_ops device \
+                        bind4 bind6 post_bind4 post_bind6 connect4 connect6 \
                         getpeername4 getpeername6 getsockname4 getsockname6 \
                         sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \
                         setsockopt sock_release'
                     local ATTACH_FLAGS='multi override'
                     local PROG_TYPE='id pinned tag name'
-                    case $prev in
-                        $command)
-                            _filedir
-                            return 0
-                            ;;
-                        ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
-                        post_bind4|post_bind6|connect4|connect6|getpeername4|\
-                        getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\
-                        recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt|sock_release)
+                    # Check for $prev = $command first
+                    if [ $prev = $command ]; then
+                        _filedir
+                        return 0
+                    # Then check for attach type. This is done outside of the
+                    # "case $prev in" to avoid writing the whole list of attach
+                    # types again as pattern to match (where we cannot reuse
+                    # our variable).
+                    elif [[ $BPFTOOL_CGROUP_ATTACH_TYPES =~ $prev ]]; then
                             COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
                                 "$cur" ) )
                             return 0
-                            ;;
+                    fi
+                    # case/esac for the other cases
+                    case $prev in
                         id)
                             _bpftool_get_prog_ids
                             return 0
                             ;;
                         *)
-                            if ! _bpftool_search_list "$ATTACH_TYPES"; then
-                                COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- \
-                                    "$cur" ) )
+                            if ! _bpftool_search_list "$BPFTOOL_CGROUP_ATTACH_TYPES"; then
+                                COMPREPLY=( $( compgen -W \
+                                    "$BPFTOOL_CGROUP_ATTACH_TYPES" -- "$cur" ) )
                             elif [[ "$command" == "attach" ]]; then
                                 # We have an attach type on the command line,
                                 # but it is not the previous word, or

From a2b5944fb4e05d6fe032ab130bfd1738481b892a Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 30 Jul 2021 22:54:30 +0100
Subject: [PATCH 59/64] selftests/bpf: Check consistency between bpftool
 source, doc, completion

Whenever the eBPF subsystem gains new elements, such as new program or
map types, it is necessary to update bpftool if we want it able to
handle the new items.

In addition to the main arrays containing the names of these elements in
the source code, there are also multiple locations to update:

- The help message in the do_help() functions in bpftool's source code.
- The RST documentation files.
- The bash completion file.

This has led to omissions multiple times in the past. This patch
attempts to address this issue by adding consistency checks for all
these different locations. It also verifies that the bpf_prog_type,
bpf_map_type and bpf_attach_type enums from the UAPI BPF header have all
their members present in bpftool.

The script requires no argument to run, it reads and parses the
different files to check, and prints the mismatches, if any. It
currently reports a number of missing elements, which will be fixed in a
later patch:

  $ ./test_bpftool_synctypes.py
  Comparing [...]/linux/tools/bpf/bpftool/map.c (map_type_name) and [...]/linux/tools/bpf/bpftool/bash-completion/bpftool (BPFTOOL_MAP_CREATE_TYPES): {'ringbuf'}
  Comparing BPF header (enum bpf_attach_type) and [...]/linux/tools/bpf/bpftool/common.c (attach_type_name): {'BPF_TRACE_ITER', 'BPF_XDP_DEVMAP', 'BPF_XDP', 'BPF_SK_REUSEPORT_SELECT', 'BPF_XDP_CPUMAP', 'BPF_SK_REUSEPORT_SELECT_OR_MIGRATE'}
  Comparing [...]/linux/tools/bpf/bpftool/prog.c (attach_type_strings) and [...]/linux/tools/bpf/bpftool/prog.c (do_help() ATTACH_TYPE): {'skb_verdict'}
  Comparing [...]/linux/tools/bpf/bpftool/prog.c (attach_type_strings) and [...]/linux/tools/bpf/bpftool/Documentation/bpftool-prog.rst (ATTACH_TYPE): {'skb_verdict'}
  Comparing [...]/linux/tools/bpf/bpftool/prog.c (attach_type_strings) and [...]/linux/tools/bpf/bpftool/bash-completion/bpftool (BPFTOOL_PROG_ATTACH_TYPES): {'skb_verdict'}

Note that the script does NOT check for consistency between the list of
program types that bpftool claims it accepts and the actual list of
keywords that can be used. This is because bpftool does not "see" them,
they are ELF section names parsed by libbpf. It is not hard to parse the
section_defs[] array in libbpf, but some section names are associated
with program types that bpftool cannot load at the moment. For example,
some programs require a BTF target and an attach target that bpftool
cannot handle. The script may be extended to parse the array and check
only relevant values in the future.

The script is not added to the selftests' Makefile, because doing so
would require all patches with BPF UAPI change to also update bpftool.
Instead it is to be added to the CI.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210730215435.7095-3-quentin@isovalent.com
---
 .../selftests/bpf/test_bpftool_synctypes.py   | 486 ++++++++++++++++++
 1 file changed, 486 insertions(+)
 create mode 100755 tools/testing/selftests/bpf/test_bpftool_synctypes.py

diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
new file mode 100755
index 000000000000..b41464f46b3b
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
@@ -0,0 +1,486 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+#
+# Copyright (C) 2021 Isovalent, Inc.
+
+import argparse
+import re
+import os, sys
+
+LINUX_ROOT = os.path.abspath(os.path.join(__file__,
+    os.pardir, os.pardir, os.pardir, os.pardir, os.pardir))
+BPFTOOL_DIR = os.path.join(LINUX_ROOT, 'tools/bpf/bpftool')
+retval = 0
+
+class BlockParser(object):
+    """
+    A parser for extracting set of values from blocks such as enums.
+    @reader: a pointer to the open file to parse
+    """
+    def __init__(self, reader):
+        self.reader = reader
+
+    def search_block(self, start_marker):
+        """
+        Search for a given structure in a file.
+        @start_marker: regex marking the beginning of a structure to parse
+        """
+        offset = self.reader.tell()
+        array_start = re.search(start_marker, self.reader.read())
+        if array_start is None:
+            raise Exception('Failed to find start of block')
+        self.reader.seek(offset + array_start.start())
+
+    def parse(self, pattern, end_marker):
+        """
+        Parse a block and return a set of values. Values to extract must be
+        on separate lines in the file.
+        @pattern: pattern used to identify the values to extract
+        @end_marker: regex marking the end of the block to parse
+        """
+        entries = set()
+        while True:
+            line = self.reader.readline()
+            if not line or re.match(end_marker, line):
+                break
+            capture = pattern.search(line)
+            if capture and pattern.groups >= 1:
+                entries.add(capture.group(1))
+        return entries
+
+class ArrayParser(BlockParser):
+    """
+    A parser for extracting dicionaries of values from some BPF-related arrays.
+    @reader: a pointer to the open file to parse
+    @array_name: name of the array to parse
+    """
+    end_marker = re.compile('^};')
+
+    def __init__(self, reader, array_name):
+        self.array_name = array_name
+        self.start_marker = re.compile(f'(static )?const char \* const {self.array_name}\[.*\] = {{\n')
+        super().__init__(reader)
+
+    def search_block(self):
+        """
+        Search for the given array in a file.
+        """
+        super().search_block(self.start_marker);
+
+    def parse(self):
+        """
+        Parse a block and return data as a dictionary. Items to extract must be
+        on separate lines in the file.
+        """
+        pattern = re.compile('\[(BPF_\w*)\]\s*= "(.*)",?$')
+        entries = {}
+        while True:
+            line = self.reader.readline()
+            if line == '' or re.match(self.end_marker, line):
+                break
+            capture = pattern.search(line)
+            if capture:
+                entries[capture.group(1)] = capture.group(2)
+        return entries
+
+class InlineListParser(BlockParser):
+    """
+    A parser for extracting set of values from inline lists.
+    """
+    def parse(self, pattern, end_marker):
+        """
+        Parse a block and return a set of values. Multiple values to extract
+        can be on a same line in the file.
+        @pattern: pattern used to identify the values to extract
+        @end_marker: regex marking the end of the block to parse
+        """
+        entries = set()
+        while True:
+            line = self.reader.readline()
+            if not line:
+                break
+            entries.update(pattern.findall(line))
+            if re.search(end_marker, line):
+                break
+        return entries
+
+class FileExtractor(object):
+    """
+    A generic reader for extracting data from a given file. This class contains
+    several helper methods that wrap arround parser objects to extract values
+    from different structures.
+    This class does not offer a way to set a filename, which is expected to be
+    defined in children classes.
+    """
+    def __init__(self):
+        self.reader = open(self.filename, 'r')
+
+    def close(self):
+        """
+        Close the file used by the parser.
+        """
+        self.reader.close()
+
+    def reset_read(self):
+        """
+        Reset the file position indicator for this parser. This is useful when
+        parsing several structures in the file without respecting the order in
+        which those structures appear in the file.
+        """
+        self.reader.seek(0)
+
+    def get_types_from_array(self, array_name):
+        """
+        Search for and parse an array associating names to BPF_* enum members,
+        for example:
+
+            const char * const prog_type_name[] = {
+                    [BPF_PROG_TYPE_UNSPEC]                  = "unspec",
+                    [BPF_PROG_TYPE_SOCKET_FILTER]           = "socket_filter",
+                    [BPF_PROG_TYPE_KPROBE]                  = "kprobe",
+            };
+
+        Return a dictionary with the enum member names as keys and the
+        associated names as values, for example:
+
+            {'BPF_PROG_TYPE_UNSPEC': 'unspec',
+             'BPF_PROG_TYPE_SOCKET_FILTER': 'socket_filter',
+             'BPF_PROG_TYPE_KPROBE': 'kprobe'}
+
+        @array_name: name of the array to parse
+        """
+        array_parser = ArrayParser(self.reader, array_name)
+        array_parser.search_block()
+        return array_parser.parse()
+
+    def get_enum(self, enum_name):
+        """
+        Search for and parse an enum containing BPF_* members, for example:
+
+            enum bpf_prog_type {
+                    BPF_PROG_TYPE_UNSPEC,
+                    BPF_PROG_TYPE_SOCKET_FILTER,
+                    BPF_PROG_TYPE_KPROBE,
+            };
+
+        Return a set containing all member names, for example:
+
+            {'BPF_PROG_TYPE_UNSPEC',
+             'BPF_PROG_TYPE_SOCKET_FILTER',
+             'BPF_PROG_TYPE_KPROBE'}
+
+        @enum_name: name of the enum to parse
+        """
+        start_marker = re.compile(f'enum {enum_name} {{\n')
+        pattern = re.compile('^\s*(BPF_\w+),?$')
+        end_marker = re.compile('^};')
+        parser = BlockParser(self.reader)
+        parser.search_block(start_marker)
+        return parser.parse(pattern, end_marker)
+
+    def __get_description_list(self, start_marker, pattern, end_marker):
+        parser = InlineListParser(self.reader)
+        parser.search_block(start_marker)
+        return parser.parse(pattern, end_marker)
+
+    def get_rst_list(self, block_name):
+        """
+        Search for and parse a list of type names from RST documentation, for
+        example:
+
+             |       *TYPE* := {
+             |               **socket** | **kprobe** |
+             |               **kretprobe**
+             |       }
+
+        Return a set containing all type names, for example:
+
+            {'socket', 'kprobe', 'kretprobe'}
+
+        @block_name: name of the blog to parse, 'TYPE' in the example
+        """
+        start_marker = re.compile(f'\*{block_name}\* := {{')
+        pattern = re.compile('\*\*([\w/]+)\*\*')
+        end_marker = re.compile('}\n')
+        return self.__get_description_list(start_marker, pattern, end_marker)
+
+    def get_help_list(self, block_name):
+        """
+        Search for and parse a list of type names from a help message in
+        bpftool, for example:
+
+            "       TYPE := { socket | kprobe |\\n"
+            "               kretprobe }\\n"
+
+        Return a set containing all type names, for example:
+
+            {'socket', 'kprobe', 'kretprobe'}
+
+        @block_name: name of the blog to parse, 'TYPE' in the example
+        """
+        start_marker = re.compile(f'"\s*{block_name} := {{')
+        pattern = re.compile('([\w/]+) [|}]')
+        end_marker = re.compile('}')
+        return self.__get_description_list(start_marker, pattern, end_marker)
+
+    def get_bashcomp_list(self, block_name):
+        """
+        Search for and parse a list of type names from a variable in bash
+        completion file, for example:
+
+            local BPFTOOL_PROG_LOAD_TYPES='socket kprobe \\
+                kretprobe'
+
+        Return a set containing all type names, for example:
+
+            {'socket', 'kprobe', 'kretprobe'}
+
+        @block_name: name of the blog to parse, 'TYPE' in the example
+        """
+        start_marker = re.compile(f'local {block_name}=\'')
+        pattern = re.compile('(?:.*=\')?([\w/]+)')
+        end_marker = re.compile('\'$')
+        return self.__get_description_list(start_marker, pattern, end_marker)
+
+class ProgFileExtractor(FileExtractor):
+    """
+    An extractor for bpftool's prog.c.
+    """
+    filename = os.path.join(BPFTOOL_DIR, 'prog.c')
+
+    def get_prog_types(self):
+        return self.get_types_from_array('prog_type_name')
+
+    def get_attach_types(self):
+        return self.get_types_from_array('attach_type_strings')
+
+    def get_prog_attach_help(self):
+        return self.get_help_list('ATTACH_TYPE')
+
+class MapFileExtractor(FileExtractor):
+    """
+    An extractor for bpftool's map.c.
+    """
+    filename = os.path.join(BPFTOOL_DIR, 'map.c')
+
+    def get_map_types(self):
+        return self.get_types_from_array('map_type_name')
+
+    def get_map_help(self):
+        return self.get_help_list('TYPE')
+
+class CgroupFileExtractor(FileExtractor):
+    """
+    An extractor for bpftool's cgroup.c.
+    """
+    filename = os.path.join(BPFTOOL_DIR, 'cgroup.c')
+
+    def get_prog_attach_help(self):
+        return self.get_help_list('ATTACH_TYPE')
+
+class CommonFileExtractor(FileExtractor):
+    """
+    An extractor for bpftool's common.c.
+    """
+    filename = os.path.join(BPFTOOL_DIR, 'common.c')
+
+    def __init__(self):
+        super().__init__()
+        self.attach_types = {}
+
+    def get_attach_types(self):
+        if not self.attach_types:
+            self.attach_types = self.get_types_from_array('attach_type_name')
+        return self.attach_types
+
+    def get_cgroup_attach_types(self):
+        if not self.attach_types:
+            self.get_attach_types()
+        cgroup_types = {}
+        for (key, value) in self.attach_types.items():
+            if key.find('BPF_CGROUP') != -1:
+                cgroup_types[key] = value
+        return cgroup_types
+
+class BpfHeaderExtractor(FileExtractor):
+    """
+    An extractor for the UAPI BPF header.
+    """
+    filename = os.path.join(LINUX_ROOT, 'tools/include/uapi/linux/bpf.h')
+
+    def get_prog_types(self):
+        return self.get_enum('bpf_prog_type')
+
+    def get_map_types(self):
+        return self.get_enum('bpf_map_type')
+
+    def get_attach_types(self):
+        return self.get_enum('bpf_attach_type')
+
+class ManProgExtractor(FileExtractor):
+    """
+    An extractor for bpftool-prog.rst.
+    """
+    filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-prog.rst')
+
+    def get_attach_types(self):
+        return self.get_rst_list('ATTACH_TYPE')
+
+class ManMapExtractor(FileExtractor):
+    """
+    An extractor for bpftool-map.rst.
+    """
+    filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-map.rst')
+
+    def get_map_types(self):
+        return self.get_rst_list('TYPE')
+
+class ManCgroupExtractor(FileExtractor):
+    """
+    An extractor for bpftool-cgroup.rst.
+    """
+    filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-cgroup.rst')
+
+    def get_attach_types(self):
+        return self.get_rst_list('ATTACH_TYPE')
+
+class BashcompExtractor(FileExtractor):
+    """
+    An extractor for bpftool's bash completion file.
+    """
+    filename = os.path.join(BPFTOOL_DIR, 'bash-completion/bpftool')
+
+    def get_prog_attach_types(self):
+        return self.get_bashcomp_list('BPFTOOL_PROG_ATTACH_TYPES')
+
+    def get_map_types(self):
+        return self.get_bashcomp_list('BPFTOOL_MAP_CREATE_TYPES')
+
+    def get_cgroup_attach_types(self):
+        return self.get_bashcomp_list('BPFTOOL_CGROUP_ATTACH_TYPES')
+
+def verify(first_set, second_set, message):
+    """
+    Print all values that differ between two sets.
+    @first_set: one set to compare
+    @second_set: another set to compare
+    @message: message to print for values belonging to only one of the sets
+    """
+    global retval
+    diff = first_set.symmetric_difference(second_set)
+    if diff:
+        print(message, diff)
+        retval = 1
+
+def main():
+    # No arguments supported at this time, but print usage for -h|--help
+    argParser = argparse.ArgumentParser(description="""
+    Verify that bpftool's code, help messages, documentation and bash completion
+    are all in sync on program types, map types and attach types. Also check that
+    bpftool is in sync with the UAPI BPF header.
+    """)
+    args = argParser.parse_args()
+
+    # Map types (enum)
+
+    bpf_info = BpfHeaderExtractor()
+    ref = bpf_info.get_map_types()
+
+    map_info = MapFileExtractor()
+    source_map_items = map_info.get_map_types()
+    map_types_enum = set(source_map_items.keys())
+
+    verify(ref, map_types_enum,
+            f'Comparing BPF header (enum bpf_map_type) and {MapFileExtractor.filename} (map_type_name):')
+
+    # Map types (names)
+
+    source_map_types = set(source_map_items.values())
+    source_map_types.discard('unspec')
+
+    help_map_types = map_info.get_map_help()
+    map_info.close()
+
+    man_map_info = ManMapExtractor()
+    man_map_types = man_map_info.get_map_types()
+    man_map_info.close()
+
+    bashcomp_info = BashcompExtractor()
+    bashcomp_map_types = bashcomp_info.get_map_types()
+
+    verify(source_map_types, help_map_types,
+            f'Comparing {MapFileExtractor.filename} (map_type_name) and {MapFileExtractor.filename} (do_help() TYPE):')
+    verify(source_map_types, man_map_types,
+            f'Comparing {MapFileExtractor.filename} (map_type_name) and {ManMapExtractor.filename} (TYPE):')
+    verify(source_map_types, bashcomp_map_types,
+            f'Comparing {MapFileExtractor.filename} (map_type_name) and {BashcompExtractor.filename} (BPFTOOL_MAP_CREATE_TYPES):')
+
+    # Program types (enum)
+
+    ref = bpf_info.get_prog_types()
+
+    prog_info = ProgFileExtractor()
+    prog_types = set(prog_info.get_prog_types().keys())
+
+    verify(ref, prog_types,
+            f'Comparing BPF header (enum bpf_prog_type) and {ProgFileExtractor.filename} (prog_type_name):')
+
+    # Attach types (enum)
+
+    ref = bpf_info.get_attach_types()
+    bpf_info.close()
+
+    common_info = CommonFileExtractor()
+    attach_types = common_info.get_attach_types()
+
+    verify(ref, attach_types,
+            f'Comparing BPF header (enum bpf_attach_type) and {CommonFileExtractor.filename} (attach_type_name):')
+
+    # Attach types (names)
+
+    source_prog_attach_types = set(prog_info.get_attach_types().values())
+
+    help_prog_attach_types = prog_info.get_prog_attach_help()
+    prog_info.close()
+
+    man_prog_info = ManProgExtractor()
+    man_prog_attach_types = man_prog_info.get_attach_types()
+    man_prog_info.close()
+
+    bashcomp_info.reset_read() # We stopped at map types, rewind
+    bashcomp_prog_attach_types = bashcomp_info.get_prog_attach_types()
+
+    verify(source_prog_attach_types, help_prog_attach_types,
+            f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {ProgFileExtractor.filename} (do_help() ATTACH_TYPE):')
+    verify(source_prog_attach_types, man_prog_attach_types,
+            f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {ManProgExtractor.filename} (ATTACH_TYPE):')
+    verify(source_prog_attach_types, bashcomp_prog_attach_types,
+            f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {BashcompExtractor.filename} (BPFTOOL_PROG_ATTACH_TYPES):')
+
+    # Cgroup attach types
+
+    source_cgroup_attach_types = set(common_info.get_cgroup_attach_types().values())
+    common_info.close()
+
+    cgroup_info = CgroupFileExtractor()
+    help_cgroup_attach_types = cgroup_info.get_prog_attach_help()
+    cgroup_info.close()
+
+    man_cgroup_info = ManCgroupExtractor()
+    man_cgroup_attach_types = man_cgroup_info.get_attach_types()
+    man_cgroup_info.close()
+
+    bashcomp_cgroup_attach_types = bashcomp_info.get_cgroup_attach_types()
+    bashcomp_info.close()
+
+    verify(source_cgroup_attach_types, help_cgroup_attach_types,
+            f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {CgroupFileExtractor.filename} (do_help() ATTACH_TYPE):')
+    verify(source_cgroup_attach_types, man_cgroup_attach_types,
+            f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {ManCgroupExtractor.filename} (ATTACH_TYPE):')
+    verify(source_cgroup_attach_types, bashcomp_cgroup_attach_types,
+            f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {BashcompExtractor.filename} (BPFTOOL_CGROUP_ATTACH_TYPES):')
+
+    sys.exit(retval)
+
+if __name__ == "__main__":
+    main()

From b544342e52fc3dfcc1d430fff8d1a7a3be920ffd Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 30 Jul 2021 22:54:31 +0100
Subject: [PATCH 60/64] tools: bpftool: Complete and synchronise attach or map
 types

Update bpftool's list of attach type names to tell it about the latest
attach types, or the "ringbuf" map. Also update the documentation, help
messages, and bash completion when relevant.

These missing items were reported by the newly added Python script used
to help maintain consistency in bpftool.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210730215435.7095-4-quentin@isovalent.com
---
 tools/bpf/bpftool/Documentation/bpftool-prog.rst | 2 +-
 tools/bpf/bpftool/bash-completion/bpftool        | 5 +++--
 tools/bpf/bpftool/common.c                       | 6 ++++++
 tools/bpf/bpftool/prog.c                         | 4 ++--
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index fe1b38e7e887..abf5f4cd7d3e 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -48,7 +48,7 @@ PROG COMMANDS
 |		**struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup**
 |	}
 |       *ATTACH_TYPE* := {
-|		**msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector**
+|		**msg_verdict** | **skb_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector**
 |	}
 |	*METRICs* := {
 |		**cycles** | **instructions** | **l1d_loads** | **llc_misses**
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index a7c947e00345..1521a725f07c 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -405,7 +405,8 @@ _bpftool()
                             ;;
                         5)
                             local BPFTOOL_PROG_ATTACH_TYPES='msg_verdict \
-                                stream_verdict stream_parser flow_dissector'
+                                skb_verdict stream_verdict stream_parser \
+                                flow_dissector'
                             COMPREPLY=( $( compgen -W "$BPFTOOL_PROG_ATTACH_TYPES" -- "$cur" ) )
                             return 0
                             ;;
@@ -706,7 +707,7 @@ _bpftool()
                                 hash_of_maps devmap devmap_hash sockmap cpumap \
                                 xskmap sockhash cgroup_storage reuseport_sockarray \
                                 percpu_cgroup_storage queue stack sk_storage \
-                                struct_ops inode_storage task_storage'
+                                struct_ops inode_storage task_storage ringbuf'
                             COMPREPLY=( $( compgen -W "$BPFTOOL_MAP_CREATE_TYPES" -- "$cur" ) )
                             return 0
                             ;;
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 1828bba19020..c5e57cce887a 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -67,6 +67,12 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
 	[BPF_MODIFY_RETURN]		= "mod_ret",
 	[BPF_LSM_MAC]			= "lsm_mac",
 	[BPF_SK_LOOKUP]			= "sk_lookup",
+	[BPF_TRACE_ITER]		= "trace_iter",
+	[BPF_XDP_DEVMAP]		= "xdp_devmap",
+	[BPF_XDP_CPUMAP]		= "xdp_cpumap",
+	[BPF_XDP]			= "xdp",
+	[BPF_SK_REUSEPORT_SELECT]	= "sk_skb_reuseport_select",
+	[BPF_SK_REUSEPORT_SELECT_OR_MIGRATE]	= "sk_skb_reuseport_select_or_migrate",
 };
 
 void p_err(const char *fmt, ...)
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index b1996b8f1d42..d98cfc973a1d 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -2256,8 +2256,8 @@ static int do_help(int argc, char **argv)
 		"                 cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n"
 		"                 cgroup/getsockopt | cgroup/setsockopt | cgroup/sock_release |\n"
 		"                 struct_ops | fentry | fexit | freplace | sk_lookup }\n"
-		"       ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
-		"                        flow_dissector }\n"
+		"       ATTACH_TYPE := { msg_verdict | skb_verdict | stream_verdict |\n"
+		"                        stream_parser | flow_dissector }\n"
 		"       METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",

From c07ba629df97b796ca7bbdfbf4748266ead27745 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 30 Jul 2021 22:54:32 +0100
Subject: [PATCH 61/64] tools: bpftool: Update and synchronise option list in
 doc and help msg

All bpftool commands support the options for JSON output and debug from
libbpf. In addition, some commands support additional options
corresponding to specific use cases.

The list of options described in the man pages for the different
commands are not always accurate. The messages for interactive help are
mostly limited to HELP_SPEC_OPTIONS, and are even less representative of
the actual set of options supported for the commands.

Let's update the lists:

- HELP_SPEC_OPTIONS is modified to contain the "default" options (JSON
  and debug), and to be extensible (no ending curly bracket).
- All commands use HELP_SPEC_OPTIONS in their help message, and then
  complete the list with their specific options.
- The lists of options in the man pages are updated.
- The formatting of the list for bpftool.rst is adjusted to match
  formatting for the other man pages. This is for consistency, and also
  because it will be helpful in a future patch to automatically check
  that the files are synchronised.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210730215435.7095-5-quentin@isovalent.com
---
 tools/bpf/bpftool/Documentation/bpftool-btf.rst      |  2 +-
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst   |  3 ++-
 tools/bpf/bpftool/Documentation/bpftool-feature.rst  |  2 +-
 tools/bpf/bpftool/Documentation/bpftool-gen.rst      |  2 +-
 tools/bpf/bpftool/Documentation/bpftool-iter.rst     |  2 ++
 tools/bpf/bpftool/Documentation/bpftool-link.rst     |  3 ++-
 tools/bpf/bpftool/Documentation/bpftool-map.rst      |  3 ++-
 tools/bpf/bpftool/Documentation/bpftool-net.rst      |  2 +-
 tools/bpf/bpftool/Documentation/bpftool-perf.rst     |  2 +-
 tools/bpf/bpftool/Documentation/bpftool-prog.rst     |  3 ++-
 .../bpf/bpftool/Documentation/bpftool-struct_ops.rst |  2 +-
 tools/bpf/bpftool/Documentation/bpftool.rst          | 12 ++++++------
 tools/bpf/bpftool/btf.c                              |  2 +-
 tools/bpf/bpftool/cgroup.c                           |  3 ++-
 tools/bpf/bpftool/feature.c                          |  1 +
 tools/bpf/bpftool/gen.c                              |  2 +-
 tools/bpf/bpftool/iter.c                             |  2 ++
 tools/bpf/bpftool/link.c                             |  3 ++-
 tools/bpf/bpftool/main.c                             |  3 ++-
 tools/bpf/bpftool/main.h                             |  3 +--
 tools/bpf/bpftool/map.c                              |  5 +++--
 tools/bpf/bpftool/net.c                              |  1 +
 tools/bpf/bpftool/perf.c                             |  5 ++++-
 tools/bpf/bpftool/prog.c                             |  3 ++-
 tools/bpf/bpftool/struct_ops.c                       |  2 +-
 25 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index ff4d327a582e..1d37f3809842 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -12,7 +12,7 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **btf** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | {**-d** | **--debug** } }
 
 	*COMMANDS* := { **dump** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index baee8591ac76..3e4395eede4f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -12,7 +12,8 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **cgroup** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
+		{ **-f** | **--bpffs** } }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **tree** | **attach** | **detach** | **help** }
diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
index dd3771bdbc57..ab9f57ee4c3a 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
@@ -12,7 +12,7 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **feature** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
 
 	*COMMANDS* := { **probe** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
index 7cd6681137f3..709b93fe1da3 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
@@ -12,7 +12,7 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **gen** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
 
 	*COMMAND* := { **object** | **skeleton** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
index 51f49bead619..471f363a725a 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
@@ -12,6 +12,8 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **iter** *COMMAND*
 
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
+
 	*COMMANDS* := { **pin** | **help** }
 
 ITER COMMANDS
diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst
index 5f7db2a837cc..0de90f086238 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-link.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst
@@ -12,7 +12,8 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **link** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
+		{ **-f** | **--bpffs** } | { **-n** | **--nomount** } }
 
 	*COMMANDS* := { **show** | **list** | **pin** | **help** }
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 3d52256ba75f..d0c4abe08aba 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -12,7 +12,8 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **map** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
+		{ **-f** | **--bpffs** } | { **-n** | **--nomount** } }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext**
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index d8165d530937..1ae0375e8fea 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -12,7 +12,7 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **net** *COMMAND*
 
-	*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **attach** | **detach** | **help** }
diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
index e958ce91de72..ce52798a917d 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -12,7 +12,7 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **perf** *COMMAND*
 
-	*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **help** }
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index abf5f4cd7d3e..4b8412fe2c60 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -12,7 +12,8 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **prog** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
+		{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load**
diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
index 506e70ee78e9..02afc0fc14cb 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst
@@ -12,7 +12,7 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **struct_ops** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **dump** | **register** | **unregister** | **help** }
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index e7d949334961..bb23f55bb05a 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -18,15 +18,15 @@ SYNOPSIS
 
 	*OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** | **feature** }
 
-	*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
-	| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
+	*OPTIONS* := { { **-V** | **--version** } |
+		{ **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
 
 	*MAP-COMMANDS* :=
-	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext**
-	| **delete** | **pin** | **event_pipe** | **help** }
+	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** |
+		**delete** | **pin** | **event_pipe** | **help** }
 
-	*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin**
-	| **load** | **attach** | **detach** | **help** }
+	*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** |
+		**load** | **attach** | **detach** | **help** }
 
 	*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
 
diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 0ce3643278d4..3c5fc9b25c30 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -981,7 +981,7 @@ static int do_help(int argc, char **argv)
 		"       FORMAT  := { raw | c }\n"
 		"       " HELP_SPEC_MAP "\n"
 		"       " HELP_SPEC_PROGRAM "\n"
-		"       " HELP_SPEC_OPTIONS "\n"
+		"       " HELP_SPEC_OPTIONS " }\n"
 		"",
 		bin_name, "btf");
 
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 6e53b1d393f4..c42f437a1015 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -501,7 +501,8 @@ static int do_help(int argc, char **argv)
 		HELP_SPEC_ATTACH_TYPES "\n"
 		"       " HELP_SPEC_ATTACH_FLAGS "\n"
 		"       " HELP_SPEC_PROGRAM "\n"
-		"       " HELP_SPEC_OPTIONS "\n"
+		"       " HELP_SPEC_OPTIONS " |\n"
+		"                    {-f|--bpffs} }
 		"",
 		bin_name, argv[-2]);
 
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 40a88df275f9..7f36385aa9e2 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -1005,6 +1005,7 @@ static int do_help(int argc, char **argv)
 		"       %1$s %2$s help\n"
 		"\n"
 		"       COMPONENT := { kernel | dev NAME }\n"
+		"       " HELP_SPEC_OPTIONS " }\n"
 		"",
 		bin_name, argv[-2]);
 
diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index 1d71ff8c52fa..d4225f7fbcee 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -1026,7 +1026,7 @@ static int do_help(int argc, char **argv)
 		"       %1$s %2$s skeleton FILE [name OBJECT_NAME]\n"
 		"       %1$s %2$s help\n"
 		"\n"
-		"       " HELP_SPEC_OPTIONS "\n"
+		"       " HELP_SPEC_OPTIONS " }\n"
 		"",
 		bin_name, "gen");
 
diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c
index 3b1aad7535dd..84a9b01d956d 100644
--- a/tools/bpf/bpftool/iter.c
+++ b/tools/bpf/bpftool/iter.c
@@ -97,7 +97,9 @@ static int do_help(int argc, char **argv)
 	fprintf(stderr,
 		"Usage: %1$s %2$s pin OBJ PATH [map MAP]\n"
 		"       %1$s %2$s help\n"
+		"\n"
 		"       " HELP_SPEC_MAP "\n"
+		"       " HELP_SPEC_OPTIONS " }\n"
 		"",
 		bin_name, "iter");
 
diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index e77e1525d20a..8cc3e36f8cc6 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -401,7 +401,8 @@ static int do_help(int argc, char **argv)
 		"       %1$s %2$s help\n"
 		"\n"
 		"       " HELP_SPEC_LINK "\n"
-		"       " HELP_SPEC_OPTIONS "\n"
+		"       " HELP_SPEC_OPTIONS " |\n"
+		"                    {-f|--bpffs} | {-n|--nomount} }\n"
 		"",
 		bin_name, argv[-2]);
 
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 3ddfd4843738..02eaaf065f65 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -64,7 +64,8 @@ static int do_help(int argc, char **argv)
 		"       %s version\n"
 		"\n"
 		"       OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n"
-		"       " HELP_SPEC_OPTIONS "\n"
+		"       " HELP_SPEC_OPTIONS " |\n"
+		"                    {-V|--version} }\n"
 		"",
 		bin_name, bin_name, bin_name);
 
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index c1cf29798b99..90caa42aac4c 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -57,8 +57,7 @@ static inline void *u64_to_ptr(__u64 ptr)
 #define HELP_SPEC_PROGRAM						\
 	"PROG := { id PROG_ID | pinned FILE | tag PROG_TAG | name PROG_NAME }"
 #define HELP_SPEC_OPTIONS						\
-	"OPTIONS := { {-j|--json} [{-p|--pretty}] | {-f|--bpffs} |\n"	\
-	"\t            {-m|--mapcompat} | {-n|--nomount} }"
+	"OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug}"
 #define HELP_SPEC_MAP							\
 	"MAP := { id MAP_ID | pinned FILE | name MAP_NAME }"
 #define HELP_SPEC_LINK							\
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 7e7f748bb0be..407071d54ab1 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -1466,8 +1466,9 @@ static int do_help(int argc, char **argv)
 		"                 devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
 		"                 cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
 		"                 queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n"
-		"		  task_storage }\n"
-		"       " HELP_SPEC_OPTIONS "\n"
+		"                 task_storage }\n"
+		"       " HELP_SPEC_OPTIONS " |\n"
+		"                    {-f|--bpffs} | {-n|--nomount} }\n"
 		"",
 		bin_name, argv[-2]);
 
diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index f836d115d7d6..649053704bd7 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -729,6 +729,7 @@ static int do_help(int argc, char **argv)
 		"\n"
 		"       " HELP_SPEC_PROGRAM "\n"
 		"       ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n"
+		"       " HELP_SPEC_OPTIONS " }\n"
 		"\n"
 		"Note: Only xdp and tc attachments are supported now.\n"
 		"      For progs attached to cgroups, use \"bpftool cgroup\"\n"
diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c
index ad23934819c7..50de087b0db7 100644
--- a/tools/bpf/bpftool/perf.c
+++ b/tools/bpf/bpftool/perf.c
@@ -231,7 +231,10 @@ static int do_show(int argc, char **argv)
 static int do_help(int argc, char **argv)
 {
 	fprintf(stderr,
-		"Usage: %1$s %2$s { show | list | help }\n"
+		"Usage: %1$s %2$s { show | list }\n"
+		"       %1$s %2$s help }\n"
+		"\n"
+		"       " HELP_SPEC_OPTIONS " }\n"
 		"",
 		bin_name, argv[-2]);
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index d98cfc973a1d..a205f7124b38 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -2259,7 +2259,8 @@ static int do_help(int argc, char **argv)
 		"       ATTACH_TYPE := { msg_verdict | skb_verdict | stream_verdict |\n"
 		"                        stream_parser | flow_dissector }\n"
 		"       METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n"
-		"       " HELP_SPEC_OPTIONS "\n"
+		"       " HELP_SPEC_OPTIONS " |\n"
+		"                    {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} }\n"
 		"",
 		bin_name, argv[-2]);
 
diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c
index b58b91f62ffb..ab2d2290569a 100644
--- a/tools/bpf/bpftool/struct_ops.c
+++ b/tools/bpf/bpftool/struct_ops.c
@@ -572,8 +572,8 @@ static int do_help(int argc, char **argv)
 		"       %1$s %2$s unregister STRUCT_OPS_MAP\n"
 		"       %1$s %2$s help\n"
 		"\n"
-		"       OPTIONS := { {-j|--json} [{-p|--pretty}] }\n"
 		"       STRUCT_OPS_MAP := [ id STRUCT_OPS_MAP_ID | name STRUCT_OPS_MAP_NAME ]\n"
+		"       " HELP_SPEC_OPTIONS " }\n"
 		"",
 		bin_name, argv[-2]);
 

From da87772f086fc40e0ced9deb939400b09a182194 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 30 Jul 2021 22:54:33 +0100
Subject: [PATCH 62/64] selftests/bpf: Update bpftool's consistency script for
 checking options

Update the script responsible for checking that the different types used
at various places in bpftool are synchronised, and extend it to check
the consistency of options between the help messages in the source code
and the manual pages.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210730215435.7095-6-quentin@isovalent.com
---
 .../selftests/bpf/test_bpftool_synctypes.py   | 122 ++++++++++++++++--
 1 file changed, 111 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
index b41464f46b3b..be54b7335a76 100755
--- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py
+++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
@@ -200,7 +200,7 @@ class FileExtractor(object):
         @block_name: name of the blog to parse, 'TYPE' in the example
         """
         start_marker = re.compile(f'\*{block_name}\* := {{')
-        pattern = re.compile('\*\*([\w/]+)\*\*')
+        pattern = re.compile('\*\*([\w/-]+)\*\*')
         end_marker = re.compile('}\n')
         return self.__get_description_list(start_marker, pattern, end_marker)
 
@@ -223,6 +223,31 @@ class FileExtractor(object):
         end_marker = re.compile('}')
         return self.__get_description_list(start_marker, pattern, end_marker)
 
+    def get_help_list_macro(self, macro):
+        """
+        Search for and parse a list of values from a help message starting with
+        a macro in bpftool, for example:
+
+            "       " HELP_SPEC_OPTIONS " |\\n"
+            "                    {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} }\\n"
+
+        Return a set containing all item names, for example:
+
+            {'-f', '--bpffs', '-m', '--mapcompat', '-n', '--nomount'}
+
+        @macro: macro starting the block, 'HELP_SPEC_OPTIONS' in the example
+        """
+        start_marker = re.compile(f'"\s*{macro}\s*" [|}}]')
+        pattern = re.compile('([\w-]+) ?(?:\||}[ }\]])')
+        end_marker = re.compile('}\\\\n')
+        return self.__get_description_list(start_marker, pattern, end_marker)
+
+    def default_options(self):
+        """
+        Return the default options contained in HELP_SPEC_OPTIONS
+        """
+        return { '-j', '--json', '-p', '--pretty', '-d', '--debug' }
+
     def get_bashcomp_list(self, block_name):
         """
         Search for and parse a list of type names from a variable in bash
@@ -242,7 +267,16 @@ class FileExtractor(object):
         end_marker = re.compile('\'$')
         return self.__get_description_list(start_marker, pattern, end_marker)
 
-class ProgFileExtractor(FileExtractor):
+class SourceFileExtractor(FileExtractor):
+    """
+    An abstract extractor for a source file with usage message.
+    This class does not offer a way to set a filename, which is expected to be
+    defined in children classes.
+    """
+    def get_options(self):
+        return self.default_options().union(self.get_help_list_macro('HELP_SPEC_OPTIONS'))
+
+class ProgFileExtractor(SourceFileExtractor):
     """
     An extractor for bpftool's prog.c.
     """
@@ -257,7 +291,7 @@ class ProgFileExtractor(FileExtractor):
     def get_prog_attach_help(self):
         return self.get_help_list('ATTACH_TYPE')
 
-class MapFileExtractor(FileExtractor):
+class MapFileExtractor(SourceFileExtractor):
     """
     An extractor for bpftool's map.c.
     """
@@ -269,7 +303,7 @@ class MapFileExtractor(FileExtractor):
     def get_map_help(self):
         return self.get_help_list('TYPE')
 
-class CgroupFileExtractor(FileExtractor):
+class CgroupFileExtractor(SourceFileExtractor):
     """
     An extractor for bpftool's cgroup.c.
     """
@@ -278,7 +312,7 @@ class CgroupFileExtractor(FileExtractor):
     def get_prog_attach_help(self):
         return self.get_help_list('ATTACH_TYPE')
 
-class CommonFileExtractor(FileExtractor):
+class CommonFileExtractor(SourceFileExtractor):
     """
     An extractor for bpftool's common.c.
     """
@@ -302,6 +336,16 @@ class CommonFileExtractor(FileExtractor):
                 cgroup_types[key] = value
         return cgroup_types
 
+class GenericSourceExtractor(SourceFileExtractor):
+    """
+    An extractor for generic source code files.
+    """
+    filename = ""
+
+    def __init__(self, filename):
+        self.filename = os.path.join(BPFTOOL_DIR, filename)
+        super().__init__()
+
 class BpfHeaderExtractor(FileExtractor):
     """
     An extractor for the UAPI BPF header.
@@ -317,7 +361,16 @@ class BpfHeaderExtractor(FileExtractor):
     def get_attach_types(self):
         return self.get_enum('bpf_attach_type')
 
-class ManProgExtractor(FileExtractor):
+class ManPageExtractor(FileExtractor):
+    """
+    An abstract extractor for an RST documentation page.
+    This class does not offer a way to set a filename, which is expected to be
+    defined in children classes.
+    """
+    def get_options(self):
+        return self.get_rst_list('OPTIONS')
+
+class ManProgExtractor(ManPageExtractor):
     """
     An extractor for bpftool-prog.rst.
     """
@@ -326,7 +379,7 @@ class ManProgExtractor(FileExtractor):
     def get_attach_types(self):
         return self.get_rst_list('ATTACH_TYPE')
 
-class ManMapExtractor(FileExtractor):
+class ManMapExtractor(ManPageExtractor):
     """
     An extractor for bpftool-map.rst.
     """
@@ -335,7 +388,7 @@ class ManMapExtractor(FileExtractor):
     def get_map_types(self):
         return self.get_rst_list('TYPE')
 
-class ManCgroupExtractor(FileExtractor):
+class ManCgroupExtractor(ManPageExtractor):
     """
     An extractor for bpftool-cgroup.rst.
     """
@@ -344,6 +397,16 @@ class ManCgroupExtractor(FileExtractor):
     def get_attach_types(self):
         return self.get_rst_list('ATTACH_TYPE')
 
+class ManGenericExtractor(ManPageExtractor):
+    """
+    An extractor for generic RST documentation pages.
+    """
+    filename = ""
+
+    def __init__(self, filename):
+        self.filename = os.path.join(BPFTOOL_DIR, filename)
+        super().__init__()
+
 class BashcompExtractor(FileExtractor):
     """
     An extractor for bpftool's bash completion file.
@@ -375,9 +438,9 @@ def verify(first_set, second_set, message):
 def main():
     # No arguments supported at this time, but print usage for -h|--help
     argParser = argparse.ArgumentParser(description="""
-    Verify that bpftool's code, help messages, documentation and bash completion
-    are all in sync on program types, map types and attach types. Also check that
-    bpftool is in sync with the UAPI BPF header.
+    Verify that bpftool's code, help messages, documentation and bash
+    completion are all in sync on program types, map types, attach types, and
+    options. Also check that bpftool is in sync with the UAPI BPF header.
     """)
     args = argParser.parse_args()
 
@@ -399,9 +462,11 @@ def main():
     source_map_types.discard('unspec')
 
     help_map_types = map_info.get_map_help()
+    help_map_options = map_info.get_options()
     map_info.close()
 
     man_map_info = ManMapExtractor()
+    man_map_options = man_map_info.get_options()
     man_map_types = man_map_info.get_map_types()
     man_map_info.close()
 
@@ -412,6 +477,8 @@ def main():
             f'Comparing {MapFileExtractor.filename} (map_type_name) and {MapFileExtractor.filename} (do_help() TYPE):')
     verify(source_map_types, man_map_types,
             f'Comparing {MapFileExtractor.filename} (map_type_name) and {ManMapExtractor.filename} (TYPE):')
+    verify(help_map_options, man_map_options,
+            f'Comparing {MapFileExtractor.filename} (do_help() OPTIONS) and {ManMapExtractor.filename} (OPTIONS):')
     verify(source_map_types, bashcomp_map_types,
             f'Comparing {MapFileExtractor.filename} (map_type_name) and {BashcompExtractor.filename} (BPFTOOL_MAP_CREATE_TYPES):')
 
@@ -441,9 +508,11 @@ def main():
     source_prog_attach_types = set(prog_info.get_attach_types().values())
 
     help_prog_attach_types = prog_info.get_prog_attach_help()
+    help_prog_options = prog_info.get_options()
     prog_info.close()
 
     man_prog_info = ManProgExtractor()
+    man_prog_options = man_prog_info.get_options()
     man_prog_attach_types = man_prog_info.get_attach_types()
     man_prog_info.close()
 
@@ -454,6 +523,8 @@ def main():
             f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {ProgFileExtractor.filename} (do_help() ATTACH_TYPE):')
     verify(source_prog_attach_types, man_prog_attach_types,
             f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {ManProgExtractor.filename} (ATTACH_TYPE):')
+    verify(help_prog_options, man_prog_options,
+            f'Comparing {ProgFileExtractor.filename} (do_help() OPTIONS) and {ManProgExtractor.filename} (OPTIONS):')
     verify(source_prog_attach_types, bashcomp_prog_attach_types,
             f'Comparing {ProgFileExtractor.filename} (attach_type_strings) and {BashcompExtractor.filename} (BPFTOOL_PROG_ATTACH_TYPES):')
 
@@ -464,9 +535,11 @@ def main():
 
     cgroup_info = CgroupFileExtractor()
     help_cgroup_attach_types = cgroup_info.get_prog_attach_help()
+    help_cgroup_options = cgroup_info.get_options()
     cgroup_info.close()
 
     man_cgroup_info = ManCgroupExtractor()
+    man_cgroup_options = man_cgroup_info.get_options()
     man_cgroup_attach_types = man_cgroup_info.get_attach_types()
     man_cgroup_info.close()
 
@@ -477,9 +550,36 @@ def main():
             f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {CgroupFileExtractor.filename} (do_help() ATTACH_TYPE):')
     verify(source_cgroup_attach_types, man_cgroup_attach_types,
             f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {ManCgroupExtractor.filename} (ATTACH_TYPE):')
+    verify(help_cgroup_options, man_cgroup_options,
+            f'Comparing {CgroupFileExtractor.filename} (do_help() OPTIONS) and {ManCgroupExtractor.filename} (OPTIONS):')
     verify(source_cgroup_attach_types, bashcomp_cgroup_attach_types,
             f'Comparing {CommonFileExtractor.filename} (attach_type_strings) and {BashcompExtractor.filename} (BPFTOOL_CGROUP_ATTACH_TYPES):')
 
+    # Options for remaining commands
+
+    for cmd in [ 'btf', 'feature', 'gen', 'iter', 'link', 'net', 'perf', 'struct_ops', ]:
+        source_info = GenericSourceExtractor(cmd + '.c')
+        help_cmd_options = source_info.get_options()
+        source_info.close()
+
+        man_cmd_info = ManGenericExtractor(os.path.join('Documentation', 'bpftool-' + cmd + '.rst'))
+        man_cmd_options = man_cmd_info.get_options()
+        man_cmd_info.close()
+
+        verify(help_cmd_options, man_cmd_options,
+                f'Comparing {source_info.filename} (do_help() OPTIONS) and {man_cmd_info.filename} (OPTIONS):')
+
+    source_main_info = GenericSourceExtractor('main.c')
+    help_main_options = source_main_info.get_options()
+    source_main_info.close()
+
+    man_main_info = ManGenericExtractor(os.path.join('Documentation', 'bpftool.rst'))
+    man_main_options = man_main_info.get_options()
+    man_main_info.close()
+
+    verify(help_main_options, man_main_options,
+            f'Comparing {source_main_info.filename} (do_help() OPTIONS) and {man_main_info.filename} (OPTIONS):')
+
     sys.exit(retval)
 
 if __name__ == "__main__":

From 8cc8c6357c8fa763c650f1bddb69871a254f427c Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 30 Jul 2021 22:54:34 +0100
Subject: [PATCH 63/64] tools: bpftool: Document and add bash completion for
 -L, -B options

The -L|--use-loader option for using loader programs when loading, or
when generating a skeleton, did not have any documentation or bash
completion. Same thing goes for -B|--base-btf, used to pass a path to a
base BTF object for split BTF such as BTF for kernel modules.

This patch documents and adds bash completion for those options.

Fixes: 75fa1777694c ("tools/bpftool: Add bpftool support for split BTF")
Fixes: d510296d331a ("bpftool: Use syscall/loader program in "prog load" and "gen skeleton" command.")
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210730215435.7095-7-quentin@isovalent.com
---
 .../bpf/bpftool/Documentation/bpftool-btf.rst | 48 ++++++++++++++++++-
 .../bpf/bpftool/Documentation/bpftool-gen.rst |  9 +++-
 .../bpftool/Documentation/bpftool-prog.rst    | 30 +++++++++++-
 tools/bpf/bpftool/bash-completion/bpftool     |  8 ++--
 tools/bpf/bpftool/btf.c                       |  3 +-
 tools/bpf/bpftool/cgroup.c                    |  2 +-
 tools/bpf/bpftool/gen.c                       |  3 +-
 tools/bpf/bpftool/prog.c                      |  3 +-
 8 files changed, 96 insertions(+), 10 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 1d37f3809842..88b28aa7431f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -12,7 +12,8 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **btf** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | {**-d** | **--debug** } }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | {**-d** | **--debug** } |
+		{ **-B** | **--base-btf** } }
 
 	*COMMANDS* := { **dump** | **help** }
 
@@ -73,6 +74,20 @@ OPTIONS
 =======
 	.. include:: common_options.rst
 
+	-B, --base-btf *FILE*
+		  Pass a base BTF object. Base BTF objects are typically used
+		  with BTF objects for kernel modules. To avoid duplicating
+		  all kernel symbols required by modules, BTF objects for
+		  modules are "split", they are built incrementally on top of
+		  the kernel (vmlinux) BTF object. So the base BTF reference
+		  should usually point to the kernel BTF.
+
+		  When the main BTF object to process (for example, the
+		  module BTF to dump) is passed as a *FILE*, bpftool attempts
+		  to autodetect the path for the base object, and passing
+		  this option is optional. When the main BTF object is passed
+		  through other handles, this option becomes necessary.
+
 EXAMPLES
 ========
 **# bpftool btf dump id 1226**
@@ -217,3 +232,34 @@ All the standard ways to specify map or program are supported:
 **# bpftool btf dump prog tag b88e0a09b1d9759d**
 
 **# bpftool btf dump prog pinned /sys/fs/bpf/prog_name**
+
+|
+| **# bpftool btf dump file /sys/kernel/btf/i2c_smbus**
+| (or)
+| **# I2C_SMBUS_ID=$(bpftool btf show -p | jq '.[] | select(.name=="i2c_smbus").id')**
+| **# bpftool btf dump id ${I2C_SMBUS_ID} -B /sys/kernel/btf/vmlinux**
+
+::
+
+  [104848] STRUCT 'i2c_smbus_alert' size=40 vlen=2
+          'alert' type_id=393 bits_offset=0
+          'ara' type_id=56050 bits_offset=256
+  [104849] STRUCT 'alert_data' size=12 vlen=3
+          'addr' type_id=16 bits_offset=0
+          'type' type_id=56053 bits_offset=32
+          'data' type_id=7 bits_offset=64
+  [104850] PTR '(anon)' type_id=104848
+  [104851] PTR '(anon)' type_id=104849
+  [104852] FUNC 'i2c_register_spd' type_id=84745 linkage=static
+  [104853] FUNC 'smbalert_driver_init' type_id=1213 linkage=static
+  [104854] FUNC_PROTO '(anon)' ret_type_id=18 vlen=1
+          'ara' type_id=56050
+  [104855] FUNC 'i2c_handle_smbus_alert' type_id=104854 linkage=static
+  [104856] FUNC 'smbalert_remove' type_id=104854 linkage=static
+  [104857] FUNC_PROTO '(anon)' ret_type_id=18 vlen=2
+          'ara' type_id=56050
+          'id' type_id=56056
+  [104858] FUNC 'smbalert_probe' type_id=104857 linkage=static
+  [104859] FUNC 'smbalert_work' type_id=9695 linkage=static
+  [104860] FUNC 'smbus_alert' type_id=71367 linkage=static
+  [104861] FUNC 'smbus_do_alert' type_id=84827 linkage=static
diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
index 709b93fe1da3..2ef2f2df0279 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst
@@ -12,7 +12,8 @@ SYNOPSIS
 
 	**bpftool** [*OPTIONS*] **gen** *COMMAND*
 
-	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } }
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
+		{ **-L** | **--use-loader** } }
 
 	*COMMAND* := { **object** | **skeleton** | **help** }
 
@@ -152,6 +153,12 @@ OPTIONS
 =======
 	.. include:: common_options.rst
 
+	-L, --use-loader
+		  For skeletons, generate a "light" skeleton (also known as "loader"
+		  skeleton). A light skeleton contains a loader eBPF program. It does
+		  not use the majority of the libbpf infrastructure, and does not need
+		  libelf.
+
 EXAMPLES
 ========
 **$ cat example1.bpf.c**
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 4b8412fe2c60..2ea5df30ff21 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -13,7 +13,8 @@ SYNOPSIS
 	**bpftool** [*OPTIONS*] **prog** *COMMAND*
 
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } |
-		{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } }
+		{ **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } |
+		{ **-L** | **--use-loader** } }
 
 	*COMMANDS* :=
 	{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load**
@@ -224,6 +225,20 @@ OPTIONS
 		  Do not automatically attempt to mount any virtual file system
 		  (such as tracefs or BPF virtual file system) when necessary.
 
+	-L, --use-loader
+		  Load program as a "loader" program. This is useful to debug
+		  the generation of such programs. When this option is in
+		  use, bpftool attempts to load the programs from the object
+		  file into the kernel, but does not pin them (therefore, the
+		  *PATH* must not be provided).
+
+		  When combined with the **-d**\ \|\ **--debug** option,
+		  additional debug messages are generated, and the execution
+		  of the loader program will use the **bpf_trace_printk**\ ()
+		  helper to log each step of loading BTF, creating the maps,
+		  and loading the programs (see **bpftool prog tracelog** as
+		  a way to dump those messages).
+
 EXAMPLES
 ========
 **# bpftool prog show**
@@ -327,3 +342,16 @@ EXAMPLES
       40176203 cycles                                                 (83.05%)
       42518139 instructions    #   1.06 insns per cycle               (83.39%)
            123 llc_misses      #   2.89 LLC misses per million insns  (83.15%)
+
+|
+| Output below is for the trace logs.
+| Run in separate terminals:
+| **# bpftool prog tracelog**
+| **# bpftool prog load -L -d file.o**
+
+::
+
+    bpftool-620059  [004] d... 2634685.517903: bpf_trace_printk: btf_load size 665 r=5
+    bpftool-620059  [004] d... 2634685.517912: bpf_trace_printk: map_create sample_map idx 0 type 2 value_size 4 value_btf_id 0 r=6
+    bpftool-620059  [004] d... 2634685.517997: bpf_trace_printk: prog_load sample insn_cnt 13 r=7
+    bpftool-620059  [004] d... 2634685.517999: bpf_trace_printk: close(5) = 0
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 1521a725f07c..134135424e7f 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -260,7 +260,8 @@ _bpftool()
 
     # Deal with options
     if [[ ${words[cword]} == -* ]]; then
-        local c='--version --json --pretty --bpffs --mapcompat --debug'
+        local c='--version --json --pretty --bpffs --mapcompat --debug \
+	       --use-loader --base-btf'
         COMPREPLY=( $( compgen -W "$c" -- "$cur" ) )
         return 0
     fi
@@ -278,7 +279,7 @@ _bpftool()
             _sysfs_get_netdevs
             return 0
             ;;
-        file|pinned)
+        file|pinned|-B|--base-btf)
             _filedir
             return 0
             ;;
@@ -291,7 +292,8 @@ _bpftool()
     # Remove all options so completions don't have to deal with them.
     local i
     for (( i=1; i < ${#words[@]}; )); do
-        if [[ ${words[i]::1} == - ]]; then
+        if [[ ${words[i]::1} == - ]] &&
+            [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then
             words=( "${words[@]:0:i}" "${words[@]:i+1}" )
             [[ $i -le $cword ]] && cword=$(( cword - 1 ))
         else
diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 3c5fc9b25c30..f7e5ff3586c9 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -981,7 +981,8 @@ static int do_help(int argc, char **argv)
 		"       FORMAT  := { raw | c }\n"
 		"       " HELP_SPEC_MAP "\n"
 		"       " HELP_SPEC_PROGRAM "\n"
-		"       " HELP_SPEC_OPTIONS " }\n"
+		"       " HELP_SPEC_OPTIONS " |\n"
+		"                    {-B|--base-btf} }\n"
 		"",
 		bin_name, "btf");
 
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index c42f437a1015..3571a281c43f 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -502,7 +502,7 @@ static int do_help(int argc, char **argv)
 		"       " HELP_SPEC_ATTACH_FLAGS "\n"
 		"       " HELP_SPEC_PROGRAM "\n"
 		"       " HELP_SPEC_OPTIONS " |\n"
-		"                    {-f|--bpffs} }
+		"                    {-f|--bpffs} }\n"
 		"",
 		bin_name, argv[-2]);
 
diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index d4225f7fbcee..d40d92bbf0e4 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -1026,7 +1026,8 @@ static int do_help(int argc, char **argv)
 		"       %1$s %2$s skeleton FILE [name OBJECT_NAME]\n"
 		"       %1$s %2$s help\n"
 		"\n"
-		"       " HELP_SPEC_OPTIONS " }\n"
+		"       " HELP_SPEC_OPTIONS " |\n"
+		"                    {-L|--use-loader} }\n"
 		"",
 		bin_name, "gen");
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index a205f7124b38..9c3e343b7d87 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -2260,7 +2260,8 @@ static int do_help(int argc, char **argv)
 		"                        stream_parser | flow_dissector }\n"
 		"       METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n"
 		"       " HELP_SPEC_OPTIONS " |\n"
-		"                    {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} }\n"
+		"                    {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} |\n"
+		"                    {-L|--use-loader} }\n"
 		"",
 		bin_name, argv[-2]);
 

From 475a23c2c15fb2af76a27f89f632a48c03166f45 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 30 Jul 2021 22:54:35 +0100
Subject: [PATCH 64/64] tools: bpftool: Complete metrics list in "bpftool prog
 profile" doc

Profiling programs with bpftool was extended some time ago to support
two new metrics, namely itlb_misses and dtlb_misses (misses for the
instruction/data translation lookaside buffer). Update the manual page
and bash completion accordingly.

Fixes: 450d060e8f75 ("bpftool: Add {i,d}tlb_misses support for bpftool profile")
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210730215435.7095-8-quentin@isovalent.com
---
 tools/bpf/bpftool/Documentation/bpftool-prog.rst | 3 ++-
 tools/bpf/bpftool/bash-completion/bpftool        | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 2ea5df30ff21..91608cb7e44a 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -53,7 +53,8 @@ PROG COMMANDS
 |		**msg_verdict** | **skb_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector**
 |	}
 |	*METRICs* := {
-|		**cycles** | **instructions** | **l1d_loads** | **llc_misses**
+|		**cycles** | **instructions** | **l1d_loads** | **llc_misses** |
+|		**itlb_misses** | **dtlb_misses**
 |	}
 
 
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 134135424e7f..88e2bcf16cca 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -345,7 +345,8 @@ _bpftool()
 
             local PROG_TYPE='id pinned tag name'
             local MAP_TYPE='id pinned name'
-            local METRIC_TYPE='cycles instructions l1d_loads llc_misses'
+            local METRIC_TYPE='cycles instructions l1d_loads llc_misses \
+                itlb_misses dtlb_misses'
             case $command in
                 show|list)
                     [[ $prev != "$command" ]] && return 0