From e47d0bf800e8d7f4de501987b2788c7f2ce22cd1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 15 Nov 2021 16:04:48 -0800 Subject: [PATCH 001/115] bpftool: Add current libbpf_strict mode to version output + bpftool --legacy --version bpftool v5.15.0 features: libbfd, skeletons + bpftool --version bpftool v5.15.0 features: libbfd, libbpf_strict, skeletons + bpftool --legacy --help Usage: bpftool [OPTIONS] OBJECT { COMMAND | help } bpftool batch file FILE bpftool version OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter } OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy} | {-V|--version} } + bpftool --help Usage: bpftool [OPTIONS] OBJECT { COMMAND | help } bpftool batch file FILE bpftool version OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter } OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy} | {-V|--version} } + bpftool --legacy Usage: bpftool [OPTIONS] OBJECT { COMMAND | help } bpftool batch file FILE bpftool version OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter } OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy} | {-V|--version} } + bpftool Usage: bpftool [OPTIONS] OBJECT { COMMAND | help } bpftool batch file FILE bpftool version OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter } OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy} | {-V|--version} } + bpftool --legacy version bpftool v5.15.0 features: libbfd, skeletons + bpftool version bpftool v5.15.0 features: libbfd, libbpf_strict, skeletons + bpftool --json --legacy version {"version":"5.15.0","features":{"libbfd":true,"libbpf_strict":false,"skeletons":true}} + bpftool --json version {"version":"5.15.0","features":{"libbfd":true,"libbpf_strict":true,"skeletons":true}} Suggested-by: Quentin Monnet Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20211116000448.2918854-1-sdf@google.com --- tools/bpf/bpftool/main.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index 473791e87f7d..8b71500e7cb2 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -93,6 +93,7 @@ static int do_version(int argc, char **argv) jsonw_name(json_wtr, "features"); jsonw_start_object(json_wtr); /* features */ jsonw_bool_field(json_wtr, "libbfd", has_libbfd); + jsonw_bool_field(json_wtr, "libbpf_strict", !legacy_libbpf); jsonw_bool_field(json_wtr, "skeletons", has_skeletons); jsonw_end_object(json_wtr); /* features */ @@ -106,6 +107,10 @@ static int do_version(int argc, char **argv) printf(" libbfd"); nb_features++; } + if (!legacy_libbpf) { + printf("%s libbpf_strict", nb_features++ ? "," : ""); + nb_features++; + } if (has_skeletons) printf("%s skeletons", nb_features++ ? "," : ""); printf("\n"); @@ -400,6 +405,7 @@ int main(int argc, char **argv) { "legacy", no_argument, NULL, 'l' }, { 0 } }; + bool version_requested = false; int opt, ret; last_do_help = do_help; @@ -414,7 +420,8 @@ int main(int argc, char **argv) options, NULL)) >= 0) { switch (opt) { case 'V': - return do_version(argc, argv); + version_requested = true; + break; case 'h': return do_help(argc, argv); case 'p': @@ -479,6 +486,9 @@ int main(int argc, char **argv) if (argc < 0) usage(); + if (version_requested) + return do_version(argc, argv); + ret = cmd_select(cmds, argc, argv, do_help); if (json_output) From 69a055d546156adc6f7727ec981f721d5ba9231a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 15 Nov 2021 08:39:37 -0800 Subject: [PATCH 002/115] libbpf: Fix a couple of missed btf_type_tag handling in btf.c Commit 2dc1e488e5cd ("libbpf: Support BTF_KIND_TYPE_TAG") added the BTF_KIND_TYPE_TAG support. But to test vmlinux build with ... #define __user __attribute__((btf_type_tag("user"))) ... I needed to sync libbpf repo and manually copy libbpf sources to pahole. To simplify process, I used BTF_KIND_RESTRICT to simulate the BTF_KIND_TYPE_TAG with vmlinux build as "restrict" modifier is barely used in kernel. But this approach missed one case in dedup with structures where BTF_KIND_RESTRICT is handled and BTF_KIND_TYPE_TAG is not handled in btf_dedup_is_equiv(), and this will result in a pahole dedup failure. This patch fixed this issue and a selftest is added in the subsequent patch to test this scenario. The other missed handling is in btf__resolve_size(). Currently the compiler always emit like PTR->TYPE_TAG->... so in practice we don't hit the missing BTF_KIND_TYPE_TAG handling issue with compiler generated code. But lets add case BTF_KIND_TYPE_TAG in the switch statement to be future proof. Fixes: 2dc1e488e5cd ("libbpf: Support BTF_KIND_TYPE_TAG") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211115163937.3922235-1-yhs@fb.com --- tools/lib/bpf/btf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index fadf089ae8fe..b6be579e0dc6 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -610,6 +610,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) case BTF_KIND_RESTRICT: case BTF_KIND_VAR: case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: type_id = t->type; break; case BTF_KIND_ARRAY: @@ -4023,6 +4024,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_FUNC: + case BTF_KIND_TYPE_TAG: if (cand_type->info != canon_type->info) return 0; return btf_dedup_is_equiv(d, cand_type->type, canon_type->type); From 4746158305e98c91c479539d53ef9bf8c520dd66 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 15 Nov 2021 08:39:43 -0800 Subject: [PATCH 003/115] selftests/bpf: Add a dedup selftest with equivalent structure types Without previous libbpf patch, the following error will occur: $ ./test_progs -t btf ... do_test_dedup:FAIL:check btf_dedup failed errno:-22#13/205 btf/dedup: btf_type_tag #5, struct:FAIL And the previous libbpf patch fixed the issue. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211115163943.3922547-1-yhs@fb.com --- tools/testing/selftests/bpf/prog_tests/btf.c | 26 ++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 4aa6343dc4c8..f9326a13badb 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -7352,6 +7352,32 @@ static struct btf_dedup_test dedup_tests[] = { BTF_STR_SEC("\0tag1"), }, }, +{ + .descr = "dedup: btf_type_tag #5, struct", + .input = { + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_TAG_ENC(NAME_NTH(1), 1), /* [2] */ + BTF_TYPE_ENC(NAME_NTH(2), BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 1), 4), /* [3] */ + BTF_MEMBER_ENC(NAME_NTH(3), 2, BTF_MEMBER_OFFSET(0, 0)), + BTF_TYPE_TAG_ENC(NAME_NTH(1), 1), /* [4] */ + BTF_TYPE_ENC(NAME_NTH(2), BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 1), 4), /* [5] */ + BTF_MEMBER_ENC(NAME_NTH(3), 4, BTF_MEMBER_OFFSET(0, 0)), + BTF_END_RAW, + }, + BTF_STR_SEC("\0tag1\0t\0m"), + }, + .expect = { + .raw_types = { + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_TYPE_TAG_ENC(NAME_NTH(1), 1), /* [2] */ + BTF_TYPE_ENC(NAME_NTH(2), BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 1), 4), /* [3] */ + BTF_MEMBER_ENC(NAME_NTH(3), 2, BTF_MEMBER_OFFSET(0, 0)), + BTF_END_RAW, + }, + BTF_STR_SEC("\0tag1\0t\0m"), + }, +}, }; From 4344842836e9b9a7b695dc84956cdecd83ac02e9 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 15 Nov 2021 22:58:42 +0000 Subject: [PATCH 004/115] bpftool: Add SPDX tags to RST documentation files Most files in the kernel repository have a SPDX tags. The files that don't have such a tag (or another license boilerplate) tend to fall under the GPL-2.0 license. In the past, bpftool's Makefile (for example) has been marked as GPL-2.0 for that reason, when in fact all bpftool is dual-licensed. To prevent a similar confusion from happening with the RST documentation files for bpftool, let's explicitly mark all files as dual-licensed. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211115225844.33943-2-quentin@isovalent.com --- tools/bpf/bpftool/Documentation/Makefile | 2 +- tools/bpf/bpftool/Documentation/bpftool-btf.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool-feature.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool-gen.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool-iter.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool-link.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool-net.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool-perf.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool-prog.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst | 2 ++ tools/bpf/bpftool/Documentation/bpftool.rst | 2 ++ tools/bpf/bpftool/Documentation/common_options.rst | 2 ++ 14 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index 692e1b947490..ac8487dcff1d 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -24,7 +24,7 @@ man: man8 man8: $(DOC_MAN8) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) -RST2MAN_OPTS += --verbose +RST2MAN_OPTS += --verbose --strip-comments list_pages = $(sort $(basename $(filter-out $(1),$(MAN8_RST)))) see_also = $(subst " ",, \ diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 4425d942dd39..2d2ceb7163f6 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ================ bpftool-btf ================ diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index 8069d37dd991..b954faeb0f07 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ================ bpftool-cgroup ================ diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index ab9f57ee4c3a..b1471788a15f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + =============== bpftool-feature =============== diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 2a137f8a4cea..51e2e8de5208 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ================ bpftool-gen ================ diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 471f363a725a..51914c9e8a54 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ============ bpftool-iter ============ diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 9434349636a5..31371bcf605a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ================ bpftool-link ================ diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 991d18fd84f2..e22c918c069c 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ================ bpftool-map ================ diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 7ec57535a7c1..6d1aa374529f 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ================ bpftool-net ================ diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index ce52798a917d..ad554806faa2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ================ bpftool-perf ================ diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index f27265bd589b..d31148571403 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ================ bpftool-prog ================ diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 02afc0fc14cb..77b845b5ac61 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ================== bpftool-struct_ops ================== diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 8ac86565c501..1248b35e67ae 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + ================ BPFTOOL ================ diff --git a/tools/bpf/bpftool/Documentation/common_options.rst b/tools/bpf/bpftool/Documentation/common_options.rst index 75adf23202d8..908487b9c2ad 100644 --- a/tools/bpf/bpftool/Documentation/common_options.rst +++ b/tools/bpf/bpftool/Documentation/common_options.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + -h, --help Print short help message (similar to **bpftool help**). From b623181520404ef48f7421333561bd294c6c6b11 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 15 Nov 2021 22:58:43 +0000 Subject: [PATCH 005/115] bpftool: Update doc (use susbtitutions) and test_bpftool_synctypes.py test_bpftool_synctypes.py helps detecting inconsistencies in bpftool between the different list of types and options scattered in the sources, the documentation, and the bash completion. For options that apply to all bpftool commands, the script had a hardcoded list of values, and would use them to check whether the man pages are up-to-date. When writing the script, it felt acceptable to have this list in order to avoid to open and parse bpftool's main.h every time, and because the list of global options in bpftool doesn't change so often. However, this is prone to omissions, and we recently added a new -l|--legacy option which was described in common_options.rst, but not listed in the options summary of each manual page. The script did not complain, because it keeps comparing the hardcoded list to the (now) outdated list in the header file. To address the issue, this commit brings the following changes: - Options that are common to all bpftool commands (--json, --pretty, and --debug) are moved to a dedicated file, and used in the definition of a RST substitution. This substitution is used in the sources of all the man pages. - This list of common options is updated, with the addition of the new -l|--legacy option. - The script test_bpftool_synctypes.py is updated to compare: - Options specific to a command, found in C files, for the interactive help messages, with the same specific options from the relevant man page for that command. - Common options, checked just once: the list in main.h is compared with the new list in substitutions.rst. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211115225844.33943-3-quentin@isovalent.com --- .../bpf/bpftool/Documentation/bpftool-btf.rst | 5 +- .../bpftool/Documentation/bpftool-cgroup.rst | 5 +- .../bpftool/Documentation/bpftool-feature.rst | 4 +- .../bpf/bpftool/Documentation/bpftool-gen.rst | 5 +- .../bpftool/Documentation/bpftool-iter.rst | 4 +- .../bpftool/Documentation/bpftool-link.rst | 5 +- .../bpf/bpftool/Documentation/bpftool-map.rst | 5 +- .../bpf/bpftool/Documentation/bpftool-net.rst | 4 +- .../bpftool/Documentation/bpftool-perf.rst | 4 +- .../bpftool/Documentation/bpftool-prog.rst | 4 +- .../Documentation/bpftool-struct_ops.rst | 4 +- tools/bpf/bpftool/Documentation/bpftool.rst | 5 +- .../bpftool/Documentation/substitutions.rst | 3 + .../selftests/bpf/test_bpftool_synctypes.py | 70 +++++++++++++++++-- 14 files changed, 102 insertions(+), 25 deletions(-) create mode 100644 tools/bpf/bpftool/Documentation/substitutions.rst diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 2d2ceb7163f6..342716f74ec4 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -9,13 +9,14 @@ tool for inspection of BTF data :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **btf** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | {**-d** | **--debug** } | - { **-B** | **--base-btf** } } + *OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } *COMMANDS* := { **dump** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index b954faeb0f07..a17e9aa314fd 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -9,13 +9,14 @@ tool for inspection and simple manipulation of eBPF progs :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **cgroup** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | - { **-f** | **--bpffs** } } + *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } *COMMANDS* := { **show** | **list** | **tree** | **attach** | **detach** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index b1471788a15f..4ce9a77bc1e0 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -9,12 +9,14 @@ tool for inspection of eBPF-related parameters for Linux kernel or net device :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **feature** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } + *OPTIONS* := { |COMMON_OPTIONS| } *COMMANDS* := { **probe** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 51e2e8de5208..bc276388f432 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -9,13 +9,14 @@ tool for BPF code-generation :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **gen** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | - { **-L** | **--use-loader** } } + *OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } *COMMAND* := { **object** | **skeleton** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst index 51914c9e8a54..84839d488621 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst @@ -9,12 +9,14 @@ tool to create BPF iterators :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **iter** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } + *OPTIONS* := { |COMMON_OPTIONS| } *COMMANDS* := { **pin** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst index 31371bcf605a..52a4eee4af54 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-link.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst @@ -9,13 +9,14 @@ tool for inspection and simple manipulation of eBPF links :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **link** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | - { **-f** | **--bpffs** } | { **-n** | **--nomount** } } + *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } *COMMANDS* := { **show** | **list** | **pin** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index e22c918c069c..7c188a598444 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -9,13 +9,14 @@ tool for inspection and simple manipulation of eBPF maps :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **map** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | - { **-f** | **--bpffs** } | { **-n** | **--nomount** } } + *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } *COMMANDS* := { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 6d1aa374529f..f4e0a516335a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -9,12 +9,14 @@ tool for inspection of netdev/tc related bpf prog attachments :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **net** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } + *OPTIONS* := { |COMMON_OPTIONS| } *COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index ad554806faa2..5fea633a82f1 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -9,12 +9,14 @@ tool for inspection of perf related bpf prog attachments :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **perf** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } + *OPTIONS* := { |COMMON_OPTIONS| } *COMMANDS* := { **show** | **list** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index d31148571403..a2e9359e554c 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -9,12 +9,14 @@ tool for inspection and simple manipulation of eBPF progs :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **prog** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | + *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | { **-L** | **--use-loader** } } diff --git a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst index 77b845b5ac61..ee53a122c0c7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-struct_ops.rst @@ -9,12 +9,14 @@ tool to register/unregister/introspect BPF struct_ops :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== **bpftool** [*OPTIONS*] **struct_ops** *COMMAND* - *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } + *OPTIONS* := { |COMMON_OPTIONS| } *COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 1248b35e67ae..7084dd9fa2f8 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -9,6 +9,8 @@ tool for inspection and simple manipulation of eBPF programs and maps :Manual section: 8 +.. include:: substitutions.rst + SYNOPSIS ======== @@ -20,8 +22,7 @@ SYNOPSIS *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** | **feature** } - *OPTIONS* := { { **-V** | **--version** } | - { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } } + *OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } *MAP-COMMANDS* := { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | diff --git a/tools/bpf/bpftool/Documentation/substitutions.rst b/tools/bpf/bpftool/Documentation/substitutions.rst new file mode 100644 index 000000000000..ccf1ffa0686c --- /dev/null +++ b/tools/bpf/bpftool/Documentation/substitutions.rst @@ -0,0 +1,3 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +.. |COMMON_OPTIONS| replace:: { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } | { **-l** | **--legacy** } diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py index be54b7335a76..3f6e562565ec 100755 --- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py +++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py @@ -242,12 +242,6 @@ class FileExtractor(object): end_marker = re.compile('}\\\\n') return self.__get_description_list(start_marker, pattern, end_marker) - def default_options(self): - """ - Return the default options contained in HELP_SPEC_OPTIONS - """ - return { '-j', '--json', '-p', '--pretty', '-d', '--debug' } - def get_bashcomp_list(self, block_name): """ Search for and parse a list of type names from a variable in bash @@ -274,7 +268,56 @@ class SourceFileExtractor(FileExtractor): defined in children classes. """ def get_options(self): - return self.default_options().union(self.get_help_list_macro('HELP_SPEC_OPTIONS')) + return self.get_help_list_macro('HELP_SPEC_OPTIONS') + +class MainHeaderFileExtractor(SourceFileExtractor): + """ + An extractor for bpftool's main.h + """ + filename = os.path.join(BPFTOOL_DIR, 'main.h') + + def get_common_options(self): + """ + Parse the list of common options in main.h (options that apply to all + commands), which looks to the lists of options in other source files + but has different start and end markers: + + "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug} | {-l|--legacy}" + + Return a set containing all options, such as: + + {'-p', '-d', '--legacy', '--pretty', '--debug', '--json', '-l', '-j'} + """ + start_marker = re.compile(f'"OPTIONS :=') + pattern = re.compile('([\w-]+) ?(?:\||}[ }\]"])') + end_marker = re.compile('#define') + + parser = InlineListParser(self.reader) + parser.search_block(start_marker) + return parser.parse(pattern, end_marker) + +class ManSubstitutionsExtractor(SourceFileExtractor): + """ + An extractor for substitutions.rst + """ + filename = os.path.join(BPFTOOL_DIR, 'Documentation/substitutions.rst') + + def get_common_options(self): + """ + Parse the list of common options in substitutions.rst (options that + apply to all commands). + + Return a set containing all options, such as: + + {'-p', '-d', '--legacy', '--pretty', '--debug', '--json', '-l', '-j'} + """ + start_marker = re.compile('\|COMMON_OPTIONS\| replace:: {') + pattern = re.compile('\*\*([\w/-]+)\*\*') + end_marker = re.compile('}$') + + parser = InlineListParser(self.reader) + parser.search_block(start_marker) + return parser.parse(pattern, end_marker) class ProgFileExtractor(SourceFileExtractor): """ @@ -580,6 +623,19 @@ def main(): verify(help_main_options, man_main_options, f'Comparing {source_main_info.filename} (do_help() OPTIONS) and {man_main_info.filename} (OPTIONS):') + # Compare common options (options that apply to all commands) + + main_hdr_info = MainHeaderFileExtractor() + source_common_options = main_hdr_info.get_common_options() + main_hdr_info.close() + + man_substitutions = ManSubstitutionsExtractor() + man_common_options = man_substitutions.get_common_options() + man_substitutions.close() + + verify(source_common_options, man_common_options, + f'Comparing common options from {main_hdr_info.filename} (HELP_SPEC_OPTIONS) and {man_substitutions.filename}:') + sys.exit(retval) if __name__ == "__main__": From e12cd158c8a45b3926cc2f42151384a2d7fdcec3 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Mon, 15 Nov 2021 22:58:44 +0000 Subject: [PATCH 006/115] selftests/bpf: Configure dir paths via env in test_bpftool_synctypes.py Script test_bpftool_synctypes.py parses a number of files in the bpftool directory (or even elsewhere in the repo) to make sure that the list of types or options in those different files are consistent. Instead of having fixed paths, let's make the directories configurable through environment variable. This should make easier in the future to run the script in a different setup, for example on an out-of-tree bpftool mirror with a different layout. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211115225844.33943-4-quentin@isovalent.com --- .../selftests/bpf/test_bpftool_synctypes.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py index 3f6e562565ec..6bf21e47882a 100755 --- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py +++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py @@ -9,7 +9,15 @@ import os, sys LINUX_ROOT = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir, os.pardir, os.pardir, os.pardir)) -BPFTOOL_DIR = os.path.join(LINUX_ROOT, 'tools/bpf/bpftool') +BPFTOOL_DIR = os.getenv('BPFTOOL_DIR', + os.path.join(LINUX_ROOT, 'tools/bpf/bpftool')) +BPFTOOL_BASHCOMP_DIR = os.getenv('BPFTOOL_BASHCOMP_DIR', + os.path.join(BPFTOOL_DIR, 'bash-completion')) +BPFTOOL_DOC_DIR = os.getenv('BPFTOOL_DOC_DIR', + os.path.join(BPFTOOL_DIR, 'Documentation')) +INCLUDE_DIR = os.getenv('INCLUDE_DIR', + os.path.join(LINUX_ROOT, 'tools/include')) + retval = 0 class BlockParser(object): @@ -300,7 +308,7 @@ class ManSubstitutionsExtractor(SourceFileExtractor): """ An extractor for substitutions.rst """ - filename = os.path.join(BPFTOOL_DIR, 'Documentation/substitutions.rst') + filename = os.path.join(BPFTOOL_DOC_DIR, 'substitutions.rst') def get_common_options(self): """ @@ -393,7 +401,7 @@ class BpfHeaderExtractor(FileExtractor): """ An extractor for the UAPI BPF header. """ - filename = os.path.join(LINUX_ROOT, 'tools/include/uapi/linux/bpf.h') + filename = os.path.join(INCLUDE_DIR, 'uapi/linux/bpf.h') def get_prog_types(self): return self.get_enum('bpf_prog_type') @@ -417,7 +425,7 @@ class ManProgExtractor(ManPageExtractor): """ An extractor for bpftool-prog.rst. """ - filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-prog.rst') + filename = os.path.join(BPFTOOL_DOC_DIR, 'bpftool-prog.rst') def get_attach_types(self): return self.get_rst_list('ATTACH_TYPE') @@ -426,7 +434,7 @@ class ManMapExtractor(ManPageExtractor): """ An extractor for bpftool-map.rst. """ - filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-map.rst') + filename = os.path.join(BPFTOOL_DOC_DIR, 'bpftool-map.rst') def get_map_types(self): return self.get_rst_list('TYPE') @@ -435,7 +443,7 @@ class ManCgroupExtractor(ManPageExtractor): """ An extractor for bpftool-cgroup.rst. """ - filename = os.path.join(BPFTOOL_DIR, 'Documentation/bpftool-cgroup.rst') + filename = os.path.join(BPFTOOL_DOC_DIR, 'bpftool-cgroup.rst') def get_attach_types(self): return self.get_rst_list('ATTACH_TYPE') @@ -454,7 +462,7 @@ class BashcompExtractor(FileExtractor): """ An extractor for bpftool's bash completion file. """ - filename = os.path.join(BPFTOOL_DIR, 'bash-completion/bpftool') + filename = os.path.join(BPFTOOL_BASHCOMP_DIR, 'bpftool') def get_prog_attach_types(self): return self.get_bashcomp_list('BPFTOOL_PROG_ATTACH_TYPES') @@ -605,7 +613,7 @@ def main(): help_cmd_options = source_info.get_options() source_info.close() - man_cmd_info = ManGenericExtractor(os.path.join('Documentation', 'bpftool-' + cmd + '.rst')) + man_cmd_info = ManGenericExtractor(os.path.join(BPFTOOL_DOC_DIR, 'bpftool-' + cmd + '.rst')) man_cmd_options = man_cmd_info.get_options() man_cmd_info.close() @@ -616,7 +624,7 @@ def main(): help_main_options = source_main_info.get_options() source_main_info.close() - man_main_info = ManGenericExtractor(os.path.join('Documentation', 'bpftool.rst')) + man_main_info = ManGenericExtractor(os.path.join(BPFTOOL_DOC_DIR, 'bpftool.rst')) man_main_options = man_main_info.get_options() man_main_info.close() From ebf7f6f0a6cdcc17a3da52b81e4b3a98c4005028 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 5 Nov 2021 09:30:00 +0800 Subject: [PATCH 007/115] bpf: Change value of MAX_TAIL_CALL_CNT from 32 to 33 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the current code, the actual max tail call count is 33 which is greater than MAX_TAIL_CALL_CNT (defined as 32). The actual limit is not consistent with the meaning of MAX_TAIL_CALL_CNT and thus confusing at first glance. We can see the historical evolution from commit 04fd61ab36ec ("bpf: allow bpf programs to tail-call other bpf programs") and commit f9dabe016b63 ("bpf: Undo off-by-one in interpreter tail call count limit"). In order to avoid changing existing behavior, the actual limit is 33 now, this is reasonable. After commit 874be05f525e ("bpf, tests: Add tail call test suite"), we can see there exists failed testcase. On all archs when CONFIG_BPF_JIT_ALWAYS_ON is not set: # echo 0 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf # dmesg | grep -w FAIL Tail call error path, max count reached jited:0 ret 34 != 33 FAIL On some archs: # echo 1 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf # dmesg | grep -w FAIL Tail call error path, max count reached jited:1 ret 34 != 33 FAIL Although the above failed testcase has been fixed in commit 18935a72eb25 ("bpf/tests: Fix error in tail call limit tests"), it would still be good to change the value of MAX_TAIL_CALL_CNT from 32 to 33 to make the code more readable. The 32-bit x86 JIT was using a limit of 32, just fix the wrong comments and limit to 33 tail calls as the constant MAX_TAIL_CALL_CNT updated. For the mips64 JIT, use "ori" instead of "addiu" as suggested by Johan Almbladh. For the riscv JIT, use RV_REG_TCC directly to save one register move as suggested by Björn Töpel. For the other implementations, no function changes, it does not change the current limit 33, the new value of MAX_TAIL_CALL_CNT can reflect the actual max tail call count, the related tail call testcases in test_bpf module and selftests can work well for the interpreter and the JIT. Here are the test results on x86_64: # uname -m x86_64 # echo 0 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf test_suite=test_tail_calls # dmesg | tail -1 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [0/8 JIT'ed] # rmmod test_bpf # echo 1 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf test_suite=test_tail_calls # dmesg | tail -1 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [8/8 JIT'ed] # rmmod test_bpf # ./test_progs -t tailcalls #142 tailcalls:OK Summary: 1/11 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Tiezhu Yang Signed-off-by: Daniel Borkmann Tested-by: Johan Almbladh Tested-by: Ilya Leoshkevich Acked-by: Björn Töpel Acked-by: Johan Almbladh Acked-by: Ilya Leoshkevich Link: https://lore.kernel.org/bpf/1636075800-3264-1-git-send-email-yangtiezhu@loongson.cn --- arch/arm/net/bpf_jit_32.c | 5 +++-- arch/arm64/net/bpf_jit_comp.c | 5 +++-- arch/mips/net/bpf_jit_comp32.c | 3 +-- arch/mips/net/bpf_jit_comp64.c | 2 +- arch/powerpc/net/bpf_jit_comp32.c | 4 ++-- arch/powerpc/net/bpf_jit_comp64.c | 4 ++-- arch/riscv/net/bpf_jit_comp32.c | 6 ++---- arch/riscv/net/bpf_jit_comp64.c | 7 +++---- arch/s390/net/bpf_jit_comp.c | 6 +++--- arch/sparc/net/bpf_jit_comp_64.c | 2 +- arch/x86/net/bpf_jit_comp.c | 10 +++++----- arch/x86/net/bpf_jit_comp32.c | 4 ++-- include/linux/bpf.h | 2 +- include/uapi/linux/bpf.h | 2 +- kernel/bpf/core.c | 3 ++- lib/test_bpf.c | 4 ++-- tools/include/uapi/linux/bpf.h | 2 +- 17 files changed, 35 insertions(+), 36 deletions(-) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index eeb6dc0ecf46..e59b41e9ab0c 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1199,7 +1199,8 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) /* tmp2[0] = array, tmp2[1] = index */ - /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + /* + * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) * goto out; * tail_call_cnt++; */ @@ -1208,7 +1209,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) tc = arm_bpf_get_reg64(tcc, tmp, ctx); emit(ARM_CMP_I(tc[0], hi), ctx); _emit(ARM_COND_EQ, ARM_CMP_I(tc[1], lo), ctx); - _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx); + _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx); emit(ARM_ADDS_I(tc[1], tc[1], 1), ctx); emit(ARM_ADC_I(tc[0], tc[0], 0), ctx); arm_bpf_put_reg64(tcc, tmp, ctx); diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 86c9dc0681cc..07c12c42b751 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -287,13 +287,14 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit(A64_CMP(0, r3, tmp), ctx); emit(A64_B_(A64_COND_CS, jmp_offset), ctx); - /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + /* + * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) * goto out; * tail_call_cnt++; */ emit_a64_mov_i64(tmp, MAX_TAIL_CALL_CNT, ctx); emit(A64_CMP(1, tcc, tmp), ctx); - emit(A64_B_(A64_COND_HI, jmp_offset), ctx); + emit(A64_B_(A64_COND_CS, jmp_offset), ctx); emit(A64_ADD_I(1, tcc, tcc, 1), ctx); /* prog = array->ptrs[index]; diff --git a/arch/mips/net/bpf_jit_comp32.c b/arch/mips/net/bpf_jit_comp32.c index bd996ede12f8..044b11b65bca 100644 --- a/arch/mips/net/bpf_jit_comp32.c +++ b/arch/mips/net/bpf_jit_comp32.c @@ -1381,8 +1381,7 @@ void build_prologue(struct jit_context *ctx) * 16-byte area in the parent's stack frame. On a tail call, the * calling function jumps into the prologue after these instructions. */ - emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, - min(MAX_TAIL_CALL_CNT + 1, 0xffff)); + emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff)); emit(ctx, sw, MIPS_R_T9, 0, MIPS_R_SP); /* diff --git a/arch/mips/net/bpf_jit_comp64.c b/arch/mips/net/bpf_jit_comp64.c index 815ade724227..6475828ffb36 100644 --- a/arch/mips/net/bpf_jit_comp64.c +++ b/arch/mips/net/bpf_jit_comp64.c @@ -552,7 +552,7 @@ void build_prologue(struct jit_context *ctx) * On a tail call, the calling function jumps into the prologue * after this instruction. */ - emit(ctx, addiu, tc, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT + 1, 0xffff)); + emit(ctx, ori, tc, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff)); /* === Entry-point for tail calls === */ diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 0da31d41d413..8a4faa05f9e4 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -221,13 +221,13 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o PPC_BCC(COND_GE, out); /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) * goto out; */ EMIT(PPC_RAW_CMPLWI(_R0, MAX_TAIL_CALL_CNT)); /* tail_call_cnt++; */ EMIT(PPC_RAW_ADDIC(_R0, _R0, 1)); - PPC_BCC(COND_GT, out); + PPC_BCC(COND_GE, out); /* prog = array->ptrs[index]; */ EMIT(PPC_RAW_RLWINM(_R3, b2p_index, 2, 0, 29)); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 8b5157ccfeba..8571aafcc9e1 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -228,12 +228,12 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o PPC_BCC(COND_GE, out); /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) * goto out; */ PPC_BPF_LL(b2p[TMP_REG_1], 1, bpf_jit_stack_tailcallcnt(ctx)); EMIT(PPC_RAW_CMPLWI(b2p[TMP_REG_1], MAX_TAIL_CALL_CNT)); - PPC_BCC(COND_GT, out); + PPC_BCC(COND_GE, out); /* * tail_call_cnt++; diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index e6497424cbf6..529a83b85c1c 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -799,11 +799,10 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) emit_bcc(BPF_JGE, lo(idx_reg), RV_REG_T1, off, ctx); /* - * temp_tcc = tcc - 1; - * if (tcc < 0) + * if (--tcc < 0) * goto out; */ - emit(rv_addi(RV_REG_T1, RV_REG_TCC, -1), ctx); + emit(rv_addi(RV_REG_TCC, RV_REG_TCC, -1), ctx); off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn)); emit_bcc(BPF_JSLT, RV_REG_TCC, RV_REG_ZERO, off, ctx); @@ -829,7 +828,6 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) if (is_12b_check(off, insn)) return -1; emit(rv_lw(RV_REG_T0, off, RV_REG_T0), ctx); - emit(rv_addi(RV_REG_TCC, RV_REG_T1, 0), ctx); /* Epilogue jumps to *(t0 + 4). */ __build_epilogue(true, ctx); return 0; diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index f2a779c7e225..603630b6f3c5 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -327,12 +327,12 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn)); emit_branch(BPF_JGE, RV_REG_A2, RV_REG_T1, off, ctx); - /* if (TCC-- < 0) + /* if (--TCC < 0) * goto out; */ - emit_addi(RV_REG_T1, tcc, -1, ctx); + emit_addi(RV_REG_TCC, tcc, -1, ctx); off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn)); - emit_branch(BPF_JSLT, tcc, RV_REG_ZERO, off, ctx); + emit_branch(BPF_JSLT, RV_REG_TCC, RV_REG_ZERO, off, ctx); /* prog = array->ptrs[index]; * if (!prog) @@ -352,7 +352,6 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) if (is_12b_check(off, insn)) return -1; emit_ld(RV_REG_T3, off, RV_REG_T2, ctx); - emit_mv(RV_REG_TCC, RV_REG_T1, ctx); __build_epilogue(true, ctx); return 0; } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 233cc9bcd652..9ff2bd83aad7 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1369,7 +1369,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, jit->prg); /* - * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ @@ -1381,9 +1381,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4_IMM(0xa7080000, REG_W0, 1); /* laal %w1,%w0,off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); - /* clij %w1,MAX_TAIL_CALL_CNT,0x2,out */ + /* clij %w1,MAX_TAIL_CALL_CNT-1,0x2,out */ patch_2_clij = jit->prg; - EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT, + EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT - 1, 2, jit->prg); /* diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 9a2f20cbd48b..0bfe1c72a0c9 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -867,7 +867,7 @@ static void emit_tail_call(struct jit_ctx *ctx) emit(LD32 | IMMED | RS1(SP) | S13(off) | RD(tmp), ctx); emit_cmpi(tmp, MAX_TAIL_CALL_CNT, ctx); #define OFFSET2 13 - emit_branch(BGU, ctx->idx, ctx->idx + OFFSET2, ctx); + emit_branch(BGEU, ctx->idx, ctx->idx + OFFSET2, ctx); emit_nop(ctx); emit_alu_K(ADD, tmp, 1, ctx); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 726700fabca6..631847907786 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -412,7 +412,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... * if (index >= array->map.max_entries) * goto out; - * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; * prog = array->ptrs[index]; * if (prog == NULL) @@ -446,14 +446,14 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, EMIT2(X86_JBE, offset); /* jbe out */ /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ offset = ctx->tail_call_indirect_label - (prog + 2 - start); - EMIT2(X86_JA, offset); /* ja out */ + EMIT2(X86_JAE, offset); /* jae out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ @@ -504,14 +504,14 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, int offset; /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ offset = ctx->tail_call_direct_label - (prog + 2 - start); - EMIT2(X86_JA, offset); /* ja out */ + EMIT2(X86_JAE, offset); /* jae out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index da9b7cfa4632..429a89c5468b 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1323,7 +1323,7 @@ static void emit_bpf_tail_call(u8 **pprog, u8 *ip) EMIT2(IA32_JBE, jmp_label(jmp_label1, 2)); /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ lo = (u32)MAX_TAIL_CALL_CNT; @@ -1337,7 +1337,7 @@ static void emit_bpf_tail_call(u8 **pprog, u8 *ip) /* cmp ecx,lo */ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo); - /* ja out */ + /* jae out */ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); /* add eax,0x1 */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 56098c866704..cc7a0c36e7df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1081,7 +1081,7 @@ struct bpf_array { }; #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ -#define MAX_TAIL_CALL_CNT 32 +#define MAX_TAIL_CALL_CNT 33 #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6297eafdc40f..a69e4b04ffeb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1744,7 +1744,7 @@ union bpf_attr { * if the maximum number of tail calls has been reached for this * chain of programs. This limit is defined in the kernel by the * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), - * which is currently set to 32. + * which is currently set to 33. * Return * 0 on success, or a negative error in case of failure. * diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2405e39d800f..b52dc845ecea 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1574,7 +1574,8 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) + + if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index adae39567264..0c5cb2d6436a 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -14683,7 +14683,7 @@ static struct tail_call_test tail_call_tests[] = { BPF_EXIT_INSN(), }, .flags = FLAG_NEED_STATE | FLAG_RESULT_IN_STATE, - .result = (MAX_TAIL_CALL_CNT + 1 + 1) * MAX_TESTRUNS, + .result = (MAX_TAIL_CALL_CNT + 1) * MAX_TESTRUNS, }, { "Tail call count preserved across function calls", @@ -14705,7 +14705,7 @@ static struct tail_call_test tail_call_tests[] = { }, .stack_depth = 8, .flags = FLAG_NEED_STATE | FLAG_RESULT_IN_STATE, - .result = (MAX_TAIL_CALL_CNT + 1 + 1) * MAX_TESTRUNS, + .result = (MAX_TAIL_CALL_CNT + 1) * MAX_TESTRUNS, }, { "Tail call error path, NULL target", diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6297eafdc40f..a69e4b04ffeb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1744,7 +1744,7 @@ union bpf_attr { * if the maximum number of tail calls has been reached for this * chain of programs. This limit is defined in the kernel by the * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), - * which is currently set to 32. + * which is currently set to 33. * Return * 0 on success, or a negative error in case of failure. * From d41bc48bfab2076f7db88d079a3a3203dd9c4a54 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 Nov 2021 17:30:41 -0800 Subject: [PATCH 008/115] selftests/bpf: Add uprobe triggering overhead benchmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add benchmark to measure overhead of uprobes and uretprobes. Also have a baseline (no uprobe attached) benchmark. On my dev machine, baseline benchmark can trigger 130M user_target() invocations. When uprobe is attached, this falls to just 700K. With uretprobe, we get down to 520K: $ sudo ./bench trig-uprobe-base -a Summary: hits 131.289 ± 2.872M/s # UPROBE $ sudo ./bench -a trig-uprobe-without-nop Summary: hits 0.729 ± 0.007M/s $ sudo ./bench -a trig-uprobe-with-nop Summary: hits 1.798 ± 0.017M/s # URETPROBE $ sudo ./bench -a trig-uretprobe-without-nop Summary: hits 0.508 ± 0.012M/s $ sudo ./bench -a trig-uretprobe-with-nop Summary: hits 0.883 ± 0.008M/s So there is almost 2.5x performance difference between probing nop vs non-nop instruction for entry uprobe. And 1.7x difference for uretprobe. This means that non-nop uprobe overhead is around 1.4 microseconds for uprobe and 2 microseconds for non-nop uretprobe. For nop variants, uprobe and uretprobe overhead is down to 0.556 and 1.13 microseconds, respectively. For comparison, just doing a very low-overhead syscall (with no BPF programs attached anywhere) gives: $ sudo ./bench trig-base -a Summary: hits 4.830 ± 0.036M/s So uprobes are about 2.67x slower than pure context switch. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211116013041.4072571-1-andrii@kernel.org --- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/bench.c | 10 ++ .../selftests/bpf/benchs/bench_trigger.c | 146 ++++++++++++++++++ .../selftests/bpf/progs/trigger_bench.c | 7 + 4 files changed, 166 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 0470802c907c..35684d61aaeb 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -533,7 +533,9 @@ $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \ $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm -$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ +$(OUTPUT)/bench: $(OUTPUT)/bench.o \ + $(OUTPUT)/testing_helpers.o \ + $(OUTPUT)/trace_helpers.o \ $(OUTPUT)/bench_count.o \ $(OUTPUT)/bench_rename.o \ $(OUTPUT)/bench_trigger.o \ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index cc4722f693e9..c75e7ee28746 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -359,6 +359,11 @@ extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fentry_sleep; extern const struct bench bench_trig_fmodret; +extern const struct bench bench_trig_uprobe_base; +extern const struct bench bench_trig_uprobe_with_nop; +extern const struct bench bench_trig_uretprobe_with_nop; +extern const struct bench bench_trig_uprobe_without_nop; +extern const struct bench bench_trig_uretprobe_without_nop; extern const struct bench bench_rb_libbpf; extern const struct bench bench_rb_custom; extern const struct bench bench_pb_libbpf; @@ -385,6 +390,11 @@ static const struct bench *benchs[] = { &bench_trig_fentry, &bench_trig_fentry_sleep, &bench_trig_fmodret, + &bench_trig_uprobe_base, + &bench_trig_uprobe_with_nop, + &bench_trig_uretprobe_with_nop, + &bench_trig_uprobe_without_nop, + &bench_trig_uretprobe_without_nop, &bench_rb_libbpf, &bench_rb_custom, &bench_pb_libbpf, diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index f41a491a8cc0..049a5ad56f65 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -2,6 +2,7 @@ /* Copyright (c) 2020 Facebook */ #include "bench.h" #include "trigger_bench.skel.h" +#include "trace_helpers.h" /* BPF triggering benchmarks */ static struct trigger_ctx { @@ -107,6 +108,101 @@ static void *trigger_consumer(void *input) return NULL; } +/* make sure call is not inlined and not avoided by compiler, so __weak and + * inline asm volatile in the body of the function + * + * There is a performance difference between uprobing at nop location vs other + * instructions. So use two different targets, one of which starts with nop + * and another doesn't. + * + * GCC doesn't generate stack setup preample for these functions due to them + * having no input arguments and doing nothing in the body. + */ +__weak void uprobe_target_with_nop(void) +{ + asm volatile ("nop"); +} + +__weak void uprobe_target_without_nop(void) +{ + asm volatile (""); +} + +static void *uprobe_base_producer(void *input) +{ + while (true) { + uprobe_target_with_nop(); + atomic_inc(&base_hits.value); + } + return NULL; +} + +static void *uprobe_producer_with_nop(void *input) +{ + while (true) + uprobe_target_with_nop(); + return NULL; +} + +static void *uprobe_producer_without_nop(void *input) +{ + while (true) + uprobe_target_without_nop(); + return NULL; +} + +static void usetup(bool use_retprobe, bool use_nop) +{ + size_t uprobe_offset; + ssize_t base_addr; + struct bpf_link *link; + + setup_libbpf(); + + ctx.skel = trigger_bench__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + base_addr = get_base_addr(); + if (use_nop) + uprobe_offset = get_uprobe_offset(&uprobe_target_with_nop, base_addr); + else + uprobe_offset = get_uprobe_offset(&uprobe_target_without_nop, base_addr); + + link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe, + use_retprobe, + -1 /* all PIDs */, + "/proc/self/exe", + uprobe_offset); + if (!link) { + fprintf(stderr, "failed to attach uprobe!\n"); + exit(1); + } + ctx.skel->links.bench_trigger_uprobe = link; +} + +static void uprobe_setup_with_nop() +{ + usetup(false, true); +} + +static void uretprobe_setup_with_nop() +{ + usetup(true, true); +} + +static void uprobe_setup_without_nop() +{ + usetup(false, false); +} + +static void uretprobe_setup_without_nop() +{ + usetup(true, false); +} + const struct bench bench_trig_base = { .name = "trig-base", .validate = trigger_validate, @@ -182,3 +278,53 @@ const struct bench bench_trig_fmodret = { .report_progress = hits_drops_report_progress, .report_final = hits_drops_report_final, }; + +const struct bench bench_trig_uprobe_base = { + .name = "trig-uprobe-base", + .setup = NULL, /* no uprobe/uretprobe is attached */ + .producer_thread = uprobe_base_producer, + .consumer_thread = trigger_consumer, + .measure = trigger_base_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_uprobe_with_nop = { + .name = "trig-uprobe-with-nop", + .setup = uprobe_setup_with_nop, + .producer_thread = uprobe_producer_with_nop, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_uretprobe_with_nop = { + .name = "trig-uretprobe-with-nop", + .setup = uretprobe_setup_with_nop, + .producer_thread = uprobe_producer_with_nop, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_uprobe_without_nop = { + .name = "trig-uprobe-without-nop", + .setup = uprobe_setup_without_nop, + .producer_thread = uprobe_producer_without_nop, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; + +const struct bench bench_trig_uretprobe_without_nop = { + .name = "trig-uretprobe-without-nop", + .setup = uretprobe_setup_without_nop, + .producer_thread = uprobe_producer_without_nop, + .consumer_thread = trigger_consumer, + .measure = trigger_measure, + .report_progress = hits_drops_report_progress, + .report_final = hits_drops_report_final, +}; diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 9a4d09590b3d..2098f3f27f18 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -52,3 +52,10 @@ int bench_trigger_fmodret(void *ctx) __sync_add_and_fetch(&hits, 1); return -22; } + +SEC("uprobe/self/uprobe_target") +int bench_trigger_uprobe(void *ctx) +{ + __sync_add_and_fetch(&hits, 1); + return 0; +} From ea78548e0f98951fa7641037ad98a750137d6b6a Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Fri, 12 Nov 2021 11:25:32 -0800 Subject: [PATCH 009/115] selftests/bpf: Move summary line after the error logs Makes it easier to find the summary line when there is a lot of logs to scroll back. Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211112192535.898352-2-fallentree@fb.com --- tools/testing/selftests/bpf/test_progs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index c65986bd9d07..d129ea5c9a48 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1198,11 +1198,11 @@ static int server_main(void) env.sub_succ_cnt += result->sub_succ_cnt; } + print_all_error_logs(); + fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); - print_all_error_logs(); - /* reap all workers */ for (i = 0; i < env.workers; i++) { int wstatus, pid; @@ -1484,11 +1484,11 @@ int main(int argc, char **argv) if (env.list_test_names) goto out; + print_all_error_logs(); + fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt); - print_all_error_logs(); - close(env.saved_netns_fd); out: if (!env.list_test_names && env.has_testmod) From 67d61d30b8a8f33d2a4f269f3a548409827d7b01 Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Fri, 12 Nov 2021 11:25:33 -0800 Subject: [PATCH 010/115] selftests/bpf: Variable naming fix Change log_fd to log_fp to reflect its type correctly. Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211112192535.898352-3-fallentree@fb.com --- tools/testing/selftests/bpf/test_progs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index d129ea5c9a48..926475aa10bb 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -939,7 +939,7 @@ static void *dispatch_thread(void *ctx) { struct dispatch_data *data = ctx; int sock_fd; - FILE *log_fd = NULL; + FILE *log_fp = NULL; sock_fd = data->sock_fd; @@ -1002,8 +1002,8 @@ static void *dispatch_thread(void *ctx) /* collect all logs */ if (msg_test_done.test_done.have_log) { - log_fd = open_memstream(&result->log_buf, &result->log_cnt); - if (!log_fd) + log_fp = open_memstream(&result->log_buf, &result->log_cnt); + if (!log_fp) goto error; while (true) { @@ -1014,12 +1014,12 @@ static void *dispatch_thread(void *ctx) if (msg_log.type != MSG_TEST_LOG) goto error; - fprintf(log_fd, "%s", msg_log.test_log.log_buf); + fprintf(log_fp, "%s", msg_log.test_log.log_buf); if (msg_log.test_log.is_last) break; } - fclose(log_fd); - log_fd = NULL; + fclose(log_fp); + log_fp = NULL; } /* output log */ { @@ -1045,8 +1045,8 @@ static void *dispatch_thread(void *ctx) if (env.debug) fprintf(stderr, "[%d]: Protocol/IO error: %s.\n", data->worker_id, strerror(errno)); - if (log_fd) - fclose(log_fd); + if (log_fp) + fclose(log_fp); done: { struct msg msg_exit; From db813d7bd919c521b869d657dc4a2a2335974cc4 Mon Sep 17 00:00:00 2001 From: Yucong Sun Date: Fri, 12 Nov 2021 11:25:34 -0800 Subject: [PATCH 011/115] selftests/bpf: Mark variable as static Fix warnings from checkstyle.pl Signed-off-by: Yucong Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211112192535.898352-4-fallentree@fb.com --- tools/testing/selftests/bpf/test_progs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 926475aa10bb..296928948bb9 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -473,11 +473,11 @@ static struct prog_test_def prog_test_defs[] = { #include #undef DEFINE_TEST }; -const int prog_test_cnt = ARRAY_SIZE(prog_test_defs); +static const int prog_test_cnt = ARRAY_SIZE(prog_test_defs); const char *argp_program_version = "test_progs 0.1"; const char *argp_program_bug_address = ""; -const char argp_program_doc[] = "BPF selftests test runner"; +static const char argp_program_doc[] = "BPF selftests test runner"; enum ARG_KEYS { ARG_TEST_NUM = 'n', From 3ff36bffaf3545d46e7dedcd8b89e62591de246d Mon Sep 17 00:00:00 2001 From: Dave Tucker Date: Fri, 12 Nov 2021 21:17:22 +0000 Subject: [PATCH 012/115] bpf, docs: Change underline in btf to match style guide This changes the type of underline used to follow the guidelines in Documentation/doc-guide/sphinx.rst which also ensures that the headings are rendered at the correct level in the HTML sidebar Signed-off-by: Dave Tucker Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/981b27485cc294206480df36fca46817e2553e39.1636749493.git.dave@dtucker.co.uk --- Documentation/bpf/btf.rst | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index d0ec40d00c28..1ebf4c5c7ddc 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -3,7 +3,7 @@ BPF Type Format (BTF) ===================== 1. Introduction -*************** +=============== BTF (BPF Type Format) is the metadata format which encodes the debug info related to BPF program/map. The name BTF was used initially to describe data @@ -30,7 +30,7 @@ sections are discussed in details in :ref:`BTF_Type_String`. .. _BTF_Type_String: 2. BTF Type and String Encoding -******************************* +=============================== The file ``include/uapi/linux/btf.h`` provides high-level definition of how types/strings are encoded. @@ -57,13 +57,13 @@ little-endian target. The ``btf_header`` is designed to be extensible with generated. 2.1 String Encoding -=================== +------------------- The first string in the string section must be a null string. The rest of string table is a concatenation of other null-terminated strings. 2.2 Type Encoding -================= +----------------- The type id ``0`` is reserved for ``void`` type. The type section is parsed sequentially and type id is assigned to each recognized type starting from id @@ -504,7 +504,7 @@ valid index (starting from 0) pointing to a member or an argument. * ``type``: the type with ``btf_type_tag`` attribute 3. BTF Kernel API -***************** +================= The following bpf syscall command involves BTF: * BPF_BTF_LOAD: load a blob of BTF data into kernel @@ -547,14 +547,14 @@ The workflow typically looks like: 3.1 BPF_BTF_LOAD -================ +---------------- Load a blob of BTF data into kernel. A blob of data, described in :ref:`BTF_Type_String`, can be directly loaded into the kernel. A ``btf_fd`` is returned to a userspace. 3.2 BPF_MAP_CREATE -================== +------------------ A map can be created with ``btf_fd`` and specified key/value type id.:: @@ -581,7 +581,7 @@ automatically. .. _BPF_Prog_Load: 3.3 BPF_PROG_LOAD -================= +----------------- During prog_load, func_info and line_info can be passed to kernel with proper values for the following attributes: @@ -631,7 +631,7 @@ For line_info, the line number and column number are defined as below: #define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) 3.4 BPF_{PROG,MAP}_GET_NEXT_ID -============================== +------------------------------ In kernel, every loaded program, map or btf has a unique id. The id won't change during the lifetime of a program, map, or btf. @@ -641,13 +641,13 @@ each command, to user space, for bpf program or maps, respectively, so an inspection tool can inspect all programs and maps. 3.5 BPF_{PROG,MAP}_GET_FD_BY_ID -=============================== +------------------------------- An introspection tool cannot use id to get details about program or maps. A file descriptor needs to be obtained first for reference-counting purpose. 3.6 BPF_OBJ_GET_INFO_BY_FD -========================== +-------------------------- Once a program/map fd is acquired, an introspection tool can get the detailed information from kernel about this fd, some of which are BTF-related. For @@ -656,7 +656,7 @@ example, ``bpf_map_info`` returns ``btf_id`` and key/value type ids. bpf byte codes, and jited_line_info. 3.7 BPF_BTF_GET_FD_BY_ID -======================== +------------------------ With ``btf_id`` obtained in ``bpf_map_info`` and ``bpf_prog_info``, bpf syscall command BPF_BTF_GET_FD_BY_ID can retrieve a btf fd. Then, with @@ -668,10 +668,10 @@ tool has full btf knowledge and is able to pretty print map key/values, dump func signatures and line info, along with byte/jit codes. 4. ELF File Format Interface -**************************** +============================ 4.1 .BTF section -================ +---------------- The .BTF section contains type and string data. The format of this section is same as the one describe in :ref:`BTF_Type_String`. @@ -679,7 +679,7 @@ same as the one describe in :ref:`BTF_Type_String`. .. _BTF_Ext_Section: 4.2 .BTF.ext section -==================== +-------------------- The .BTF.ext section encodes func_info and line_info which needs loader manipulation before loading into the kernel. @@ -743,7 +743,7 @@ bpf_insn``. For ELF API, the ``insn_off`` is the byte offset from the beginning of section (``btf_ext_info_sec->sec_name_off``). 4.2 .BTF_ids section -==================== +-------------------- The .BTF_ids section encodes BTF ID values that are used within the kernel. @@ -804,10 +804,10 @@ All the BTF ID lists and sets are compiled in the .BTF_ids section and resolved during the linking phase of kernel build by ``resolve_btfids`` tool. 5. Using BTF -************ +============ 5.1 bpftool map pretty print -============================ +---------------------------- With BTF, the map key/value can be printed based on fields rather than simply raw bytes. This is especially valuable for large structure or if your data @@ -849,7 +849,7 @@ bpftool is able to pretty print like below: ] 5.2 bpftool prog dump -===================== +--------------------- The following is an example showing how func_info and line_info can help prog dump with better kernel symbol names, function prototypes and line @@ -883,7 +883,7 @@ information.:: [...] 5.3 Verifier Log -================ +---------------- The following is an example of how line_info can help debugging verification failure.:: @@ -909,7 +909,7 @@ failure.:: R2 offset is outside of the packet 6. BTF Generation -***************** +================= You need latest pahole @@ -1016,6 +1016,6 @@ format.:: .long 8206 # Line 8 Col 14 7. Testing -********** +========== Kernel bpf selftest `test_btf.c` provides extensive set of BTF-related tests. From f5b1c2ef43d79e054f471dc96996ac40bb262d8d Mon Sep 17 00:00:00 2001 From: Dave Tucker Date: Fri, 12 Nov 2021 21:17:23 +0000 Subject: [PATCH 013/115] bpf, docs: Rename bpf_lsm.rst to prog_lsm.rst This allows for documentation relating to BPF Program Types to be matched by the glob pattern prog_* for inclusion in a sphinx toctree Signed-off-by: Dave Tucker Signed-off-by: Daniel Borkmann Acked-by: KP Singh Link: https://lore.kernel.org/bpf/49fe0f370a2b28500c1b60f1fdb6fb7ec90de28a.1636749493.git.dave@dtucker.co.uk --- Documentation/bpf/{bpf_lsm.rst => prog_lsm.rst} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/bpf/{bpf_lsm.rst => prog_lsm.rst} (100%) diff --git a/Documentation/bpf/bpf_lsm.rst b/Documentation/bpf/prog_lsm.rst similarity index 100% rename from Documentation/bpf/bpf_lsm.rst rename to Documentation/bpf/prog_lsm.rst diff --git a/MAINTAINERS b/MAINTAINERS index 4c74516e4353..25a59950042d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3559,7 +3559,7 @@ R: Florent Revest R: Brendan Jackman L: bpf@vger.kernel.org S: Maintained -F: Documentation/bpf/bpf_lsm.rst +F: Documentation/bpf/prog_lsm.rst F: include/linux/bpf_lsm.h F: kernel/bpf/bpf_lsm.c F: security/bpf/ From 5931d9a3d0529dc803c792a10e52f0de1d0b9991 Mon Sep 17 00:00:00 2001 From: Dave Tucker Date: Fri, 12 Nov 2021 21:17:24 +0000 Subject: [PATCH 014/115] bpf, docs: Fix ordering of bpf documentation This commit fixes the display of the BPF documentation in the sidebar when rendered as HTML. Before this patch, the sidebar would render as follows for some sections: | BPF Documentation |- BPF Type Format (BTF) |- BPF Type Format (BTF) This was due to creating a heading in index.rst followed by a sphinx toctree, where the file referenced carries the same title as the section heading. To fix this I applied a pattern that has been established in other subfolders of Documentation: 1. Re-wrote index.rst to have a single toctree 2. Split the sections out in to their own files Additionally maps.rst and programs.rst make use of a glob pattern to include map_* or prog_* rst files in their toctree, meaning future map or program type documentation will be automatically included. Signed-off-by: Dave Tucker Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1a1eed800e7b9dc13b458de113a489641519b0cc.1636749493.git.dave@dtucker.co.uk --- Documentation/bpf/faq.rst | 11 ++++ Documentation/bpf/helpers.rst | 7 +++ Documentation/bpf/index.rst | 97 ++++-------------------------- Documentation/bpf/libbpf/index.rst | 4 +- Documentation/bpf/maps.rst | 9 +++ Documentation/bpf/other.rst | 9 +++ Documentation/bpf/programs.rst | 9 +++ Documentation/bpf/syscall_api.rst | 11 ++++ Documentation/bpf/test_debug.rst | 9 +++ 9 files changed, 80 insertions(+), 86 deletions(-) create mode 100644 Documentation/bpf/faq.rst create mode 100644 Documentation/bpf/helpers.rst create mode 100644 Documentation/bpf/maps.rst create mode 100644 Documentation/bpf/other.rst create mode 100644 Documentation/bpf/programs.rst create mode 100644 Documentation/bpf/syscall_api.rst create mode 100644 Documentation/bpf/test_debug.rst diff --git a/Documentation/bpf/faq.rst b/Documentation/bpf/faq.rst new file mode 100644 index 000000000000..a622602ce9ad --- /dev/null +++ b/Documentation/bpf/faq.rst @@ -0,0 +1,11 @@ +================================ +Frequently asked questions (FAQ) +================================ + +Two sets of Questions and Answers (Q&A) are maintained. + +.. toctree:: + :maxdepth: 1 + + bpf_design_QA + bpf_devel_QA diff --git a/Documentation/bpf/helpers.rst b/Documentation/bpf/helpers.rst new file mode 100644 index 000000000000..c4ee0cc20dec --- /dev/null +++ b/Documentation/bpf/helpers.rst @@ -0,0 +1,7 @@ +Helper functions +================ + +* `bpf-helpers(7)`_ maintains a list of helpers available to eBPF programs. + +.. Links +.. _bpf-helpers(7): https://man7.org/linux/man-pages/man7/bpf-helpers.7.html \ No newline at end of file diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 37f273a7e8b6..413f50101eca 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -12,97 +12,26 @@ BPF instruction-set. The Cilium project also maintains a `BPF and XDP Reference Guide`_ that goes into great technical depth about the BPF Architecture. -libbpf -====== - -Documentation/bpf/libbpf/libbpf.rst is a userspace library for loading and interacting with bpf programs. - -BPF Type Format (BTF) -===================== - .. toctree:: :maxdepth: 1 + libbpf/index btf - - -Frequently asked questions (FAQ) -================================ - -Two sets of Questions and Answers (Q&A) are maintained. - -.. toctree:: - :maxdepth: 1 - - bpf_design_QA - bpf_devel_QA - -Syscall API -=========== - -The primary info for the bpf syscall is available in the `man-pages`_ -for `bpf(2)`_. For more information about the userspace API, see -Documentation/userspace-api/ebpf/index.rst. - -Helper functions -================ - -* `bpf-helpers(7)`_ maintains a list of helpers available to eBPF programs. - - -Program types -============= - -.. toctree:: - :maxdepth: 1 - - prog_cgroup_sockopt - prog_cgroup_sysctl - prog_flow_dissector - bpf_lsm - prog_sk_lookup - - -Map types -========= - -.. toctree:: - :maxdepth: 1 - - map_cgroup_storage - - -Testing and debugging BPF -========================= - -.. toctree:: - :maxdepth: 1 - - drgn - s390 - - -Licensing -========= - -.. toctree:: - :maxdepth: 1 - + faq + syscall_api + helpers + programs + maps bpf_licensing + test_debug + other +.. only:: subproject and html -Other -===== + Indices + ======= -.. toctree:: - :maxdepth: 1 - - ringbuf - llvm_reloc + * :ref:`genindex` .. Links: -.. _networking-filter: ../networking/filter.rst -.. _man-pages: https://www.kernel.org/doc/man-pages/ -.. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html -.. _bpf-helpers(7): https://man7.org/linux/man-pages/man7/bpf-helpers.7.html -.. _BPF and XDP Reference Guide: https://docs.cilium.io/en/latest/bpf/ +.. _BPF and XDP Reference Guide: https://docs.cilium.io/en/latest/bpf/ \ No newline at end of file diff --git a/Documentation/bpf/libbpf/index.rst b/Documentation/bpf/libbpf/index.rst index 4f8adfc3ab83..4e8c656b539a 100644 --- a/Documentation/bpf/libbpf/index.rst +++ b/Documentation/bpf/libbpf/index.rst @@ -3,8 +3,6 @@ libbpf ====== -For API documentation see the `versioned API documentation site `_. - .. toctree:: :maxdepth: 1 @@ -14,6 +12,8 @@ For API documentation see the `versioned API documentation site `_. + All general BPF questions, including kernel functionality, libbpf APIs and their application, should be sent to bpf@vger.kernel.org mailing list. You can `subscribe `_ to the diff --git a/Documentation/bpf/maps.rst b/Documentation/bpf/maps.rst new file mode 100644 index 000000000000..2084b0e7cde8 --- /dev/null +++ b/Documentation/bpf/maps.rst @@ -0,0 +1,9 @@ +========= +Map Types +========= + +.. toctree:: + :maxdepth: 1 + :glob: + + map_* \ No newline at end of file diff --git a/Documentation/bpf/other.rst b/Documentation/bpf/other.rst new file mode 100644 index 000000000000..3d61963403b4 --- /dev/null +++ b/Documentation/bpf/other.rst @@ -0,0 +1,9 @@ +===== +Other +===== + +.. toctree:: + :maxdepth: 1 + + ringbuf + llvm_reloc \ No newline at end of file diff --git a/Documentation/bpf/programs.rst b/Documentation/bpf/programs.rst new file mode 100644 index 000000000000..620eb667ac7a --- /dev/null +++ b/Documentation/bpf/programs.rst @@ -0,0 +1,9 @@ +============= +Program Types +============= + +.. toctree:: + :maxdepth: 1 + :glob: + + prog_* diff --git a/Documentation/bpf/syscall_api.rst b/Documentation/bpf/syscall_api.rst new file mode 100644 index 000000000000..f0a1dff087ad --- /dev/null +++ b/Documentation/bpf/syscall_api.rst @@ -0,0 +1,11 @@ +=========== +Syscall API +=========== + +The primary info for the bpf syscall is available in the `man-pages`_ +for `bpf(2)`_. For more information about the userspace API, see +Documentation/userspace-api/ebpf/index.rst. + +.. Links: +.. _man-pages: https://www.kernel.org/doc/man-pages/ +.. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html \ No newline at end of file diff --git a/Documentation/bpf/test_debug.rst b/Documentation/bpf/test_debug.rst new file mode 100644 index 000000000000..ebf0caceb6a6 --- /dev/null +++ b/Documentation/bpf/test_debug.rst @@ -0,0 +1,9 @@ +========================= +Testing and debugging BPF +========================= + +.. toctree:: + :maxdepth: 1 + + drgn + s390 From dd7f091fd22b1dce6c20e8f7769aa068ed88ac6d Mon Sep 17 00:00:00 2001 From: Tirthendu Sarkar Date: Wed, 17 Nov 2021 18:06:13 +0530 Subject: [PATCH 015/115] selftests/bpf: Fix xdpxceiver failures for no hugepages xsk_configure_umem() needs hugepages to work in unaligned mode. So when hugepages are not configured, 'unaligned' tests should be skipped which is determined by the helper function hugepages_present(). This function erroneously returns true with MAP_NORESERVE flag even when no hugepages are configured. The removal of this flag fixes the issue. The test TEST_TYPE_UNALIGNED_INV_DESC also needs to be skipped when there are no hugepages. However, this was not skipped as there was no check for presence of hugepages and hence was failing. The check to skip the test has now been added. Fixes: a4ba98dd0c69 (selftests: xsk: Add test for unaligned mode) Signed-off-by: Tirthendu Sarkar Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211117123613.22288-1-tirthendu.sarkar@intel.com --- tools/testing/selftests/bpf/xdpxceiver.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index fe7f423b8c3f..040164c7efc1 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -1217,7 +1217,7 @@ static bool hugepages_present(struct ifobject *ifobject) void *bufs; bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE | MAP_HUGETLB, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (bufs == MAP_FAILED) return false; @@ -1364,6 +1364,10 @@ static void run_pkt_test(struct test_spec *test, enum test_mode mode, enum test_ testapp_invalid_desc(test); break; case TEST_TYPE_UNALIGNED_INV_DESC: + if (!hugepages_present(test->ifobj_tx)) { + ksft_test_result_skip("No 2M huge pages present.\n"); + return; + } test_spec_set_name(test, "UNALIGNED_INV_DESC"); test->ifobj_tx->umem->unaligned_mode = true; test->ifobj_rx->umem->unaligned_mode = true; From 29ad850a5cae84757bcd4c60e0d74232ef8c5157 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 18 Nov 2021 12:52:25 +0100 Subject: [PATCH 016/115] selfetests/bpf: Adapt vmtest.sh to s390 libbpf CI changes [1] added s390 support to libbpf CI and added an ${ARCH} prefix to a number of paths and identifiers in libbpf GitHub repo, which vmtest.sh relies upon. Update these and make use of the new s390 support. [1] https://github.com/libbpf/libbpf/pull/204 Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Signed-off-by: Ilya Leoshkevich Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211118115225.1349726-1-iii@linux.ibm.com --- tools/testing/selftests/bpf/vmtest.sh | 46 ++++++++++++++++++--------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 027198768fad..5e43c79ddc6e 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -4,17 +4,34 @@ set -u set -e -# This script currently only works for x86_64, as -# it is based on the VM image used by the BPF CI which is -# x86_64. -QEMU_BINARY="${QEMU_BINARY:="qemu-system-x86_64"}" -X86_BZIMAGE="arch/x86/boot/bzImage" +# This script currently only works for x86_64 and s390x, as +# it is based on the VM image used by the BPF CI, which is +# available only for these architectures. +ARCH="$(uname -m)" +case "${ARCH}" in +s390x) + QEMU_BINARY=qemu-system-s390x + QEMU_CONSOLE="ttyS1" + QEMU_FLAGS=(-smp 2) + BZIMAGE="arch/s390/boot/compressed/vmlinux" + ;; +x86_64) + QEMU_BINARY=qemu-system-x86_64 + QEMU_CONSOLE="ttyS0,115200" + QEMU_FLAGS=(-cpu host -smp 8) + BZIMAGE="arch/x86/boot/bzImage" + ;; +*) + echo "Unsupported architecture" + exit 1 + ;; +esac DEFAULT_COMMAND="./test_progs" MOUNT_DIR="mnt" ROOTFS_IMAGE="root.img" OUTPUT_DIR="$HOME/.bpf_selftests" -KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/latest.config" -KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/latest.config" +KCONFIG_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/config-latest.${ARCH}" +KCONFIG_API_URL="https://api.github.com/repos/libbpf/libbpf/contents/travis-ci/vmtest/configs/config-latest.${ARCH}" INDEX_URL="https://raw.githubusercontent.com/libbpf/libbpf/master/travis-ci/vmtest/configs/INDEX" NUM_COMPILE_JOBS="$(nproc)" LOG_FILE_BASE="$(date +"bpf_selftests.%Y-%m-%d_%H-%M-%S")" @@ -85,7 +102,7 @@ newest_rootfs_version() { { for file in "${!URLS[@]}"; do - if [[ $file =~ ^libbpf-vmtest-rootfs-(.*)\.tar\.zst$ ]]; then + if [[ $file =~ ^"${ARCH}"/libbpf-vmtest-rootfs-(.*)\.tar\.zst$ ]]; then echo "${BASH_REMATCH[1]}" fi done @@ -102,7 +119,7 @@ download_rootfs() exit 1 fi - download "libbpf-vmtest-rootfs-$rootfsversion.tar.zst" | + download "${ARCH}/libbpf-vmtest-rootfs-$rootfsversion.tar.zst" | zstd -d | sudo tar -C "$dir" -x } @@ -224,13 +241,12 @@ EOF -nodefaults \ -display none \ -serial mon:stdio \ - -cpu host \ + "${qemu_flags[@]}" \ -enable-kvm \ - -smp 8 \ -m 4G \ -drive file="${rootfs_img}",format=raw,index=1,media=disk,if=virtio,cache=none \ -kernel "${kernel_bzimage}" \ - -append "root=/dev/vda rw console=ttyS0,115200" + -append "root=/dev/vda rw console=${QEMU_CONSOLE}" } copy_logs() @@ -282,7 +298,7 @@ main() local kernel_checkout=$(realpath "${script_dir}"/../../../../) # By default the script searches for the kernel in the checkout directory but # it also obeys environment variables O= and KBUILD_OUTPUT= - local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}" + local kernel_bzimage="${kernel_checkout}/${BZIMAGE}" local command="${DEFAULT_COMMAND}" local update_image="no" local exit_command="poweroff -f" @@ -337,13 +353,13 @@ main() if is_rel_path "${O}"; then O="$(realpath "${PWD}/${O}")" fi - kernel_bzimage="${O}/${X86_BZIMAGE}" + kernel_bzimage="${O}/${BZIMAGE}" make_command="${make_command} O=${O}" elif [[ "${KBUILD_OUTPUT:=""}" != "" ]]; then if is_rel_path "${KBUILD_OUTPUT}"; then KBUILD_OUTPUT="$(realpath "${PWD}/${KBUILD_OUTPUT}")" fi - kernel_bzimage="${KBUILD_OUTPUT}/${X86_BZIMAGE}" + kernel_bzimage="${KBUILD_OUTPUT}/${BZIMAGE}" make_command="${make_command} KBUILD_OUTPUT=${KBUILD_OUTPUT}" fi From 7615209f42a1976894cd0df97a380a034911656a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 18 Nov 2021 09:40:54 -0800 Subject: [PATCH 017/115] libbpf: Add runtime APIs to query libbpf version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Libbpf provided LIBBPF_MAJOR_VERSION and LIBBPF_MINOR_VERSION macros to check libbpf version at compilation time. This doesn't cover all the needs, though, because version of libbpf that application is compiled against doesn't necessarily match the version of libbpf at runtime, especially if libbpf is used as a shared library. Add libbpf_major_version() and libbpf_minor_version() returning major and minor versions, respectively, as integers. Also add a convenience libbpf_version_string() for various tooling using libbpf to print out libbpf version in a human-readable form. Currently it will return "v0.6", but in the future it can contains some extra information, so the format itself is not part of a stable API and shouldn't be relied upon. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20211118174054.2699477-1-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 19 +++++++++++++++++++ tools/lib/bpf/libbpf.h | 4 ++++ tools/lib/bpf/libbpf.map | 3 +++ 3 files changed, 26 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index de7e09a6b5ec..78de238f975a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -168,6 +168,25 @@ int libbpf_set_strict_mode(enum libbpf_strict_mode mode) return 0; } +__u32 libbpf_major_version(void) +{ + return LIBBPF_MAJOR_VERSION; +} + +__u32 libbpf_minor_version(void) +{ + return LIBBPF_MINOR_VERSION; +} + +const char *libbpf_version_string(void) +{ +#define __S(X) #X +#define _S(X) __S(X) + return "v" _S(LIBBPF_MAJOR_VERSION) "." _S(LIBBPF_MINOR_VERSION); +#undef _S +#undef __S +} + enum kern_feature_id { /* v4.14: kernel support for program & map names. */ FEAT_PROG_NAME, diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 4ec69f224342..003fdc5cf3a8 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -24,6 +24,10 @@ extern "C" { #endif +LIBBPF_API __u32 libbpf_major_version(void); +LIBBPF_API __u32 libbpf_minor_version(void); +LIBBPF_API const char *libbpf_version_string(void); + enum libbpf_errno { __LIBBPF_ERRNO__START = 4000, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 6a59514a48cf..bea6791272e5 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -410,6 +410,9 @@ LIBBPF_0.6.0 { btf__type_cnt; btf_dump__new; btf_dump__new_deprecated; + libbpf_major_version; + libbpf_minor_version; + libbpf_version_string; perf_buffer__new; perf_buffer__new_deprecated; perf_buffer__new_raw; From efdd3eb8015e7447095f02a26eaabd164cd18004 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Nov 2021 11:41:13 -0800 Subject: [PATCH 018/115] libbpf: Accommodate DWARF/compiler bug with duplicated structs According to [0], compilers sometimes might produce duplicate DWARF definitions for exactly the same struct/union within the same compilation unit (CU). We've had similar issues with identical arrays and handled them with a similar workaround in 6b6e6b1d09aa ("libbpf: Accomodate DWARF/compiler bug with duplicated identical arrays"). Do the same for struct/union by ensuring that two structs/unions are exactly the same, down to the integer values of field referenced type IDs. Solving this more generically (allowing referenced types to be equivalent, but using different type IDs, all within a single CU) requires a huge complexity increase to handle many-to-many mappings between canonidal and candidate type graphs. Before we invest in that, let's see if this approach handles all the instances of this issue in practice. Thankfully it's pretty rare, it seems. [0] https://lore.kernel.org/bpf/YXr2NFlJTAhHdZqq@krava/ Reported-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211117194114.347675-1-andrii@kernel.org --- tools/lib/bpf/btf.c | 45 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index b6be579e0dc6..e97217a77196 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -3477,8 +3477,8 @@ static long btf_hash_struct(struct btf_type *t) } /* - * Check structural compatibility of two FUNC_PROTOs, ignoring referenced type - * IDs. This check is performed during type graph equivalence check and + * Check structural compatibility of two STRUCTs/UNIONs, ignoring referenced + * type IDs. This check is performed during type graph equivalence check and * referenced types equivalence is checked separately. */ static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2) @@ -3851,6 +3851,31 @@ static int btf_dedup_identical_arrays(struct btf_dedup *d, __u32 id1, __u32 id2) return btf_equal_array(t1, t2); } +/* Check if given two types are identical STRUCT/UNION definitions */ +static bool btf_dedup_identical_structs(struct btf_dedup *d, __u32 id1, __u32 id2) +{ + const struct btf_member *m1, *m2; + struct btf_type *t1, *t2; + int n, i; + + t1 = btf_type_by_id(d->btf, id1); + t2 = btf_type_by_id(d->btf, id2); + + if (!btf_is_composite(t1) || btf_kind(t1) != btf_kind(t2)) + return false; + + if (!btf_shallow_equal_struct(t1, t2)) + return false; + + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) { + if (m1->type != m2->type) + return false; + } + return true; +} + /* * Check equivalence of BTF type graph formed by candidate struct/union (we'll * call it "candidate graph" in this description for brevity) to a type graph @@ -3962,6 +3987,8 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, hypot_type_id = d->hypot_map[canon_id]; if (hypot_type_id <= BTF_MAX_NR_TYPES) { + if (hypot_type_id == cand_id) + return 1; /* In some cases compiler will generate different DWARF types * for *identical* array type definitions and use them for * different fields within the *same* struct. This breaks type @@ -3970,8 +3997,18 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, * types within a single CU. So work around that by explicitly * allowing identical array types here. */ - return hypot_type_id == cand_id || - btf_dedup_identical_arrays(d, hypot_type_id, cand_id); + if (btf_dedup_identical_arrays(d, hypot_type_id, cand_id)) + return 1; + /* It turns out that similar situation can happen with + * struct/union sometimes, sigh... Handle the case where + * structs/unions are exactly the same, down to the referenced + * type IDs. Anything more complicated (e.g., if referenced + * types are different, but equivalent) is *way more* + * complicated and requires a many-to-many equivalence mapping. + */ + if (btf_dedup_identical_structs(d, hypot_type_id, cand_id)) + return 1; + return 0; } if (btf_dedup_hypot_map_add(d, canon_id, cand_id)) From 9a49afe6f5a516eb33bec24be0f81cb35ca79445 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 17 Nov 2021 11:41:14 -0800 Subject: [PATCH 019/115] selftests/bpf: Add btf_dedup case with duplicated structs within CU Add an artificial minimal example simulating compilers producing two different types within a single CU that correspond to identical struct definitions. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211117194114.347675-2-andrii@kernel.org --- .../bpf/prog_tests/btf_dedup_split.c | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c index 9d3b8d7a1537..94ff9757557a 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c @@ -314,6 +314,117 @@ static void test_split_struct_duped() { btf__free(btf1); } +static void btf_add_dup_struct_in_cu(struct btf *btf, int start_id) +{ +#define ID(n) (start_id + n) + btf__set_pointer_size(btf, 8); /* enforce 64-bit arch */ + + btf__add_int(btf, "int", 4, BTF_INT_SIGNED); /* [1] int */ + + btf__add_struct(btf, "s", 8); /* [2] struct s { */ + btf__add_field(btf, "a", ID(3), 0, 0); /* struct anon a; */ + btf__add_field(btf, "b", ID(4), 0, 0); /* struct anon b; */ + /* } */ + + btf__add_struct(btf, "(anon)", 8); /* [3] struct anon { */ + btf__add_field(btf, "f1", ID(1), 0, 0); /* int f1; */ + btf__add_field(btf, "f2", ID(1), 32, 0); /* int f2; */ + /* } */ + + btf__add_struct(btf, "(anon)", 8); /* [4] struct anon { */ + btf__add_field(btf, "f1", ID(1), 0, 0); /* int f1; */ + btf__add_field(btf, "f2", ID(1), 32, 0); /* int f2; */ + /* } */ +#undef ID +} + +static void test_split_dup_struct_in_cu() +{ + struct btf *btf1, *btf2; + int err; + + /* generate the base data.. */ + btf1 = btf__new_empty(); + if (!ASSERT_OK_PTR(btf1, "empty_main_btf")) + return; + + btf_add_dup_struct_in_cu(btf1, 0); + + VALIDATE_RAW_BTF( + btf1, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] STRUCT 's' size=8 vlen=2\n" + "\t'a' type_id=3 bits_offset=0\n" + "\t'b' type_id=4 bits_offset=0", + "[3] STRUCT '(anon)' size=8 vlen=2\n" + "\t'f1' type_id=1 bits_offset=0\n" + "\t'f2' type_id=1 bits_offset=32", + "[4] STRUCT '(anon)' size=8 vlen=2\n" + "\t'f1' type_id=1 bits_offset=0\n" + "\t'f2' type_id=1 bits_offset=32"); + + /* ..dedup them... */ + err = btf__dedup(btf1, NULL, NULL); + if (!ASSERT_OK(err, "btf_dedup")) + goto cleanup; + + VALIDATE_RAW_BTF( + btf1, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] STRUCT 's' size=8 vlen=2\n" + "\t'a' type_id=3 bits_offset=0\n" + "\t'b' type_id=3 bits_offset=0", + "[3] STRUCT '(anon)' size=8 vlen=2\n" + "\t'f1' type_id=1 bits_offset=0\n" + "\t'f2' type_id=1 bits_offset=32"); + + /* and add the same data on top of it */ + btf2 = btf__new_empty_split(btf1); + if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) + goto cleanup; + + btf_add_dup_struct_in_cu(btf2, 3); + + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] STRUCT 's' size=8 vlen=2\n" + "\t'a' type_id=3 bits_offset=0\n" + "\t'b' type_id=3 bits_offset=0", + "[3] STRUCT '(anon)' size=8 vlen=2\n" + "\t'f1' type_id=1 bits_offset=0\n" + "\t'f2' type_id=1 bits_offset=32", + "[4] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[5] STRUCT 's' size=8 vlen=2\n" + "\t'a' type_id=6 bits_offset=0\n" + "\t'b' type_id=7 bits_offset=0", + "[6] STRUCT '(anon)' size=8 vlen=2\n" + "\t'f1' type_id=4 bits_offset=0\n" + "\t'f2' type_id=4 bits_offset=32", + "[7] STRUCT '(anon)' size=8 vlen=2\n" + "\t'f1' type_id=4 bits_offset=0\n" + "\t'f2' type_id=4 bits_offset=32"); + + err = btf__dedup(btf2, NULL, NULL); + if (!ASSERT_OK(err, "btf_dedup")) + goto cleanup; + + /* after dedup it should match the original data */ + VALIDATE_RAW_BTF( + btf2, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] STRUCT 's' size=8 vlen=2\n" + "\t'a' type_id=3 bits_offset=0\n" + "\t'b' type_id=3 bits_offset=0", + "[3] STRUCT '(anon)' size=8 vlen=2\n" + "\t'f1' type_id=1 bits_offset=0\n" + "\t'f2' type_id=1 bits_offset=32"); + +cleanup: + btf__free(btf2); + btf__free(btf1); +} + void test_btf_dedup_split() { if (test__start_subtest("split_simple")) @@ -322,4 +433,6 @@ void test_btf_dedup_split() test_split_struct_duped(); if (test__start_subtest("split_fwd_resolve")) test_split_fwd_resolve(); + if (test__start_subtest("split_dup_struct_in_cu")) + test_split_dup_struct_in_cu(); } From 8cccee9e91e19207671b94af40bacf7c1d2e74ef Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Fri, 19 Nov 2021 19:00:35 +0100 Subject: [PATCH 020/115] libbpf: Change bpf_program__set_extra_flags to bpf_program__set_flags bpf_program__set_extra_flags has just been introduced so we can still change it without breaking users. This new interface is a bit more flexible (for example if someone wants to clear a flag). Signed-off-by: Florent Revest Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211119180035.1396139-1-revest@chromium.org --- tools/lib/bpf/libbpf.c | 4 ++-- tools/lib/bpf/libbpf.h | 2 +- tools/lib/bpf/libbpf.map | 2 +- tools/testing/selftests/bpf/testing_helpers.c | 4 +++- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 78de238f975a..af405c38aadc 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8315,12 +8315,12 @@ __u32 bpf_program__flags(const struct bpf_program *prog) return prog->prog_flags; } -int bpf_program__set_extra_flags(struct bpf_program *prog, __u32 extra_flags) +int bpf_program__set_flags(struct bpf_program *prog, __u32 flags) { if (prog->obj->loaded) return libbpf_err(-EBUSY); - prog->prog_flags |= extra_flags; + prog->prog_flags = flags; return 0; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 003fdc5cf3a8..d02139fec4ac 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -498,7 +498,7 @@ bpf_program__set_expected_attach_type(struct bpf_program *prog, enum bpf_attach_type type); LIBBPF_API __u32 bpf_program__flags(const struct bpf_program *prog); -LIBBPF_API int bpf_program__set_extra_flags(struct bpf_program *prog, __u32 extra_flags); +LIBBPF_API int bpf_program__set_flags(struct bpf_program *prog, __u32 flags); LIBBPF_API int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index bea6791272e5..69bc069f0a68 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -400,7 +400,7 @@ LIBBPF_0.6.0 { bpf_program__flags; bpf_program__insn_cnt; bpf_program__insns; - bpf_program__set_extra_flags; + bpf_program__set_flags; btf__add_btf; btf__add_decl_tag; btf__add_type_tag; diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 52c2f24e0898..0f1c37ac6f2c 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -91,6 +91,7 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type, struct bpf_object_load_attr attr = {}; struct bpf_object *obj; struct bpf_program *prog; + __u32 flags; int err; obj = bpf_object__open(file); @@ -106,7 +107,8 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type, if (type != BPF_PROG_TYPE_UNSPEC) bpf_program__set_type(prog, type); - bpf_program__set_extra_flags(prog, BPF_F_TEST_RND_HI32); + flags = bpf_program__flags(prog) | BPF_F_TEST_RND_HI32; + bpf_program__set_flags(prog, flags); attr.obj = obj; attr.log_level = extra_prog_load_log_flags; From fa721d4f0b91f525339996f4faef7bb072d70162 Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Sun, 21 Nov 2021 23:05:30 -0800 Subject: [PATCH 021/115] selftests/bpf: Fix trivial typo Fix trivial typo in comment from 'oveflow' to 'overflow'. Reported-by: Gustavo A. R. Silva Signed-off-by: Drew Fustini Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211122070528.837806-1-dfustini@baylibre.com --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index d6272013a5a3..af47aeb211e7 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -756,7 +756,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d, /* overflow bpf_sock_ops struct with final element nonzero/zero. * Regardless of the value of the final field, we don't have all the * data we need to display it, so we should trigger an overflow. - * In other words oveflow checking should trump "is field zero?" + * In other words overflow checking should trump "is field zero?" * checks because if we've overflowed, it shouldn't matter what the * field is - we can't trust its value so shouldn't display it. */ From 16e0c35c6f7a2e90d52f3035ecf942af21417b7b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 12:01:04 -0800 Subject: [PATCH 022/115] libbpf: Load global data maps lazily on legacy kernels Load global data maps lazily, if kernel is too old to support global data. Make sure that programs are still correct by detecting if any of the to-be-loaded programs have relocation against any of such maps. This allows to solve the issue ([0]) with bpf_printk() and Clang generating unnecessary and unreferenced .rodata.strX.Y sections, but it also goes further along the CO-RE lines, allowing to have a BPF object in which some code can work on very old kernels and relies only on BPF maps explicitly, while other BPF programs might enjoy global variable support. If such programs are correctly set to not load at runtime on old kernels, bpf_object will load and function correctly now. [0] https://lore.kernel.org/bpf/CAK-59YFPU3qO+_pXWOH+c1LSA=8WA1yabJZfREjOEXNHAqgXNg@mail.gmail.com/ Fixes: aed659170a31 ("libbpf: Support multiple .rodata.* and .data.* BPF maps") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211123200105.387855-1-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index af405c38aadc..27695bf31250 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5006,6 +5006,24 @@ bpf_object__create_maps(struct bpf_object *obj) for (i = 0; i < obj->nr_maps; i++) { map = &obj->maps[i]; + /* To support old kernels, we skip creating global data maps + * (.rodata, .data, .kconfig, etc); later on, during program + * loading, if we detect that at least one of the to-be-loaded + * programs is referencing any global data map, we'll error + * out with program name and relocation index logged. + * This approach allows to accommodate Clang emitting + * unnecessary .rodata.str1.1 sections for string literals, + * but also it allows to have CO-RE applications that use + * global variables in some of BPF programs, but not others. + * If those global variable-using programs are not loaded at + * runtime due to bpf_program__set_autoload(prog, false), + * bpf_object loading will succeed just fine even on old + * kernels. + */ + if (bpf_map__is_internal(map) && + !kernel_supports(obj, FEAT_GLOBAL_DATA)) + continue; + retried = false; retry: if (map->pin_path) { @@ -5605,6 +5623,14 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; insn[0].imm = relo->map_idx; } else { + const struct bpf_map *map = &obj->maps[relo->map_idx]; + + if (bpf_map__is_internal(map) && + !kernel_supports(obj, FEAT_GLOBAL_DATA)) { + pr_warn("prog '%s': relo #%d: kernel doesn't support global data\n", + prog->name, i); + return -ENOTSUP; + } insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; insn[0].imm = obj->maps[relo->map_idx].fd; } @@ -6139,6 +6165,8 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) */ if (prog_is_subprog(obj, prog)) continue; + if (!prog->load) + continue; err = bpf_object__relocate_calls(obj, prog); if (err) { @@ -6152,6 +6180,8 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) prog = &obj->programs[i]; if (prog_is_subprog(obj, prog)) continue; + if (!prog->load) + continue; err = bpf_object__relocate_data(obj, prog); if (err) { pr_warn("prog '%s': failed to relocate data references: %d\n", @@ -6939,10 +6969,6 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) bpf_object__for_each_map(m, obj) { if (!bpf_map__is_internal(m)) continue; - if (!kernel_supports(obj, FEAT_GLOBAL_DATA)) { - pr_warn("kernel doesn't support global data\n"); - return -ENOTSUP; - } if (!kernel_supports(obj, FEAT_ARRAY_MMAP)) m->def.map_flags ^= BPF_F_MMAPABLE; } From e4f7ac90c2b09766e4acf771908987391c836413 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 12:01:05 -0800 Subject: [PATCH 023/115] selftests/bpf: Mix legacy (maps) and modern (vars) BPF in one test Add selftest that combines two BPF programs within single BPF object file such that one of the programs is using global variables, but can be skipped at runtime on old kernels that don't support global data. Another BPF program is written with the goal to be runnable on very old kernels and only relies on explicitly accessed BPF maps. Such test, run against old kernels (e.g., libbpf CI will run it against 4.9 kernel that doesn't support global data), allows to test the approach and ensure that libbpf doesn't make unnecessary assumption about necessary kernel features. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211123200105.387855-2-andrii@kernel.org --- .../selftests/bpf/prog_tests/legacy_printk.c | 65 +++++++++++++++++ .../selftests/bpf/progs/test_legacy_printk.c | 73 +++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/legacy_printk.c create mode 100644 tools/testing/selftests/bpf/progs/test_legacy_printk.c diff --git a/tools/testing/selftests/bpf/prog_tests/legacy_printk.c b/tools/testing/selftests/bpf/prog_tests/legacy_printk.c new file mode 100644 index 000000000000..ec6e45f2a644 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/legacy_printk.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include "test_legacy_printk.skel.h" + +static int execute_one_variant(bool legacy) +{ + struct test_legacy_printk *skel; + int err, zero = 0, my_pid = getpid(), res, map_fd; + + skel = test_legacy_printk__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return -errno; + + bpf_program__set_autoload(skel->progs.handle_legacy, legacy); + bpf_program__set_autoload(skel->progs.handle_modern, !legacy); + + err = test_legacy_printk__load(skel); + /* no ASSERT_OK, we expect one of two variants can fail here */ + if (err) + goto err_out; + + if (legacy) { + map_fd = bpf_map__fd(skel->maps.my_pid_map); + err = bpf_map_update_elem(map_fd, &zero, &my_pid, BPF_ANY); + if (!ASSERT_OK(err, "my_pid_map_update")) + goto err_out; + err = bpf_map_lookup_elem(map_fd, &zero, &res); + } else { + skel->bss->my_pid_var = my_pid; + } + + err = test_legacy_printk__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto err_out; + + usleep(1); /* trigger */ + + if (legacy) { + map_fd = bpf_map__fd(skel->maps.res_map); + err = bpf_map_lookup_elem(map_fd, &zero, &res); + if (!ASSERT_OK(err, "res_map_lookup")) + goto err_out; + } else { + res = skel->bss->res_var; + } + + if (!ASSERT_GT(res, 0, "res")) { + err = -EINVAL; + goto err_out; + } + +err_out: + test_legacy_printk__destroy(skel); + return err; +} + +void test_legacy_printk(void) +{ + /* legacy variant should work everywhere */ + ASSERT_OK(execute_one_variant(true /* legacy */), "legacy_case"); + + /* execute modern variant, can fail the load on old kernels */ + execute_one_variant(false); +} diff --git a/tools/testing/selftests/bpf/progs/test_legacy_printk.c b/tools/testing/selftests/bpf/progs/test_legacy_printk.c new file mode 100644 index 000000000000..64c2d9ced529 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_legacy_printk.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#define BPF_NO_GLOBAL_DATA +#include + +char LICENSE[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); +} my_pid_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); +} res_map SEC(".maps"); + +volatile int my_pid_var = 0; +volatile int res_var = 0; + +SEC("tp/raw_syscalls/sys_enter") +int handle_legacy(void *ctx) +{ + int zero = 0, *my_pid, cur_pid, *my_res; + + my_pid = bpf_map_lookup_elem(&my_pid_map, &zero); + if (!my_pid) + return 1; + + cur_pid = bpf_get_current_pid_tgid() >> 32; + if (cur_pid != *my_pid) + return 1; + + my_res = bpf_map_lookup_elem(&res_map, &zero); + if (!my_res) + return 1; + + if (*my_res == 0) + /* use bpf_printk() in combination with BPF_NO_GLOBAL_DATA to + * force .rodata.str1.1 section that previously caused + * problems on old kernels due to libbpf always tried to + * create a global data map for it + */ + bpf_printk("Legacy-case bpf_printk test, pid %d\n", cur_pid); + *my_res = 1; + + return *my_res; +} + +SEC("tp/raw_syscalls/sys_enter") +int handle_modern(void *ctx) +{ + int zero = 0, cur_pid; + + cur_pid = bpf_get_current_pid_tgid() >> 32; + if (cur_pid != my_pid_var) + return 1; + + if (res_var == 0) + /* we need bpf_printk() to validate libbpf logic around unused + * global maps and legacy kernels; see comment in handle_legacy() + */ + bpf_printk("Modern-case bpf_printk test, pid %d\n", cur_pid); + res_var = 1; + + return res_var; +} From 992c4225419a38663d6239bc2f525b4ac0429188 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 Nov 2021 11:32:30 -0800 Subject: [PATCH 024/115] libbpf: Unify low-level map creation APIs w/ new bpf_map_create() Mark the entire zoo of low-level map creation APIs for deprecation in libbpf 0.7 ([0]) and introduce a new bpf_map_create() API that is OPTS-based (and thus future-proof) and matches the BPF_MAP_CREATE command name. While at it, ensure that gen_loader sends map_extra field. Also remove now unneeded btf_key_type_id/btf_value_type_id logic that libbpf is doing anyways. [0] Closes: https://github.com/libbpf/libbpf/issues/282 Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124193233.3115996-2-andrii@kernel.org --- tools/lib/bpf/bpf.c | 136 +++++++++++++------------------ tools/lib/bpf/bpf.h | 33 +++++++- tools/lib/bpf/bpf_gen_internal.h | 5 +- tools/lib/bpf/gen_loader.c | 46 ++++------- tools/lib/bpf/libbpf.c | 33 ++++---- tools/lib/bpf/libbpf.map | 1 + tools/lib/bpf/libbpf_internal.h | 21 ----- 7 files changed, 124 insertions(+), 151 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 94560ba31724..053c86e3d20f 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -88,146 +88,122 @@ static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int return fd; } -int libbpf__bpf_create_map_xattr(const struct bpf_create_map_params *create_attr) +int bpf_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + const struct bpf_map_create_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, map_extra); union bpf_attr attr; int fd; - memset(&attr, '\0', sizeof(attr)); + memset(&attr, 0, attr_sz); - attr.map_type = create_attr->map_type; - attr.key_size = create_attr->key_size; - attr.value_size = create_attr->value_size; - attr.max_entries = create_attr->max_entries; - attr.map_flags = create_attr->map_flags; - if (create_attr->name) - memcpy(attr.map_name, create_attr->name, - min(strlen(create_attr->name), BPF_OBJ_NAME_LEN - 1)); - attr.numa_node = create_attr->numa_node; - attr.btf_fd = create_attr->btf_fd; - attr.btf_key_type_id = create_attr->btf_key_type_id; - attr.btf_value_type_id = create_attr->btf_value_type_id; - attr.map_ifindex = create_attr->map_ifindex; - if (attr.map_type == BPF_MAP_TYPE_STRUCT_OPS) - attr.btf_vmlinux_value_type_id = - create_attr->btf_vmlinux_value_type_id; - else - attr.inner_map_fd = create_attr->inner_map_fd; - attr.map_extra = create_attr->map_extra; + if (!OPTS_VALID(opts, bpf_map_create_opts)) + return libbpf_err(-EINVAL); - fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, sizeof(attr)); + attr.map_type = map_type; + if (map_name) + strncat(attr.map_name, map_name, sizeof(attr.map_name) - 1); + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + + attr.btf_fd = OPTS_GET(opts, btf_fd, 0); + attr.btf_key_type_id = OPTS_GET(opts, btf_key_type_id, 0); + attr.btf_value_type_id = OPTS_GET(opts, btf_value_type_id, 0); + attr.btf_vmlinux_value_type_id = OPTS_GET(opts, btf_vmlinux_value_type_id, 0); + + attr.inner_map_fd = OPTS_GET(opts, inner_map_fd, 0); + attr.map_flags = OPTS_GET(opts, map_flags, 0); + attr.map_extra = OPTS_GET(opts, map_extra, 0); + attr.numa_node = OPTS_GET(opts, numa_node, 0); + attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0); + + fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); return libbpf_err_errno(fd); } int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) { - struct bpf_create_map_params p = {}; + LIBBPF_OPTS(bpf_map_create_opts, p); - p.map_type = create_attr->map_type; - p.key_size = create_attr->key_size; - p.value_size = create_attr->value_size; - p.max_entries = create_attr->max_entries; p.map_flags = create_attr->map_flags; - p.name = create_attr->name; p.numa_node = create_attr->numa_node; p.btf_fd = create_attr->btf_fd; p.btf_key_type_id = create_attr->btf_key_type_id; p.btf_value_type_id = create_attr->btf_value_type_id; p.map_ifindex = create_attr->map_ifindex; - if (p.map_type == BPF_MAP_TYPE_STRUCT_OPS) - p.btf_vmlinux_value_type_id = - create_attr->btf_vmlinux_value_type_id; + if (create_attr->map_type == BPF_MAP_TYPE_STRUCT_OPS) + p.btf_vmlinux_value_type_id = create_attr->btf_vmlinux_value_type_id; else p.inner_map_fd = create_attr->inner_map_fd; - return libbpf__bpf_create_map_xattr(&p); + return bpf_map_create(create_attr->map_type, create_attr->name, + create_attr->key_size, create_attr->value_size, + create_attr->max_entries, &p); } int bpf_create_map_node(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, __u32 map_flags, int node) { - struct bpf_create_map_attr map_attr = {}; + LIBBPF_OPTS(bpf_map_create_opts, opts); - map_attr.name = name; - map_attr.map_type = map_type; - map_attr.map_flags = map_flags; - map_attr.key_size = key_size; - map_attr.value_size = value_size; - map_attr.max_entries = max_entries; + opts.map_flags = map_flags; if (node >= 0) { - map_attr.numa_node = node; - map_attr.map_flags |= BPF_F_NUMA_NODE; + opts.numa_node = node; + opts.map_flags |= BPF_F_NUMA_NODE; } - return bpf_create_map_xattr(&map_attr); + return bpf_map_create(map_type, name, key_size, value_size, max_entries, &opts); } int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, __u32 map_flags) { - struct bpf_create_map_attr map_attr = {}; + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = map_flags); - map_attr.map_type = map_type; - map_attr.map_flags = map_flags; - map_attr.key_size = key_size; - map_attr.value_size = value_size; - map_attr.max_entries = max_entries; - - return bpf_create_map_xattr(&map_attr); + return bpf_map_create(map_type, NULL, key_size, value_size, max_entries, &opts); } int bpf_create_map_name(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, __u32 map_flags) { - struct bpf_create_map_attr map_attr = {}; + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = map_flags); - map_attr.name = name; - map_attr.map_type = map_type; - map_attr.map_flags = map_flags; - map_attr.key_size = key_size; - map_attr.value_size = value_size; - map_attr.max_entries = max_entries; - - return bpf_create_map_xattr(&map_attr); + return bpf_map_create(map_type, name, key_size, value_size, max_entries, &opts); } int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, int key_size, int inner_map_fd, int max_entries, __u32 map_flags, int node) { - union bpf_attr attr; - int fd; - - memset(&attr, '\0', sizeof(attr)); - - attr.map_type = map_type; - attr.key_size = key_size; - attr.value_size = 4; - attr.inner_map_fd = inner_map_fd; - attr.max_entries = max_entries; - attr.map_flags = map_flags; - if (name) - memcpy(attr.map_name, name, - min(strlen(name), BPF_OBJ_NAME_LEN - 1)); + LIBBPF_OPTS(bpf_map_create_opts, opts); + opts.inner_map_fd = inner_map_fd; + opts.map_flags = map_flags; if (node >= 0) { - attr.map_flags |= BPF_F_NUMA_NODE; - attr.numa_node = node; + opts.map_flags |= BPF_F_NUMA_NODE; + opts.numa_node = node; } - fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, sizeof(attr)); - return libbpf_err_errno(fd); + return bpf_map_create(map_type, name, key_size, 4, max_entries, &opts); } int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, int key_size, int inner_map_fd, int max_entries, __u32 map_flags) { - return bpf_create_map_in_map_node(map_type, name, key_size, - inner_map_fd, max_entries, map_flags, - -1); + LIBBPF_OPTS(bpf_map_create_opts, opts, + .inner_map_fd = inner_map_fd, + .map_flags = map_flags, + ); + + return bpf_map_create(map_type, name, key_size, 4, max_entries, &opts); } static void * diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 079cc81ac51e..70b6f44fc8b0 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -35,6 +35,30 @@ extern "C" { #endif +struct bpf_map_create_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 btf_fd; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 btf_vmlinux_value_type_id; + + int inner_map_fd; + int map_flags; + __u64 map_extra; + + int numa_node; + int map_ifindex; +}; +#define bpf_map_create_opts__last_field map_ifindex + +LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + const struct bpf_map_create_opts *opts); + struct bpf_create_map_attr { const char *name; enum bpf_map_type map_type; @@ -53,20 +77,25 @@ struct bpf_create_map_attr { }; }; -LIBBPF_API int -bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr); +LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") +LIBBPF_API int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr); +LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") LIBBPF_API int bpf_create_map_node(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, __u32 map_flags, int node); +LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") LIBBPF_API int bpf_create_map_name(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, __u32 map_flags); +LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") LIBBPF_API int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, __u32 map_flags); +LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") LIBBPF_API int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, int key_size, int inner_map_fd, int max_entries, __u32 map_flags, int node); +LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") LIBBPF_API int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, int key_size, int inner_map_fd, int max_entries, diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h index 75ca9fb857b2..ae7704deba30 100644 --- a/tools/lib/bpf/bpf_gen_internal.h +++ b/tools/lib/bpf/bpf_gen_internal.h @@ -51,7 +51,10 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level); int bpf_gen__finish(struct bpf_gen *gen); void bpf_gen__free(struct bpf_gen *gen); void bpf_gen__load_btf(struct bpf_gen *gen, const void *raw_data, __u32 raw_size); -void bpf_gen__map_create(struct bpf_gen *gen, struct bpf_create_map_params *map_attr, int map_idx); +void bpf_gen__map_create(struct bpf_gen *gen, + enum bpf_map_type map_type, const char *map_name, + __u32 key_size, __u32 value_size, __u32 max_entries, + struct bpf_map_create_opts *map_attr, int map_idx); void bpf_gen__prog_load(struct bpf_gen *gen, enum bpf_prog_type prog_type, const char *prog_name, const char *license, struct bpf_insn *insns, size_t insn_cnt, diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 7b73f97b1fa1..c7bc77f4e752 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -432,47 +432,33 @@ void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, } void bpf_gen__map_create(struct bpf_gen *gen, - struct bpf_create_map_params *map_attr, int map_idx) + enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, __u32 value_size, __u32 max_entries, + struct bpf_map_create_opts *map_attr, int map_idx) { - int attr_size = offsetofend(union bpf_attr, btf_vmlinux_value_type_id); + int attr_size = offsetofend(union bpf_attr, map_extra); bool close_inner_map_fd = false; int map_create_attr, idx; union bpf_attr attr; memset(&attr, 0, attr_size); - attr.map_type = map_attr->map_type; - attr.key_size = map_attr->key_size; - attr.value_size = map_attr->value_size; + attr.map_type = map_type; + attr.key_size = key_size; + attr.value_size = value_size; attr.map_flags = map_attr->map_flags; attr.map_extra = map_attr->map_extra; - memcpy(attr.map_name, map_attr->name, - min((unsigned)strlen(map_attr->name), BPF_OBJ_NAME_LEN - 1)); + if (map_name) + memcpy(attr.map_name, map_name, + min((unsigned)strlen(map_name), BPF_OBJ_NAME_LEN - 1)); attr.numa_node = map_attr->numa_node; attr.map_ifindex = map_attr->map_ifindex; - attr.max_entries = map_attr->max_entries; - switch (attr.map_type) { - case BPF_MAP_TYPE_PERF_EVENT_ARRAY: - case BPF_MAP_TYPE_CGROUP_ARRAY: - case BPF_MAP_TYPE_STACK_TRACE: - case BPF_MAP_TYPE_ARRAY_OF_MAPS: - case BPF_MAP_TYPE_HASH_OF_MAPS: - case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - case BPF_MAP_TYPE_CPUMAP: - case BPF_MAP_TYPE_XSKMAP: - case BPF_MAP_TYPE_SOCKMAP: - case BPF_MAP_TYPE_SOCKHASH: - case BPF_MAP_TYPE_QUEUE: - case BPF_MAP_TYPE_STACK: - case BPF_MAP_TYPE_RINGBUF: - break; - default: - attr.btf_key_type_id = map_attr->btf_key_type_id; - attr.btf_value_type_id = map_attr->btf_value_type_id; - } + attr.max_entries = max_entries; + attr.btf_key_type_id = map_attr->btf_key_type_id; + attr.btf_value_type_id = map_attr->btf_value_type_id; pr_debug("gen: map_create: %s idx %d type %d value_type_id %d\n", - attr.map_name, map_idx, map_attr->map_type, attr.btf_value_type_id); + attr.map_name, map_idx, map_type, attr.btf_value_type_id); map_create_attr = add_data(gen, &attr, attr_size); if (attr.btf_value_type_id) @@ -499,7 +485,7 @@ void bpf_gen__map_create(struct bpf_gen *gen, /* emit MAP_CREATE command */ emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size); debug_ret(gen, "map_create %s idx %d type %d value_size %d value_btf_id %d", - attr.map_name, map_idx, map_attr->map_type, attr.value_size, + attr.map_name, map_idx, map_type, value_size, attr.btf_value_type_id); emit_check_err(gen); /* remember map_fd in the stack, if successful */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 27695bf31250..c869a3721532 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4839,19 +4839,16 @@ static void bpf_map__destroy(struct bpf_map *map); static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner) { - struct bpf_create_map_params create_attr; + LIBBPF_OPTS(bpf_map_create_opts, create_attr); struct bpf_map_def *def = &map->def; + const char *map_name = NULL; + __u32 max_entries; int err = 0; - memset(&create_attr, 0, sizeof(create_attr)); - if (kernel_supports(obj, FEAT_PROG_NAME)) - create_attr.name = map->name; + map_name = map->name; create_attr.map_ifindex = map->map_ifindex; - create_attr.map_type = def->type; create_attr.map_flags = def->map_flags; - create_attr.key_size = def->key_size; - create_attr.value_size = def->value_size; create_attr.numa_node = map->numa_node; create_attr.map_extra = map->map_extra; @@ -4865,18 +4862,14 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b return nr_cpus; } pr_debug("map '%s': setting size to %d\n", map->name, nr_cpus); - create_attr.max_entries = nr_cpus; + max_entries = nr_cpus; } else { - create_attr.max_entries = def->max_entries; + max_entries = def->max_entries; } if (bpf_map__is_struct_ops(map)) - create_attr.btf_vmlinux_value_type_id = - map->btf_vmlinux_value_type_id; + create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; - create_attr.btf_fd = 0; - create_attr.btf_key_type_id = 0; - create_attr.btf_value_type_id = 0; if (obj->btf && btf__fd(obj->btf) >= 0 && !bpf_map_find_btf_info(obj, map)) { create_attr.btf_fd = btf__fd(obj->btf); create_attr.btf_key_type_id = map->btf_key_type_id; @@ -4922,13 +4915,17 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b } if (obj->gen_loader) { - bpf_gen__map_create(obj->gen_loader, &create_attr, is_inner ? -1 : map - obj->maps); + bpf_gen__map_create(obj->gen_loader, def->type, map_name, + def->key_size, def->value_size, max_entries, + &create_attr, is_inner ? -1 : map - obj->maps); /* Pretend to have valid FD to pass various fd >= 0 checks. * This fd == 0 will not be used with any syscall and will be reset to -1 eventually. */ map->fd = 0; } else { - map->fd = libbpf__bpf_create_map_xattr(&create_attr); + map->fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + max_entries, &create_attr); } if (map->fd < 0 && (create_attr.btf_key_type_id || create_attr.btf_value_type_id)) { @@ -4943,7 +4940,9 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b create_attr.btf_value_type_id = 0; map->btf_key_type_id = 0; map->btf_value_type_id = 0; - map->fd = libbpf__bpf_create_map_xattr(&create_attr); + map->fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + max_entries, &create_attr); } err = map->fd < 0 ? -errno : 0; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 69bc069f0a68..623002b83b2b 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -391,6 +391,7 @@ LIBBPF_0.6.0 { global: bpf_map__map_extra; bpf_map__set_map_extra; + bpf_map_create; bpf_object__next_map; bpf_object__next_program; bpf_object__prev_map; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index f7ac349650a1..311905d8ca70 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -278,27 +278,6 @@ int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); int libbpf__load_raw_btf(const char *raw_types, size_t types_len, const char *str_sec, size_t str_len); -struct bpf_create_map_params { - const char *name; - enum bpf_map_type map_type; - __u32 map_flags; - __u32 key_size; - __u32 value_size; - __u32 max_entries; - __u32 numa_node; - __u32 btf_fd; - __u32 btf_key_type_id; - __u32 btf_value_type_id; - __u32 map_ifindex; - union { - __u32 inner_map_fd; - __u32 btf_vmlinux_value_type_id; - }; - __u64 map_extra; -}; - -int libbpf__bpf_create_map_xattr(const struct bpf_create_map_params *create_attr); - struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf); void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, const char **prefix, int *kind); From a9606f405f2c8f24751b0a7326655a657a63ad60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 Nov 2021 11:32:31 -0800 Subject: [PATCH 025/115] libbpf: Use bpf_map_create() consistently internally Remove all the remaining uses of to-be-deprecated bpf_create_map*() APIs. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124193233.3115996-3-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 30 ++++++------------------------ tools/lib/bpf/libbpf_probes.c | 30 +++++++++++++++--------------- tools/lib/bpf/skel_internal.h | 3 +-- tools/lib/bpf/xsk.c | 13 +++---------- 4 files changed, 25 insertions(+), 51 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c869a3721532..e05dd785b347 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4361,7 +4361,6 @@ static int probe_kern_prog_name(void) static int probe_kern_global_data(void) { - struct bpf_create_map_attr map_attr; char *cp, errmsg[STRERR_BUFSIZE]; struct bpf_insn insns[] = { BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16), @@ -4371,13 +4370,7 @@ static int probe_kern_global_data(void) }; int ret, map, insn_cnt = ARRAY_SIZE(insns); - memset(&map_attr, 0, sizeof(map_attr)); - map_attr.map_type = BPF_MAP_TYPE_ARRAY; - map_attr.key_size = sizeof(int); - map_attr.value_size = 32; - map_attr.max_entries = 1; - - map = bpf_create_map_xattr(&map_attr); + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), 32, 1, NULL); if (map < 0) { ret = -errno; cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); @@ -4507,15 +4500,11 @@ static int probe_kern_btf_type_tag(void) static int probe_kern_array_mmap(void) { - struct bpf_create_map_attr attr = { - .map_type = BPF_MAP_TYPE_ARRAY, - .map_flags = BPF_F_MMAPABLE, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 1, - }; + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE); + int fd; - return probe_fd(bpf_create_map_xattr(&attr)); + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), sizeof(int), 1, &opts); + return probe_fd(fd); } static int probe_kern_exp_attach_type(void) @@ -4554,7 +4543,6 @@ static int probe_kern_probe_read_kernel(void) static int probe_prog_bind_map(void) { - struct bpf_create_map_attr map_attr; char *cp, errmsg[STRERR_BUFSIZE]; struct bpf_insn insns[] = { BPF_MOV64_IMM(BPF_REG_0, 0), @@ -4562,13 +4550,7 @@ static int probe_prog_bind_map(void) }; int ret, map, prog, insn_cnt = ARRAY_SIZE(insns); - memset(&map_attr, 0, sizeof(map_attr)); - map_attr.map_type = BPF_MAP_TYPE_ARRAY; - map_attr.key_size = sizeof(int); - map_attr.value_size = 32; - map_attr.max_entries = 1; - - map = bpf_create_map_xattr(&map_attr); + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), 32, 1, NULL); if (map < 0) { ret = -errno; cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 02c401e314c7..41f2be47c2ea 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -201,7 +201,6 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) { int key_size, value_size, max_entries, map_flags; __u32 btf_key_type_id = 0, btf_value_type_id = 0; - struct bpf_create_map_attr attr = {}; int fd = -1, btf_fd = -1, fd_inner; key_size = sizeof(__u32); @@ -271,34 +270,35 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) if (map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + LIBBPF_OPTS(bpf_map_create_opts, opts); + /* TODO: probe for device, once libbpf has a function to create * map-in-map for offload */ if (ifindex) return false; - fd_inner = bpf_create_map(BPF_MAP_TYPE_HASH, - sizeof(__u32), sizeof(__u32), 1, 0); + fd_inner = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, + sizeof(__u32), sizeof(__u32), 1, NULL); if (fd_inner < 0) return false; - fd = bpf_create_map_in_map(map_type, NULL, sizeof(__u32), - fd_inner, 1, 0); + + opts.inner_map_fd = fd_inner; + fd = bpf_map_create(map_type, NULL, sizeof(__u32), sizeof(__u32), 1, &opts); close(fd_inner); } else { + LIBBPF_OPTS(bpf_map_create_opts, opts); + /* Note: No other restriction on map type probes for offload */ - attr.map_type = map_type; - attr.key_size = key_size; - attr.value_size = value_size; - attr.max_entries = max_entries; - attr.map_flags = map_flags; - attr.map_ifindex = ifindex; + opts.map_flags = map_flags; + opts.map_ifindex = ifindex; if (btf_fd >= 0) { - attr.btf_fd = btf_fd; - attr.btf_key_type_id = btf_key_type_id; - attr.btf_value_type_id = btf_value_type_id; + opts.btf_fd = btf_fd; + opts.btf_key_type_id = btf_key_type_id; + opts.btf_value_type_id = btf_value_type_id; } - fd = bpf_create_map_xattr(&attr); + fd = bpf_map_create(map_type, NULL, key_size, value_size, max_entries, &opts); } if (fd >= 0) close(fd); diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h index 9cf66702fa8d..b206532704ce 100644 --- a/tools/lib/bpf/skel_internal.h +++ b/tools/lib/bpf/skel_internal.h @@ -65,8 +65,7 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) int map_fd = -1, prog_fd = -1, key = 0, err; union bpf_attr attr; - map_fd = bpf_create_map_name(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, - opts->data_sz, 1, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1, NULL); if (map_fd < 0) { opts->errstr = "failed to create loader map"; err = -errno; diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index fdb22f5405c9..f1c29074d527 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -364,7 +364,6 @@ int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area, static enum xsk_prog get_xsk_prog(void) { enum xsk_prog detected = XSK_PROG_FALLBACK; - struct bpf_create_map_attr map_attr; __u32 size_out, retval, duration; char data_in = 0, data_out; struct bpf_insn insns[] = { @@ -376,13 +375,7 @@ static enum xsk_prog get_xsk_prog(void) }; int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns); - memset(&map_attr, 0, sizeof(map_attr)); - map_attr.map_type = BPF_MAP_TYPE_XSKMAP; - map_attr.key_size = sizeof(int); - map_attr.value_size = sizeof(int); - map_attr.max_entries = 1; - - map_fd = bpf_create_map_xattr(&map_attr); + map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL); if (map_fd < 0) return detected; @@ -586,8 +579,8 @@ static int xsk_create_bpf_maps(struct xsk_socket *xsk) if (max_queues < 0) return max_queues; - fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map", - sizeof(int), sizeof(int), max_queues, 0); + fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map", + sizeof(int), sizeof(int), max_queues, NULL); if (fd < 0) return fd; From 99a12a32fee4f740af2f36bb8f64e11c026f3389 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 Nov 2021 11:32:32 -0800 Subject: [PATCH 026/115] libbpf: Prevent deprecation warnings in xsk.c xsk.c is using own APIs that are marked for deprecation internally. Given xsk.c and xsk.h will be gone in libbpf 1.0, there is no reason to do public vs internal function split just to avoid deprecation warnings. So just add a pragma to silence deprecation warnings (until the code is removed completely). Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124193233.3115996-4-andrii@kernel.org --- tools/lib/bpf/xsk.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index f1c29074d527..e8d94c6dd3bc 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -35,6 +35,11 @@ #include "libbpf_internal.h" #include "xsk.h" +/* entire xsk.h and xsk.c is going away in libbpf 1.0, so ignore all internal + * uses of deprecated APIs + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + #ifndef SOL_XDP #define SOL_XDP 283 #endif From 2fe256a429cb6c0b0064563af4158470143a363c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 24 Nov 2021 11:32:33 -0800 Subject: [PATCH 027/115] selftests/bpf: Migrate selftests to bpf_map_create() Conversion is straightforward for most cases. In few cases tests are using mutable map_flags and attribute structs, but bpf_map_create_opts can be used in the similar fashion, so there were no problems. Just lots of repetitive conversions. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124193233.3115996-5-andrii@kernel.org --- .../bpf/map_tests/array_map_batch_ops.c | 13 +-- .../bpf/map_tests/htab_map_batch_ops.c | 13 +-- .../bpf/map_tests/lpm_trie_map_batch_ops.c | 15 +-- .../selftests/bpf/map_tests/sk_storage_map.c | 50 ++++---- .../bpf/prog_tests/bloom_filter_map.c | 36 +++--- .../selftests/bpf/prog_tests/bpf_iter.c | 8 +- tools/testing/selftests/bpf/prog_tests/btf.c | 51 +++----- .../bpf/prog_tests/cgroup_attach_multi.c | 12 +- .../selftests/bpf/prog_tests/pinning.c | 4 +- .../selftests/bpf/prog_tests/ringbuf_multi.c | 4 +- .../bpf/prog_tests/select_reuseport.c | 21 +--- .../selftests/bpf/prog_tests/sockmap_basic.c | 4 +- .../selftests/bpf/prog_tests/sockmap_ktls.c | 2 +- .../selftests/bpf/prog_tests/sockmap_listen.c | 4 +- .../selftests/bpf/prog_tests/test_bpffs.c | 2 +- .../selftests/bpf/test_cgroup_storage.c | 8 +- tools/testing/selftests/bpf/test_lpm_map.c | 27 +++-- tools/testing/selftests/bpf/test_lru_map.c | 16 +-- tools/testing/selftests/bpf/test_maps.c | 110 +++++++++--------- tools/testing/selftests/bpf/test_tag.c | 5 +- tools/testing/selftests/bpf/test_verifier.c | 52 ++++----- 21 files changed, 201 insertions(+), 256 deletions(-) diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c index f4d870da7684..78c76496b14a 100644 --- a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c @@ -68,13 +68,6 @@ static void map_batch_verify(int *visited, __u32 max_entries, int *keys, static void __test_map_lookup_and_update_batch(bool is_pcpu) { - struct bpf_create_map_attr xattr = { - .name = "array_map", - .map_type = is_pcpu ? BPF_MAP_TYPE_PERCPU_ARRAY : - BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(__s64), - }; int map_fd, *keys, *visited; __u32 count, total, total_success; const __u32 max_entries = 10; @@ -86,10 +79,10 @@ static void __test_map_lookup_and_update_batch(bool is_pcpu) .flags = 0, ); - xattr.max_entries = max_entries; - map_fd = bpf_create_map_xattr(&xattr); + map_fd = bpf_map_create(is_pcpu ? BPF_MAP_TYPE_PERCPU_ARRAY : BPF_MAP_TYPE_ARRAY, + "array_map", sizeof(int), sizeof(__s64), max_entries, NULL); CHECK(map_fd == -1, - "bpf_create_map_xattr()", "error:%s\n", strerror(errno)); + "bpf_map_create()", "error:%s\n", strerror(errno)); value_size = sizeof(__s64); if (is_pcpu) diff --git a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c index 976bf415fbdd..f807d53fd8dd 100644 --- a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c @@ -83,22 +83,15 @@ void __test_map_lookup_and_delete_batch(bool is_pcpu) int err, step, value_size; bool nospace_err; void *values; - struct bpf_create_map_attr xattr = { - .name = "hash_map", - .map_type = is_pcpu ? BPF_MAP_TYPE_PERCPU_HASH : - BPF_MAP_TYPE_HASH, - .key_size = sizeof(int), - .value_size = sizeof(int), - }; DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, ); - xattr.max_entries = max_entries; - map_fd = bpf_create_map_xattr(&xattr); + map_fd = bpf_map_create(is_pcpu ? BPF_MAP_TYPE_PERCPU_HASH : BPF_MAP_TYPE_HASH, + "hash_map", sizeof(int), sizeof(int), max_entries, NULL); CHECK(map_fd == -1, - "bpf_create_map_xattr()", "error:%s\n", strerror(errno)); + "bpf_map_create()", "error:%s\n", strerror(errno)); value_size = is_pcpu ? sizeof(value) : sizeof(int); keys = malloc(max_entries * sizeof(int)); diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c index 2e986e5e4cac..87d07b596e17 100644 --- a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c @@ -64,13 +64,7 @@ static void map_batch_verify(int *visited, __u32 max_entries, void test_lpm_trie_map_batch_ops(void) { - struct bpf_create_map_attr xattr = { - .name = "lpm_trie_map", - .map_type = BPF_MAP_TYPE_LPM_TRIE, - .key_size = sizeof(struct test_lpm_key), - .value_size = sizeof(int), - .map_flags = BPF_F_NO_PREALLOC, - }; + LIBBPF_OPTS(bpf_map_create_opts, create_opts, .map_flags = BPF_F_NO_PREALLOC); struct test_lpm_key *keys, key; int map_fd, *values, *visited; __u32 step, count, total, total_success; @@ -82,9 +76,10 @@ void test_lpm_trie_map_batch_ops(void) .flags = 0, ); - xattr.max_entries = max_entries; - map_fd = bpf_create_map_xattr(&xattr); - CHECK(map_fd == -1, "bpf_create_map_xattr()", "error:%s\n", + map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, "lpm_trie_map", + sizeof(struct test_lpm_key), sizeof(int), + max_entries, &create_opts); + CHECK(map_fd == -1, "bpf_map_create()", "error:%s\n", strerror(errno)); keys = malloc(max_entries * sizeof(struct test_lpm_key)); diff --git a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c index e569edc679d8..8eea4ffeb092 100644 --- a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c +++ b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c @@ -19,16 +19,12 @@ #include #include -static struct bpf_create_map_attr xattr = { - .name = "sk_storage_map", - .map_type = BPF_MAP_TYPE_SK_STORAGE, - .map_flags = BPF_F_NO_PREALLOC, - .max_entries = 0, - .key_size = 4, - .value_size = 8, +static struct bpf_map_create_opts map_opts = { + .sz = sizeof(map_opts), .btf_key_type_id = 1, .btf_value_type_id = 3, .btf_fd = -1, + .map_flags = BPF_F_NO_PREALLOC, }; static unsigned int nr_sk_threads_done; @@ -150,13 +146,13 @@ static int create_sk_storage_map(void) btf_fd = load_btf(); CHECK(btf_fd == -1, "bpf_load_btf", "btf_fd:%d errno:%d\n", btf_fd, errno); - xattr.btf_fd = btf_fd; + map_opts.btf_fd = btf_fd; - map_fd = bpf_create_map_xattr(&xattr); - xattr.btf_fd = -1; + map_fd = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 0, &map_opts); + map_opts.btf_fd = -1; close(btf_fd); CHECK(map_fd == -1, - "bpf_create_map_xattr()", "errno:%d\n", errno); + "bpf_map_create()", "errno:%d\n", errno); return map_fd; } @@ -463,20 +459,20 @@ static void test_sk_storage_map_basic(void) int cnt; int lock; } value = { .cnt = 0xeB9f, .lock = 0, }, lookup_value; - struct bpf_create_map_attr bad_xattr; + struct bpf_map_create_opts bad_xattr; int btf_fd, map_fd, sk_fd, err; btf_fd = load_btf(); CHECK(btf_fd == -1, "bpf_load_btf", "btf_fd:%d errno:%d\n", btf_fd, errno); - xattr.btf_fd = btf_fd; + map_opts.btf_fd = btf_fd; sk_fd = socket(AF_INET6, SOCK_STREAM, 0); CHECK(sk_fd == -1, "socket()", "sk_fd:%d errno:%d\n", sk_fd, errno); - map_fd = bpf_create_map_xattr(&xattr); - CHECK(map_fd == -1, "bpf_create_map_xattr(good_xattr)", + map_fd = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 0, &map_opts); + CHECK(map_fd == -1, "bpf_map_create(good_xattr)", "map_fd:%d errno:%d\n", map_fd, errno); /* Add new elem */ @@ -560,31 +556,29 @@ static void test_sk_storage_map_basic(void) CHECK(!err || errno != ENOENT, "bpf_map_delete_elem()", "err:%d errno:%d\n", err, errno); - memcpy(&bad_xattr, &xattr, sizeof(xattr)); + memcpy(&bad_xattr, &map_opts, sizeof(map_opts)); bad_xattr.btf_key_type_id = 0; - err = bpf_create_map_xattr(&bad_xattr); - CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)", + err = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 0, &bad_xattr); + CHECK(!err || errno != EINVAL, "bpf_map_create(bad_xattr)", "err:%d errno:%d\n", err, errno); - memcpy(&bad_xattr, &xattr, sizeof(xattr)); + memcpy(&bad_xattr, &map_opts, sizeof(map_opts)); bad_xattr.btf_key_type_id = 3; - err = bpf_create_map_xattr(&bad_xattr); - CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)", + err = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 0, &bad_xattr); + CHECK(!err || errno != EINVAL, "bpf_map_create(bad_xattr)", "err:%d errno:%d\n", err, errno); - memcpy(&bad_xattr, &xattr, sizeof(xattr)); - bad_xattr.max_entries = 1; - err = bpf_create_map_xattr(&bad_xattr); - CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)", + err = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 1, &map_opts); + CHECK(!err || errno != EINVAL, "bpf_map_create(bad_xattr)", "err:%d errno:%d\n", err, errno); - memcpy(&bad_xattr, &xattr, sizeof(xattr)); + memcpy(&bad_xattr, &map_opts, sizeof(map_opts)); bad_xattr.map_flags = 0; - err = bpf_create_map_xattr(&bad_xattr); + err = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "sk_storage_map", 4, 8, 0, &bad_xattr); CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)", "err:%d errno:%d\n", err, errno); - xattr.btf_fd = -1; + map_opts.btf_fd = -1; close(btf_fd); close(map_fd); close(sk_fd); diff --git a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c index be73e3de6668..d2d9e965eba5 100644 --- a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c +++ b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c @@ -7,32 +7,33 @@ static void test_fail_cases(void) { + LIBBPF_OPTS(bpf_map_create_opts, opts); __u32 value; int fd, err; /* Invalid key size */ - fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 4, sizeof(value), 100, 0); - if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid key size")) + fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 4, sizeof(value), 100, NULL); + if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid key size")) close(fd); /* Invalid value size */ - fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, 0, 100, 0); - if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid value size 0")) + fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, 0, 100, NULL); + if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid value size 0")) close(fd); /* Invalid max entries size */ - fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 0, 0); - if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid max entries size")) + fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 0, NULL); + if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid max entries size")) close(fd); /* Bloom filter maps do not support BPF_F_NO_PREALLOC */ - fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 100, - BPF_F_NO_PREALLOC); - if (!ASSERT_LT(fd, 0, "bpf_create_map bloom filter invalid flags")) + opts.map_flags = BPF_F_NO_PREALLOC; + fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 100, &opts); + if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid flags")) close(fd); - fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 100, 0); - if (!ASSERT_GE(fd, 0, "bpf_create_map bloom filter")) + fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 100, NULL); + if (!ASSERT_GE(fd, 0, "bpf_map_create bloom filter")) return; /* Test invalid flags */ @@ -56,13 +57,14 @@ static void test_fail_cases(void) static void test_success_cases(void) { + LIBBPF_OPTS(bpf_map_create_opts, opts); char value[11]; int fd, err; /* Create a map */ - fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(value), 100, - BPF_F_ZERO_SEED | BPF_F_NUMA_NODE); - if (!ASSERT_GE(fd, 0, "bpf_create_map bloom filter success case")) + opts.map_flags = BPF_F_ZERO_SEED | BPF_F_NUMA_NODE; + fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 100, &opts); + if (!ASSERT_GE(fd, 0, "bpf_map_create bloom filter success case")) return; /* Add a value to the bloom filter */ @@ -100,9 +102,9 @@ static void test_inner_map(struct bloom_filter_map *skel, const __u32 *rand_vals struct bpf_link *link; /* Create a bloom filter map that will be used as the inner map */ - inner_map_fd = bpf_create_map(BPF_MAP_TYPE_BLOOM_FILTER, 0, sizeof(*rand_vals), - nr_rand_vals, 0); - if (!ASSERT_GE(inner_map_fd, 0, "bpf_create_map bloom filter inner map")) + inner_map_fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(*rand_vals), + nr_rand_vals, NULL); + if (!ASSERT_GE(inner_map_fd, 0, "bpf_map_create bloom filter inner map")) return; for (i = 0; i < nr_rand_vals; i++) { diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 3e10abce3e5a..0b996be923b5 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -469,12 +469,12 @@ static void test_overflow(bool test_e2big_overflow, bool ret1) * fills seq_file buffer and then the other will trigger * overflow and needs restart. */ - map1_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); - if (CHECK(map1_fd < 0, "bpf_create_map", + map1_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 8, 1, NULL); + if (CHECK(map1_fd < 0, "bpf_map_create", "map_creation failed: %s\n", strerror(errno))) goto out; - map2_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); - if (CHECK(map2_fd < 0, "bpf_create_map", + map2_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 8, 1, NULL); + if (CHECK(map2_fd < 0, "bpf_map_create", "map_creation failed: %s\n", strerror(errno))) goto free_map1; diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index f9326a13badb..cab810bab593 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -4074,7 +4074,7 @@ static void *btf_raw_create(const struct btf_header *hdr, static void do_test_raw(unsigned int test_num) { struct btf_raw_test *test = &raw_tests[test_num - 1]; - struct bpf_create_map_attr create_attr = {}; + LIBBPF_OPTS(bpf_map_create_opts, opts); int map_fd = -1, btf_fd = -1; unsigned int raw_btf_size; struct btf_header *hdr; @@ -4117,16 +4117,11 @@ static void do_test_raw(unsigned int test_num) if (err || btf_fd < 0) goto done; - create_attr.name = test->map_name; - create_attr.map_type = test->map_type; - create_attr.key_size = test->key_size; - create_attr.value_size = test->value_size; - create_attr.max_entries = test->max_entries; - create_attr.btf_fd = btf_fd; - create_attr.btf_key_type_id = test->key_type_id; - create_attr.btf_value_type_id = test->value_type_id; - - map_fd = bpf_create_map_xattr(&create_attr); + opts.btf_fd = btf_fd; + opts.btf_key_type_id = test->key_type_id; + opts.btf_value_type_id = test->value_type_id; + map_fd = bpf_map_create(test->map_type, test->map_name, + test->key_size, test->value_size, test->max_entries, &opts); err = ((map_fd < 0) != test->map_create_err); CHECK(err, "map_fd:%d test->map_create_err:%u", @@ -4290,7 +4285,7 @@ static int test_big_btf_info(unsigned int test_num) static int test_btf_id(unsigned int test_num) { const struct btf_get_info_test *test = &get_info_tests[test_num - 1]; - struct bpf_create_map_attr create_attr = {}; + LIBBPF_OPTS(bpf_map_create_opts, opts); uint8_t *raw_btf = NULL, *user_btf[2] = {}; int btf_fd[2] = {-1, -1}, map_fd = -1; struct bpf_map_info map_info = {}; @@ -4355,16 +4350,11 @@ static int test_btf_id(unsigned int test_num) } /* Test btf members in struct bpf_map_info */ - create_attr.name = "test_btf_id"; - create_attr.map_type = BPF_MAP_TYPE_ARRAY; - create_attr.key_size = sizeof(int); - create_attr.value_size = sizeof(unsigned int); - create_attr.max_entries = 4; - create_attr.btf_fd = btf_fd[0]; - create_attr.btf_key_type_id = 1; - create_attr.btf_value_type_id = 2; - - map_fd = bpf_create_map_xattr(&create_attr); + opts.btf_fd = btf_fd[0]; + opts.btf_key_type_id = 1; + opts.btf_value_type_id = 2; + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "test_btf_id", + sizeof(int), sizeof(int), 4, &opts); if (CHECK(map_fd < 0, "errno:%d", errno)) { err = -1; goto done; @@ -5153,7 +5143,7 @@ static void do_test_pprint(int test_num) { const struct btf_raw_test *test = &pprint_test_template[test_num]; enum pprint_mapv_kind_t mapv_kind = test->mapv_kind; - struct bpf_create_map_attr create_attr = {}; + LIBBPF_OPTS(bpf_map_create_opts, opts); bool ordered_map, lossless_map, percpu_map; int err, ret, num_cpus, rounded_value_size; unsigned int key, nr_read_elems; @@ -5189,16 +5179,11 @@ static void do_test_pprint(int test_num) goto done; } - create_attr.name = test->map_name; - create_attr.map_type = test->map_type; - create_attr.key_size = test->key_size; - create_attr.value_size = test->value_size; - create_attr.max_entries = test->max_entries; - create_attr.btf_fd = btf_fd; - create_attr.btf_key_type_id = test->key_type_id; - create_attr.btf_value_type_id = test->value_type_id; - - map_fd = bpf_create_map_xattr(&create_attr); + opts.btf_fd = btf_fd; + opts.btf_key_type_id = test->key_type_id; + opts.btf_value_type_id = test->value_type_id; + map_fd = bpf_map_create(test->map_type, test->map_name, + test->key_size, test->value_size, test->max_entries, &opts); if (CHECK(map_fd < 0, "errno:%d", errno)) { err = -1; goto done; diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c index de9c3e12b0ea..d3e8f729c623 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -15,22 +15,22 @@ static int prog_load_cnt(int verdict, int val) int cgroup_storage_fd, percpu_cgroup_storage_fd; if (map_fd < 0) - map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 8, 1, NULL); if (map_fd < 0) { printf("failed to create map '%s'\n", strerror(errno)); return -1; } - cgroup_storage_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, - sizeof(struct bpf_cgroup_storage_key), 8, 0, 0); + cgroup_storage_fd = bpf_map_create(BPF_MAP_TYPE_CGROUP_STORAGE, NULL, + sizeof(struct bpf_cgroup_storage_key), 8, 0, NULL); if (cgroup_storage_fd < 0) { printf("failed to create map '%s'\n", strerror(errno)); return -1; } - percpu_cgroup_storage_fd = bpf_create_map( - BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, - sizeof(struct bpf_cgroup_storage_key), 8, 0, 0); + percpu_cgroup_storage_fd = bpf_map_create( + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, NULL, + sizeof(struct bpf_cgroup_storage_key), 8, 0, NULL); if (percpu_cgroup_storage_fd < 0) { printf("failed to create map '%s'\n", strerror(errno)); return -1; diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c index d4b953ae3407..31c09ba577eb 100644 --- a/tools/testing/selftests/bpf/prog_tests/pinning.c +++ b/tools/testing/selftests/bpf/prog_tests/pinning.c @@ -241,8 +241,8 @@ void test_pinning(void) goto out; } - map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(__u32), - sizeof(__u64), 1, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(__u32), + sizeof(__u64), 1, NULL); if (CHECK(map_fd < 0, "create pinmap manually", "fd %d\n", map_fd)) goto out; diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index 167cd8a2edfd..e945195b24c9 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -62,8 +62,8 @@ void test_ringbuf_multi(void) if (CHECK(err != 0, "bpf_map__set_max_entries", "bpf_map__set_max_entries failed\n")) goto cleanup; - proto_fd = bpf_create_map(BPF_MAP_TYPE_RINGBUF, 0, 0, page_size, 0); - if (CHECK(proto_fd < 0, "bpf_create_map", "bpf_create_map failed\n")) + proto_fd = bpf_map_create(BPF_MAP_TYPE_RINGBUF, NULL, 0, 0, page_size, NULL); + if (CHECK(proto_fd < 0, "bpf_map_create", "bpf_map_create failed\n")) goto cleanup; err = bpf_map__set_inner_map_fd(skel->maps.ringbuf_hash, proto_fd); diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 3cfc910ab3c1..980ac0f2c0bb 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -66,29 +66,20 @@ static union sa46 { static int create_maps(enum bpf_map_type inner_type) { - struct bpf_create_map_attr attr = {}; + LIBBPF_OPTS(bpf_map_create_opts, opts); inner_map_type = inner_type; /* Creating reuseport_array */ - attr.name = "reuseport_array"; - attr.map_type = inner_type; - attr.key_size = sizeof(__u32); - attr.value_size = sizeof(__u32); - attr.max_entries = REUSEPORT_ARRAY_SIZE; - - reuseport_array = bpf_create_map_xattr(&attr); + reuseport_array = bpf_map_create(inner_type, "reuseport_array", + sizeof(__u32), sizeof(__u32), REUSEPORT_ARRAY_SIZE, NULL); RET_ERR(reuseport_array < 0, "creating reuseport_array", "reuseport_array:%d errno:%d\n", reuseport_array, errno); /* Creating outer_map */ - attr.name = "outer_map"; - attr.map_type = BPF_MAP_TYPE_ARRAY_OF_MAPS; - attr.key_size = sizeof(__u32); - attr.value_size = sizeof(__u32); - attr.max_entries = 1; - attr.inner_map_fd = reuseport_array; - outer_map = bpf_create_map_xattr(&attr); + opts.inner_map_fd = reuseport_array; + outer_map = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, "outer_map", + sizeof(__u32), sizeof(__u32), 1, &opts); RET_ERR(outer_map < 0, "creating outer_map", "outer_map:%d errno:%d\n", outer_map, errno); diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 1352ec104149..85db0f4cdd95 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -91,9 +91,9 @@ static void test_sockmap_create_update_free(enum bpf_map_type map_type) if (CHECK_FAIL(s < 0)) return; - map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0); + map = bpf_map_create(map_type, NULL, sizeof(int), sizeof(int), 1, NULL); if (CHECK_FAIL(map < 0)) { - perror("bpf_create_map"); + perror("bpf_cmap_create"); goto out; } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c index 7a0d64fdc192..af293ea1542c 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c @@ -97,7 +97,7 @@ static void run_tests(int family, enum bpf_map_type map_type) char test_name[MAX_TEST_NAME]; int map; - map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0); + map = bpf_map_create(map_type, NULL, sizeof(int), sizeof(int), 1, NULL); if (CHECK_FAIL(map < 0)) { perror("bpf_map_create"); return; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 2a9cb951bfd6..7e21bfab6358 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -502,8 +502,8 @@ static void test_lookup_32_bit_value(int family, int sotype, int mapfd) if (s < 0) return; - mapfd = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(key), - sizeof(value32), 1, 0); + mapfd = bpf_map_create(BPF_MAP_TYPE_SOCKMAP, NULL, sizeof(key), + sizeof(value32), 1, NULL); if (mapfd < 0) { FAIL_ERRNO("map_create"); goto close; diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c index d29ebfeef9c5..ada95bfb9b1b 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c @@ -80,7 +80,7 @@ static int fn(void) if (!ASSERT_OK(err, "creating " TDIR "/fs1/b")) goto out; - map = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 4, 1, 0); + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 4, 1, NULL); if (!ASSERT_GT(map, 0, "create_map(ARRAY)")) goto out; err = bpf_obj_pin(map, TDIR "/fs1/c"); diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c index a63787e7bb1a..5b8314cd77fd 100644 --- a/tools/testing/selftests/bpf/test_cgroup_storage.c +++ b/tools/testing/selftests/bpf/test_cgroup_storage.c @@ -51,15 +51,15 @@ int main(int argc, char **argv) goto err; } - map_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, sizeof(key), - sizeof(value), 0, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_CGROUP_STORAGE, NULL, sizeof(key), + sizeof(value), 0, NULL); if (map_fd < 0) { printf("Failed to create map: %s\n", strerror(errno)); goto out; } - percpu_map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, - sizeof(key), sizeof(value), 0, 0); + percpu_map_fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, NULL, + sizeof(key), sizeof(value), 0, NULL); if (percpu_map_fd < 0) { printf("Failed to create map: %s\n", strerror(errno)); goto out; diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index 006be3963977..baa3e3ecae82 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -208,6 +208,7 @@ static void test_lpm_order(void) static void test_lpm_map(int keysize) { + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups; struct tlpm_node *t, *list = NULL; struct bpf_lpm_trie_key *key; @@ -233,11 +234,11 @@ static void test_lpm_map(int keysize) key = alloca(sizeof(*key) + keysize); memset(key, 0, sizeof(*key) + keysize); - map = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, + map = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, sizeof(*key) + keysize, keysize + 1, 4096, - BPF_F_NO_PREALLOC); + &opts); assert(map >= 0); for (i = 0; i < n_nodes; ++i) { @@ -329,6 +330,7 @@ static void test_lpm_map(int keysize) static void test_lpm_ipaddr(void) { + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); struct bpf_lpm_trie_key *key_ipv4; struct bpf_lpm_trie_key *key_ipv6; size_t key_size_ipv4; @@ -342,14 +344,14 @@ static void test_lpm_ipaddr(void) key_ipv4 = alloca(key_size_ipv4); key_ipv6 = alloca(key_size_ipv6); - map_fd_ipv4 = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, + map_fd_ipv4 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, key_size_ipv4, sizeof(value), - 100, BPF_F_NO_PREALLOC); + 100, &opts); assert(map_fd_ipv4 >= 0); - map_fd_ipv6 = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, + map_fd_ipv6 = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, key_size_ipv6, sizeof(value), - 100, BPF_F_NO_PREALLOC); + 100, &opts); assert(map_fd_ipv6 >= 0); /* Fill data some IPv4 and IPv6 address ranges */ @@ -423,6 +425,7 @@ static void test_lpm_ipaddr(void) static void test_lpm_delete(void) { + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); struct bpf_lpm_trie_key *key; size_t key_size; int map_fd; @@ -431,9 +434,9 @@ static void test_lpm_delete(void) key_size = sizeof(*key) + sizeof(__u32); key = alloca(key_size); - map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, + map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, key_size, sizeof(value), - 100, BPF_F_NO_PREALLOC); + 100, &opts); assert(map_fd >= 0); /* Add nodes: @@ -535,6 +538,7 @@ static void test_lpm_delete(void) static void test_lpm_get_next_key(void) { + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); struct bpf_lpm_trie_key *key_p, *next_key_p; size_t key_size; __u32 value = 0; @@ -544,8 +548,7 @@ static void test_lpm_get_next_key(void) key_p = alloca(key_size); next_key_p = alloca(key_size); - map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, sizeof(value), - 100, BPF_F_NO_PREALLOC); + map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, key_size, sizeof(value), 100, &opts); assert(map_fd >= 0); /* empty tree. get_next_key should return ENOENT */ @@ -753,6 +756,7 @@ static void setup_lpm_mt_test_info(struct lpm_mt_test_info *info, int map_fd) static void test_lpm_multi_thread(void) { + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); struct lpm_mt_test_info info[4]; size_t key_size, value_size; pthread_t thread_id[4]; @@ -762,8 +766,7 @@ static void test_lpm_multi_thread(void) /* create a trie */ value_size = sizeof(__u32); key_size = sizeof(struct bpf_lpm_trie_key) + value_size; - map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, value_size, - 100, BPF_F_NO_PREALLOC); + map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, NULL, key_size, value_size, 100, &opts); /* create 4 threads to test update, delete, lookup and get_next_key */ setup_lpm_mt_test_info(&info[0], map_fd); diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c index 7f3d1d8460b4..b9f1bbbc8aba 100644 --- a/tools/testing/selftests/bpf/test_lru_map.c +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -28,13 +28,14 @@ static int nr_cpus; static int create_map(int map_type, int map_flags, unsigned int size) { + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = map_flags); int map_fd; - map_fd = bpf_create_map(map_type, sizeof(unsigned long long), - sizeof(unsigned long long), size, map_flags); + map_fd = bpf_map_create(map_type, NULL, sizeof(unsigned long long), + sizeof(unsigned long long), size, &opts); if (map_fd == -1) - perror("bpf_create_map"); + perror("bpf_map_create"); return map_fd; } @@ -42,7 +43,6 @@ static int create_map(int map_type, int map_flags, unsigned int size) static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key, void *value) { - struct bpf_create_map_attr map; struct bpf_insn insns[] = { BPF_LD_MAP_VALUE(BPF_REG_9, 0, 0), BPF_LD_MAP_FD(BPF_REG_1, fd), @@ -63,13 +63,7 @@ static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key, int mfd, pfd, ret, zero = 0; __u32 retval = 0; - memset(&map, 0, sizeof(map)); - map.map_type = BPF_MAP_TYPE_ARRAY; - map.key_size = sizeof(int); - map.value_size = sizeof(unsigned long long); - map.max_entries = 1; - - mfd = bpf_create_map_xattr(&map); + mfd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), sizeof(__u64), 1, NULL); if (mfd < 0) return -1; diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 8b31bc1a801d..f4cd658bbe00 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -33,15 +33,14 @@ static int skips; -static int map_flags; +static struct bpf_map_create_opts map_opts = { .sz = sizeof(map_opts) }; static void test_hashmap(unsigned int task, void *data) { long long key, next_key, first_key, value; int fd; - fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), - 2, map_flags); + fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value), 2, &map_opts); if (fd < 0) { printf("Failed to create hashmap '%s'!\n", strerror(errno)); exit(1); @@ -138,8 +137,7 @@ static void test_hashmap_sizes(unsigned int task, void *data) for (i = 1; i <= 512; i <<= 1) for (j = 1; j <= 1 << 18; j <<= 1) { - fd = bpf_create_map(BPF_MAP_TYPE_HASH, i, j, - 2, map_flags); + fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, i, j, 2, &map_opts); if (fd < 0) { if (errno == ENOMEM) return; @@ -160,8 +158,8 @@ static void test_hashmap_percpu(unsigned int task, void *data) int expected_key_mask = 0; int fd, i; - fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key), - sizeof(bpf_percpu(value, 0)), 2, map_flags); + fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_HASH, NULL, sizeof(key), + sizeof(bpf_percpu(value, 0)), 2, &map_opts); if (fd < 0) { printf("Failed to create hashmap '%s'!\n", strerror(errno)); exit(1); @@ -272,11 +270,11 @@ static int helper_fill_hashmap(int max_entries) int i, fd, ret; long long key, value; - fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), - max_entries, map_flags); + fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value), + max_entries, &map_opts); CHECK(fd < 0, "failed to create hashmap", - "err: %s, flags: 0x%x\n", strerror(errno), map_flags); + "err: %s, flags: 0x%x\n", strerror(errno), map_opts.map_flags); for (i = 0; i < max_entries; i++) { key = i; value = key; @@ -332,8 +330,8 @@ static void test_hashmap_zero_seed(void) int i, first, second, old_flags; long long key, next_first, next_second; - old_flags = map_flags; - map_flags |= BPF_F_ZERO_SEED; + old_flags = map_opts.map_flags; + map_opts.map_flags |= BPF_F_ZERO_SEED; first = helper_fill_hashmap(3); second = helper_fill_hashmap(3); @@ -355,7 +353,7 @@ static void test_hashmap_zero_seed(void) key = next_first; } - map_flags = old_flags; + map_opts.map_flags = old_flags; close(first); close(second); } @@ -365,8 +363,7 @@ static void test_arraymap(unsigned int task, void *data) int key, next_key, fd; long long value; - fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), - 2, 0); + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(key), sizeof(value), 2, NULL); if (fd < 0) { printf("Failed to create arraymap '%s'!\n", strerror(errno)); exit(1); @@ -421,8 +418,8 @@ static void test_arraymap_percpu(unsigned int task, void *data) BPF_DECLARE_PERCPU(long, values); int key, next_key, fd, i; - fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key), - sizeof(bpf_percpu(values, 0)), 2, 0); + fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_ARRAY, NULL, sizeof(key), + sizeof(bpf_percpu(values, 0)), 2, NULL); if (fd < 0) { printf("Failed to create arraymap '%s'!\n", strerror(errno)); exit(1); @@ -484,8 +481,8 @@ static void test_arraymap_percpu_many_keys(void) unsigned int nr_keys = 2000; int key, fd, i; - fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key), - sizeof(bpf_percpu(values, 0)), nr_keys, 0); + fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_ARRAY, NULL, sizeof(key), + sizeof(bpf_percpu(values, 0)), nr_keys, NULL); if (fd < 0) { printf("Failed to create per-cpu arraymap '%s'!\n", strerror(errno)); @@ -516,8 +513,7 @@ static void test_devmap(unsigned int task, void *data) int fd; __u32 key, value; - fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value), - 2, 0); + fd = bpf_map_create(BPF_MAP_TYPE_DEVMAP, NULL, sizeof(key), sizeof(value), 2, NULL); if (fd < 0) { printf("Failed to create devmap '%s'!\n", strerror(errno)); exit(1); @@ -531,8 +527,7 @@ static void test_devmap_hash(unsigned int task, void *data) int fd; __u32 key, value; - fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP_HASH, sizeof(key), sizeof(value), - 2, 0); + fd = bpf_map_create(BPF_MAP_TYPE_DEVMAP_HASH, NULL, sizeof(key), sizeof(value), 2, NULL); if (fd < 0) { printf("Failed to create devmap_hash '%s'!\n", strerror(errno)); exit(1); @@ -552,14 +547,12 @@ static void test_queuemap(unsigned int task, void *data) vals[i] = rand(); /* Invalid key size */ - fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 4, sizeof(val), MAP_SIZE, - map_flags); + fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 4, sizeof(val), MAP_SIZE, &map_opts); assert(fd < 0 && errno == EINVAL); - fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 0, sizeof(val), MAP_SIZE, - map_flags); + fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0, sizeof(val), MAP_SIZE, &map_opts); /* Queue map does not support BPF_F_NO_PREALLOC */ - if (map_flags & BPF_F_NO_PREALLOC) { + if (map_opts.map_flags & BPF_F_NO_PREALLOC) { assert(fd < 0 && errno == EINVAL); return; } @@ -610,14 +603,12 @@ static void test_stackmap(unsigned int task, void *data) vals[i] = rand(); /* Invalid key size */ - fd = bpf_create_map(BPF_MAP_TYPE_STACK, 4, sizeof(val), MAP_SIZE, - map_flags); + fd = bpf_map_create(BPF_MAP_TYPE_STACK, NULL, 4, sizeof(val), MAP_SIZE, &map_opts); assert(fd < 0 && errno == EINVAL); - fd = bpf_create_map(BPF_MAP_TYPE_STACK, 0, sizeof(val), MAP_SIZE, - map_flags); + fd = bpf_map_create(BPF_MAP_TYPE_STACK, NULL, 0, sizeof(val), MAP_SIZE, &map_opts); /* Stack map does not support BPF_F_NO_PREALLOC */ - if (map_flags & BPF_F_NO_PREALLOC) { + if (map_opts.map_flags & BPF_F_NO_PREALLOC) { assert(fd < 0 && errno == EINVAL); return; } @@ -744,9 +735,9 @@ static void test_sockmap(unsigned int tasks, void *data) } /* Test sockmap with connected sockets */ - fd = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, + fd = bpf_map_create(BPF_MAP_TYPE_SOCKMAP, NULL, sizeof(key), sizeof(value), - 6, 0); + 6, NULL); if (fd < 0) { if (!bpf_probe_map_type(BPF_MAP_TYPE_SOCKMAP, 0)) { printf("%s SKIP (unsupported map type BPF_MAP_TYPE_SOCKMAP)\n", @@ -1168,8 +1159,7 @@ static void test_map_in_map(void) obj = bpf_object__open(MAPINMAP_PROG); - fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(int), sizeof(int), - 2, 0); + fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(int), sizeof(int), 2, NULL); if (fd < 0) { printf("Failed to create hashmap '%s'!\n", strerror(errno)); exit(1); @@ -1315,8 +1305,8 @@ static void test_map_large(void) } key; int fd, i, value; - fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), - MAP_SIZE, map_flags); + fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value), + MAP_SIZE, &map_opts); if (fd < 0) { printf("Failed to create large map '%s'!\n", strerror(errno)); exit(1); @@ -1469,8 +1459,8 @@ static void test_map_parallel(void) int i, fd, key = 0, value = 0; int data[2]; - fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), - MAP_SIZE, map_flags); + fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value), + MAP_SIZE, &map_opts); if (fd < 0) { printf("Failed to create map for parallel test '%s'!\n", strerror(errno)); @@ -1518,9 +1508,13 @@ static void test_map_parallel(void) static void test_map_rdonly(void) { int fd, key = 0, value = 0; + __u32 old_flags; - fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), - MAP_SIZE, map_flags | BPF_F_RDONLY); + old_flags = map_opts.map_flags; + map_opts.map_flags |= BPF_F_RDONLY; + fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value), + MAP_SIZE, &map_opts); + map_opts.map_flags = old_flags; if (fd < 0) { printf("Failed to create map for read only test '%s'!\n", strerror(errno)); @@ -1543,9 +1537,13 @@ static void test_map_rdonly(void) static void test_map_wronly_hash(void) { int fd, key = 0, value = 0; + __u32 old_flags; - fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), - MAP_SIZE, map_flags | BPF_F_WRONLY); + old_flags = map_opts.map_flags; + map_opts.map_flags |= BPF_F_WRONLY; + fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(key), sizeof(value), + MAP_SIZE, &map_opts); + map_opts.map_flags = old_flags; if (fd < 0) { printf("Failed to create map for write only test '%s'!\n", strerror(errno)); @@ -1567,13 +1565,17 @@ static void test_map_wronly_hash(void) static void test_map_wronly_stack_or_queue(enum bpf_map_type map_type) { int fd, value = 0; + __u32 old_flags; + assert(map_type == BPF_MAP_TYPE_QUEUE || map_type == BPF_MAP_TYPE_STACK); - fd = bpf_create_map(map_type, 0, sizeof(value), MAP_SIZE, - map_flags | BPF_F_WRONLY); + old_flags = map_opts.map_flags; + map_opts.map_flags |= BPF_F_WRONLY; + fd = bpf_map_create(map_type, NULL, 0, sizeof(value), MAP_SIZE, &map_opts); + map_opts.map_flags = old_flags; /* Stack/Queue maps do not support BPF_F_NO_PREALLOC */ - if (map_flags & BPF_F_NO_PREALLOC) { + if (map_opts.map_flags & BPF_F_NO_PREALLOC) { assert(fd < 0 && errno == EINVAL); return; } @@ -1700,8 +1702,8 @@ static void test_reuseport_array(void) __u32 fds_idx = 0; int fd; - map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, - sizeof(__u32), sizeof(__u64), array_size, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, NULL, + sizeof(__u32), sizeof(__u64), array_size, NULL); CHECK(map_fd < 0, "reuseport array create", "map_fd:%d, errno:%d\n", map_fd, errno); @@ -1837,8 +1839,8 @@ static void test_reuseport_array(void) close(map_fd); /* Test 32 bit fd */ - map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, - sizeof(__u32), sizeof(__u32), array_size, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, NULL, + sizeof(__u32), sizeof(__u32), array_size, NULL); CHECK(map_fd < 0, "reuseport array create", "map_fd:%d, errno:%d\n", map_fd, errno); prepare_reuseport_grp(SOCK_STREAM, map_fd, sizeof(__u32), &fd64, @@ -1896,10 +1898,10 @@ int main(void) libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - map_flags = 0; + map_opts.map_flags = 0; run_all_tests(); - map_flags = BPF_F_NO_PREALLOC; + map_opts.map_flags = BPF_F_NO_PREALLOC; run_all_tests(); #define DEFINE_TEST(name) test_##name(); diff --git a/tools/testing/selftests/bpf/test_tag.c b/tools/testing/selftests/bpf/test_tag.c index 5c7bea525626..0851c42ee31c 100644 --- a/tools/testing/selftests/bpf/test_tag.c +++ b/tools/testing/selftests/bpf/test_tag.c @@ -185,11 +185,12 @@ static void do_test(uint32_t *tests, int start_insns, int fd_map, int main(void) { + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC); uint32_t tests = 0; int i, fd_map; - fd_map = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(int), - sizeof(int), 1, BPF_F_NO_PREALLOC); + fd_map = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(int), + sizeof(int), 1, &opts); assert(fd_map > 0); for (i = 0; i < 5; i++) { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index e512b715a785..222cb063ddf4 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -461,11 +461,11 @@ static int __create_map(uint32_t type, uint32_t size_key, uint32_t size_value, uint32_t max_elem, uint32_t extra_flags) { + LIBBPF_OPTS(bpf_map_create_opts, opts); int fd; - fd = bpf_create_map(type, size_key, size_value, max_elem, - (type == BPF_MAP_TYPE_HASH ? - BPF_F_NO_PREALLOC : 0) | extra_flags); + opts.map_flags = (type == BPF_MAP_TYPE_HASH ? BPF_F_NO_PREALLOC : 0) | extra_flags; + fd = bpf_map_create(type, NULL, size_key, size_value, max_elem, &opts); if (fd < 0) { if (skip_unsupported_map(type)) return -1; @@ -521,8 +521,8 @@ static int create_prog_array(enum bpf_prog_type prog_type, uint32_t max_elem, { int mfd, p1fd, p2fd, p3fd; - mfd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, sizeof(int), - sizeof(int), max_elem, 0); + mfd = bpf_map_create(BPF_MAP_TYPE_PROG_ARRAY, NULL, sizeof(int), + sizeof(int), max_elem, NULL); if (mfd < 0) { if (skip_unsupported_map(BPF_MAP_TYPE_PROG_ARRAY)) return -1; @@ -552,10 +552,11 @@ static int create_prog_array(enum bpf_prog_type prog_type, uint32_t max_elem, static int create_map_in_map(void) { + LIBBPF_OPTS(bpf_map_create_opts, opts); int inner_map_fd, outer_map_fd; - inner_map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), - sizeof(int), 1, 0); + inner_map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), + sizeof(int), 1, NULL); if (inner_map_fd < 0) { if (skip_unsupported_map(BPF_MAP_TYPE_ARRAY)) return -1; @@ -563,8 +564,9 @@ static int create_map_in_map(void) return inner_map_fd; } - outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS, NULL, - sizeof(int), inner_map_fd, 1, 0); + opts.inner_map_fd = inner_map_fd; + outer_map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, NULL, + sizeof(int), sizeof(int), 1, &opts); if (outer_map_fd < 0) { if (skip_unsupported_map(BPF_MAP_TYPE_ARRAY_OF_MAPS)) return -1; @@ -583,8 +585,8 @@ static int create_cgroup_storage(bool percpu) BPF_MAP_TYPE_CGROUP_STORAGE; int fd; - fd = bpf_create_map(type, sizeof(struct bpf_cgroup_storage_key), - TEST_DATA_LEN, 0, 0); + fd = bpf_map_create(type, NULL, sizeof(struct bpf_cgroup_storage_key), + TEST_DATA_LEN, 0, NULL); if (fd < 0) { if (skip_unsupported_map(type)) return -1; @@ -648,22 +650,17 @@ static int load_btf(void) static int create_map_spin_lock(void) { - struct bpf_create_map_attr attr = { - .name = "test_map", - .map_type = BPF_MAP_TYPE_ARRAY, - .key_size = 4, - .value_size = 8, - .max_entries = 1, + LIBBPF_OPTS(bpf_map_create_opts, opts, .btf_key_type_id = 1, .btf_value_type_id = 3, - }; + ); int fd, btf_fd; btf_fd = load_btf(); if (btf_fd < 0) return -1; - attr.btf_fd = btf_fd; - fd = bpf_create_map_xattr(&attr); + opts.btf_fd = btf_fd; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "test_map", 4, 8, 1, &opts); if (fd < 0) printf("Failed to create map with spin_lock\n"); return fd; @@ -671,24 +668,19 @@ static int create_map_spin_lock(void) static int create_sk_storage_map(void) { - struct bpf_create_map_attr attr = { - .name = "test_map", - .map_type = BPF_MAP_TYPE_SK_STORAGE, - .key_size = 4, - .value_size = 8, - .max_entries = 0, + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_NO_PREALLOC, .btf_key_type_id = 1, .btf_value_type_id = 3, - }; + ); int fd, btf_fd; btf_fd = load_btf(); if (btf_fd < 0) return -1; - attr.btf_fd = btf_fd; - fd = bpf_create_map_xattr(&attr); - close(attr.btf_fd); + opts.btf_fd = btf_fd; + fd = bpf_map_create(BPF_MAP_TYPE_SK_STORAGE, "test_map", 4, 8, 0, &opts); + close(opts.btf_fd); if (fd < 0) printf("Failed to create sk_storage_map\n"); return fd; From 1144ab9bdf3430e1b5b3f22741e5283841951add Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:13 -0800 Subject: [PATCH 028/115] tools/resolve_btf_ids: Close ELF file on error Fix one case where we don't do explicit clean up. Fixes: fbbb68de80a4 ("bpf: Add resolve_btfids tool to resolve BTF IDs in ELF object") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-2-andrii@kernel.org --- tools/bpf/resolve_btfids/main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index a59cb0ee609c..e9e6166c3f28 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -167,7 +167,7 @@ static struct btf_id *btf_id__find(struct rb_root *root, const char *name) return NULL; } -static struct btf_id* +static struct btf_id * btf_id__add(struct rb_root *root, char *name, bool unique) { struct rb_node **p = &root->rb_node; @@ -730,7 +730,8 @@ int main(int argc, const char **argv) if (obj.efile.idlist_shndx == -1 || obj.efile.symbols_shndx == -1) { pr_debug("Cannot find .BTF_ids or symbols sections, nothing to do\n"); - return 0; + err = 0; + goto out; } if (symbols_collect(&obj)) From 401891a9debaf0a684502f2aaecf53448cee9414 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:14 -0800 Subject: [PATCH 029/115] libbpf: Fix potential misaligned memory access in btf_ext__new() Perform a memory copy before we do the sanity checks of btf_ext_hdr. This prevents misaligned memory access if raw btf_ext data is not 4-byte aligned ([0]). While at it, also add missing const qualifier. [0] Closes: https://github.com/libbpf/libbpf/issues/391 Fixes: 2993e0515bb4 ("tools/bpf: add support to read .BTF.ext sections") Reported-by: Evgeny Vereshchagin Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-3-andrii@kernel.org --- tools/lib/bpf/btf.c | 10 +++++----- tools/lib/bpf/btf.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index e97217a77196..8024fe355ca8 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -2731,15 +2731,11 @@ void btf_ext__free(struct btf_ext *btf_ext) free(btf_ext); } -struct btf_ext *btf_ext__new(__u8 *data, __u32 size) +struct btf_ext *btf_ext__new(const __u8 *data, __u32 size) { struct btf_ext *btf_ext; int err; - err = btf_ext_parse_hdr(data, size); - if (err) - return libbpf_err_ptr(err); - btf_ext = calloc(1, sizeof(struct btf_ext)); if (!btf_ext) return libbpf_err_ptr(-ENOMEM); @@ -2752,6 +2748,10 @@ struct btf_ext *btf_ext__new(__u8 *data, __u32 size) } memcpy(btf_ext->data, data, size); + err = btf_ext_parse_hdr(btf_ext->data, size); + if (err) + goto done; + if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, line_info_len)) { err = -EINVAL; goto done; diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 5c73a5b0a044..742a2bf71c5e 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -157,7 +157,7 @@ LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, __u32 expected_value_size, __u32 *key_type_id, __u32 *value_type_id); -LIBBPF_API struct btf_ext *btf_ext__new(__u8 *data, __u32 size); +LIBBPF_API struct btf_ext *btf_ext__new(const __u8 *data, __u32 size); LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size); From 2a6a9bf26170b4e156c18706cd230934ebd2f95f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:16 -0800 Subject: [PATCH 030/115] libbpf: Don't call libc APIs with NULL pointers Sanitizer complains about qsort(), bsearch(), and memcpy() being called with NULL pointer. This can only happen when the associated number of elements is zero, so no harm should be done. But still prevent this from happening to keep sanitizer runs clean from extra noise. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-5-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e05dd785b347..672671879b21 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3369,7 +3369,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) /* sort BPF programs by section name and in-section instruction offset * for faster search */ - qsort(obj->programs, obj->nr_programs, sizeof(*obj->programs), cmp_progs); + if (obj->nr_programs) + qsort(obj->programs, obj->nr_programs, sizeof(*obj->programs), cmp_progs); return bpf_object__init_btf(obj, btf_data, btf_ext_data); } @@ -5823,6 +5824,8 @@ static int cmp_relo_by_insn_idx(const void *key, const void *elem) static struct reloc_desc *find_prog_insn_relo(const struct bpf_program *prog, size_t insn_idx) { + if (!prog->nr_reloc) + return NULL; return bsearch(&insn_idx, prog->reloc_desc, prog->nr_reloc, sizeof(*prog->reloc_desc), cmp_relo_by_insn_idx); } @@ -5838,8 +5841,9 @@ static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_progra relos = libbpf_reallocarray(main_prog->reloc_desc, new_cnt, sizeof(*relos)); if (!relos) return -ENOMEM; - memcpy(relos + main_prog->nr_reloc, subprog->reloc_desc, - sizeof(*relos) * subprog->nr_reloc); + if (subprog->nr_reloc) + memcpy(relos + main_prog->nr_reloc, subprog->reloc_desc, + sizeof(*relos) * subprog->nr_reloc); for (i = main_prog->nr_reloc; i < new_cnt; i++) relos[i].insn_idx += subprog->sub_insn_off; From 8cb125566c40b7141d8842c534f0ea5820ee3d5c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:17 -0800 Subject: [PATCH 031/115] libbpf: Fix glob_syms memory leak in bpf_linker glob_syms array wasn't freed on bpf_link__free(). Fix that. Fixes: a46349227cd8 ("libbpf: Add linker extern resolution support for functions and global variables") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-6-andrii@kernel.org --- tools/lib/bpf/linker.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 594b206fa674..3e1b2a15fdc7 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -210,6 +210,7 @@ void bpf_linker__free(struct bpf_linker *linker) } free(linker->secs); + free(linker->glob_syms); free(linker); } From 593835377f24ca1bb98008ec1dc3baefe491ad6e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:18 -0800 Subject: [PATCH 032/115] libbpf: Fix using invalidated memory in bpf_linker add_dst_sec() can invalidate bpf_linker's section index making dst_symtab pointer pointing into unallocated memory. Reinitialize dst_symtab pointer on each iteration to make sure it's always valid. Fixes: faf6ed321cf6 ("libbpf: Add BPF static linker APIs") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-7-andrii@kernel.org --- tools/lib/bpf/linker.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c index 3e1b2a15fdc7..9aa016fb55aa 100644 --- a/tools/lib/bpf/linker.c +++ b/tools/lib/bpf/linker.c @@ -2000,7 +2000,7 @@ static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj) { struct src_sec *src_symtab = &obj->secs[obj->symtab_sec_idx]; - struct dst_sec *dst_symtab = &linker->secs[linker->symtab_sec_idx]; + struct dst_sec *dst_symtab; int i, err; for (i = 1; i < obj->sec_cnt; i++) { @@ -2033,6 +2033,9 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob return -1; } + /* add_dst_sec() above could have invalidated linker->secs */ + dst_symtab = &linker->secs[linker->symtab_sec_idx]; + /* shdr->sh_link points to SYMTAB */ dst_sec->shdr->sh_link = linker->symtab_sec_idx; From 486e648cb2f170702fc05f777c7b6b3d8ec662ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:19 -0800 Subject: [PATCH 033/115] selftests/bpf: Fix UBSan complaint about signed __int128 overflow Test is using __int128 variable as unsigned and highest order bit can be set to 1 after bit shift. Use unsigned __int128 explicitly and prevent UBSan from complaining. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-8-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index af47aeb211e7..9e26903f9170 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -323,7 +323,7 @@ static void test_btf_dump_int_data(struct btf *btf, struct btf_dump *d, char *str) { #ifdef __SIZEOF_INT128__ - __int128 i = 0xffffffffffffffff; + unsigned __int128 i = 0xffffffffffffffff; /* this dance is required because we cannot directly initialize * a 128-bit value to anything larger than a 64-bit value. From 3bd0233f388e061c44d36a1ac614a3bb4a851b7e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:20 -0800 Subject: [PATCH 034/115] selftests/bpf: Fix possible NULL passed to memcpy() with zero size Prevent sanitizer from complaining about passing NULL into memcpy(), even if it happens with zero size. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-9-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/core_reloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c index 1041d0c593f6..44a9868c70ea 100644 --- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c +++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c @@ -881,7 +881,8 @@ void test_core_reloc(void) data = mmap_data; memset(mmap_data, 0, sizeof(*data)); - memcpy(data->in, test_case->input, test_case->input_len); + if (test_case->input_len) + memcpy(data->in, test_case->input, test_case->input_len); data->my_pid_tgid = my_pid_tgid; link = bpf_program__attach_raw_tracepoint(prog, tp_name); From 6c4dedb7550aafd094f7d803668fd039545f4e57 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:21 -0800 Subject: [PATCH 035/115] selftests/bpf: Prevent misaligned memory access in get_stack_raw_tp test Perfbuf doesn't guarantee 8-byte alignment of the data like BPF ringbuf does, so struct get_stack_trace_t can arrive not properly aligned for subsequent u64 accesses. Easiest fix is to just copy data locally. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-10-andrii@kernel.org --- .../selftests/bpf/prog_tests/get_stack_raw_tp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c index 4184c399d4c6..977ab433a946 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c +++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c @@ -24,13 +24,19 @@ static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size) { bool good_kern_stack = false, good_user_stack = false; const char *nonjit_func = "___bpf_prog_run"; - struct get_stack_trace_t *e = data; + /* perfbuf-submitted data is 4-byte aligned, but we need 8-byte + * alignment, so copy data into a local variable, for simplicity + */ + struct get_stack_trace_t e; int i, num_stack; static __u64 cnt; struct ksym *ks; cnt++; + memset(&e, 0, sizeof(e)); + memcpy(&e, data, size <= sizeof(e) ? size : sizeof(e)); + if (size < sizeof(struct get_stack_trace_t)) { __u64 *raw_data = data; bool found = false; @@ -57,19 +63,19 @@ static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size) good_user_stack = true; } } else { - num_stack = e->kern_stack_size / sizeof(__u64); + num_stack = e.kern_stack_size / sizeof(__u64); if (env.jit_enabled) { good_kern_stack = num_stack > 0; } else { for (i = 0; i < num_stack; i++) { - ks = ksym_search(e->kern_stack[i]); + ks = ksym_search(e.kern_stack[i]); if (ks && (strcmp(ks->name, nonjit_func) == 0)) { good_kern_stack = true; break; } } } - if (e->user_stack_size > 0 && e->user_stack_buildid_size > 0) + if (e.user_stack_size > 0 && e.user_stack_buildid_size > 0) good_user_stack = true; } From e2e0d90c550a2588ebed7aa2753adaac0f633989 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:22 -0800 Subject: [PATCH 036/115] selftests/bpf: Fix misaligned memory access in queue_stack_map test Copy over iphdr into a local variable before accessing its fields. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-11-andrii@kernel.org --- .../selftests/bpf/prog_tests/queue_stack_map.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c index 8ccba3ab70ee..b9822f914eeb 100644 --- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c +++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c @@ -14,7 +14,7 @@ static void test_queue_stack_map_by_type(int type) int i, err, prog_fd, map_in_fd, map_out_fd; char file[32], buf[128]; struct bpf_object *obj; - struct iphdr *iph = (void *)buf + sizeof(struct ethhdr); + struct iphdr iph; /* Fill test values to be used */ for (i = 0; i < MAP_SIZE; i++) @@ -60,15 +60,17 @@ static void test_queue_stack_map_by_type(int type) err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); - if (err || retval || size != sizeof(pkt_v4) || - iph->daddr != val) + if (err || retval || size != sizeof(pkt_v4)) + break; + memcpy(&iph, buf + sizeof(struct ethhdr), sizeof(iph)); + if (iph.daddr != val) break; } - CHECK(err || retval || size != sizeof(pkt_v4) || iph->daddr != val, + CHECK(err || retval || size != sizeof(pkt_v4) || iph.daddr != val, "bpf_map_pop_elem", "err %d errno %d retval %d size %d iph->daddr %u\n", - err, errno, retval, size, iph->daddr); + err, errno, retval, size, iph.daddr); /* Queue is empty, program should return TC_ACT_SHOT */ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), From 57428298b5acf2ba2dd98359c532774f6eaeecb3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:23 -0800 Subject: [PATCH 037/115] selftests/bpf: Prevent out-of-bounds stack access in test_bpffs Buf can be not zero-terminated leading to strstr() to access data beyond the intended buf[] array. Fix by forcing zero termination. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-12-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/test_bpffs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c index ada95bfb9b1b..214d9f4a94a5 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c @@ -19,11 +19,13 @@ static int read_iter(char *file) fd = open(file, 0); if (fd < 0) return -1; - while ((len = read(fd, buf, sizeof(buf))) > 0) + while ((len = read(fd, buf, sizeof(buf))) > 0) { + buf[sizeof(buf) - 1] = '\0'; if (strstr(buf, "iter")) { close(fd); return 0; } + } close(fd); return -1; } From 43080b7106db5bcdb4f09c2648e968151e1461b7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:24 -0800 Subject: [PATCH 038/115] selftests/bpf: Fix misaligned memory accesses in xdp_bonding test Construct packet buffer explicitly for each packet to avoid unaligned memory accesses. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-13-andrii@kernel.org --- .../selftests/bpf/prog_tests/xdp_bonding.c | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c index faa22b84f2ee..5e3a26b15ec6 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c @@ -218,9 +218,9 @@ static int send_udp_packets(int vary_dst_ip) .h_dest = BOND2_MAC, .h_proto = htons(ETH_P_IP), }; - uint8_t buf[128] = {}; - struct iphdr *iph = (struct iphdr *)(buf + sizeof(eh)); - struct udphdr *uh = (struct udphdr *)(buf + sizeof(eh) + sizeof(*iph)); + struct iphdr iph = {}; + struct udphdr uh = {}; + uint8_t buf[128]; int i, s = -1; int ifindex; @@ -232,17 +232,16 @@ static int send_udp_packets(int vary_dst_ip) if (!ASSERT_GT(ifindex, 0, "get bond1 ifindex")) goto err; - memcpy(buf, &eh, sizeof(eh)); - iph->ihl = 5; - iph->version = 4; - iph->tos = 16; - iph->id = 1; - iph->ttl = 64; - iph->protocol = IPPROTO_UDP; - iph->saddr = 1; - iph->daddr = 2; - iph->tot_len = htons(sizeof(buf) - ETH_HLEN); - iph->check = 0; + iph.ihl = 5; + iph.version = 4; + iph.tos = 16; + iph.id = 1; + iph.ttl = 64; + iph.protocol = IPPROTO_UDP; + iph.saddr = 1; + iph.daddr = 2; + iph.tot_len = htons(sizeof(buf) - ETH_HLEN); + iph.check = 0; for (i = 1; i <= NPACKETS; i++) { int n; @@ -253,10 +252,15 @@ static int send_udp_packets(int vary_dst_ip) }; /* vary the UDP destination port for even distribution with roundrobin/xor modes */ - uh->dest++; + uh.dest++; if (vary_dst_ip) - iph->daddr++; + iph.daddr++; + + /* construct a packet */ + memcpy(buf, &eh, sizeof(eh)); + memcpy(buf + sizeof(eh), &iph, sizeof(iph)); + memcpy(buf + sizeof(eh) + sizeof(iph), &uh, sizeof(uh)); n = sendto(s, buf, sizeof(buf), 0, (struct sockaddr *)&saddr_ll, sizeof(saddr_ll)); if (!ASSERT_EQ(n, sizeof(buf), "sendto")) From 8f6f41f39348f25db843f2fcb2f1c166b4bfa2d7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 23 Nov 2021 16:23:25 -0800 Subject: [PATCH 039/115] selftests/bpf: Fix misaligned accesses in xdp and xdp_bpf2bpf tests Similar to previous patch, just copy over necessary struct into local stack variable before checking its fields. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211124002325.1737739-14-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/xdp.c | 11 ++++++----- tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c | 6 +++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c index 7a7ef9d4e151..ac65456b7ab8 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp.c @@ -11,8 +11,8 @@ void test_xdp(void) const char *file = "./test_xdp.o"; struct bpf_object *obj; char buf[128]; - struct ipv6hdr *iph6 = (void *)buf + sizeof(struct ethhdr); - struct iphdr *iph = (void *)buf + sizeof(struct ethhdr); + struct ipv6hdr iph6; + struct iphdr iph; __u32 duration, retval, size; int err, prog_fd, map_fd; @@ -28,16 +28,17 @@ void test_xdp(void) err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); - + memcpy(&iph, buf + sizeof(struct ethhdr), sizeof(iph)); CHECK(err || retval != XDP_TX || size != 74 || - iph->protocol != IPPROTO_IPIP, "ipv4", + iph.protocol != IPPROTO_IPIP, "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); + memcpy(&iph6, buf + sizeof(struct ethhdr), sizeof(iph6)); CHECK(err || retval != XDP_TX || size != 114 || - iph6->nexthdr != IPPROTO_IPV6, "ipv6", + iph6.nexthdr != IPPROTO_IPV6, "ipv6", "err %d errno %d retval %d size %d\n", err, errno, retval, size); out: diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c index f99386d1dc4c..c98a897ad692 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c @@ -42,7 +42,7 @@ void test_xdp_bpf2bpf(void) char buf[128]; int err, pkt_fd, map_fd; bool passed = false; - struct iphdr *iph = (void *)buf + sizeof(struct ethhdr); + struct iphdr iph; struct iptnl_info value4 = {.family = AF_INET}; struct test_xdp *pkt_skel = NULL; struct test_xdp_bpf2bpf *ftrace_skel = NULL; @@ -93,9 +93,9 @@ void test_xdp_bpf2bpf(void) /* Run test program */ err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); - + memcpy(&iph, buf + sizeof(struct ethhdr), sizeof(iph)); if (CHECK(err || retval != XDP_TX || size != 74 || - iph->protocol != IPPROTO_IPIP, "ipv4", + iph.protocol != IPPROTO_IPIP, "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size)) goto out; From e32cb12ff52a2840fc1248998717f7b95c42f064 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 25 Nov 2021 09:36:07 +0800 Subject: [PATCH 040/115] bpf, mips: Fix build errors about __NR_bpf undeclared MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the __NR_bpf definitions to fix the following build errors for mips: $ cd tools/bpf/bpftool $ make [...] bpf.c:54:4: error: #error __NR_bpf not defined. libbpf does not support your arch. # error __NR_bpf not defined. libbpf does not support your arch. ^~~~~ bpf.c: In function ‘sys_bpf’: bpf.c:66:17: error: ‘__NR_bpf’ undeclared (first use in this function); did you mean ‘__NR_brk’? return syscall(__NR_bpf, cmd, attr, size); ^~~~~~~~ __NR_brk [...] In file included from gen_loader.c:15:0: skel_internal.h: In function ‘skel_sys_bpf’: skel_internal.h:53:17: error: ‘__NR_bpf’ undeclared (first use in this function); did you mean ‘__NR_brk’? return syscall(__NR_bpf, cmd, attr, size); ^~~~~~~~ __NR_brk We can see the following generated definitions: $ grep -r "#define __NR_bpf" arch/mips arch/mips/include/generated/uapi/asm/unistd_o32.h:#define __NR_bpf (__NR_Linux + 355) arch/mips/include/generated/uapi/asm/unistd_n64.h:#define __NR_bpf (__NR_Linux + 315) arch/mips/include/generated/uapi/asm/unistd_n32.h:#define __NR_bpf (__NR_Linux + 319) The __NR_Linux is defined in arch/mips/include/uapi/asm/unistd.h: $ grep -r "#define __NR_Linux" arch/mips arch/mips/include/uapi/asm/unistd.h:#define __NR_Linux 4000 arch/mips/include/uapi/asm/unistd.h:#define __NR_Linux 5000 arch/mips/include/uapi/asm/unistd.h:#define __NR_Linux 6000 That is to say, __NR_bpf is: 4000 + 355 = 4355 for mips o32, 6000 + 319 = 6319 for mips n32, 5000 + 315 = 5315 for mips n64. So use the GCC pre-defined macro _ABIO32, _ABIN32 and _ABI64 [1] to define the corresponding __NR_bpf. This patch is similar with commit bad1926dd2f6 ("bpf, s390: fix build for libbpf and selftest suite"). [1] https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/config/mips/mips.h#l549 Signed-off-by: Tiezhu Yang Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1637804167-8323-1-git-send-email-yangtiezhu@loongson.cn --- tools/build/feature/test-bpf.c | 6 ++++++ tools/lib/bpf/bpf.c | 6 ++++++ tools/lib/bpf/skel_internal.h | 10 ++++++++++ 3 files changed, 22 insertions(+) diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c index 82070eadfc07..727d22e34a6e 100644 --- a/tools/build/feature/test-bpf.c +++ b/tools/build/feature/test-bpf.c @@ -14,6 +14,12 @@ # define __NR_bpf 349 # elif defined(__s390__) # define __NR_bpf 351 +# elif defined(__mips__) && defined(_ABIO32) +# define __NR_bpf 4355 +# elif defined(__mips__) && defined(_ABIN32) +# define __NR_bpf 6319 +# elif defined(__mips__) && defined(_ABI64) +# define __NR_bpf 5315 # else # error __NR_bpf not defined. libbpf does not support your arch. # endif diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 053c86e3d20f..4e7836e1a7b5 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -50,6 +50,12 @@ # define __NR_bpf 351 # elif defined(__arc__) # define __NR_bpf 280 +# elif defined(__mips__) && defined(_ABIO32) +# define __NR_bpf 4355 +# elif defined(__mips__) && defined(_ABIN32) +# define __NR_bpf 6319 +# elif defined(__mips__) && defined(_ABI64) +# define __NR_bpf 5315 # else # error __NR_bpf not defined. libbpf does not support your arch. # endif diff --git a/tools/lib/bpf/skel_internal.h b/tools/lib/bpf/skel_internal.h index b206532704ce..0b84d8e6b72a 100644 --- a/tools/lib/bpf/skel_internal.h +++ b/tools/lib/bpf/skel_internal.h @@ -7,6 +7,16 @@ #include #include +#ifndef __NR_bpf +# if defined(__mips__) && defined(_ABIO32) +# define __NR_bpf 4355 +# elif defined(__mips__) && defined(_ABIN32) +# define __NR_bpf 6319 +# elif defined(__mips__) && defined(_ABI64) +# define __NR_bpf 5315 +# endif +#endif + /* This file is a base header for auto-generated *.lskel.h files. * Its contents will change and may become part of auto-generation in the future. * From 341ac5ffc4bd859103899c876902caf07cc97ea4 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Sun, 28 Nov 2021 22:16:32 +0800 Subject: [PATCH 041/115] libbpf: Support static initialization of BPF_MAP_TYPE_PROG_ARRAY Support static initialization of BPF_MAP_TYPE_PROG_ARRAY with a syntax similar to map-in-map initialization ([0]): SEC("socket") int tailcall_1(void *ctx) { return 0; } struct { __uint(type, BPF_MAP_TYPE_PROG_ARRAY); __uint(max_entries, 2); __uint(key_size, sizeof(__u32)); __array(values, int (void *)); } prog_array_init SEC(".maps") = { .values = { [1] = (void *)&tailcall_1, }, }; Here's the relevant part of libbpf debug log showing what's going on with prog-array initialization: libbpf: sec '.relsocket': collecting relocation for section(3) 'socket' libbpf: sec '.relsocket': relo #0: insn #2 against 'prog_array_init' libbpf: prog 'entry': found map 0 (prog_array_init, sec 4, off 0) for insn #0 libbpf: .maps relo #0: for 3 value 0 rel->r_offset 32 name 53 ('tailcall_1') libbpf: .maps relo #0: map 'prog_array_init' slot [1] points to prog 'tailcall_1' libbpf: map 'prog_array_init': created successfully, fd=5 libbpf: map 'prog_array_init': slot [1] set to prog 'tailcall_1' fd=6 [0] Closes: https://github.com/libbpf/libbpf/issues/354 Signed-off-by: Hengqi Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211128141633.502339-2-hengqi.chen@gmail.com --- tools/lib/bpf/libbpf.c | 154 ++++++++++++++++++++++++++++++++--------- 1 file changed, 121 insertions(+), 33 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 672671879b21..b59fede08ba7 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2277,6 +2277,9 @@ int parse_btf_map_def(const char *map_name, struct btf *btf, map_def->parts |= MAP_DEF_VALUE_SIZE | MAP_DEF_VALUE_TYPE; } else if (strcmp(name, "values") == 0) { + bool is_map_in_map = bpf_map_type__is_map_in_map(map_def->map_type); + bool is_prog_array = map_def->map_type == BPF_MAP_TYPE_PROG_ARRAY; + const char *desc = is_map_in_map ? "map-in-map inner" : "prog-array value"; char inner_map_name[128]; int err; @@ -2290,8 +2293,8 @@ int parse_btf_map_def(const char *map_name, struct btf *btf, map_name, name); return -EINVAL; } - if (!bpf_map_type__is_map_in_map(map_def->map_type)) { - pr_warn("map '%s': should be map-in-map.\n", + if (!is_map_in_map && !is_prog_array) { + pr_warn("map '%s': should be map-in-map or prog-array.\n", map_name); return -ENOTSUP; } @@ -2303,22 +2306,30 @@ int parse_btf_map_def(const char *map_name, struct btf *btf, map_def->value_size = 4; t = btf__type_by_id(btf, m->type); if (!t) { - pr_warn("map '%s': map-in-map inner type [%d] not found.\n", - map_name, m->type); + pr_warn("map '%s': %s type [%d] not found.\n", + map_name, desc, m->type); return -EINVAL; } if (!btf_is_array(t) || btf_array(t)->nelems) { - pr_warn("map '%s': map-in-map inner spec is not a zero-sized array.\n", - map_name); + pr_warn("map '%s': %s spec is not a zero-sized array.\n", + map_name, desc); return -EINVAL; } t = skip_mods_and_typedefs(btf, btf_array(t)->type, NULL); if (!btf_is_ptr(t)) { - pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", - map_name, btf_kind_str(t)); + pr_warn("map '%s': %s def is of unexpected kind %s.\n", + map_name, desc, btf_kind_str(t)); return -EINVAL; } t = skip_mods_and_typedefs(btf, t->type, NULL); + if (is_prog_array) { + if (!btf_is_func_proto(t)) { + pr_warn("map '%s': prog-array value def is of unexpected kind %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + continue; + } if (!btf_is_struct(t)) { pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", map_name, btf_kind_str(t)); @@ -4940,7 +4951,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b return err; } -static int init_map_slots(struct bpf_object *obj, struct bpf_map *map) +static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map) { const struct bpf_map *targ_map; unsigned int i; @@ -4952,6 +4963,7 @@ static int init_map_slots(struct bpf_object *obj, struct bpf_map *map) targ_map = map->init_slots[i]; fd = bpf_map__fd(targ_map); + if (obj->gen_loader) { pr_warn("// TODO map_update_elem: idx %td key %d value==map_idx %td\n", map - obj->maps, i, targ_map - obj->maps); @@ -4962,8 +4974,7 @@ static int init_map_slots(struct bpf_object *obj, struct bpf_map *map) if (err) { err = -errno; pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n", - map->name, i, targ_map->name, - fd, err); + map->name, i, targ_map->name, fd, err); return err; } pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n", @@ -4976,6 +4987,59 @@ static int init_map_slots(struct bpf_object *obj, struct bpf_map *map) return 0; } +static int init_prog_array_slots(struct bpf_object *obj, struct bpf_map *map) +{ + const struct bpf_program *targ_prog; + unsigned int i; + int fd, err; + + if (obj->gen_loader) + return -ENOTSUP; + + for (i = 0; i < map->init_slots_sz; i++) { + if (!map->init_slots[i]) + continue; + + targ_prog = map->init_slots[i]; + fd = bpf_program__fd(targ_prog); + + err = bpf_map_update_elem(map->fd, &i, &fd, 0); + if (err) { + err = -errno; + pr_warn("map '%s': failed to initialize slot [%d] to prog '%s' fd=%d: %d\n", + map->name, i, targ_prog->name, fd, err); + return err; + } + pr_debug("map '%s': slot [%d] set to prog '%s' fd=%d\n", + map->name, i, targ_prog->name, fd); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + return 0; +} + +static int bpf_object_init_prog_arrays(struct bpf_object *obj) +{ + struct bpf_map *map; + int i, err; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + if (!map->init_slots_sz || map->def.type != BPF_MAP_TYPE_PROG_ARRAY) + continue; + + err = init_prog_array_slots(obj, map); + if (err < 0) { + zclose(map->fd); + return err; + } + } + return 0; +} + static int bpf_object__create_maps(struct bpf_object *obj) { @@ -5042,8 +5106,8 @@ bpf_object__create_maps(struct bpf_object *obj) } } - if (map->init_slots_sz) { - err = init_map_slots(obj, map); + if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) { + err = init_map_in_map_slots(obj, map); if (err < 0) { zclose(map->fd); goto err_out; @@ -6189,9 +6253,11 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, int i, j, nrels, new_sz; const struct btf_var_secinfo *vi = NULL; const struct btf_type *sec, *var, *def; - struct bpf_map *map = NULL, *targ_map; + struct bpf_map *map = NULL, *targ_map = NULL; + struct bpf_program *targ_prog = NULL; + bool is_prog_array, is_map_in_map; const struct btf_member *member; - const char *name, *mname; + const char *name, *mname, *type; unsigned int moff; Elf64_Sym *sym; Elf64_Rel *rel; @@ -6218,11 +6284,6 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, return -LIBBPF_ERRNO__FORMAT; } name = elf_sym_str(obj, sym->st_name) ?: ""; - if (sym->st_shndx != obj->efile.btf_maps_shndx) { - pr_warn(".maps relo #%d: '%s' isn't a BTF-defined map\n", - i, name); - return -LIBBPF_ERRNO__RELOC; - } pr_debug(".maps relo #%d: for %zd value %zd rel->r_offset %zu name %d ('%s')\n", i, (ssize_t)(rel->r_info >> 32), (size_t)sym->st_value, @@ -6244,19 +6305,45 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, return -EINVAL; } - if (!bpf_map_type__is_map_in_map(map->def.type)) - return -EINVAL; - if (map->def.type == BPF_MAP_TYPE_HASH_OF_MAPS && - map->def.key_size != sizeof(int)) { - pr_warn(".maps relo #%d: hash-of-maps '%s' should have key size %zu.\n", - i, map->name, sizeof(int)); + is_map_in_map = bpf_map_type__is_map_in_map(map->def.type); + is_prog_array = map->def.type == BPF_MAP_TYPE_PROG_ARRAY; + type = is_map_in_map ? "map" : "prog"; + if (is_map_in_map) { + if (sym->st_shndx != obj->efile.btf_maps_shndx) { + pr_warn(".maps relo #%d: '%s' isn't a BTF-defined map\n", + i, name); + return -LIBBPF_ERRNO__RELOC; + } + if (map->def.type == BPF_MAP_TYPE_HASH_OF_MAPS && + map->def.key_size != sizeof(int)) { + pr_warn(".maps relo #%d: hash-of-maps '%s' should have key size %zu.\n", + i, map->name, sizeof(int)); + return -EINVAL; + } + targ_map = bpf_object__find_map_by_name(obj, name); + if (!targ_map) { + pr_warn(".maps relo #%d: '%s' isn't a valid map reference\n", + i, name); + return -ESRCH; + } + } else if (is_prog_array) { + targ_prog = bpf_object__find_program_by_name(obj, name); + if (!targ_prog) { + pr_warn(".maps relo #%d: '%s' isn't a valid program reference\n", + i, name); + return -ESRCH; + } + if (targ_prog->sec_idx != sym->st_shndx || + targ_prog->sec_insn_off * 8 != sym->st_value || + prog_is_subprog(obj, targ_prog)) { + pr_warn(".maps relo #%d: '%s' isn't an entry-point program\n", + i, name); + return -LIBBPF_ERRNO__RELOC; + } + } else { return -EINVAL; } - targ_map = bpf_object__find_map_by_name(obj, name); - if (!targ_map) - return -ESRCH; - var = btf__type_by_id(obj->btf, vi->type); def = skip_mods_and_typedefs(obj->btf, var->type, NULL); if (btf_vlen(def) == 0) @@ -6287,10 +6374,10 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, (new_sz - map->init_slots_sz) * host_ptr_sz); map->init_slots_sz = new_sz; } - map->init_slots[moff] = targ_map; + map->init_slots[moff] = is_map_in_map ? (void *)targ_map : (void *)targ_prog; - pr_debug(".maps relo #%d: map '%s' slot [%d] points to map '%s'\n", - i, map->name, moff, name); + pr_debug(".maps relo #%d: map '%s' slot [%d] points to %s '%s'\n", + i, map->name, moff, type, name); } return 0; @@ -7304,6 +7391,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) err = err ? : bpf_object__create_maps(obj); err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : attr->target_btf_path); err = err ? : bpf_object__load_progs(obj, attr->log_level); + err = err ? : bpf_object_init_prog_arrays(obj); if (obj->gen_loader) { /* reset FDs */ From baeead213e67a9554d589a2845c634b8e473d107 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Sun, 28 Nov 2021 22:16:33 +0800 Subject: [PATCH 042/115] selftests/bpf: Test BPF_MAP_TYPE_PROG_ARRAY static initialization Add testcase for BPF_MAP_TYPE_PROG_ARRAY static initialization. Signed-off-by: Hengqi Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211128141633.502339-3-hengqi.chen@gmail.com --- .../bpf/prog_tests/prog_array_init.c | 32 +++++++++++++++ .../bpf/progs/test_prog_array_init.c | 39 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/prog_array_init.c create mode 100644 tools/testing/selftests/bpf/progs/test_prog_array_init.c diff --git a/tools/testing/selftests/bpf/prog_tests/prog_array_init.c b/tools/testing/selftests/bpf/prog_tests/prog_array_init.c new file mode 100644 index 000000000000..fc4657619739 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/prog_array_init.c @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021 Hengqi Chen */ + +#include +#include "test_prog_array_init.skel.h" + +void test_prog_array_init(void) +{ + struct test_prog_array_init *skel; + int err; + + skel = test_prog_array_init__open(); + if (!ASSERT_OK_PTR(skel, "could not open BPF object")) + return; + + skel->rodata->my_pid = getpid(); + + err = test_prog_array_init__load(skel); + if (!ASSERT_OK(err, "could not load BPF object")) + goto cleanup; + + skel->links.entry = bpf_program__attach_raw_tracepoint(skel->progs.entry, "sys_enter"); + if (!ASSERT_OK_PTR(skel->links.entry, "could not attach BPF program")) + goto cleanup; + + usleep(1); + + ASSERT_EQ(skel->bss->value, 42, "unexpected value"); + +cleanup: + test_prog_array_init__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_prog_array_init.c b/tools/testing/selftests/bpf/progs/test_prog_array_init.c new file mode 100644 index 000000000000..2cd138356126 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_prog_array_init.c @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021 Hengqi Chen */ + +#include "vmlinux.h" +#include +#include + +const volatile pid_t my_pid = 0; +int value = 0; + +SEC("raw_tp/sys_enter") +int tailcall_1(void *ctx) +{ + value = 42; + return 0; +} + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 2); + __uint(key_size, sizeof(__u32)); + __array(values, int (void *)); +} prog_array_init SEC(".maps") = { + .values = { + [1] = (void *)&tailcall_1, + }, +}; + +SEC("raw_tp/sys_enter") +int entry(void *ctx) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + bpf_tail_call(ctx, &prog_array_init, 1); + return 0; +} From 43174f0d4597325cb91f1f1f55263eb6e6101036 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 29 Nov 2021 10:00:40 +0000 Subject: [PATCH 043/115] libbpf: Silence uninitialized warning/error in btf_dump_dump_type_data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling libbpf with gcc 4.8.5, we see: CC staticobjs/btf_dump.o btf_dump.c: In function ‘btf_dump_dump_type_data.isra.24’: btf_dump.c:2296:5: error: ‘err’ may be used uninitialized in this function [-Werror=maybe-uninitialized] if (err < 0) ^ cc1: all warnings being treated as errors make: *** [staticobjs/btf_dump.o] Error 1 While gcc 4.8.5 is too old to build the upstream kernel, it's possible it could be used to build standalone libbpf which suffers from the same problem. Silence the error by initializing 'err' to 0. The warning/error seems to be a false positive since err is set early in the function. Regardless we shouldn't prevent libbpf from building for this. Fixes: 920d16af9b42 ("libbpf: BTF dumper support for typed data") Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1638180040-8037-1-git-send-email-alan.maguire@oracle.com --- tools/lib/bpf/btf_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 05f3e7dfec0a..f06a1d343c92 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -2216,7 +2216,7 @@ static int btf_dump_dump_type_data(struct btf_dump *d, __u8 bits_offset, __u8 bit_sz) { - int size, err; + int size, err = 0; size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset); if (size < 0) From c291d0a4d169811898d723cfa5f1aa1fc60e607c Mon Sep 17 00:00:00 2001 From: Mehrdad Arshad Rad Date: Sun, 28 Nov 2021 11:33:37 -0800 Subject: [PATCH 044/115] libbpf: Remove duplicate assignments There is a same action when load_attr.attach_btf_id is initialized. Signed-off-by: Mehrdad Arshad Rad Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211128193337.10628-1-arshad.rad@gmail.com --- tools/lib/bpf/libbpf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b59fede08ba7..5a2f5a6ae2f9 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6559,7 +6559,6 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog load_attr.expected_attach_type = prog->expected_attach_type; if (kernel_supports(obj, FEAT_PROG_NAME)) prog_name = prog->name; - load_attr.attach_btf_id = prog->attach_btf_id; load_attr.attach_prog_fd = prog->attach_prog_fd; load_attr.attach_btf_obj_fd = prog->attach_btf_obj_fd; load_attr.attach_btf_id = prog->attach_btf_id; From 58ffa1b413690dbfdea86c068510339fe1573c33 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Nov 2021 17:32:11 +0100 Subject: [PATCH 045/115] x86, bpf: Cleanup the top of file header in bpf_jit_comp.c Don't bother mentioning the file name as it is implied, and remove the reference to internal BPF. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211119163215.971383-2-hch@lst.de --- arch/x86/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 631847907786..1d7b0c69b644 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * bpf_jit_comp.c: BPF JIT compiler + * BPF JIT compiler * * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com) - * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include #include From ccb00292eb2dbb58a55850639356d07630cd3c46 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Nov 2021 17:32:12 +0100 Subject: [PATCH 046/115] bpf: Remove a redundant comment on bpf_prog_free The comment telling that the prog_free helper is freeing the program is not exactly useful, so just remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211119163215.971383-3-hch@lst.de --- kernel/bpf/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b52dc845ecea..189d85d64bf1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2301,7 +2301,6 @@ static void bpf_prog_free_deferred(struct work_struct *work) } } -/* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; From 06edc59c1fd7aababc8361655b20f4cc9870aef2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Nov 2021 17:32:13 +0100 Subject: [PATCH 047/115] bpf, docs: Prune all references to "internal BPF" The eBPF name has completely taken over from eBPF in general usage for the actual eBPF representation, or BPF for any general in-kernel use. Prune all remaining references to "internal BPF". Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211119163215.971383-4-hch@lst.de --- Documentation/networking/filter.rst | 22 +++++++++++----------- arch/arm/net/bpf_jit_32.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp_64.c | 2 +- kernel/bpf/core.c | 2 +- net/core/filter.c | 11 +++++------ 6 files changed, 20 insertions(+), 21 deletions(-) diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index ce2b8e8bb9ab..83ffcaa5b91a 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -617,7 +617,7 @@ format with similar underlying principles from BPF described in previous paragraphs is being used. However, the instruction set format is modelled closer to the underlying architecture to mimic native instruction sets, so that a better performance can be achieved (more details later). This new -ISA is called 'eBPF' or 'internal BPF' interchangeably. (Note: eBPF which +ISA is called 'eBPF'. (Note: eBPF which originates from [e]xtended BPF is not the same as BPF extensions! While eBPF is an ISA, BPF extensions date back to classic BPF's 'overloading' of BPF_LD | BPF_{B,H,W} | BPF_ABS instruction.) @@ -690,7 +690,7 @@ Some core changes of the new internal format: That behavior maps directly to x86_64 and arm64 subregister definition, but makes other JITs more difficult. - 32-bit architectures run 64-bit internal BPF programs via interpreter. + 32-bit architectures run 64-bit eBPF programs via interpreter. Their JITs may convert BPF programs that only use 32-bit subregisters into native instruction set and let the rest being interpreted. @@ -711,7 +711,7 @@ Some core changes of the new internal format: - Introduces bpf_call insn and register passing convention for zero overhead calls from/to other kernel functions: - Before an in-kernel function call, the internal BPF program needs to + Before an in-kernel function call, the eBPF program needs to place function arguments into R1 to R5 registers to satisfy calling convention, then the interpreter will take them from registers and pass to in-kernel function. If R1 - R5 registers are mapped to CPU registers @@ -780,7 +780,7 @@ Some core changes of the new internal format: ... since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing and rbx, r12 - r15 are callee saved. - Then the following internal BPF pseudo-program:: + Then the following eBPF pseudo-program:: bpf_mov R6, R1 /* save ctx */ bpf_mov R2, 2 @@ -846,7 +846,7 @@ Some core changes of the new internal format: bpf_exit After the call the registers R1-R5 contain junk values and cannot be read. - An in-kernel eBPF verifier is used to validate internal BPF programs. + An in-kernel eBPF verifier is used to validate eBPF programs. Also in the new design, eBPF is limited to 4096 insns, which means that any program will terminate quickly and will only call a fixed number of kernel @@ -861,23 +861,23 @@ A program, that is translated internally consists of the following elements:: op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32 -So far 87 internal BPF instructions were implemented. 8-bit 'op' opcode field +So far 87 eBPF instructions were implemented. 8-bit 'op' opcode field has room for new instructions. Some of them may use 16/24/32 byte encoding. New instructions must be multiple of 8 bytes to preserve backward compatibility. -Internal BPF is a general purpose RISC instruction set. Not every register and +eBPF is a general purpose RISC instruction set. Not every register and every instruction are used during translation from original BPF to new format. For example, socket filters are not using ``exclusive add`` instruction, but tracing filters may do to maintain counters of events, for example. Register R9 is not used by socket filters either, but more complex filters may be running out of registers and would have to resort to spill/fill to stack. -Internal BPF can be used as a generic assembler for last step performance +eBPF can be used as a generic assembler for last step performance optimizations, socket filters and seccomp are using it as assembler. Tracing filters may use it as assembler to generate code from kernel. In kernel usage -may not be bounded by security considerations, since generated internal BPF code +may not be bounded by security considerations, since generated eBPF code may be optimizing internal code path and not being exposed to the user space. -Safety of internal BPF can come from a verifier (TBD). In such use cases as +Safety of eBPF can come from a verifier (TBD). In such use cases as described, it may be used as safe instruction set. Just like the original BPF, the new format runs within a controlled environment, @@ -1675,7 +1675,7 @@ Testing ------- Next to the BPF toolchain, the kernel also ships a test module that contains -various test cases for classic and internal BPF that can be executed against +various test cases for classic and eBPF that can be executed against the BPF interpreter and JIT compiler. It can be found in lib/test_bpf.c and enabled via Kconfig:: diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index e59b41e9ab0c..10ceebb7530b 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -163,7 +163,7 @@ static const s8 bpf2a32[][2] = { [BPF_REG_9] = {STACK_OFFSET(BPF_R9_HI), STACK_OFFSET(BPF_R9_LO)}, /* Read only Frame Pointer to access Stack */ [BPF_REG_FP] = {STACK_OFFSET(BPF_FP_HI), STACK_OFFSET(BPF_FP_LO)}, - /* Temporary Register for internal BPF JIT, can be used + /* Temporary Register for BPF JIT, can be used * for constant blindings and others. */ [TMP_REG_1] = {ARM_R7, ARM_R6}, diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 07c12c42b751..07aad85848fa 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -44,7 +44,7 @@ static const int bpf2a64[] = { [BPF_REG_9] = A64_R(22), /* read-only frame pointer to access stack */ [BPF_REG_FP] = A64_R(25), - /* temporary registers for internal BPF JIT */ + /* temporary registers for BPF JIT */ [TMP_REG_1] = A64_R(10), [TMP_REG_2] = A64_R(11), [TMP_REG_3] = A64_R(12), diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 0bfe1c72a0c9..b1e38784eb23 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -227,7 +227,7 @@ static const int bpf2sparc[] = { [BPF_REG_AX] = G7, - /* temporary register for internal BPF JIT */ + /* temporary register for BPF JIT */ [TMP_REG_1] = G1, [TMP_REG_2] = G2, [TMP_REG_3] = G3, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 189d85d64bf1..de3e5bc6781f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1892,7 +1892,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp) /** * bpf_prog_select_runtime - select exec runtime for BPF program - * @fp: bpf_prog populated with internal BPF program + * @fp: bpf_prog populated with BPF program * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. diff --git a/net/core/filter.c b/net/core/filter.c index 26e0276aa00d..fe27c91e3758 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1242,10 +1242,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) int err, new_len, old_len = fp->len; bool seen_ld_abs = false; - /* We are free to overwrite insns et al right here as it - * won't be used at this point in time anymore internally - * after the migration to the internal BPF instruction - * representation. + /* We are free to overwrite insns et al right here as it won't be used at + * this point in time anymore internally after the migration to the eBPF + * instruction representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct bpf_insn)); @@ -1336,8 +1335,8 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, */ bpf_jit_compile(fp); - /* JIT compiler couldn't process this filter, so do the - * internal BPF translation for the optimized interpreter. + /* JIT compiler couldn't process this filter, so do the eBPF translation + * for the optimized interpreter. */ if (!fp->jited) fp = bpf_migrate_filter(fp); From bc84e959e5aed4a79597d03e810fd1d7067b4ff7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Nov 2021 17:32:14 +0100 Subject: [PATCH 048/115] bpf, docs: Move handling of maps to Documentation/bpf/maps.rst Move the general maps documentation into the maps.rst file from the overall networking filter documentation and add a link instead. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211119163215.971383-5-hch@lst.de --- Documentation/bpf/maps.rst | 43 ++++++++++++++++++++++++++ Documentation/networking/filter.rst | 47 ++--------------------------- 2 files changed, 46 insertions(+), 44 deletions(-) diff --git a/Documentation/bpf/maps.rst b/Documentation/bpf/maps.rst index 2084b0e7cde8..f41619e312ac 100644 --- a/Documentation/bpf/maps.rst +++ b/Documentation/bpf/maps.rst @@ -1,4 +1,47 @@ + ========= +eBPF maps +========= + +'maps' is a generic storage of different types for sharing data between kernel +and userspace. + +The maps are accessed from user space via BPF syscall, which has commands: + +- create a map with given type and attributes + ``map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)`` + using attr->map_type, attr->key_size, attr->value_size, attr->max_entries + returns process-local file descriptor or negative error + +- lookup key in a given map + ``err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)`` + using attr->map_fd, attr->key, attr->value + returns zero and stores found elem into value or negative error + +- create or update key/value pair in a given map + ``err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)`` + using attr->map_fd, attr->key, attr->value + returns zero or negative error + +- find and delete element by key in a given map + ``err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)`` + using attr->map_fd, attr->key + +- to delete map: close(fd) + Exiting process will delete maps automatically + +userspace programs use this syscall to create/access maps that eBPF programs +are concurrently updating. + +maps can have different types: hash, array, bloom filter, radix-tree, etc. + +The map is defined by: + + - type + - max number of elements + - key size in bytes + - value size in bytes + Map Types ========= diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index 83ffcaa5b91a..43ef05b91f98 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -1232,9 +1232,9 @@ pointer type. The types of pointers describe their base, as follows: Pointer to the value stored in a map element. PTR_TO_MAP_VALUE_OR_NULL Either a pointer to a map value, or NULL; map accesses - (see section 'eBPF maps', below) return this type, - which becomes a PTR_TO_MAP_VALUE when checked != NULL. - Arithmetic on these pointers is forbidden. + (see maps.rst) return this type, which becomes a + a PTR_TO_MAP_VALUE when checked != NULL. Arithmetic on + these pointers is forbidden. PTR_TO_STACK Frame pointer. PTR_TO_PACKET @@ -1402,47 +1402,6 @@ using normal C code as:: which makes such programs easier to write comparing to LD_ABS insn and significantly faster. -eBPF maps ---------- -'maps' is a generic storage of different types for sharing data between kernel -and userspace. - -The maps are accessed from user space via BPF syscall, which has commands: - -- create a map with given type and attributes - ``map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)`` - using attr->map_type, attr->key_size, attr->value_size, attr->max_entries - returns process-local file descriptor or negative error - -- lookup key in a given map - ``err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)`` - using attr->map_fd, attr->key, attr->value - returns zero and stores found elem into value or negative error - -- create or update key/value pair in a given map - ``err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)`` - using attr->map_fd, attr->key, attr->value - returns zero or negative error - -- find and delete element by key in a given map - ``err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)`` - using attr->map_fd, attr->key - -- to delete map: close(fd) - Exiting process will delete maps automatically - -userspace programs use this syscall to create/access maps that eBPF programs -are concurrently updating. - -maps can have different types: hash, array, bloom filter, radix-tree, etc. - -The map is defined by: - - - type - - max number of elements - - key size in bytes - - value size in bytes - Pruning ------- The verifier does not actually walk all possible paths through the program. For From 88691e9e1ef59fa917b2bc2df47d550e7635e73c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Nov 2021 17:32:15 +0100 Subject: [PATCH 049/115] bpf, docs: Split general purpose eBPF documentation out of filter.rst filter.rst starts out documenting the classic BPF and then spills into introducing and documentating eBPF. Move the eBPF documentation into rwo new files under Documentation/bpf/ for the instruction set and the verifier and link to the BPF documentation from filter.rst. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211119163215.971383-6-hch@lst.de --- Documentation/bpf/index.rst | 9 +- Documentation/bpf/instruction-set.rst | 467 ++++++++++++ Documentation/bpf/verifier.rst | 529 ++++++++++++++ Documentation/networking/filter.rst | 993 +------------------------- 4 files changed, 1008 insertions(+), 990 deletions(-) create mode 100644 Documentation/bpf/instruction-set.rst create mode 100644 Documentation/bpf/verifier.rst diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 413f50101eca..91ba5a62026b 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -5,16 +5,15 @@ BPF Documentation This directory contains documentation for the BPF (Berkeley Packet Filter) facility, with a focus on the extended BPF version (eBPF). -This kernel side documentation is still work in progress. The main -textual documentation is (for historical reasons) described in -:ref:`networking-filter`, which describe both classical and extended -BPF instruction-set. +This kernel side documentation is still work in progress. The Cilium project also maintains a `BPF and XDP Reference Guide`_ that goes into great technical depth about the BPF Architecture. .. toctree:: :maxdepth: 1 + instruction-set + verifier libbpf/index btf faq @@ -34,4 +33,4 @@ that goes into great technical depth about the BPF Architecture. * :ref:`genindex` .. Links: -.. _BPF and XDP Reference Guide: https://docs.cilium.io/en/latest/bpf/ \ No newline at end of file +.. _BPF and XDP Reference Guide: https://docs.cilium.io/en/latest/bpf/ diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst new file mode 100644 index 000000000000..fa7cba59031e --- /dev/null +++ b/Documentation/bpf/instruction-set.rst @@ -0,0 +1,467 @@ + +==================== +eBPF Instruction Set +==================== + +eBPF is designed to be JITed with one to one mapping, which can also open up +the possibility for GCC/LLVM compilers to generate optimized eBPF code through +an eBPF backend that performs almost as fast as natively compiled code. + +Some core changes of the eBPF format from classic BPF: + +- Number of registers increase from 2 to 10: + + The old format had two registers A and X, and a hidden frame pointer. The + new layout extends this to be 10 internal registers and a read-only frame + pointer. Since 64-bit CPUs are passing arguments to functions via registers + the number of args from eBPF program to in-kernel function is restricted + to 5 and one register is used to accept return value from an in-kernel + function. Natively, x86_64 passes first 6 arguments in registers, aarch64/ + sparcv9/mips64 have 7 - 8 registers for arguments; x86_64 has 6 callee saved + registers, and aarch64/sparcv9/mips64 have 11 or more callee saved registers. + + Therefore, eBPF calling convention is defined as: + + * R0 - return value from in-kernel function, and exit value for eBPF program + * R1 - R5 - arguments from eBPF program to in-kernel function + * R6 - R9 - callee saved registers that in-kernel function will preserve + * R10 - read-only frame pointer to access stack + + Thus, all eBPF registers map one to one to HW registers on x86_64, aarch64, + etc, and eBPF calling convention maps directly to ABIs used by the kernel on + 64-bit architectures. + + On 32-bit architectures JIT may map programs that use only 32-bit arithmetic + and may let more complex programs to be interpreted. + + R0 - R5 are scratch registers and eBPF program needs spill/fill them if + necessary across calls. Note that there is only one eBPF program (== one + eBPF main routine) and it cannot call other eBPF functions, it can only + call predefined in-kernel functions, though. + +- Register width increases from 32-bit to 64-bit: + + Still, the semantics of the original 32-bit ALU operations are preserved + via 32-bit subregisters. All eBPF registers are 64-bit with 32-bit lower + subregisters that zero-extend into 64-bit if they are being written to. + That behavior maps directly to x86_64 and arm64 subregister definition, but + makes other JITs more difficult. + + 32-bit architectures run 64-bit eBPF programs via interpreter. + Their JITs may convert BPF programs that only use 32-bit subregisters into + native instruction set and let the rest being interpreted. + + Operation is 64-bit, because on 64-bit architectures, pointers are also + 64-bit wide, and we want to pass 64-bit values in/out of kernel functions, + so 32-bit eBPF registers would otherwise require to define register-pair + ABI, thus, there won't be able to use a direct eBPF register to HW register + mapping and JIT would need to do combine/split/move operations for every + register in and out of the function, which is complex, bug prone and slow. + Another reason is the use of atomic 64-bit counters. + +- Conditional jt/jf targets replaced with jt/fall-through: + + While the original design has constructs such as ``if (cond) jump_true; + else jump_false;``, they are being replaced into alternative constructs like + ``if (cond) jump_true; /* else fall-through */``. + +- Introduces bpf_call insn and register passing convention for zero overhead + calls from/to other kernel functions: + + Before an in-kernel function call, the eBPF program needs to + place function arguments into R1 to R5 registers to satisfy calling + convention, then the interpreter will take them from registers and pass + to in-kernel function. If R1 - R5 registers are mapped to CPU registers + that are used for argument passing on given architecture, the JIT compiler + doesn't need to emit extra moves. Function arguments will be in the correct + registers and BPF_CALL instruction will be JITed as single 'call' HW + instruction. This calling convention was picked to cover common call + situations without performance penalty. + + After an in-kernel function call, R1 - R5 are reset to unreadable and R0 has + a return value of the function. Since R6 - R9 are callee saved, their state + is preserved across the call. + + For example, consider three C functions:: + + u64 f1() { return (*_f2)(1); } + u64 f2(u64 a) { return f3(a + 1, a); } + u64 f3(u64 a, u64 b) { return a - b; } + + GCC can compile f1, f3 into x86_64:: + + f1: + movl $1, %edi + movq _f2(%rip), %rax + jmp *%rax + f3: + movq %rdi, %rax + subq %rsi, %rax + ret + + Function f2 in eBPF may look like:: + + f2: + bpf_mov R2, R1 + bpf_add R1, 1 + bpf_call f3 + bpf_exit + + If f2 is JITed and the pointer stored to ``_f2``. The calls f1 -> f2 -> f3 and + returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to + be used to call into f2. + + For practical reasons all eBPF programs have only one argument 'ctx' which is + already placed into R1 (e.g. on __bpf_prog_run() startup) and the programs + can call kernel functions with up to 5 arguments. Calls with 6 or more arguments + are currently not supported, but these restrictions can be lifted if necessary + in the future. + + On 64-bit architectures all register map to HW registers one to one. For + example, x86_64 JIT compiler can map them as ... + + :: + + R0 - rax + R1 - rdi + R2 - rsi + R3 - rdx + R4 - rcx + R5 - r8 + R6 - rbx + R7 - r13 + R8 - r14 + R9 - r15 + R10 - rbp + + ... since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing + and rbx, r12 - r15 are callee saved. + + Then the following eBPF pseudo-program:: + + bpf_mov R6, R1 /* save ctx */ + bpf_mov R2, 2 + bpf_mov R3, 3 + bpf_mov R4, 4 + bpf_mov R5, 5 + bpf_call foo + bpf_mov R7, R0 /* save foo() return value */ + bpf_mov R1, R6 /* restore ctx for next call */ + bpf_mov R2, 6 + bpf_mov R3, 7 + bpf_mov R4, 8 + bpf_mov R5, 9 + bpf_call bar + bpf_add R0, R7 + bpf_exit + + After JIT to x86_64 may look like:: + + push %rbp + mov %rsp,%rbp + sub $0x228,%rsp + mov %rbx,-0x228(%rbp) + mov %r13,-0x220(%rbp) + mov %rdi,%rbx + mov $0x2,%esi + mov $0x3,%edx + mov $0x4,%ecx + mov $0x5,%r8d + callq foo + mov %rax,%r13 + mov %rbx,%rdi + mov $0x6,%esi + mov $0x7,%edx + mov $0x8,%ecx + mov $0x9,%r8d + callq bar + add %r13,%rax + mov -0x228(%rbp),%rbx + mov -0x220(%rbp),%r13 + leaveq + retq + + Which is in this example equivalent in C to:: + + u64 bpf_filter(u64 ctx) + { + return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9); + } + + In-kernel functions foo() and bar() with prototype: u64 (*)(u64 arg1, u64 + arg2, u64 arg3, u64 arg4, u64 arg5); will receive arguments in proper + registers and place their return value into ``%rax`` which is R0 in eBPF. + Prologue and epilogue are emitted by JIT and are implicit in the + interpreter. R0-R5 are scratch registers, so eBPF program needs to preserve + them across the calls as defined by calling convention. + + For example the following program is invalid:: + + bpf_mov R1, 1 + bpf_call foo + bpf_mov R0, R1 + bpf_exit + + After the call the registers R1-R5 contain junk values and cannot be read. + An in-kernel `eBPF verifier`_ is used to validate eBPF programs. + +Also in the new design, eBPF is limited to 4096 insns, which means that any +program will terminate quickly and will only call a fixed number of kernel +functions. Original BPF and eBPF are two operand instructions, +which helps to do one-to-one mapping between eBPF insn and x86 insn during JIT. + +The input context pointer for invoking the interpreter function is generic, +its content is defined by a specific use case. For seccomp register R1 points +to seccomp_data, for converted BPF filters R1 points to a skb. + +A program, that is translated internally consists of the following elements:: + + op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32 + +So far 87 eBPF instructions were implemented. 8-bit 'op' opcode field +has room for new instructions. Some of them may use 16/24/32 byte encoding. New +instructions must be multiple of 8 bytes to preserve backward compatibility. + +eBPF is a general purpose RISC instruction set. Not every register and +every instruction are used during translation from original BPF to eBPF. +For example, socket filters are not using ``exclusive add`` instruction, but +tracing filters may do to maintain counters of events, for example. Register R9 +is not used by socket filters either, but more complex filters may be running +out of registers and would have to resort to spill/fill to stack. + +eBPF can be used as a generic assembler for last step performance +optimizations, socket filters and seccomp are using it as assembler. Tracing +filters may use it as assembler to generate code from kernel. In kernel usage +may not be bounded by security considerations, since generated eBPF code +may be optimizing internal code path and not being exposed to the user space. +Safety of eBPF can come from the `eBPF verifier`_. In such use cases as +described, it may be used as safe instruction set. + +Just like the original BPF, eBPF runs within a controlled environment, +is deterministic and the kernel can easily prove that. The safety of the program +can be determined in two steps: first step does depth-first-search to disallow +loops and other CFG validation; second step starts from the first insn and +descends all possible paths. It simulates execution of every insn and observes +the state change of registers and stack. + +eBPF opcode encoding +==================== + +eBPF is reusing most of the opcode encoding from classic to simplify conversion +of classic BPF to eBPF. For arithmetic and jump instructions the 8-bit 'code' +field is divided into three parts:: + + +----------------+--------+--------------------+ + | 4 bits | 1 bit | 3 bits | + | operation code | source | instruction class | + +----------------+--------+--------------------+ + (MSB) (LSB) + +Three LSB bits store instruction class which is one of: + + =================== =============== + Classic BPF classes eBPF classes + =================== =============== + BPF_LD 0x00 BPF_LD 0x00 + BPF_LDX 0x01 BPF_LDX 0x01 + BPF_ST 0x02 BPF_ST 0x02 + BPF_STX 0x03 BPF_STX 0x03 + BPF_ALU 0x04 BPF_ALU 0x04 + BPF_JMP 0x05 BPF_JMP 0x05 + BPF_RET 0x06 BPF_JMP32 0x06 + BPF_MISC 0x07 BPF_ALU64 0x07 + =================== =============== + +When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ... + + :: + + BPF_K 0x00 + BPF_X 0x08 + + * in classic BPF, this means:: + + BPF_SRC(code) == BPF_X - use register X as source operand + BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand + + * in eBPF, this means:: + + BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand + BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand + +... and four MSB bits store operation code. + +If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of:: + + BPF_ADD 0x00 + BPF_SUB 0x10 + BPF_MUL 0x20 + BPF_DIV 0x30 + BPF_OR 0x40 + BPF_AND 0x50 + BPF_LSH 0x60 + BPF_RSH 0x70 + BPF_NEG 0x80 + BPF_MOD 0x90 + BPF_XOR 0xa0 + BPF_MOV 0xb0 /* eBPF only: mov reg to reg */ + BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */ + BPF_END 0xd0 /* eBPF only: endianness conversion */ + +If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of:: + + BPF_JA 0x00 /* BPF_JMP only */ + BPF_JEQ 0x10 + BPF_JGT 0x20 + BPF_JGE 0x30 + BPF_JSET 0x40 + BPF_JNE 0x50 /* eBPF only: jump != */ + BPF_JSGT 0x60 /* eBPF only: signed '>' */ + BPF_JSGE 0x70 /* eBPF only: signed '>=' */ + BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */ + BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */ + BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ + BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ + BPF_JSLT 0xc0 /* eBPF only: signed '<' */ + BPF_JSLE 0xd0 /* eBPF only: signed '<=' */ + +So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF +and eBPF. There are only two registers in classic BPF, so it means A += X. +In eBPF it means dst_reg = (u32) dst_reg + (u32) src_reg; similarly, +BPF_XOR | BPF_K | BPF_ALU means A ^= imm32 in classic BPF and analogous +src_reg = (u32) src_reg ^ (u32) imm32 in eBPF. + +Classic BPF is using BPF_MISC class to represent A = X and X = A moves. +eBPF is using BPF_MOV | BPF_X | BPF_ALU code instead. Since there are no +BPF_MISC operations in eBPF, the class 7 is used as BPF_ALU64 to mean +exactly the same operations as BPF_ALU, but with 64-bit wide operands +instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.: +dst_reg = dst_reg + src_reg + +Classic BPF wastes the whole BPF_RET class to represent a single ``ret`` +operation. Classic BPF_RET | BPF_K means copy imm32 into return register +and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT +in eBPF means function exit only. The eBPF program needs to store return +value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as +BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide +operands for the comparisons instead. + +For load and store instructions the 8-bit 'code' field is divided as:: + + +--------+--------+-------------------+ + | 3 bits | 2 bits | 3 bits | + | mode | size | instruction class | + +--------+--------+-------------------+ + (MSB) (LSB) + +Size modifier is one of ... + +:: + + BPF_W 0x00 /* word */ + BPF_H 0x08 /* half word */ + BPF_B 0x10 /* byte */ + BPF_DW 0x18 /* eBPF only, double word */ + +... which encodes size of load/store operation:: + + B - 1 byte + H - 2 byte + W - 4 byte + DW - 8 byte (eBPF only) + +Mode modifier is one of:: + + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ + BPF_ABS 0x20 + BPF_IND 0x40 + BPF_MEM 0x60 + BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ + BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ + BPF_ATOMIC 0xc0 /* eBPF only, atomic operations */ + +eBPF has two non-generic instructions: (BPF_ABS | | BPF_LD) and +(BPF_IND | | BPF_LD) which are used to access packet data. + +They had to be carried over from classic to have strong performance of +socket filters running in eBPF interpreter. These instructions can only +be used when interpreter context is a pointer to ``struct sk_buff`` and +have seven implicit operands. Register R6 is an implicit input that must +contain pointer to sk_buff. Register R0 is an implicit output which contains +the data fetched from the packet. Registers R1-R5 are scratch registers +and must not be used to store the data across BPF_ABS | BPF_LD or +BPF_IND | BPF_LD instructions. + +These instructions have implicit program exit condition as well. When +eBPF program is trying to access the data beyond the packet boundary, +the interpreter will abort the execution of the program. JIT compilers +therefore must preserve this property. src_reg and imm32 fields are +explicit inputs to these instructions. + +For example:: + + BPF_IND | BPF_W | BPF_LD means: + + R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32)) + and R1 - R5 were scratched. + +Unlike classic BPF instruction set, eBPF has generic load/store operations:: + + BPF_MEM | | BPF_STX: *(size *) (dst_reg + off) = src_reg + BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 + BPF_MEM | | BPF_LDX: dst_reg = *(size *) (src_reg + off) + +Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. + +It also includes atomic operations, which use the immediate field for extra +encoding:: + + .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg + .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg + +The basic atomic operations supported are:: + + BPF_ADD + BPF_AND + BPF_OR + BPF_XOR + +Each having equivalent semantics with the ``BPF_ADD`` example, that is: the +memory location addresed by ``dst_reg + off`` is atomically modified, with +``src_reg`` as the other operand. If the ``BPF_FETCH`` flag is set in the +immediate, then these operations also overwrite ``src_reg`` with the +value that was in memory before it was modified. + +The more special operations are:: + + BPF_XCHG + +This atomically exchanges ``src_reg`` with the value addressed by ``dst_reg + +off``. :: + + BPF_CMPXCHG + +This atomically compares the value addressed by ``dst_reg + off`` with +``R0``. If they match it is replaced with ``src_reg``. In either case, the +value that was there before is zero-extended and loaded back to ``R0``. + +Note that 1 and 2 byte atomic operations are not supported. + +Clang can generate atomic instructions by default when ``-mcpu=v3`` is +enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction +Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable +the atomics features, while keeping a lower ``-mcpu`` version, you can use +``-Xclang -target-feature -Xclang +alu32``. + +You may encounter ``BPF_XADD`` - this is a legacy name for ``BPF_ATOMIC``, +referring to the exclusive-add operation encoded when the immediate field is +zero. + +eBPF has one 16-byte instruction: ``BPF_LD | BPF_DW | BPF_IMM`` which consists +of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single +instruction that loads 64-bit immediate value into a dst_reg. +Classic BPF has similar instruction: ``BPF_LD | BPF_W | BPF_IMM`` which loads +32-bit immediate value into a register. + +.. Links: +.. _eBPF verifier: verifiers.rst diff --git a/Documentation/bpf/verifier.rst b/Documentation/bpf/verifier.rst new file mode 100644 index 000000000000..fae5f6273bac --- /dev/null +++ b/Documentation/bpf/verifier.rst @@ -0,0 +1,529 @@ + +============= +eBPF verifier +============= + +The safety of the eBPF program is determined in two steps. + +First step does DAG check to disallow loops and other CFG validation. +In particular it will detect programs that have unreachable instructions. +(though classic BPF checker allows them) + +Second step starts from the first insn and descends all possible paths. +It simulates execution of every insn and observes the state change of +registers and stack. + +At the start of the program the register R1 contains a pointer to context +and has type PTR_TO_CTX. +If verifier sees an insn that does R2=R1, then R2 has now type +PTR_TO_CTX as well and can be used on the right hand side of expression. +If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=SCALAR_VALUE, +since addition of two valid pointers makes invalid pointer. +(In 'secure' mode verifier will reject any type of pointer arithmetic to make +sure that kernel addresses don't leak to unprivileged users) + +If register was never written to, it's not readable:: + + bpf_mov R0 = R2 + bpf_exit + +will be rejected, since R2 is unreadable at the start of the program. + +After kernel function call, R1-R5 are reset to unreadable and +R0 has a return type of the function. + +Since R6-R9 are callee saved, their state is preserved across the call. + +:: + + bpf_mov R6 = 1 + bpf_call foo + bpf_mov R0 = R6 + bpf_exit + +is a correct program. If there was R1 instead of R6, it would have +been rejected. + +load/store instructions are allowed only with registers of valid types, which +are PTR_TO_CTX, PTR_TO_MAP, PTR_TO_STACK. They are bounds and alignment checked. +For example:: + + bpf_mov R1 = 1 + bpf_mov R2 = 2 + bpf_xadd *(u32 *)(R1 + 3) += R2 + bpf_exit + +will be rejected, since R1 doesn't have a valid pointer type at the time of +execution of instruction bpf_xadd. + +At the start R1 type is PTR_TO_CTX (a pointer to generic ``struct bpf_context``) +A callback is used to customize verifier to restrict eBPF program access to only +certain fields within ctx structure with specified size and alignment. + +For example, the following insn:: + + bpf_ld R0 = *(u32 *)(R6 + 8) + +intends to load a word from address R6 + 8 and store it into R0 +If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know +that offset 8 of size 4 bytes can be accessed for reading, otherwise +the verifier will reject the program. +If R6=PTR_TO_STACK, then access should be aligned and be within +stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, +so it will fail verification, since it's out of bounds. + +The verifier will allow eBPF program to read data from stack only after +it wrote into it. + +Classic BPF verifier does similar check with M[0-15] memory slots. +For example:: + + bpf_ld R0 = *(u32 *)(R10 - 4) + bpf_exit + +is invalid program. +Though R10 is correct read-only register and has type PTR_TO_STACK +and R10 - 4 is within stack bounds, there were no stores into that location. + +Pointer register spill/fill is tracked as well, since four (R6-R9) +callee saved registers may not be enough for some programs. + +Allowed function calls are customized with bpf_verifier_ops->get_func_proto() +The eBPF verifier will check that registers match argument constraints. +After the call register R0 will be set to return type of the function. + +Function calls is a main mechanism to extend functionality of eBPF programs. +Socket filters may let programs to call one set of functions, whereas tracing +filters may allow completely different set. + +If a function made accessible to eBPF program, it needs to be thought through +from safety point of view. The verifier will guarantee that the function is +called with valid arguments. + +seccomp vs socket filters have different security restrictions for classic BPF. +Seccomp solves this by two stage verifier: classic BPF verifier is followed +by seccomp verifier. In case of eBPF one configurable verifier is shared for +all use cases. + +See details of eBPF verifier in kernel/bpf/verifier.c + +Register value tracking +======================= + +In order to determine the safety of an eBPF program, the verifier must track +the range of possible values in each register and also in each stack slot. +This is done with ``struct bpf_reg_state``, defined in include/linux/ +bpf_verifier.h, which unifies tracking of scalar and pointer values. Each +register state has a type, which is either NOT_INIT (the register has not been +written to), SCALAR_VALUE (some value which is not usable as a pointer), or a +pointer type. The types of pointers describe their base, as follows: + + + PTR_TO_CTX + Pointer to bpf_context. + CONST_PTR_TO_MAP + Pointer to struct bpf_map. "Const" because arithmetic + on these pointers is forbidden. + PTR_TO_MAP_VALUE + Pointer to the value stored in a map element. + PTR_TO_MAP_VALUE_OR_NULL + Either a pointer to a map value, or NULL; map accesses + (see maps.rst) return this type, which becomes a + PTR_TO_MAP_VALUE when checked != NULL. Arithmetic on + these pointers is forbidden. + PTR_TO_STACK + Frame pointer. + PTR_TO_PACKET + skb->data. + PTR_TO_PACKET_END + skb->data + headlen; arithmetic forbidden. + PTR_TO_SOCKET + Pointer to struct bpf_sock_ops, implicitly refcounted. + PTR_TO_SOCKET_OR_NULL + Either a pointer to a socket, or NULL; socket lookup + returns this type, which becomes a PTR_TO_SOCKET when + checked != NULL. PTR_TO_SOCKET is reference-counted, + so programs must release the reference through the + socket release function before the end of the program. + Arithmetic on these pointers is forbidden. + +However, a pointer may be offset from this base (as a result of pointer +arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable +offset'. The former is used when an exactly-known value (e.g. an immediate +operand) is added to a pointer, while the latter is used for values which are +not exactly known. The variable offset is also used in SCALAR_VALUEs, to track +the range of possible values in the register. + +The verifier's knowledge about the variable offset consists of: + +* minimum and maximum values as unsigned +* minimum and maximum values as signed + +* knowledge of the values of individual bits, in the form of a 'tnum': a u64 + 'mask' and a u64 'value'. 1s in the mask represent bits whose value is unknown; + 1s in the value represent bits known to be 1. Bits known to be 0 have 0 in both + mask and value; no bit should ever be 1 in both. For example, if a byte is read + into a register from memory, the register's top 56 bits are known zero, while + the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we + then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; + 0x1ff), because of potential carries. + +Besides arithmetic, the register state can also be updated by conditional +branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch +it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false' +branch it will have a umax_value of 8. A signed compare (with BPF_JSGT or +BPF_JSGE) would instead update the signed minimum/maximum values. Information +from the signed and unsigned bounds can be combined; for instance if a value is +first tested < 8 and then tested s> 4, the verifier will conclude that the value +is also > 4 and s< 8, since the bounds prevent crossing the sign boundary. + +PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all +pointers sharing that same variable offset. This is important for packet range +checks: after adding a variable to a packet pointer register A, if you then copy +it to another register B and then add a constant 4 to A, both registers will +share the same 'id' but the A will have a fixed offset of +4. Then if A is +bounds-checked and found to be less than a PTR_TO_PACKET_END, the register B is +now known to have a safe range of at least 4 bytes. See 'Direct packet access', +below, for more on PTR_TO_PACKET ranges. + +The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of +the pointer returned from a map lookup. This means that when one copy is +checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs. +As well as range-checking, the tracked information is also used for enforcing +alignment of pointer accesses. For instance, on most systems the packet pointer +is 2 bytes after a 4-byte alignment. If a program adds 14 bytes to that to jump +over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting +pointer will have a variable offset known to be 4n+2 for some n, so adding the 2 +bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through +that pointer are safe. +The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common +to all copies of the pointer returned from a socket lookup. This has similar +behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but +it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly +represents a reference to the corresponding ``struct sock``. To ensure that the +reference is not leaked, it is imperative to NULL-check the reference and in +the non-NULL case, and pass the valid reference to the socket release function. + +Direct packet access +==================== + +In cls_bpf and act_bpf programs the verifier allows direct access to the packet +data via skb->data and skb->data_end pointers. +Ex:: + + 1: r4 = *(u32 *)(r1 +80) /* load skb->data_end */ + 2: r3 = *(u32 *)(r1 +76) /* load skb->data */ + 3: r5 = r3 + 4: r5 += 14 + 5: if r5 > r4 goto pc+16 + R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp + 6: r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */ + +this 2byte load from the packet is safe to do, since the program author +did check ``if (skb->data + 14 > skb->data_end) goto err`` at insn #5 which +means that in the fall-through case the register R3 (which points to skb->data) +has at least 14 directly accessible bytes. The verifier marks it +as R3=pkt(id=0,off=0,r=14). +id=0 means that no additional variables were added to the register. +off=0 means that no additional constants were added. +r=14 is the range of safe access which means that bytes [R3, R3 + 14) are ok. +Note that R5 is marked as R5=pkt(id=0,off=14,r=14). It also points +to the packet data, but constant 14 was added to the register, so +it now points to ``skb->data + 14`` and accessible range is [R5, R5 + 14 - 14) +which is zero bytes. + +More complex packet access may look like:: + + + R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp + 6: r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */ + 7: r4 = *(u8 *)(r3 +12) + 8: r4 *= 14 + 9: r3 = *(u32 *)(r1 +76) /* load skb->data */ + 10: r3 += r4 + 11: r2 = r1 + 12: r2 <<= 48 + 13: r2 >>= 48 + 14: r3 += r2 + 15: r2 = r3 + 16: r2 += 8 + 17: r1 = *(u32 *)(r1 +80) /* load skb->data_end */ + 18: if r2 > r1 goto pc+2 + R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp + 19: r1 = *(u8 *)(r3 +4) + +The state of the register R3 is R3=pkt(id=2,off=0,r=8) +id=2 means that two ``r3 += rX`` instructions were seen, so r3 points to some +offset within a packet and since the program author did +``if (r3 + 8 > r1) goto err`` at insn #18, the safe range is [R3, R3 + 8). +The verifier only allows 'add'/'sub' operations on packet registers. Any other +operation will set the register state to 'SCALAR_VALUE' and it won't be +available for direct packet access. + +Operation ``r3 += rX`` may overflow and become less than original skb->data, +therefore the verifier has to prevent that. So when it sees ``r3 += rX`` +instruction and rX is more than 16-bit value, any subsequent bounds-check of r3 +against skb->data_end will not give us 'range' information, so attempts to read +through the pointer will give "invalid access to packet" error. + +Ex. after insn ``r4 = *(u8 *)(r3 +12)`` (insn #7 above) the state of r4 is +R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) which means that upper 56 bits +of the register are guaranteed to be zero, and nothing is known about the lower +8 bits. After insn ``r4 *= 14`` the state becomes +R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)), since multiplying an 8-bit +value by constant 14 will keep upper 52 bits as zero, also the least significant +bit will be zero as 14 is even. Similarly ``r2 >>= 48`` will make +R2=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)), since the shift is not sign +extending. This logic is implemented in adjust_reg_min_max_vals() function, +which calls adjust_ptr_min_max_vals() for adding pointer to scalar (or vice +versa) and adjust_scalar_min_max_vals() for operations on two scalars. + +The end result is that bpf program author can access packet directly +using normal C code as:: + + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + struct eth_hdr *eth = data; + struct iphdr *iph = data + sizeof(*eth); + struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph); + + if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end) + return 0; + if (eth->h_proto != htons(ETH_P_IP)) + return 0; + if (iph->protocol != IPPROTO_UDP || iph->ihl != 5) + return 0; + if (udp->dest == 53 || udp->source == 9) + ...; + +which makes such programs easier to write comparing to LD_ABS insn +and significantly faster. + +Pruning +======= + +The verifier does not actually walk all possible paths through the program. For +each new branch to analyse, the verifier looks at all the states it's previously +been in when at this instruction. If any of them contain the current state as a +subset, the branch is 'pruned' - that is, the fact that the previous state was +accepted implies the current state would be as well. For instance, if in the +previous state, r1 held a packet-pointer, and in the current state, r1 holds a +packet-pointer with a range as long or longer and at least as strict an +alignment, then r1 is safe. Similarly, if r2 was NOT_INIT before then it can't +have been used by any path from that point, so any value in r2 (including +another NOT_INIT) is safe. The implementation is in the function regsafe(). +Pruning considers not only the registers but also the stack (and any spilled +registers it may hold). They must all be safe for the branch to be pruned. +This is implemented in states_equal(). + +Understanding eBPF verifier messages +==================================== + +The following are few examples of invalid eBPF programs and verifier error +messages as seen in the log: + +Program with unreachable instructions:: + + static struct bpf_insn prog[] = { + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), + }; + +Error: + + unreachable insn 1 + +Program that reads uninitialized register:: + + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + +Error:: + + 0: (bf) r0 = r2 + R2 !read_ok + +Program that doesn't initialize R0 before exiting:: + + BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + +Error:: + + 0: (bf) r2 = r1 + 1: (95) exit + R0 !read_ok + +Program that accesses stack out of bounds:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 +8) = 0 + invalid stack off=8 size=8 + +Program that doesn't initialize stack before passing its address into function:: + + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + +Error:: + + 0: (bf) r2 = r10 + 1: (07) r2 += -8 + 2: (b7) r1 = 0x0 + 3: (85) call 1 + invalid indirect read from stack off -8+0 size 8 + +Program that uses invalid map_fd=0 while calling to map_lookup_elem() function:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + fd 0 is not pointing to valid bpf_map + +Program that doesn't check return value of map_lookup_elem() before accessing +map element:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + 5: (7a) *(u64 *)(r0 +0) = 0 + R0 invalid mem access 'map_value_or_null' + +Program that correctly checks map_lookup_elem() returned value for NULL, but +accesses the memory with incorrect alignment:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+1 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +4) = 0 + misaligned access off 4 size 8 + +Program that correctly checks map_lookup_elem() returned value for NULL and +accesses memory with correct alignment in one side of 'if' branch, but fails +to do so in the other side of 'if' branch:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+2 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +0) = 0 + 7: (95) exit + + from 5 to 8: R0=imm0 R10=fp + 8: (7a) *(u64 *)(r0 +0) = 1 + R0 invalid mem access 'imm' + +Program that performs a socket lookup then sets the pointer to NULL without +checking it:: + + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (b7) r2 = 0 + 1: (63) *(u32 *)(r10 -8) = r2 + 2: (bf) r2 = r10 + 3: (07) r2 += -8 + 4: (b7) r3 = 4 + 5: (b7) r4 = 0 + 6: (b7) r5 = 0 + 7: (85) call bpf_sk_lookup_tcp#65 + 8: (b7) r0 = 0 + 9: (95) exit + Unreleased reference id=1, alloc_insn=7 + +Program that performs a socket lookup but does not NULL-check the returned +value:: + + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), + BPF_EXIT_INSN(), + +Error:: + + 0: (b7) r2 = 0 + 1: (63) *(u32 *)(r10 -8) = r2 + 2: (bf) r2 = r10 + 3: (07) r2 += -8 + 4: (b7) r3 = 4 + 5: (b7) r4 = 0 + 6: (b7) r5 = 0 + 7: (85) call bpf_sk_lookup_tcp#65 + 8: (95) exit + Unreleased reference id=1, alloc_insn=7 diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index 43ef05b91f98..43cdc4d34745 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -6,6 +6,13 @@ Linux Socket Filtering aka Berkeley Packet Filter (BPF) ======================================================= +Notice +------ + +This file used to document the eBPF format and mechanisms even when not +related to socket filtering. The ../bpf/index.rst has more details +on eBPF. + Introduction ------------ @@ -617,15 +624,11 @@ format with similar underlying principles from BPF described in previous paragraphs is being used. However, the instruction set format is modelled closer to the underlying architecture to mimic native instruction sets, so that a better performance can be achieved (more details later). This new -ISA is called 'eBPF'. (Note: eBPF which +ISA is called eBPF. See the ../bpf/index.rst for details. (Note: eBPF which originates from [e]xtended BPF is not the same as BPF extensions! While eBPF is an ISA, BPF extensions date back to classic BPF's 'overloading' of BPF_LD | BPF_{B,H,W} | BPF_ABS instruction.) -It is designed to be JITed with one to one mapping, which can also open up -the possibility for GCC/LLVM compilers to generate optimized eBPF code through -an eBPF backend that performs almost as fast as natively compiled code. - The new instruction set was originally designed with the possible goal in mind to write programs in "restricted C" and compile into eBPF with a optional GCC/LLVM backend, so that it can just-in-time map to modern 64-bit CPUs with @@ -650,986 +653,6 @@ Currently, the classic BPF format is being used for JITing on most sparc64, arm32, riscv64, riscv32 perform JIT compilation from eBPF instruction set. -Some core changes of the new internal format: - -- Number of registers increase from 2 to 10: - - The old format had two registers A and X, and a hidden frame pointer. The - new layout extends this to be 10 internal registers and a read-only frame - pointer. Since 64-bit CPUs are passing arguments to functions via registers - the number of args from eBPF program to in-kernel function is restricted - to 5 and one register is used to accept return value from an in-kernel - function. Natively, x86_64 passes first 6 arguments in registers, aarch64/ - sparcv9/mips64 have 7 - 8 registers for arguments; x86_64 has 6 callee saved - registers, and aarch64/sparcv9/mips64 have 11 or more callee saved registers. - - Therefore, eBPF calling convention is defined as: - - * R0 - return value from in-kernel function, and exit value for eBPF program - * R1 - R5 - arguments from eBPF program to in-kernel function - * R6 - R9 - callee saved registers that in-kernel function will preserve - * R10 - read-only frame pointer to access stack - - Thus, all eBPF registers map one to one to HW registers on x86_64, aarch64, - etc, and eBPF calling convention maps directly to ABIs used by the kernel on - 64-bit architectures. - - On 32-bit architectures JIT may map programs that use only 32-bit arithmetic - and may let more complex programs to be interpreted. - - R0 - R5 are scratch registers and eBPF program needs spill/fill them if - necessary across calls. Note that there is only one eBPF program (== one - eBPF main routine) and it cannot call other eBPF functions, it can only - call predefined in-kernel functions, though. - -- Register width increases from 32-bit to 64-bit: - - Still, the semantics of the original 32-bit ALU operations are preserved - via 32-bit subregisters. All eBPF registers are 64-bit with 32-bit lower - subregisters that zero-extend into 64-bit if they are being written to. - That behavior maps directly to x86_64 and arm64 subregister definition, but - makes other JITs more difficult. - - 32-bit architectures run 64-bit eBPF programs via interpreter. - Their JITs may convert BPF programs that only use 32-bit subregisters into - native instruction set and let the rest being interpreted. - - Operation is 64-bit, because on 64-bit architectures, pointers are also - 64-bit wide, and we want to pass 64-bit values in/out of kernel functions, - so 32-bit eBPF registers would otherwise require to define register-pair - ABI, thus, there won't be able to use a direct eBPF register to HW register - mapping and JIT would need to do combine/split/move operations for every - register in and out of the function, which is complex, bug prone and slow. - Another reason is the use of atomic 64-bit counters. - -- Conditional jt/jf targets replaced with jt/fall-through: - - While the original design has constructs such as ``if (cond) jump_true; - else jump_false;``, they are being replaced into alternative constructs like - ``if (cond) jump_true; /* else fall-through */``. - -- Introduces bpf_call insn and register passing convention for zero overhead - calls from/to other kernel functions: - - Before an in-kernel function call, the eBPF program needs to - place function arguments into R1 to R5 registers to satisfy calling - convention, then the interpreter will take them from registers and pass - to in-kernel function. If R1 - R5 registers are mapped to CPU registers - that are used for argument passing on given architecture, the JIT compiler - doesn't need to emit extra moves. Function arguments will be in the correct - registers and BPF_CALL instruction will be JITed as single 'call' HW - instruction. This calling convention was picked to cover common call - situations without performance penalty. - - After an in-kernel function call, R1 - R5 are reset to unreadable and R0 has - a return value of the function. Since R6 - R9 are callee saved, their state - is preserved across the call. - - For example, consider three C functions:: - - u64 f1() { return (*_f2)(1); } - u64 f2(u64 a) { return f3(a + 1, a); } - u64 f3(u64 a, u64 b) { return a - b; } - - GCC can compile f1, f3 into x86_64:: - - f1: - movl $1, %edi - movq _f2(%rip), %rax - jmp *%rax - f3: - movq %rdi, %rax - subq %rsi, %rax - ret - - Function f2 in eBPF may look like:: - - f2: - bpf_mov R2, R1 - bpf_add R1, 1 - bpf_call f3 - bpf_exit - - If f2 is JITed and the pointer stored to ``_f2``. The calls f1 -> f2 -> f3 and - returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to - be used to call into f2. - - For practical reasons all eBPF programs have only one argument 'ctx' which is - already placed into R1 (e.g. on __bpf_prog_run() startup) and the programs - can call kernel functions with up to 5 arguments. Calls with 6 or more arguments - are currently not supported, but these restrictions can be lifted if necessary - in the future. - - On 64-bit architectures all register map to HW registers one to one. For - example, x86_64 JIT compiler can map them as ... - - :: - - R0 - rax - R1 - rdi - R2 - rsi - R3 - rdx - R4 - rcx - R5 - r8 - R6 - rbx - R7 - r13 - R8 - r14 - R9 - r15 - R10 - rbp - - ... since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing - and rbx, r12 - r15 are callee saved. - - Then the following eBPF pseudo-program:: - - bpf_mov R6, R1 /* save ctx */ - bpf_mov R2, 2 - bpf_mov R3, 3 - bpf_mov R4, 4 - bpf_mov R5, 5 - bpf_call foo - bpf_mov R7, R0 /* save foo() return value */ - bpf_mov R1, R6 /* restore ctx for next call */ - bpf_mov R2, 6 - bpf_mov R3, 7 - bpf_mov R4, 8 - bpf_mov R5, 9 - bpf_call bar - bpf_add R0, R7 - bpf_exit - - After JIT to x86_64 may look like:: - - push %rbp - mov %rsp,%rbp - sub $0x228,%rsp - mov %rbx,-0x228(%rbp) - mov %r13,-0x220(%rbp) - mov %rdi,%rbx - mov $0x2,%esi - mov $0x3,%edx - mov $0x4,%ecx - mov $0x5,%r8d - callq foo - mov %rax,%r13 - mov %rbx,%rdi - mov $0x6,%esi - mov $0x7,%edx - mov $0x8,%ecx - mov $0x9,%r8d - callq bar - add %r13,%rax - mov -0x228(%rbp),%rbx - mov -0x220(%rbp),%r13 - leaveq - retq - - Which is in this example equivalent in C to:: - - u64 bpf_filter(u64 ctx) - { - return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9); - } - - In-kernel functions foo() and bar() with prototype: u64 (*)(u64 arg1, u64 - arg2, u64 arg3, u64 arg4, u64 arg5); will receive arguments in proper - registers and place their return value into ``%rax`` which is R0 in eBPF. - Prologue and epilogue are emitted by JIT and are implicit in the - interpreter. R0-R5 are scratch registers, so eBPF program needs to preserve - them across the calls as defined by calling convention. - - For example the following program is invalid:: - - bpf_mov R1, 1 - bpf_call foo - bpf_mov R0, R1 - bpf_exit - - After the call the registers R1-R5 contain junk values and cannot be read. - An in-kernel eBPF verifier is used to validate eBPF programs. - -Also in the new design, eBPF is limited to 4096 insns, which means that any -program will terminate quickly and will only call a fixed number of kernel -functions. Original BPF and the new format are two operand instructions, -which helps to do one-to-one mapping between eBPF insn and x86 insn during JIT. - -The input context pointer for invoking the interpreter function is generic, -its content is defined by a specific use case. For seccomp register R1 points -to seccomp_data, for converted BPF filters R1 points to a skb. - -A program, that is translated internally consists of the following elements:: - - op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32 - -So far 87 eBPF instructions were implemented. 8-bit 'op' opcode field -has room for new instructions. Some of them may use 16/24/32 byte encoding. New -instructions must be multiple of 8 bytes to preserve backward compatibility. - -eBPF is a general purpose RISC instruction set. Not every register and -every instruction are used during translation from original BPF to new format. -For example, socket filters are not using ``exclusive add`` instruction, but -tracing filters may do to maintain counters of events, for example. Register R9 -is not used by socket filters either, but more complex filters may be running -out of registers and would have to resort to spill/fill to stack. - -eBPF can be used as a generic assembler for last step performance -optimizations, socket filters and seccomp are using it as assembler. Tracing -filters may use it as assembler to generate code from kernel. In kernel usage -may not be bounded by security considerations, since generated eBPF code -may be optimizing internal code path and not being exposed to the user space. -Safety of eBPF can come from a verifier (TBD). In such use cases as -described, it may be used as safe instruction set. - -Just like the original BPF, the new format runs within a controlled environment, -is deterministic and the kernel can easily prove that. The safety of the program -can be determined in two steps: first step does depth-first-search to disallow -loops and other CFG validation; second step starts from the first insn and -descends all possible paths. It simulates execution of every insn and observes -the state change of registers and stack. - -eBPF opcode encoding --------------------- - -eBPF is reusing most of the opcode encoding from classic to simplify conversion -of classic BPF to eBPF. For arithmetic and jump instructions the 8-bit 'code' -field is divided into three parts:: - - +----------------+--------+--------------------+ - | 4 bits | 1 bit | 3 bits | - | operation code | source | instruction class | - +----------------+--------+--------------------+ - (MSB) (LSB) - -Three LSB bits store instruction class which is one of: - - =================== =============== - Classic BPF classes eBPF classes - =================== =============== - BPF_LD 0x00 BPF_LD 0x00 - BPF_LDX 0x01 BPF_LDX 0x01 - BPF_ST 0x02 BPF_ST 0x02 - BPF_STX 0x03 BPF_STX 0x03 - BPF_ALU 0x04 BPF_ALU 0x04 - BPF_JMP 0x05 BPF_JMP 0x05 - BPF_RET 0x06 BPF_JMP32 0x06 - BPF_MISC 0x07 BPF_ALU64 0x07 - =================== =============== - -When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ... - - :: - - BPF_K 0x00 - BPF_X 0x08 - - * in classic BPF, this means:: - - BPF_SRC(code) == BPF_X - use register X as source operand - BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand - - * in eBPF, this means:: - - BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand - BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand - -... and four MSB bits store operation code. - -If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of:: - - BPF_ADD 0x00 - BPF_SUB 0x10 - BPF_MUL 0x20 - BPF_DIV 0x30 - BPF_OR 0x40 - BPF_AND 0x50 - BPF_LSH 0x60 - BPF_RSH 0x70 - BPF_NEG 0x80 - BPF_MOD 0x90 - BPF_XOR 0xa0 - BPF_MOV 0xb0 /* eBPF only: mov reg to reg */ - BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */ - BPF_END 0xd0 /* eBPF only: endianness conversion */ - -If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of:: - - BPF_JA 0x00 /* BPF_JMP only */ - BPF_JEQ 0x10 - BPF_JGT 0x20 - BPF_JGE 0x30 - BPF_JSET 0x40 - BPF_JNE 0x50 /* eBPF only: jump != */ - BPF_JSGT 0x60 /* eBPF only: signed '>' */ - BPF_JSGE 0x70 /* eBPF only: signed '>=' */ - BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */ - BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */ - BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ - BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ - BPF_JSLT 0xc0 /* eBPF only: signed '<' */ - BPF_JSLE 0xd0 /* eBPF only: signed '<=' */ - -So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF -and eBPF. There are only two registers in classic BPF, so it means A += X. -In eBPF it means dst_reg = (u32) dst_reg + (u32) src_reg; similarly, -BPF_XOR | BPF_K | BPF_ALU means A ^= imm32 in classic BPF and analogous -src_reg = (u32) src_reg ^ (u32) imm32 in eBPF. - -Classic BPF is using BPF_MISC class to represent A = X and X = A moves. -eBPF is using BPF_MOV | BPF_X | BPF_ALU code instead. Since there are no -BPF_MISC operations in eBPF, the class 7 is used as BPF_ALU64 to mean -exactly the same operations as BPF_ALU, but with 64-bit wide operands -instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.: -dst_reg = dst_reg + src_reg - -Classic BPF wastes the whole BPF_RET class to represent a single ``ret`` -operation. Classic BPF_RET | BPF_K means copy imm32 into return register -and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT -in eBPF means function exit only. The eBPF program needs to store return -value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as -BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide -operands for the comparisons instead. - -For load and store instructions the 8-bit 'code' field is divided as:: - - +--------+--------+-------------------+ - | 3 bits | 2 bits | 3 bits | - | mode | size | instruction class | - +--------+--------+-------------------+ - (MSB) (LSB) - -Size modifier is one of ... - -:: - - BPF_W 0x00 /* word */ - BPF_H 0x08 /* half word */ - BPF_B 0x10 /* byte */ - BPF_DW 0x18 /* eBPF only, double word */ - -... which encodes size of load/store operation:: - - B - 1 byte - H - 2 byte - W - 4 byte - DW - 8 byte (eBPF only) - -Mode modifier is one of:: - - BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ - BPF_ABS 0x20 - BPF_IND 0x40 - BPF_MEM 0x60 - BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ - BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ - BPF_ATOMIC 0xc0 /* eBPF only, atomic operations */ - -eBPF has two non-generic instructions: (BPF_ABS | | BPF_LD) and -(BPF_IND | | BPF_LD) which are used to access packet data. - -They had to be carried over from classic to have strong performance of -socket filters running in eBPF interpreter. These instructions can only -be used when interpreter context is a pointer to ``struct sk_buff`` and -have seven implicit operands. Register R6 is an implicit input that must -contain pointer to sk_buff. Register R0 is an implicit output which contains -the data fetched from the packet. Registers R1-R5 are scratch registers -and must not be used to store the data across BPF_ABS | BPF_LD or -BPF_IND | BPF_LD instructions. - -These instructions have implicit program exit condition as well. When -eBPF program is trying to access the data beyond the packet boundary, -the interpreter will abort the execution of the program. JIT compilers -therefore must preserve this property. src_reg and imm32 fields are -explicit inputs to these instructions. - -For example:: - - BPF_IND | BPF_W | BPF_LD means: - - R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32)) - and R1 - R5 were scratched. - -Unlike classic BPF instruction set, eBPF has generic load/store operations:: - - BPF_MEM | | BPF_STX: *(size *) (dst_reg + off) = src_reg - BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 - BPF_MEM | | BPF_LDX: dst_reg = *(size *) (src_reg + off) - -Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. - -It also includes atomic operations, which use the immediate field for extra -encoding:: - - .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg - .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg - -The basic atomic operations supported are:: - - BPF_ADD - BPF_AND - BPF_OR - BPF_XOR - -Each having equivalent semantics with the ``BPF_ADD`` example, that is: the -memory location addresed by ``dst_reg + off`` is atomically modified, with -``src_reg`` as the other operand. If the ``BPF_FETCH`` flag is set in the -immediate, then these operations also overwrite ``src_reg`` with the -value that was in memory before it was modified. - -The more special operations are:: - - BPF_XCHG - -This atomically exchanges ``src_reg`` with the value addressed by ``dst_reg + -off``. :: - - BPF_CMPXCHG - -This atomically compares the value addressed by ``dst_reg + off`` with -``R0``. If they match it is replaced with ``src_reg``. In either case, the -value that was there before is zero-extended and loaded back to ``R0``. - -Note that 1 and 2 byte atomic operations are not supported. - -Clang can generate atomic instructions by default when ``-mcpu=v3`` is -enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction -Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable -the atomics features, while keeping a lower ``-mcpu`` version, you can use -``-Xclang -target-feature -Xclang +alu32``. - -You may encounter ``BPF_XADD`` - this is a legacy name for ``BPF_ATOMIC``, -referring to the exclusive-add operation encoded when the immediate field is -zero. - -eBPF has one 16-byte instruction: ``BPF_LD | BPF_DW | BPF_IMM`` which consists -of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single -instruction that loads 64-bit immediate value into a dst_reg. -Classic BPF has similar instruction: ``BPF_LD | BPF_W | BPF_IMM`` which loads -32-bit immediate value into a register. - -eBPF verifier -------------- -The safety of the eBPF program is determined in two steps. - -First step does DAG check to disallow loops and other CFG validation. -In particular it will detect programs that have unreachable instructions. -(though classic BPF checker allows them) - -Second step starts from the first insn and descends all possible paths. -It simulates execution of every insn and observes the state change of -registers and stack. - -At the start of the program the register R1 contains a pointer to context -and has type PTR_TO_CTX. -If verifier sees an insn that does R2=R1, then R2 has now type -PTR_TO_CTX as well and can be used on the right hand side of expression. -If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=SCALAR_VALUE, -since addition of two valid pointers makes invalid pointer. -(In 'secure' mode verifier will reject any type of pointer arithmetic to make -sure that kernel addresses don't leak to unprivileged users) - -If register was never written to, it's not readable:: - - bpf_mov R0 = R2 - bpf_exit - -will be rejected, since R2 is unreadable at the start of the program. - -After kernel function call, R1-R5 are reset to unreadable and -R0 has a return type of the function. - -Since R6-R9 are callee saved, their state is preserved across the call. - -:: - - bpf_mov R6 = 1 - bpf_call foo - bpf_mov R0 = R6 - bpf_exit - -is a correct program. If there was R1 instead of R6, it would have -been rejected. - -load/store instructions are allowed only with registers of valid types, which -are PTR_TO_CTX, PTR_TO_MAP, PTR_TO_STACK. They are bounds and alignment checked. -For example:: - - bpf_mov R1 = 1 - bpf_mov R2 = 2 - bpf_xadd *(u32 *)(R1 + 3) += R2 - bpf_exit - -will be rejected, since R1 doesn't have a valid pointer type at the time of -execution of instruction bpf_xadd. - -At the start R1 type is PTR_TO_CTX (a pointer to generic ``struct bpf_context``) -A callback is used to customize verifier to restrict eBPF program access to only -certain fields within ctx structure with specified size and alignment. - -For example, the following insn:: - - bpf_ld R0 = *(u32 *)(R6 + 8) - -intends to load a word from address R6 + 8 and store it into R0 -If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know -that offset 8 of size 4 bytes can be accessed for reading, otherwise -the verifier will reject the program. -If R6=PTR_TO_STACK, then access should be aligned and be within -stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, -so it will fail verification, since it's out of bounds. - -The verifier will allow eBPF program to read data from stack only after -it wrote into it. - -Classic BPF verifier does similar check with M[0-15] memory slots. -For example:: - - bpf_ld R0 = *(u32 *)(R10 - 4) - bpf_exit - -is invalid program. -Though R10 is correct read-only register and has type PTR_TO_STACK -and R10 - 4 is within stack bounds, there were no stores into that location. - -Pointer register spill/fill is tracked as well, since four (R6-R9) -callee saved registers may not be enough for some programs. - -Allowed function calls are customized with bpf_verifier_ops->get_func_proto() -The eBPF verifier will check that registers match argument constraints. -After the call register R0 will be set to return type of the function. - -Function calls is a main mechanism to extend functionality of eBPF programs. -Socket filters may let programs to call one set of functions, whereas tracing -filters may allow completely different set. - -If a function made accessible to eBPF program, it needs to be thought through -from safety point of view. The verifier will guarantee that the function is -called with valid arguments. - -seccomp vs socket filters have different security restrictions for classic BPF. -Seccomp solves this by two stage verifier: classic BPF verifier is followed -by seccomp verifier. In case of eBPF one configurable verifier is shared for -all use cases. - -See details of eBPF verifier in kernel/bpf/verifier.c - -Register value tracking ------------------------ -In order to determine the safety of an eBPF program, the verifier must track -the range of possible values in each register and also in each stack slot. -This is done with ``struct bpf_reg_state``, defined in include/linux/ -bpf_verifier.h, which unifies tracking of scalar and pointer values. Each -register state has a type, which is either NOT_INIT (the register has not been -written to), SCALAR_VALUE (some value which is not usable as a pointer), or a -pointer type. The types of pointers describe their base, as follows: - - - PTR_TO_CTX - Pointer to bpf_context. - CONST_PTR_TO_MAP - Pointer to struct bpf_map. "Const" because arithmetic - on these pointers is forbidden. - PTR_TO_MAP_VALUE - Pointer to the value stored in a map element. - PTR_TO_MAP_VALUE_OR_NULL - Either a pointer to a map value, or NULL; map accesses - (see maps.rst) return this type, which becomes a - a PTR_TO_MAP_VALUE when checked != NULL. Arithmetic on - these pointers is forbidden. - PTR_TO_STACK - Frame pointer. - PTR_TO_PACKET - skb->data. - PTR_TO_PACKET_END - skb->data + headlen; arithmetic forbidden. - PTR_TO_SOCKET - Pointer to struct bpf_sock_ops, implicitly refcounted. - PTR_TO_SOCKET_OR_NULL - Either a pointer to a socket, or NULL; socket lookup - returns this type, which becomes a PTR_TO_SOCKET when - checked != NULL. PTR_TO_SOCKET is reference-counted, - so programs must release the reference through the - socket release function before the end of the program. - Arithmetic on these pointers is forbidden. - -However, a pointer may be offset from this base (as a result of pointer -arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable -offset'. The former is used when an exactly-known value (e.g. an immediate -operand) is added to a pointer, while the latter is used for values which are -not exactly known. The variable offset is also used in SCALAR_VALUEs, to track -the range of possible values in the register. - -The verifier's knowledge about the variable offset consists of: - -* minimum and maximum values as unsigned -* minimum and maximum values as signed - -* knowledge of the values of individual bits, in the form of a 'tnum': a u64 - 'mask' and a u64 'value'. 1s in the mask represent bits whose value is unknown; - 1s in the value represent bits known to be 1. Bits known to be 0 have 0 in both - mask and value; no bit should ever be 1 in both. For example, if a byte is read - into a register from memory, the register's top 56 bits are known zero, while - the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we - then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; - 0x1ff), because of potential carries. - -Besides arithmetic, the register state can also be updated by conditional -branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch -it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false' -branch it will have a umax_value of 8. A signed compare (with BPF_JSGT or -BPF_JSGE) would instead update the signed minimum/maximum values. Information -from the signed and unsigned bounds can be combined; for instance if a value is -first tested < 8 and then tested s> 4, the verifier will conclude that the value -is also > 4 and s< 8, since the bounds prevent crossing the sign boundary. - -PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all -pointers sharing that same variable offset. This is important for packet range -checks: after adding a variable to a packet pointer register A, if you then copy -it to another register B and then add a constant 4 to A, both registers will -share the same 'id' but the A will have a fixed offset of +4. Then if A is -bounds-checked and found to be less than a PTR_TO_PACKET_END, the register B is -now known to have a safe range of at least 4 bytes. See 'Direct packet access', -below, for more on PTR_TO_PACKET ranges. - -The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of -the pointer returned from a map lookup. This means that when one copy is -checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs. -As well as range-checking, the tracked information is also used for enforcing -alignment of pointer accesses. For instance, on most systems the packet pointer -is 2 bytes after a 4-byte alignment. If a program adds 14 bytes to that to jump -over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting -pointer will have a variable offset known to be 4n+2 for some n, so adding the 2 -bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through -that pointer are safe. -The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common -to all copies of the pointer returned from a socket lookup. This has similar -behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but -it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly -represents a reference to the corresponding ``struct sock``. To ensure that the -reference is not leaked, it is imperative to NULL-check the reference and in -the non-NULL case, and pass the valid reference to the socket release function. - -Direct packet access --------------------- -In cls_bpf and act_bpf programs the verifier allows direct access to the packet -data via skb->data and skb->data_end pointers. -Ex:: - - 1: r4 = *(u32 *)(r1 +80) /* load skb->data_end */ - 2: r3 = *(u32 *)(r1 +76) /* load skb->data */ - 3: r5 = r3 - 4: r5 += 14 - 5: if r5 > r4 goto pc+16 - R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp - 6: r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */ - -this 2byte load from the packet is safe to do, since the program author -did check ``if (skb->data + 14 > skb->data_end) goto err`` at insn #5 which -means that in the fall-through case the register R3 (which points to skb->data) -has at least 14 directly accessible bytes. The verifier marks it -as R3=pkt(id=0,off=0,r=14). -id=0 means that no additional variables were added to the register. -off=0 means that no additional constants were added. -r=14 is the range of safe access which means that bytes [R3, R3 + 14) are ok. -Note that R5 is marked as R5=pkt(id=0,off=14,r=14). It also points -to the packet data, but constant 14 was added to the register, so -it now points to ``skb->data + 14`` and accessible range is [R5, R5 + 14 - 14) -which is zero bytes. - -More complex packet access may look like:: - - - R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp - 6: r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */ - 7: r4 = *(u8 *)(r3 +12) - 8: r4 *= 14 - 9: r3 = *(u32 *)(r1 +76) /* load skb->data */ - 10: r3 += r4 - 11: r2 = r1 - 12: r2 <<= 48 - 13: r2 >>= 48 - 14: r3 += r2 - 15: r2 = r3 - 16: r2 += 8 - 17: r1 = *(u32 *)(r1 +80) /* load skb->data_end */ - 18: if r2 > r1 goto pc+2 - R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp - 19: r1 = *(u8 *)(r3 +4) - -The state of the register R3 is R3=pkt(id=2,off=0,r=8) -id=2 means that two ``r3 += rX`` instructions were seen, so r3 points to some -offset within a packet and since the program author did -``if (r3 + 8 > r1) goto err`` at insn #18, the safe range is [R3, R3 + 8). -The verifier only allows 'add'/'sub' operations on packet registers. Any other -operation will set the register state to 'SCALAR_VALUE' and it won't be -available for direct packet access. - -Operation ``r3 += rX`` may overflow and become less than original skb->data, -therefore the verifier has to prevent that. So when it sees ``r3 += rX`` -instruction and rX is more than 16-bit value, any subsequent bounds-check of r3 -against skb->data_end will not give us 'range' information, so attempts to read -through the pointer will give "invalid access to packet" error. - -Ex. after insn ``r4 = *(u8 *)(r3 +12)`` (insn #7 above) the state of r4 is -R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) which means that upper 56 bits -of the register are guaranteed to be zero, and nothing is known about the lower -8 bits. After insn ``r4 *= 14`` the state becomes -R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)), since multiplying an 8-bit -value by constant 14 will keep upper 52 bits as zero, also the least significant -bit will be zero as 14 is even. Similarly ``r2 >>= 48`` will make -R2=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)), since the shift is not sign -extending. This logic is implemented in adjust_reg_min_max_vals() function, -which calls adjust_ptr_min_max_vals() for adding pointer to scalar (or vice -versa) and adjust_scalar_min_max_vals() for operations on two scalars. - -The end result is that bpf program author can access packet directly -using normal C code as:: - - void *data = (void *)(long)skb->data; - void *data_end = (void *)(long)skb->data_end; - struct eth_hdr *eth = data; - struct iphdr *iph = data + sizeof(*eth); - struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph); - - if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end) - return 0; - if (eth->h_proto != htons(ETH_P_IP)) - return 0; - if (iph->protocol != IPPROTO_UDP || iph->ihl != 5) - return 0; - if (udp->dest == 53 || udp->source == 9) - ...; - -which makes such programs easier to write comparing to LD_ABS insn -and significantly faster. - -Pruning -------- -The verifier does not actually walk all possible paths through the program. For -each new branch to analyse, the verifier looks at all the states it's previously -been in when at this instruction. If any of them contain the current state as a -subset, the branch is 'pruned' - that is, the fact that the previous state was -accepted implies the current state would be as well. For instance, if in the -previous state, r1 held a packet-pointer, and in the current state, r1 holds a -packet-pointer with a range as long or longer and at least as strict an -alignment, then r1 is safe. Similarly, if r2 was NOT_INIT before then it can't -have been used by any path from that point, so any value in r2 (including -another NOT_INIT) is safe. The implementation is in the function regsafe(). -Pruning considers not only the registers but also the stack (and any spilled -registers it may hold). They must all be safe for the branch to be pruned. -This is implemented in states_equal(). - -Understanding eBPF verifier messages ------------------------------------- - -The following are few examples of invalid eBPF programs and verifier error -messages as seen in the log: - -Program with unreachable instructions:: - - static struct bpf_insn prog[] = { - BPF_EXIT_INSN(), - BPF_EXIT_INSN(), - }; - -Error: - - unreachable insn 1 - -Program that reads uninitialized register:: - - BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), - BPF_EXIT_INSN(), - -Error:: - - 0: (bf) r0 = r2 - R2 !read_ok - -Program that doesn't initialize R0 before exiting:: - - BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), - BPF_EXIT_INSN(), - -Error:: - - 0: (bf) r2 = r1 - 1: (95) exit - R0 !read_ok - -Program that accesses stack out of bounds:: - - BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), - BPF_EXIT_INSN(), - -Error:: - - 0: (7a) *(u64 *)(r10 +8) = 0 - invalid stack off=8 size=8 - -Program that doesn't initialize stack before passing its address into function:: - - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - -Error:: - - 0: (bf) r2 = r10 - 1: (07) r2 += -8 - 2: (b7) r1 = 0x0 - 3: (85) call 1 - invalid indirect read from stack off -8+0 size 8 - -Program that uses invalid map_fd=0 while calling to map_lookup_elem() function:: - - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_EXIT_INSN(), - -Error:: - - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 0x0 - 4: (85) call 1 - fd 0 is not pointing to valid bpf_map - -Program that doesn't check return value of map_lookup_elem() before accessing -map element:: - - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), - BPF_EXIT_INSN(), - -Error:: - - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 0x0 - 4: (85) call 1 - 5: (7a) *(u64 *)(r0 +0) = 0 - R0 invalid mem access 'map_value_or_null' - -Program that correctly checks map_lookup_elem() returned value for NULL, but -accesses the memory with incorrect alignment:: - - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), - BPF_EXIT_INSN(), - -Error:: - - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 1 - 4: (85) call 1 - 5: (15) if r0 == 0x0 goto pc+1 - R0=map_ptr R10=fp - 6: (7a) *(u64 *)(r0 +4) = 0 - misaligned access off 4 size 8 - -Program that correctly checks map_lookup_elem() returned value for NULL and -accesses memory with correct alignment in one side of 'if' branch, but fails -to do so in the other side of 'if' branch:: - - BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), - BPF_EXIT_INSN(), - BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), - BPF_EXIT_INSN(), - -Error:: - - 0: (7a) *(u64 *)(r10 -8) = 0 - 1: (bf) r2 = r10 - 2: (07) r2 += -8 - 3: (b7) r1 = 1 - 4: (85) call 1 - 5: (15) if r0 == 0x0 goto pc+2 - R0=map_ptr R10=fp - 6: (7a) *(u64 *)(r0 +0) = 0 - 7: (95) exit - - from 5 to 8: R0=imm0 R10=fp - 8: (7a) *(u64 *)(r0 +0) = 1 - R0 invalid mem access 'imm' - -Program that performs a socket lookup then sets the pointer to NULL without -checking it:: - - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_MOV64_IMM(BPF_REG_3, 4), - BPF_MOV64_IMM(BPF_REG_4, 0), - BPF_MOV64_IMM(BPF_REG_5, 0), - BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - -Error:: - - 0: (b7) r2 = 0 - 1: (63) *(u32 *)(r10 -8) = r2 - 2: (bf) r2 = r10 - 3: (07) r2 += -8 - 4: (b7) r3 = 4 - 5: (b7) r4 = 0 - 6: (b7) r5 = 0 - 7: (85) call bpf_sk_lookup_tcp#65 - 8: (b7) r0 = 0 - 9: (95) exit - Unreleased reference id=1, alloc_insn=7 - -Program that performs a socket lookup but does not NULL-check the returned -value:: - - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), - BPF_MOV64_IMM(BPF_REG_3, 4), - BPF_MOV64_IMM(BPF_REG_4, 0), - BPF_MOV64_IMM(BPF_REG_5, 0), - BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), - BPF_EXIT_INSN(), - -Error:: - - 0: (b7) r2 = 0 - 1: (63) *(u32 *)(r10 -8) = r2 - 2: (bf) r2 = r10 - 3: (07) r2 += -8 - 4: (b7) r3 = 4 - 5: (b7) r4 = 0 - 6: (b7) r5 = 0 - 7: (85) call bpf_sk_lookup_tcp#65 - 8: (95) exit - Unreleased reference id=1, alloc_insn=7 - Testing ------- From e6f2dd0f80674e9d5960337b3e9c2a242441b326 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 29 Nov 2021 19:06:19 -0800 Subject: [PATCH 050/115] bpf: Add bpf_loop helper This patch adds the kernel-side and API changes for a new helper function, bpf_loop: long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags); where long (*callback_fn)(u32 index, void *ctx); bpf_loop invokes the "callback_fn" **nr_loops** times or until the callback_fn returns 1. The callback_fn can only return 0 or 1, and this is enforced by the verifier. The callback_fn index is zero-indexed. A few things to please note: ~ The "u64 flags" parameter is currently unused but is included in case a future use case for it arises. ~ In the kernel-side implementation of bpf_loop (kernel/bpf/bpf_iter.c), bpf_callback_t is used as the callback function cast. ~ A program can have nested bpf_loop calls but the program must still adhere to the verifier constraint of its stack depth (the stack depth cannot exceed MAX_BPF_STACK)) ~ Recursive callback_fns do not pass the verifier, due to the call stack for these being too deep. ~ The next patch will include the tests and benchmark Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211130030622.4131246-2-joannekoong@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 25 ++++++++++ kernel/bpf/bpf_iter.c | 35 ++++++++++++++ kernel/bpf/helpers.c | 2 + kernel/bpf/verifier.c | 88 +++++++++++++++++++++------------- tools/include/uapi/linux/bpf.h | 25 ++++++++++ 6 files changed, 142 insertions(+), 34 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc7a0c36e7df..cad0829710be 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2164,6 +2164,7 @@ extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; extern const struct bpf_func_proto bpf_find_vma_proto; +extern const struct bpf_func_proto bpf_loop_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a69e4b04ffeb..211b43afd0fb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4957,6 +4957,30 @@ union bpf_attr { * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. * **-EBUSY** if failed to try lock mmap_lock. * **-EINVAL** for invalid **flags**. + * + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * Return + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5140,6 +5164,7 @@ union bpf_attr { FN(skc_to_unix_sock), \ FN(kallsyms_lookup_name), \ FN(find_vma), \ + FN(loop), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b2ee45064e06..b7aef5b3416d 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -714,3 +714,38 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = { .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; + +/* maximum number of loops */ +#define MAX_LOOPS BIT(23) + +BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, + u64, flags) +{ + bpf_callback_t callback = (bpf_callback_t)callback_fn; + u64 ret; + u32 i; + + if (flags) + return -EINVAL; + if (nr_loops > MAX_LOOPS) + return -E2BIG; + + for (i = 0; i < nr_loops; i++) { + ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) + return i + 1; + } + + return i; +} + +const struct bpf_func_proto bpf_loop_proto = { + .func = bpf_loop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ffd469c217f..52188004a9c3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1378,6 +1378,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_query_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; + case BPF_FUNC_loop: + return &bpf_loop_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0763cca139a7..d7678d8a925c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6085,6 +6085,27 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_loop_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, + * u64 flags); + * callback_fn(u32 index, void *callback_ctx); + */ + callee->regs[BPF_REG_1].type = SCALAR_VALUE; + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; + return 0; +} + static int set_timer_callback_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -6458,13 +6479,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (func_id == BPF_FUNC_tail_call) { - err = check_reference_leak(env); - if (err) { - verbose(env, "tail_call would lead to reference leak\n"); - return err; - } - } else if (is_release_function(func_id)) { + if (is_release_function(func_id)) { err = release_reference(env, meta.ref_obj_id); if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", @@ -6475,41 +6490,46 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); - /* check that flags argument in get_local_storage(map, flags) is 0, - * this is required because get_local_storage() can't return an error. - */ - if (func_id == BPF_FUNC_get_local_storage && - !register_is_null(®s[BPF_REG_2])) { - verbose(env, "get_local_storage() doesn't support non-zero flags\n"); - return -EINVAL; - } - - if (func_id == BPF_FUNC_for_each_map_elem) { + switch (func_id) { + case BPF_FUNC_tail_call: + err = check_reference_leak(env); + if (err) { + verbose(env, "tail_call would lead to reference leak\n"); + return err; + } + break; + case BPF_FUNC_get_local_storage: + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (!register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + break; + case BPF_FUNC_for_each_map_elem: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_map_elem_callback_state); - if (err < 0) - return -EINVAL; - } - - if (func_id == BPF_FUNC_timer_set_callback) { + break; + case BPF_FUNC_timer_set_callback: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_timer_callback_state); - if (err < 0) - return -EINVAL; - } - - if (func_id == BPF_FUNC_find_vma) { + break; + case BPF_FUNC_find_vma: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_find_vma_callback_state); - if (err < 0) - return -EINVAL; + break; + case BPF_FUNC_snprintf: + err = check_bpf_snprintf_call(env, regs); + break; + case BPF_FUNC_loop: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_loop_callback_state); + break; } - if (func_id == BPF_FUNC_snprintf) { - err = check_bpf_snprintf_call(env, regs); - if (err < 0) - return err; - } + if (err) + return err; /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a69e4b04ffeb..211b43afd0fb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4957,6 +4957,30 @@ union bpf_attr { * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. * **-EBUSY** if failed to try lock mmap_lock. * **-EINVAL** for invalid **flags**. + * + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * Return + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5140,6 +5164,7 @@ union bpf_attr { FN(skc_to_unix_sock), \ FN(kallsyms_lookup_name), \ FN(find_vma), \ + FN(loop), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper From 4e5070b64b375a9c1f570893cfceeba108382bef Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 29 Nov 2021 19:06:20 -0800 Subject: [PATCH 051/115] selftests/bpf: Add bpf_loop test Add test for bpf_loop testing a variety of cases: various nr_loops, null callback ctx, invalid flags, nested callbacks. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211130030622.4131246-3-joannekoong@fb.com --- .../selftests/bpf/prog_tests/bpf_loop.c | 145 ++++++++++++++++++ tools/testing/selftests/bpf/progs/bpf_loop.c | 112 ++++++++++++++ 2 files changed, 257 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_loop.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_loop.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_loop.c b/tools/testing/selftests/bpf/prog_tests/bpf_loop.c new file mode 100644 index 000000000000..380d7a2072e3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_loop.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include +#include "bpf_loop.skel.h" + +static void check_nr_loops(struct bpf_loop *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.test_prog); + if (!ASSERT_OK_PTR(link, "link")) + return; + + /* test 0 loops */ + skel->bss->nr_loops = 0; + + usleep(1); + + ASSERT_EQ(skel->bss->nr_loops_returned, skel->bss->nr_loops, + "0 loops"); + + /* test 500 loops */ + skel->bss->nr_loops = 500; + + usleep(1); + + ASSERT_EQ(skel->bss->nr_loops_returned, skel->bss->nr_loops, + "500 loops"); + ASSERT_EQ(skel->bss->g_output, (500 * 499) / 2, "g_output"); + + /* test exceeding the max limit */ + skel->bss->nr_loops = -1; + + usleep(1); + + ASSERT_EQ(skel->bss->err, -E2BIG, "over max limit"); + + bpf_link__destroy(link); +} + +static void check_callback_fn_stop(struct bpf_loop *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach(skel->progs.test_prog); + if (!ASSERT_OK_PTR(link, "link")) + return; + + /* testing that loop is stopped when callback_fn returns 1 */ + skel->bss->nr_loops = 400; + skel->data->stop_index = 50; + + usleep(1); + + ASSERT_EQ(skel->bss->nr_loops_returned, skel->data->stop_index + 1, + "nr_loops_returned"); + ASSERT_EQ(skel->bss->g_output, (50 * 49) / 2, + "g_output"); + + bpf_link__destroy(link); +} + +static void check_null_callback_ctx(struct bpf_loop *skel) +{ + struct bpf_link *link; + + /* check that user is able to pass in a null callback_ctx */ + link = bpf_program__attach(skel->progs.prog_null_ctx); + if (!ASSERT_OK_PTR(link, "link")) + return; + + skel->bss->nr_loops = 10; + + usleep(1); + + ASSERT_EQ(skel->bss->nr_loops_returned, skel->bss->nr_loops, + "nr_loops_returned"); + + bpf_link__destroy(link); +} + +static void check_invalid_flags(struct bpf_loop *skel) +{ + struct bpf_link *link; + + /* check that passing in non-zero flags returns -EINVAL */ + link = bpf_program__attach(skel->progs.prog_invalid_flags); + if (!ASSERT_OK_PTR(link, "link")) + return; + + usleep(1); + + ASSERT_EQ(skel->bss->err, -EINVAL, "err"); + + bpf_link__destroy(link); +} + +static void check_nested_calls(struct bpf_loop *skel) +{ + __u32 nr_loops = 100, nested_callback_nr_loops = 4; + struct bpf_link *link; + + /* check that nested calls are supported */ + link = bpf_program__attach(skel->progs.prog_nested_calls); + if (!ASSERT_OK_PTR(link, "link")) + return; + + skel->bss->nr_loops = nr_loops; + skel->bss->nested_callback_nr_loops = nested_callback_nr_loops; + + usleep(1); + + ASSERT_EQ(skel->bss->nr_loops_returned, nr_loops * nested_callback_nr_loops + * nested_callback_nr_loops, "nr_loops_returned"); + ASSERT_EQ(skel->bss->g_output, (4 * 3) / 2 * nested_callback_nr_loops + * nr_loops, "g_output"); + + bpf_link__destroy(link); +} + +void test_bpf_loop(void) +{ + struct bpf_loop *skel; + + skel = bpf_loop__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bpf_loop__open_and_load")) + return; + + skel->bss->pid = getpid(); + + if (test__start_subtest("check_nr_loops")) + check_nr_loops(skel); + if (test__start_subtest("check_callback_fn_stop")) + check_callback_fn_stop(skel); + if (test__start_subtest("check_null_callback_ctx")) + check_null_callback_ctx(skel); + if (test__start_subtest("check_invalid_flags")) + check_invalid_flags(skel); + if (test__start_subtest("check_nested_calls")) + check_nested_calls(skel); + + bpf_loop__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_loop.c b/tools/testing/selftests/bpf/progs/bpf_loop.c new file mode 100644 index 000000000000..12349e4601e8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_loop.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +struct callback_ctx { + int output; +}; + +/* These should be set by the user program */ +u32 nested_callback_nr_loops; +u32 stop_index = -1; +u32 nr_loops; +int pid; + +/* Making these global variables so that the userspace program + * can verify the output through the skeleton + */ +int nr_loops_returned; +int g_output; +int err; + +static int callback(__u32 index, void *data) +{ + struct callback_ctx *ctx = data; + + if (index >= stop_index) + return 1; + + ctx->output += index; + + return 0; +} + +static int empty_callback(__u32 index, void *data) +{ + return 0; +} + +static int nested_callback2(__u32 index, void *data) +{ + nr_loops_returned += bpf_loop(nested_callback_nr_loops, callback, data, 0); + + return 0; +} + +static int nested_callback1(__u32 index, void *data) +{ + bpf_loop(nested_callback_nr_loops, nested_callback2, data, 0); + return 0; +} + +SEC("fentry/__x64_sys_nanosleep") +int test_prog(void *ctx) +{ + struct callback_ctx data = {}; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + nr_loops_returned = bpf_loop(nr_loops, callback, &data, 0); + + if (nr_loops_returned < 0) + err = nr_loops_returned; + else + g_output = data.output; + + return 0; +} + +SEC("fentry/__x64_sys_nanosleep") +int prog_null_ctx(void *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + nr_loops_returned = bpf_loop(nr_loops, empty_callback, NULL, 0); + + return 0; +} + +SEC("fentry/__x64_sys_nanosleep") +int prog_invalid_flags(void *ctx) +{ + struct callback_ctx data = {}; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + err = bpf_loop(nr_loops, callback, &data, 1); + + return 0; +} + +SEC("fentry/__x64_sys_nanosleep") +int prog_nested_calls(void *ctx) +{ + struct callback_ctx data = {}; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + nr_loops_returned = 0; + bpf_loop(nr_loops, nested_callback1, &data, 0); + + g_output = data.output; + + return 0; +} From f6e659b7f97c76d0471d12bf274ea2a097cf3c5c Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 29 Nov 2021 19:06:21 -0800 Subject: [PATCH 052/115] selftests/bpf: Measure bpf_loop verifier performance This patch tests bpf_loop in pyperf and strobemeta, and measures the verifier performance of replacing the traditional for loop with bpf_loop. The results are as follows: ~strobemeta~ Baseline verification time 6808200 usec stack depth 496 processed 554252 insns (limit 1000000) max_states_per_insn 16 total_states 15878 peak_states 13489 mark_read 3110 #192 verif_scale_strobemeta:OK (unrolled loop) Using bpf_loop verification time 31589 usec stack depth 96+400 processed 1513 insns (limit 1000000) max_states_per_insn 2 total_states 106 peak_states 106 mark_read 60 #193 verif_scale_strobemeta_bpf_loop:OK ~pyperf600~ Baseline verification time 29702486 usec stack depth 368 processed 626838 insns (limit 1000000) max_states_per_insn 7 total_states 30368 peak_states 30279 mark_read 748 #182 verif_scale_pyperf600:OK (unrolled loop) Using bpf_loop verification time 148488 usec stack depth 320+40 processed 10518 insns (limit 1000000) max_states_per_insn 10 total_states 705 peak_states 517 mark_read 38 #183 verif_scale_pyperf600_bpf_loop:OK Using the bpf_loop helper led to approximately a 99% decrease in the verification time and in the number of instructions. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211130030622.4131246-4-joannekoong@fb.com --- .../bpf/prog_tests/bpf_verif_scale.c | 12 +++ tools/testing/selftests/bpf/progs/pyperf.h | 71 +++++++++++++++++- .../selftests/bpf/progs/pyperf600_bpf_loop.c | 6 ++ .../testing/selftests/bpf/progs/strobemeta.h | 75 ++++++++++++++++++- .../selftests/bpf/progs/strobemeta_bpf_loop.c | 9 +++ 5 files changed, 169 insertions(+), 4 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/pyperf600_bpf_loop.c create mode 100644 tools/testing/selftests/bpf/progs/strobemeta_bpf_loop.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index 27f5d8ea7964..1fb16f8dad56 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -115,6 +115,12 @@ void test_verif_scale_pyperf600() scale_test("pyperf600.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } +void test_verif_scale_pyperf600_bpf_loop(void) +{ + /* use the bpf_loop helper*/ + scale_test("pyperf600_bpf_loop.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); +} + void test_verif_scale_pyperf600_nounroll() { /* no unroll at all. @@ -165,6 +171,12 @@ void test_verif_scale_strobemeta() scale_test("strobemeta.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); } +void test_verif_scale_strobemeta_bpf_loop(void) +{ + /* use the bpf_loop helper*/ + scale_test("strobemeta_bpf_loop.o", BPF_PROG_TYPE_RAW_TRACEPOINT, false); +} + void test_verif_scale_strobemeta_nounroll1() { /* no unroll, tiny loops */ diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h index 2fb7adafb6b6..1ed28882daf3 100644 --- a/tools/testing/selftests/bpf/progs/pyperf.h +++ b/tools/testing/selftests/bpf/progs/pyperf.h @@ -159,6 +159,59 @@ struct { __uint(value_size, sizeof(long long) * 127); } stackmap SEC(".maps"); +#ifdef USE_BPF_LOOP +struct process_frame_ctx { + int cur_cpu; + int32_t *symbol_counter; + void *frame_ptr; + FrameData *frame; + PidData *pidData; + Symbol *sym; + Event *event; + bool done; +}; + +#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var)) + +static int process_frame_callback(__u32 i, struct process_frame_ctx *ctx) +{ + int zero = 0; + void *frame_ptr = ctx->frame_ptr; + PidData *pidData = ctx->pidData; + FrameData *frame = ctx->frame; + int32_t *symbol_counter = ctx->symbol_counter; + int cur_cpu = ctx->cur_cpu; + Event *event = ctx->event; + Symbol *sym = ctx->sym; + + if (frame_ptr && get_frame_data(frame_ptr, pidData, frame, sym)) { + int32_t new_symbol_id = *symbol_counter * 64 + cur_cpu; + int32_t *symbol_id = bpf_map_lookup_elem(&symbolmap, sym); + + if (!symbol_id) { + bpf_map_update_elem(&symbolmap, sym, &zero, 0); + symbol_id = bpf_map_lookup_elem(&symbolmap, sym); + if (!symbol_id) { + ctx->done = true; + return 1; + } + } + if (*symbol_id == new_symbol_id) + (*symbol_counter)++; + + barrier_var(i); + if (i >= STACK_MAX_LEN) + return 1; + + event->stack[i] = *symbol_id; + + event->stack_len = i + 1; + frame_ptr = frame->f_back; + } + return 0; +} +#endif /* USE_BPF_LOOP */ + #ifdef GLOBAL_FUNC __noinline #elif defined(SUBPROGS) @@ -228,11 +281,26 @@ int __on_event(struct bpf_raw_tracepoint_args *ctx) int32_t* symbol_counter = bpf_map_lookup_elem(&symbolmap, &sym); if (symbol_counter == NULL) return 0; +#ifdef USE_BPF_LOOP + struct process_frame_ctx ctx = { + .cur_cpu = cur_cpu, + .symbol_counter = symbol_counter, + .frame_ptr = frame_ptr, + .frame = &frame, + .pidData = pidData, + .sym = &sym, + .event = event, + }; + + bpf_loop(STACK_MAX_LEN, process_frame_callback, &ctx, 0); + if (ctx.done) + return 0; +#else #ifdef NO_UNROLL #pragma clang loop unroll(disable) #else #pragma clang loop unroll(full) -#endif +#endif /* NO_UNROLL */ /* Unwind python stack */ for (int i = 0; i < STACK_MAX_LEN; ++i) { if (frame_ptr && get_frame_data(frame_ptr, pidData, &frame, &sym)) { @@ -251,6 +319,7 @@ int __on_event(struct bpf_raw_tracepoint_args *ctx) frame_ptr = frame.f_back; } } +#endif /* USE_BPF_LOOP */ event->stack_complete = frame_ptr == NULL; } else { event->stack_complete = 1; diff --git a/tools/testing/selftests/bpf/progs/pyperf600_bpf_loop.c b/tools/testing/selftests/bpf/progs/pyperf600_bpf_loop.c new file mode 100644 index 000000000000..5c2059dc01af --- /dev/null +++ b/tools/testing/selftests/bpf/progs/pyperf600_bpf_loop.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#define STACK_MAX_LEN 600 +#define USE_BPF_LOOP +#include "pyperf.h" diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index 60c93aee2f4a..753718595c26 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -445,6 +445,48 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, return payload; } +#ifdef USE_BPF_LOOP +enum read_type { + READ_INT_VAR, + READ_MAP_VAR, + READ_STR_VAR, +}; + +struct read_var_ctx { + struct strobemeta_payload *data; + void *tls_base; + struct strobemeta_cfg *cfg; + void *payload; + /* value gets mutated */ + struct strobe_value_generic *value; + enum read_type type; +}; + +static int read_var_callback(__u32 index, struct read_var_ctx *ctx) +{ + switch (ctx->type) { + case READ_INT_VAR: + if (index >= STROBE_MAX_INTS) + return 1; + read_int_var(ctx->cfg, index, ctx->tls_base, ctx->value, ctx->data); + break; + case READ_MAP_VAR: + if (index >= STROBE_MAX_MAPS) + return 1; + ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base, + ctx->value, ctx->data, ctx->payload); + break; + case READ_STR_VAR: + if (index >= STROBE_MAX_STRS) + return 1; + ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base, + ctx->value, ctx->data, ctx->payload); + break; + } + return 0; +} +#endif /* USE_BPF_LOOP */ + /* * read_strobe_meta returns NULL, if no metadata was read; otherwise returns * pointer to *right after* payload ends @@ -475,11 +517,36 @@ static void *read_strobe_meta(struct task_struct *task, */ tls_base = (void *)task; +#ifdef USE_BPF_LOOP + struct read_var_ctx ctx = { + .cfg = cfg, + .tls_base = tls_base, + .value = &value, + .data = data, + .payload = payload, + }; + int err; + + ctx.type = READ_INT_VAR; + err = bpf_loop(STROBE_MAX_INTS, read_var_callback, &ctx, 0); + if (err != STROBE_MAX_INTS) + return NULL; + + ctx.type = READ_STR_VAR; + err = bpf_loop(STROBE_MAX_STRS, read_var_callback, &ctx, 0); + if (err != STROBE_MAX_STRS) + return NULL; + + ctx.type = READ_MAP_VAR; + err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0); + if (err != STROBE_MAX_MAPS) + return NULL; +#else #ifdef NO_UNROLL #pragma clang loop unroll(disable) #else #pragma unroll -#endif +#endif /* NO_UNROLL */ for (int i = 0; i < STROBE_MAX_INTS; ++i) { read_int_var(cfg, i, tls_base, &value, data); } @@ -487,7 +554,7 @@ static void *read_strobe_meta(struct task_struct *task, #pragma clang loop unroll(disable) #else #pragma unroll -#endif +#endif /* NO_UNROLL */ for (int i = 0; i < STROBE_MAX_STRS; ++i) { payload += read_str_var(cfg, i, tls_base, &value, data, payload); } @@ -495,10 +562,12 @@ static void *read_strobe_meta(struct task_struct *task, #pragma clang loop unroll(disable) #else #pragma unroll -#endif +#endif /* NO_UNROLL */ for (int i = 0; i < STROBE_MAX_MAPS; ++i) { payload = read_map_var(cfg, i, tls_base, &value, data, payload); } +#endif /* USE_BPF_LOOP */ + /* * return pointer right after end of payload, so it's possible to * calculate exact amount of useful data that needs to be sent diff --git a/tools/testing/selftests/bpf/progs/strobemeta_bpf_loop.c b/tools/testing/selftests/bpf/progs/strobemeta_bpf_loop.c new file mode 100644 index 000000000000..d18b992f0165 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/strobemeta_bpf_loop.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ + +#define STROBE_MAX_INTS 2 +#define STROBE_MAX_STRS 25 +#define STROBE_MAX_MAPS 100 +#define STROBE_MAX_MAP_ENTRIES 20 +#define USE_BPF_LOOP +#include "strobemeta.h" From ec151037af4f56065d5b258af82f13dbbf279ebd Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 29 Nov 2021 19:06:22 -0800 Subject: [PATCH 053/115] selftest/bpf/benchs: Add bpf_loop benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add benchmark to measure the throughput and latency of the bpf_loop call. Testing this on my dev machine on 1 thread, the data is as follows: nr_loops: 10 bpf_loop - throughput: 198.519 ± 0.155 M ops/s, latency: 5.037 ns/op nr_loops: 100 bpf_loop - throughput: 247.448 ± 0.305 M ops/s, latency: 4.041 ns/op nr_loops: 500 bpf_loop - throughput: 260.839 ± 0.380 M ops/s, latency: 3.834 ns/op nr_loops: 1000 bpf_loop - throughput: 262.806 ± 0.629 M ops/s, latency: 3.805 ns/op nr_loops: 5000 bpf_loop - throughput: 264.211 ± 1.508 M ops/s, latency: 3.785 ns/op nr_loops: 10000 bpf_loop - throughput: 265.366 ± 3.054 M ops/s, latency: 3.768 ns/op nr_loops: 50000 bpf_loop - throughput: 235.986 ± 20.205 M ops/s, latency: 4.238 ns/op nr_loops: 100000 bpf_loop - throughput: 264.482 ± 0.279 M ops/s, latency: 3.781 ns/op nr_loops: 500000 bpf_loop - throughput: 309.773 ± 87.713 M ops/s, latency: 3.228 ns/op nr_loops: 1000000 bpf_loop - throughput: 262.818 ± 4.143 M ops/s, latency: 3.805 ns/op >From this data, we can see that the latency per loop decreases as the number of loops increases. On this particular machine, each loop had an overhead of about ~4 ns, and we were able to run ~250 million loops per second. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211130030622.4131246-5-joannekoong@fb.com --- tools/testing/selftests/bpf/Makefile | 4 +- tools/testing/selftests/bpf/bench.c | 37 ++++++ tools/testing/selftests/bpf/bench.h | 2 + .../selftests/bpf/benchs/bench_bpf_loop.c | 105 ++++++++++++++++++ .../bpf/benchs/run_bench_bpf_loop.sh | 15 +++ .../selftests/bpf/benchs/run_common.sh | 15 +++ .../selftests/bpf/progs/bpf_loop_bench.c | 26 +++++ 7 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/benchs/bench_bpf_loop.c create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_bpf_loop.sh create mode 100644 tools/testing/selftests/bpf/progs/bpf_loop_bench.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 35684d61aaeb..a6c0e92c86a1 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -531,6 +531,7 @@ $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \ $(OUTPUT)/perfbuf_bench.skel.h $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h +$(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -540,7 +541,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_rename.o \ $(OUTPUT)/bench_trigger.o \ $(OUTPUT)/bench_ringbufs.o \ - $(OUTPUT)/bench_bloom_filter_map.o + $(OUTPUT)/bench_bloom_filter_map.o \ + $(OUTPUT)/bench_bpf_loop.o $(call msg,BINARY,,$@) $(Q)$(CC) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index c75e7ee28746..3d6082b97a56 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -134,6 +134,39 @@ void hits_drops_report_final(struct bench_res res[], int res_cnt) total_ops_mean, total_ops_stddev); } +void ops_report_progress(int iter, struct bench_res *res, long delta_ns) +{ + double hits_per_sec, hits_per_prod; + + hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0); + hits_per_prod = hits_per_sec / env.producer_cnt; + + printf("Iter %3d (%7.3lfus): ", iter, (delta_ns - 1000000000) / 1000.0); + + printf("hits %8.3lfM/s (%7.3lfM/prod)\n", hits_per_sec, hits_per_prod); +} + +void ops_report_final(struct bench_res res[], int res_cnt) +{ + double hits_mean = 0.0, hits_stddev = 0.0; + int i; + + for (i = 0; i < res_cnt; i++) + hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt); + + if (res_cnt > 1) { + for (i = 0; i < res_cnt; i++) + hits_stddev += (hits_mean - res[i].hits / 1000000.0) * + (hits_mean - res[i].hits / 1000000.0) / + (res_cnt - 1.0); + + hits_stddev = sqrt(hits_stddev); + } + printf("Summary: throughput %8.3lf \u00B1 %5.3lf M ops/s (%7.3lfM ops/prod), ", + hits_mean, hits_stddev, hits_mean / env.producer_cnt); + printf("latency %8.3lf ns/op\n", 1000.0 / hits_mean * env.producer_cnt); +} + const char *argp_program_version = "benchmark"; const char *argp_program_bug_address = ""; const char argp_program_doc[] = @@ -171,10 +204,12 @@ static const struct argp_option opts[] = { extern struct argp bench_ringbufs_argp; extern struct argp bench_bloom_map_argp; +extern struct argp bench_bpf_loop_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, { &bench_bloom_map_argp, 0, "Bloom filter map benchmark", 0 }, + { &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 }, {}, }; @@ -373,6 +408,7 @@ extern const struct bench bench_bloom_update; extern const struct bench bench_bloom_false_positive; extern const struct bench bench_hashmap_without_bloom; extern const struct bench bench_hashmap_with_bloom; +extern const struct bench bench_bpf_loop; static const struct bench *benchs[] = { &bench_count_global, @@ -404,6 +440,7 @@ static const struct bench *benchs[] = { &bench_bloom_false_positive, &bench_hashmap_without_bloom, &bench_hashmap_with_bloom, + &bench_bpf_loop, }; static void setup_benchmark() diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index 624c6b11501f..50785503756b 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -59,6 +59,8 @@ void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns); void hits_drops_report_final(struct bench_res res[], int res_cnt); void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns); void false_hits_report_final(struct bench_res res[], int res_cnt); +void ops_report_progress(int iter, struct bench_res *res, long delta_ns); +void ops_report_final(struct bench_res res[], int res_cnt); static inline __u64 get_time_ns() { struct timespec t; diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_loop.c b/tools/testing/selftests/bpf/benchs/bench_bpf_loop.c new file mode 100644 index 000000000000..d0a6572bfab6 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_loop.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include "bench.h" +#include "bpf_loop_bench.skel.h" + +/* BPF triggering benchmarks */ +static struct ctx { + struct bpf_loop_bench *skel; +} ctx; + +static struct { + __u32 nr_loops; +} args = { + .nr_loops = 10, +}; + +enum { + ARG_NR_LOOPS = 4000, +}; + +static const struct argp_option opts[] = { + { "nr_loops", ARG_NR_LOOPS, "nr_loops", 0, + "Set number of loops for the bpf_loop helper"}, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case ARG_NR_LOOPS: + args.nr_loops = strtol(arg, NULL, 10); + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +/* exported into benchmark runner */ +const struct argp bench_bpf_loop_argp = { + .options = opts, + .parser = parse_arg, +}; + +static void validate(void) +{ + if (env.consumer_cnt != 1) { + fprintf(stderr, "benchmark doesn't support multi-consumer!\n"); + exit(1); + } +} + +static void *producer(void *input) +{ + while (true) + /* trigger the bpf program */ + syscall(__NR_getpgid); + + return NULL; +} + +static void *consumer(void *input) +{ + return NULL; +} + +static void measure(struct bench_res *res) +{ + res->hits = atomic_swap(&ctx.skel->bss->hits, 0); +} + +static void setup(void) +{ + struct bpf_link *link; + + setup_libbpf(); + + ctx.skel = bpf_loop_bench__open_and_load(); + if (!ctx.skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + link = bpf_program__attach(ctx.skel->progs.benchmark); + if (!link) { + fprintf(stderr, "failed to attach program!\n"); + exit(1); + } + + ctx.skel->bss->nr_loops = args.nr_loops; +} + +const struct bench bench_bpf_loop = { + .name = "bpf-loop", + .validate = validate, + .setup = setup, + .producer_thread = producer, + .consumer_thread = consumer, + .measure = measure, + .report_progress = ops_report_progress, + .report_final = ops_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_bpf_loop.sh b/tools/testing/selftests/bpf/benchs/run_bench_bpf_loop.sh new file mode 100755 index 000000000000..d4f5f73b356b --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_bpf_loop.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source ./benchs/run_common.sh + +set -eufo pipefail + +for t in 1 4 8 12 16; do +for i in 10 100 500 1000 5000 10000 50000 100000 500000 1000000; do +subtitle "nr_loops: $i, nr_threads: $t" + summarize_ops "bpf_loop: " \ + "$($RUN_BENCH -p $t --nr_loops $i bpf-loop)" + printf "\n" +done +done diff --git a/tools/testing/selftests/bpf/benchs/run_common.sh b/tools/testing/selftests/bpf/benchs/run_common.sh index 9a16be78b180..6c5e6023a69f 100644 --- a/tools/testing/selftests/bpf/benchs/run_common.sh +++ b/tools/testing/selftests/bpf/benchs/run_common.sh @@ -33,6 +33,14 @@ function percentage() echo "$*" | sed -E "s/.*Percentage\s=\s+([0-9]+\.[0-9]+).*/\1/" } +function ops() +{ + echo -n "throughput: " + echo -n "$*" | sed -E "s/.*throughput\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+\sM\sops\/s).*/\1/" + echo -n -e ", latency: " + echo "$*" | sed -E "s/.*latency\s+([0-9]+\.[0-9]+\sns\/op).*/\1/" +} + function total() { echo "$*" | sed -E "s/.*total operations\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/" @@ -52,6 +60,13 @@ function summarize_percentage() printf "%-20s %s%%\n" "$bench" "$(percentage $summary)" } +function summarize_ops() +{ + bench="$1" + summary=$(echo $2 | tail -n1) + printf "%-20s %s\n" "$bench" "$(ops $summary)" +} + function summarize_total() { bench="$1" diff --git a/tools/testing/selftests/bpf/progs/bpf_loop_bench.c b/tools/testing/selftests/bpf/progs/bpf_loop_bench.c new file mode 100644 index 000000000000..9dafdc244462 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_loop_bench.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +u32 nr_loops; +long hits; + +static int empty_callback(__u32 index, void *data) +{ + return 0; +} + +SEC("fentry/__x64_sys_getpgid") +int benchmark(void *ctx) +{ + for (int i = 0; i < 1000; i++) { + bpf_loop(nr_loops, empty_callback, NULL, 0); + + __sync_add_and_fetch(&hits, nr_loops); + } + return 0; +} From d4efb170861827290f7f571020001a60d001faaf Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Nov 2021 05:27:31 +0530 Subject: [PATCH 054/115] bpf: Change bpf_kallsyms_lookup_name size type to ARG_CONST_SIZE_OR_ZERO Andrii mentioned in [0] that switching to ARG_CONST_SIZE_OR_ZERO lets user avoid having to prove that string size at runtime is not zero and helps with not having to supress clang optimizations. [0]: https://lore.kernel.org/bpf/CAEf4BzZa_vhXB3c8atNcTS6=krQvC25H7K7c3WWZhM=27ro=Wg@mail.gmail.com Suggested-by: Andrii Nakryiko Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122235733.634914-2-memxor@gmail.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 50f96ea4452a..47089d1d67a4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4804,7 +4804,7 @@ const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, - .arg2_type = ARG_CONST_SIZE, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; From 0270090d396a8e7e7f42adae13fdfa48ffb85144 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Nov 2021 05:27:32 +0530 Subject: [PATCH 055/115] libbpf: Avoid double stores for success/failure case of ksym relocations Instead, jump directly to success case stores in case ret >= 0, else do the default 0 value store and jump over the success case. This is better in terms of readability. Readjust the code for kfunc relocation as well to follow a similar pattern, also leads to easier to follow code now. Suggested-by: Alexei Starovoitov Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122235733.634914-3-memxor@gmail.com --- tools/lib/bpf/gen_loader.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index c7bc77f4e752..5a2d6bf041dd 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -674,27 +674,29 @@ static void emit_relo_kfunc_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo return; } kdesc->off = btf_fd_idx; - /* set a default value for imm */ + /* jump to success case */ + emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); + /* set value for imm, off as 0 */ emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, offsetof(struct bpf_insn, imm), 0)); - /* skip success case store if ret < 0 */ - emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, 1)); + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); + /* skip success case for ret < 0 */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 10)); /* store btf_id into insn[insn_idx].imm */ emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, offsetof(struct bpf_insn, imm))); + /* obtain fd in BPF_REG_9 */ + emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_7)); + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32)); + /* jump to fd_array store if fd denotes module BTF */ + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2)); + /* set the default value for off */ + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); + /* skip BTF fd store for vmlinux BTF */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 4)); /* load fd_array slot pointer */ emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, blob_fd_array_off(gen, btf_fd_idx))); - /* skip store of BTF fd if ret < 0 */ - emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, 3)); /* store BTF fd in slot */ - emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_7)); - emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32)); emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_9, 0)); - /* set a default value for off */ - emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); - /* skip insn->off store if ret < 0 */ - emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, 2)); - /* skip if vmlinux BTF */ - emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_9, 0, 1)); /* store index into insn[insn_idx].off */ emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), btf_fd_idx)); log: @@ -803,17 +805,20 @@ static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, emit_bpf_find_by_name_kind(gen, relo); if (!relo->is_weak) emit_check_err(gen); - /* set default values as 0 */ + /* jump to success case */ + emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); + /* set values for insn[insn_idx].imm, insn[insn_idx + 1].imm as 0 */ emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, offsetof(struct bpf_insn, imm), 0)); emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 0)); - /* skip success case stores if ret < 0 */ - emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, 4)); + /* skip success case for ret < 0 */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 4)); /* store btf_id into insn[insn_idx].imm */ emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, offsetof(struct bpf_insn, imm))); /* store btf_obj_fd into insn[insn_idx + 1].imm */ emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 32)); emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm))); + /* skip src_reg adjustment */ emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); clear_src_reg: /* clear bpf_object__relocate_data's src_reg assignment, otherwise we get a verifier failure */ From d995816b77eb826e0f6d7adf4471ec191b362be0 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Nov 2021 05:27:33 +0530 Subject: [PATCH 056/115] libbpf: Avoid reload of imm for weak, unresolved, repeating ksym Alexei pointed out that we can use BPF_REG_0 which already contains imm from move_blob2blob computation. Note that we now compare the second insn's imm, but this should not matter, since both will be zeroed out for the error case for the insn populated earlier. Suggested-by: Alexei Starovoitov Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122235733.634914-4-memxor@gmail.com --- tools/lib/bpf/gen_loader.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 5a2d6bf041dd..6f3790369463 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -795,9 +795,8 @@ static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, kdesc->insn + offsetof(struct bpf_insn, imm)); move_blob2blob(gen, insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 4, kdesc->insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm)); - emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_8, offsetof(struct bpf_insn, imm))); - /* jump over src_reg adjustment if imm is not 0 */ - emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 3)); + /* jump over src_reg adjustment if imm is not 0, reuse BPF_REG_0 from move_blob2blob */ + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3)); goto clear_src_reg; } /* remember insn offset, so we can copy BTF ID and FD later */ From 436d404cc8ff573a417cb3b6a5c76655121aceac Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 1 Dec 2021 15:34:57 +0800 Subject: [PATCH 057/115] bpf: Clean-up bpf_verifier_vlog() for BPF_LOG_KERNEL log level An extra newline will output for bpf_log() with BPF_LOG_KERNEL level as shown below: [ 52.095704] BPF:The function test_3 has 12 arguments. Too many. [ 52.095704] [ 52.096896] Error in parsing func ptr test_3 in struct bpf_dummy_ops Now all bpf_log() are ended by newline, but not all btf_verifier_log() are ended by newline, so checking whether or not the log message has the trailing newline and adding a newline if not. Also there is no need to calculate the left userspace buffer size for kernel log output and to truncate the output by '\0' which has already been done by vscnprintf(), so only do these for userspace log output. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211201073458.2731595-2-houtao1@huawei.com --- kernel/bpf/verifier.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d7678d8a925c..6c9c0d9a04a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -293,13 +293,15 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, "verifier log line truncated - local buffer too short\n"); - n = min(log->len_total - log->len_used - 1, n); - log->kbuf[n] = '\0'; - if (log->level == BPF_LOG_KERNEL) { - pr_err("BPF:%s\n", log->kbuf); + bool newline = n > 0 && log->kbuf[n - 1] == '\n'; + + pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); return; } + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) log->len_used += n; else From 64b5b97b8cfff64409fcc234ae3151bc8de0c4d6 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 1 Dec 2021 17:49:31 +0100 Subject: [PATCH 058/115] samples: bpf: Fix conflicting types in fds_example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following samples/bpf build error appeared after the introduction of bpf_map_create() in libbpf: CC samples/bpf/fds_example.o samples/bpf/fds_example.c:49:12: error: static declaration of 'bpf_map_create' follows non-static declaration static int bpf_map_create(void) ^ samples/bpf/libbpf/include/bpf/bpf.h:55:16: note: previous declaration is here LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, ^ samples/bpf/fds_example.c:82:23: error: too few arguments to function call, expected 6, have 0 fd = bpf_map_create(); ~~~~~~~~~~~~~~ ^ samples/bpf/libbpf/include/bpf/bpf.h:55:16: note: 'bpf_map_create' declared here LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, ^ 2 errors generated. fds_example by accident has a static function with the same name. It's not worth it to separate a single call into its own function, so just embed it. Fixes: 992c4225419a ("libbpf: Unify low-level map creation APIs w/ new bpf_map_create()") Signed-off-by: Alexander Lobakin Signed-off-by: Andrii Nakryiko Reviewed-by: Maciej Fijalkowski Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20211201164931.47357-1-alexandr.lobakin@intel.com --- samples/bpf/fds_example.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c index 59f45fef5110..9a7c1fd7a4a8 100644 --- a/samples/bpf/fds_example.c +++ b/samples/bpf/fds_example.c @@ -46,12 +46,6 @@ static void usage(void) printf(" -h Display this help.\n"); } -static int bpf_map_create(void) -{ - return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t), - sizeof(uint32_t), 1024, 0); -} - static int bpf_prog_create(const char *object) { static struct bpf_insn insns[] = { @@ -79,7 +73,8 @@ static int bpf_do_map(const char *file, uint32_t flags, uint32_t key, int fd, ret; if (flags & BPF_F_PIN) { - fd = bpf_map_create(); + fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t), + sizeof(uint32_t), 1024, 0); printf("bpf: map fd:%d (%s)\n", fd, strerror(errno)); assert(fd > 0); From 74753e1462e77349525daf9eb60ea21ed92d3a97 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:24 -0800 Subject: [PATCH 059/115] libbpf: Replace btf__type_by_id() with btf_type_by_id(). To prepare relo_core.c to be compiled in the kernel and the user space replace btf__type_by_id with btf_type_by_id. In libbpf btf__type_by_id and btf_type_by_id have different behavior. bpf_core_apply_relo_insn() needs behavior of uapi btf__type_by_id vs internal btf_type_by_id, but type_id range check is already done in bpf_core_apply_relo(), so it's safe to replace it everywhere. The kernel btf_type_by_id() does the check anyway. It doesn't hurt. Suggested-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-2-alexei.starovoitov@gmail.com --- tools/lib/bpf/btf.c | 2 +- tools/lib/bpf/libbpf_internal.h | 2 +- tools/lib/bpf/relo_core.c | 19 ++++++++----------- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 8024fe355ca8..0d7b16eab569 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -454,7 +454,7 @@ const struct btf *btf__base_btf(const struct btf *btf) } /* internal helper returning non-const pointer to a type */ -struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) +struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id) { if (type_id == 0) return &btf_void; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 311905d8ca70..6f143e9e810c 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -172,7 +172,7 @@ static inline void *libbpf_reallocarray(void *ptr, size_t nmemb, size_t size) struct btf; struct btf_type; -struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id); +struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id); const char *btf_kind_str(const struct btf_type *t); const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index b5b8956a1be8..c0904f4cb514 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -51,7 +51,7 @@ static bool is_flex_arr(const struct btf *btf, return false; /* has to be the last member of enclosing struct */ - t = btf__type_by_id(btf, acc->type_id); + t = btf_type_by_id(btf, acc->type_id); return acc->idx == btf_vlen(t) - 1; } @@ -388,7 +388,7 @@ static int bpf_core_match_member(const struct btf *local_btf, return 0; local_id = local_acc->type_id; - local_type = btf__type_by_id(local_btf, local_id); + local_type = btf_type_by_id(local_btf, local_id); local_member = btf_members(local_type) + local_acc->idx; local_name = btf__name_by_offset(local_btf, local_member->name_off); @@ -580,7 +580,7 @@ static int bpf_core_calc_field_relo(const char *prog_name, return -EUCLEAN; /* request instruction poisoning */ acc = &spec->spec[spec->len - 1]; - t = btf__type_by_id(spec->btf, acc->type_id); + t = btf_type_by_id(spec->btf, acc->type_id); /* a[n] accessor needs special handling */ if (!acc->name) { @@ -729,7 +729,7 @@ static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, case BPF_ENUMVAL_VALUE: if (!spec) return -EUCLEAN; /* request instruction poisoning */ - t = btf__type_by_id(spec->btf, spec->spec[0].type_id); + t = btf_type_by_id(spec->btf, spec->spec[0].type_id); e = btf_enum(t) + spec->spec[0].idx; *val = e->val; break; @@ -805,8 +805,8 @@ static int bpf_core_calc_relo(const char *prog_name, if (res->orig_sz != res->new_sz) { const struct btf_type *orig_t, *new_t; - orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id); - new_t = btf__type_by_id(targ_spec->btf, res->new_type_id); + orig_t = btf_type_by_id(local_spec->btf, res->orig_type_id); + new_t = btf_type_by_id(targ_spec->btf, res->new_type_id); /* There are two use cases in which it's safe to * adjust load/store's mem size: @@ -1054,7 +1054,7 @@ static void bpf_core_dump_spec(int level, const struct bpf_core_spec *spec) int i; type_id = spec->root_type_id; - t = btf__type_by_id(spec->btf, type_id); + t = btf_type_by_id(spec->btf, type_id); s = btf__name_by_offset(spec->btf, t->name_off); libbpf_print(level, "[%u] %s %s", type_id, btf_kind_str(t), str_is_empty(s) ? "" : s); @@ -1158,10 +1158,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, int i, j, err; local_id = relo->type_id; - local_type = btf__type_by_id(local_btf, local_id); - if (!local_type) - return -EINVAL; - + local_type = btf_type_by_id(local_btf, local_id); local_name = btf__name_by_offset(local_btf, local_type->name_off); if (!local_name) return -EINVAL; From 8293eb995f349aed28006792cad4cb48091919dd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:25 -0800 Subject: [PATCH 060/115] bpf: Rename btf_member accessors. Rename btf_member_bit_offset() and btf_member_bitfield_size() to avoid conflicts with similarly named helpers in libbpf's btf.h. Rename the kernel helpers, since libbpf helpers are part of uapi. Suggested-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-3-alexei.starovoitov@gmail.com --- include/linux/btf.h | 8 ++++---- kernel/bpf/bpf_struct_ops.c | 6 +++--- kernel/bpf/btf.c | 18 +++++++++--------- net/ipv4/bpf_tcp_ca.c | 6 +++--- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 203eef993d76..956f70388f69 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -194,15 +194,15 @@ static inline bool btf_type_kflag(const struct btf_type *t) return BTF_INFO_KFLAG(t->info); } -static inline u32 btf_member_bit_offset(const struct btf_type *struct_type, - const struct btf_member *member) +static inline u32 __btf_member_bit_offset(const struct btf_type *struct_type, + const struct btf_member *member) { return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) : member->offset; } -static inline u32 btf_member_bitfield_size(const struct btf_type *struct_type, - const struct btf_member *member) +static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type, + const struct btf_member *member) { return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) : 0; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 8ecfe4752769..21069dbe9138 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -165,7 +165,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) break; } - if (btf_member_bitfield_size(t, member)) { + if (__btf_member_bitfield_size(t, member)) { pr_warn("bit field member %s in struct %s is not supported\n", mname, st_ops->name); break; @@ -296,7 +296,7 @@ static int check_zero_holes(const struct btf_type *t, void *data) const struct btf_type *mtype; for_each_member(i, t, member) { - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (moff > prev_mend && memchr_inv(data + prev_mend, 0, moff - prev_mend)) return -EINVAL; @@ -387,7 +387,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_prog *prog; u32 moff; - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); if (ptype == module_type) { if (*(void **)(udata + moff)) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b9d23be1e99..f4119a99da7b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2969,7 +2969,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } - offset = btf_member_bit_offset(t, member); + offset = __btf_member_bit_offset(t, member); if (is_union && offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); @@ -3094,7 +3094,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t if (off != -ENOENT) /* only one such field is allowed */ return -E2BIG; - off = btf_member_bit_offset(t, member); + off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ return -EINVAL; @@ -3184,8 +3184,8 @@ static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, btf_show_start_member(show, member); - member_offset = btf_member_bit_offset(t, member); - bitfield_size = btf_member_bitfield_size(t, member); + member_offset = __btf_member_bit_offset(t, member); + bitfield_size = __btf_member_bitfield_size(t, member); bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { @@ -5060,7 +5060,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, if (array_elem->nelems != 0) goto error; - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (off < moff) goto error; @@ -5083,14 +5083,14 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, for_each_member(i, t, member) { /* offset of the field in bytes */ - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (off + size <= moff) /* won't find anything, field is already too far */ break; - if (btf_member_bitfield_size(t, member)) { - u32 end_bit = btf_member_bit_offset(t, member) + - btf_member_bitfield_size(t, member); + if (__btf_member_bitfield_size(t, member)) { + u32 end_bit = __btf_member_bit_offset(t, member) + + __btf_member_bitfield_size(t, member); /* off <= moff instead of off == moff because clang * does not generate a BTF member for anonymous diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 2cf02b4d77fb..67466dbff152 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -169,7 +169,7 @@ static u32 prog_ops_moff(const struct bpf_prog *prog) t = bpf_tcp_congestion_ops.type; m = &btf_type_member(t)[midx]; - return btf_member_bit_offset(t, m) / 8; + return __btf_member_bit_offset(t, m) / 8; } static const struct bpf_func_proto * @@ -244,7 +244,7 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, utcp_ca = (const struct tcp_congestion_ops *)udata; tcp_ca = (struct tcp_congestion_ops *)kdata; - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; switch (moff) { case offsetof(struct tcp_congestion_ops, flags): if (utcp_ca->flags & ~TCP_CONG_MASK) @@ -274,7 +274,7 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, static int bpf_tcp_ca_check_member(const struct btf_type *t, const struct btf_member *member) { - if (is_unsupported(btf_member_bit_offset(t, member) / 8)) + if (is_unsupported(__btf_member_bit_offset(t, member) / 8)) return -ENOTSUPP; return 0; } From 29db4bea1d10b73749d7992c1fc9ac13499e8871 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:26 -0800 Subject: [PATCH 061/115] bpf: Prepare relo_core.c for kernel duty. Make relo_core.c to be compiled for the kernel and for user space libbpf. Note the patch is reducing BPF_CORE_SPEC_MAX_LEN from 64 to 32. This is the maximum number of nested structs and arrays. For example: struct sample { int a; struct { int b[10]; }; }; struct sample *s = ...; int *y = &s->b[5]; This field access is encoded as "0:1:0:5" and spec len is 4. The follow up patch might bump it back to 64. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-4-alexei.starovoitov@gmail.com --- include/linux/btf.h | 81 +++++++++++++++++++++++++++++++++++++++ kernel/bpf/Makefile | 4 ++ kernel/bpf/btf.c | 26 +++++++++++++ tools/lib/bpf/relo_core.c | 76 ++++++++++++++++++++++++++++++------ 4 files changed, 176 insertions(+), 11 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 956f70388f69..acef6ef28768 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -144,6 +144,53 @@ static inline bool btf_type_is_enum(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; } +static inline bool str_is_empty(const char *s) +{ + return !s || !s[0]; +} + +static inline u16 btf_kind(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info); +} + +static inline bool btf_is_enum(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM; +} + +static inline bool btf_is_composite(const struct btf_type *t) +{ + u16 kind = btf_kind(t); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static inline bool btf_is_array(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ARRAY; +} + +static inline bool btf_is_int(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_INT; +} + +static inline bool btf_is_ptr(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_PTR; +} + +static inline u8 btf_int_offset(const struct btf_type *t) +{ + return BTF_INT_OFFSET(*(u32 *)(t + 1)); +} + +static inline u8 btf_int_encoding(const struct btf_type *t) +{ + return BTF_INT_ENCODING(*(u32 *)(t + 1)); +} + static inline bool btf_type_is_scalar(const struct btf_type *t) { return btf_type_is_int(t) || btf_type_is_enum(t); @@ -184,6 +231,11 @@ static inline u16 btf_type_vlen(const struct btf_type *t) return BTF_INFO_VLEN(t->info); } +static inline u16 btf_vlen(const struct btf_type *t) +{ + return btf_type_vlen(t); +} + static inline u16 btf_func_linkage(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); @@ -208,11 +260,40 @@ static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type, : 0; } +static inline struct btf_member *btf_members(const struct btf_type *t) +{ + return (struct btf_member *)(t + 1); +} + +static inline u32 btf_member_bit_offset(const struct btf_type *t, u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + + return __btf_member_bit_offset(t, m); +} + +static inline u32 btf_member_bitfield_size(const struct btf_type *t, u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + + return __btf_member_bitfield_size(t, m); +} + static inline const struct btf_member *btf_type_member(const struct btf_type *t) { return (const struct btf_member *)(t + 1); } +static inline struct btf_array *btf_array(const struct btf_type *t) +{ + return (struct btf_array *)(t + 1); +} + +static inline struct btf_enum *btf_enum(const struct btf_type *t) +{ + return (struct btf_enum *)(t + 1); +} + static inline const struct btf_var_secinfo *btf_type_var_secinfo( const struct btf_type *t) { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index cf6ca339f3cd..c1a9be6a4b9f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -36,3 +36,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ + +obj-$(CONFIG_BPF_SYSCALL) += relo_core.o +$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE + $(call if_changed_rule,cc_o_c) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f4119a99da7b..c79595aad55b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6413,3 +6413,29 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); + +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return -EOPNOTSUPP; +} + +static bool bpf_core_is_flavor_sep(const char *s) +{ + /* check X___Y name pattern, where X and Y are not underscores */ + return s[0] != '_' && /* X */ + s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */ + s[4] != '_'; /* Y */ +} + +size_t bpf_core_essential_name_len(const char *name) +{ + size_t n = strlen(name); + int i; + + for (i = n - 5; i >= 0; i--) { + if (bpf_core_is_flavor_sep(name + i)) + return i + 1; + } + return n; +} diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index c0904f4cb514..56dbe6d16664 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -1,6 +1,60 @@ // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* Copyright (c) 2019 Facebook */ +#ifdef __KERNEL__ +#include +#include +#include +#include +#include "relo_core.h" + +static const char *btf_kind_str(const struct btf_type *t) +{ + return btf_type_str(t); +} + +static bool is_ldimm64_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + +static const struct btf_type * +skip_mods_and_typedefs(const struct btf *btf, u32 id, u32 *res_id) +{ + return btf_type_skip_modifiers(btf, id, res_id); +} + +static const char *btf__name_by_offset(const struct btf *btf, u32 offset) +{ + return btf_name_by_offset(btf, offset); +} + +static s64 btf__resolve_size(const struct btf *btf, u32 type_id) +{ + const struct btf_type *t; + int size; + + t = btf_type_by_id(btf, type_id); + t = btf_resolve_size(btf, t, &size); + if (IS_ERR(t)) + return PTR_ERR(t); + return size; +} + +enum libbpf_print_level { + LIBBPF_WARN, + LIBBPF_INFO, + LIBBPF_DEBUG, +}; + +#undef pr_warn +#undef pr_info +#undef pr_debug +#define pr_warn(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define pr_info(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define pr_debug(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define libbpf_print(level, fmt, ...) bpf_log((void *)prog_name, fmt, ##__VA_ARGS__) +#else #include #include #include @@ -12,8 +66,9 @@ #include "btf.h" #include "str_error.h" #include "libbpf_internal.h" +#endif -#define BPF_CORE_SPEC_MAX_LEN 64 +#define BPF_CORE_SPEC_MAX_LEN 32 /* represents BPF CO-RE field or array element accessor */ struct bpf_core_accessor { @@ -150,7 +205,7 @@ static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access * string to specify enumerator's value index that need to be relocated. */ -static int bpf_core_parse_spec(const struct btf *btf, +static int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, __u32 type_id, const char *spec_str, enum bpf_core_relo_kind relo_kind, @@ -272,8 +327,8 @@ static int bpf_core_parse_spec(const struct btf *btf, return sz; spec->bit_offset += access_idx * sz * 8; } else { - pr_warn("relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n", - type_id, spec_str, i, id, btf_kind_str(t)); + pr_warn("prog '%s': relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n", + prog_name, type_id, spec_str, i, id, btf_kind_str(t)); return -EINVAL; } } @@ -346,8 +401,6 @@ static int bpf_core_fields_are_compat(const struct btf *local_btf, targ_id = btf_array(targ_type)->type; goto recur; default: - pr_warn("unexpected kind %d relocated, local [%d], target [%d]\n", - btf_kind(local_type), local_id, targ_id); return 0; } } @@ -1045,7 +1098,7 @@ static int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, * [] () + => @, * where is a C-syntax view of recorded field access, e.g.: x.a[3].b */ -static void bpf_core_dump_spec(int level, const struct bpf_core_spec *spec) +static void bpf_core_dump_spec(const char *prog_name, int level, const struct bpf_core_spec *spec) { const struct btf_type *t; const struct btf_enum *e; @@ -1167,7 +1220,8 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, if (str_is_empty(spec_str)) return -EINVAL; - err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec); + err = bpf_core_parse_spec(prog_name, local_btf, local_id, spec_str, + relo->kind, &local_spec); if (err) { pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", prog_name, relo_idx, local_id, btf_kind_str(local_type), @@ -1178,7 +1232,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); - bpf_core_dump_spec(LIBBPF_DEBUG, &local_spec); + bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &local_spec); libbpf_print(LIBBPF_DEBUG, "\n"); /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ @@ -1204,14 +1258,14 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, if (err < 0) { pr_warn("prog '%s': relo #%d: error matching candidate #%d ", prog_name, relo_idx, i); - bpf_core_dump_spec(LIBBPF_WARN, &cand_spec); + bpf_core_dump_spec(prog_name, LIBBPF_WARN, &cand_spec); libbpf_print(LIBBPF_WARN, ": %d\n", err); return err; } pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name, relo_idx, err == 0 ? "non-matching" : "matching", i); - bpf_core_dump_spec(LIBBPF_DEBUG, &cand_spec); + bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &cand_spec); libbpf_print(LIBBPF_DEBUG, "\n"); if (err == 0) From 46334a0cd21bed70d6f1ddef1464f75a0ebe1774 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:27 -0800 Subject: [PATCH 062/115] bpf: Define enum bpf_core_relo_kind as uapi. enum bpf_core_relo_kind is generated by llvm and processed by libbpf. It's a de-facto uapi. With CO-RE in the kernel the bpf_core_relo_kind values become uapi de-jure. Also rename them with BPF_CORE_ prefix to distinguish from conflicting names in bpf_core_read.h. The enums bpf_field_info_kind, bpf_type_id_kind, bpf_type_info_kind, bpf_enum_value_kind are passing different values from bpf program into llvm. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-5-alexei.starovoitov@gmail.com --- include/uapi/linux/bpf.h | 19 ++++++++ tools/include/uapi/linux/bpf.h | 19 ++++++++ tools/lib/bpf/libbpf.c | 2 +- tools/lib/bpf/relo_core.c | 84 +++++++++++++++++----------------- tools/lib/bpf/relo_core.h | 18 +------- 5 files changed, 82 insertions(+), 60 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 211b43afd0fb..9e66b1880020 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6374,4 +6374,23 @@ enum { BTF_F_ZERO = (1ULL << 3), }; +/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value + * has to be adjusted by relocations. It is emitted by llvm and passed to + * libbpf and later to the kernel. + */ +enum bpf_core_relo_kind { + BPF_CORE_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_CORE_FIELD_BYTE_SIZE = 1, /* field size in bytes */ + BPF_CORE_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_CORE_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ + BPF_CORE_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ + BPF_CORE_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ + BPF_CORE_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ + BPF_CORE_TYPE_ID_TARGET = 7, /* type ID in target kernel */ + BPF_CORE_TYPE_EXISTS = 8, /* type existence in target kernel */ + BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ + BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ + BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 211b43afd0fb..9e66b1880020 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6374,4 +6374,23 @@ enum { BTF_F_ZERO = (1ULL << 3), }; +/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value + * has to be adjusted by relocations. It is emitted by llvm and passed to + * libbpf and later to the kernel. + */ +enum bpf_core_relo_kind { + BPF_CORE_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_CORE_FIELD_BYTE_SIZE = 1, /* field size in bytes */ + BPF_CORE_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_CORE_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ + BPF_CORE_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ + BPF_CORE_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ + BPF_CORE_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ + BPF_CORE_TYPE_ID_TARGET = 7, /* type ID in target kernel */ + BPF_CORE_TYPE_EXISTS = 8, /* type existence in target kernel */ + BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ + BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ + BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 5a2f5a6ae2f9..9eaf2d9820e6 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5523,7 +5523,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, return -ENOTSUP; } - if (relo->kind != BPF_TYPE_ID_LOCAL && + if (relo->kind != BPF_CORE_TYPE_ID_LOCAL && !hashmap__find(cand_cache, type_key, (void **)&cands)) { cands = bpf_core_find_cands(prog->obj, local_btf, local_id); if (IS_ERR(cands)) { diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index 56dbe6d16664..d194fb9306ed 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -113,18 +113,18 @@ static bool is_flex_arr(const struct btf *btf, static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) { switch (kind) { - case BPF_FIELD_BYTE_OFFSET: return "byte_off"; - case BPF_FIELD_BYTE_SIZE: return "byte_sz"; - case BPF_FIELD_EXISTS: return "field_exists"; - case BPF_FIELD_SIGNED: return "signed"; - case BPF_FIELD_LSHIFT_U64: return "lshift_u64"; - case BPF_FIELD_RSHIFT_U64: return "rshift_u64"; - case BPF_TYPE_ID_LOCAL: return "local_type_id"; - case BPF_TYPE_ID_TARGET: return "target_type_id"; - case BPF_TYPE_EXISTS: return "type_exists"; - case BPF_TYPE_SIZE: return "type_size"; - case BPF_ENUMVAL_EXISTS: return "enumval_exists"; - case BPF_ENUMVAL_VALUE: return "enumval_value"; + case BPF_CORE_FIELD_BYTE_OFFSET: return "byte_off"; + case BPF_CORE_FIELD_BYTE_SIZE: return "byte_sz"; + case BPF_CORE_FIELD_EXISTS: return "field_exists"; + case BPF_CORE_FIELD_SIGNED: return "signed"; + case BPF_CORE_FIELD_LSHIFT_U64: return "lshift_u64"; + case BPF_CORE_FIELD_RSHIFT_U64: return "rshift_u64"; + case BPF_CORE_TYPE_ID_LOCAL: return "local_type_id"; + case BPF_CORE_TYPE_ID_TARGET: return "target_type_id"; + case BPF_CORE_TYPE_EXISTS: return "type_exists"; + case BPF_CORE_TYPE_SIZE: return "type_size"; + case BPF_CORE_ENUMVAL_EXISTS: return "enumval_exists"; + case BPF_CORE_ENUMVAL_VALUE: return "enumval_value"; default: return "unknown"; } } @@ -132,12 +132,12 @@ static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) static bool core_relo_is_field_based(enum bpf_core_relo_kind kind) { switch (kind) { - case BPF_FIELD_BYTE_OFFSET: - case BPF_FIELD_BYTE_SIZE: - case BPF_FIELD_EXISTS: - case BPF_FIELD_SIGNED: - case BPF_FIELD_LSHIFT_U64: - case BPF_FIELD_RSHIFT_U64: + case BPF_CORE_FIELD_BYTE_OFFSET: + case BPF_CORE_FIELD_BYTE_SIZE: + case BPF_CORE_FIELD_EXISTS: + case BPF_CORE_FIELD_SIGNED: + case BPF_CORE_FIELD_LSHIFT_U64: + case BPF_CORE_FIELD_RSHIFT_U64: return true; default: return false; @@ -147,10 +147,10 @@ static bool core_relo_is_field_based(enum bpf_core_relo_kind kind) static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) { switch (kind) { - case BPF_TYPE_ID_LOCAL: - case BPF_TYPE_ID_TARGET: - case BPF_TYPE_EXISTS: - case BPF_TYPE_SIZE: + case BPF_CORE_TYPE_ID_LOCAL: + case BPF_CORE_TYPE_ID_TARGET: + case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_SIZE: return true; default: return false; @@ -160,8 +160,8 @@ static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) { switch (kind) { - case BPF_ENUMVAL_EXISTS: - case BPF_ENUMVAL_VALUE: + case BPF_CORE_ENUMVAL_EXISTS: + case BPF_CORE_ENUMVAL_VALUE: return true; default: return false; @@ -624,7 +624,7 @@ static int bpf_core_calc_field_relo(const char *prog_name, *field_sz = 0; - if (relo->kind == BPF_FIELD_EXISTS) { + if (relo->kind == BPF_CORE_FIELD_EXISTS) { *val = spec ? 1 : 0; return 0; } @@ -637,7 +637,7 @@ static int bpf_core_calc_field_relo(const char *prog_name, /* a[n] accessor needs special handling */ if (!acc->name) { - if (relo->kind == BPF_FIELD_BYTE_OFFSET) { + if (relo->kind == BPF_CORE_FIELD_BYTE_OFFSET) { *val = spec->bit_offset / 8; /* remember field size for load/store mem size */ sz = btf__resolve_size(spec->btf, acc->type_id); @@ -645,7 +645,7 @@ static int bpf_core_calc_field_relo(const char *prog_name, return -EINVAL; *field_sz = sz; *type_id = acc->type_id; - } else if (relo->kind == BPF_FIELD_BYTE_SIZE) { + } else if (relo->kind == BPF_CORE_FIELD_BYTE_SIZE) { sz = btf__resolve_size(spec->btf, acc->type_id); if (sz < 0) return -EINVAL; @@ -697,36 +697,36 @@ static int bpf_core_calc_field_relo(const char *prog_name, *validate = !bitfield; switch (relo->kind) { - case BPF_FIELD_BYTE_OFFSET: + case BPF_CORE_FIELD_BYTE_OFFSET: *val = byte_off; if (!bitfield) { *field_sz = byte_sz; *type_id = field_type_id; } break; - case BPF_FIELD_BYTE_SIZE: + case BPF_CORE_FIELD_BYTE_SIZE: *val = byte_sz; break; - case BPF_FIELD_SIGNED: + case BPF_CORE_FIELD_SIGNED: /* enums will be assumed unsigned */ *val = btf_is_enum(mt) || (btf_int_encoding(mt) & BTF_INT_SIGNED); if (validate) *validate = true; /* signedness is never ambiguous */ break; - case BPF_FIELD_LSHIFT_U64: + case BPF_CORE_FIELD_LSHIFT_U64: #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ *val = 64 - (bit_off + bit_sz - byte_off * 8); #else *val = (8 - byte_sz) * 8 + (bit_off - byte_off * 8); #endif break; - case BPF_FIELD_RSHIFT_U64: + case BPF_CORE_FIELD_RSHIFT_U64: *val = 64 - bit_sz; if (validate) *validate = true; /* right shift is never ambiguous */ break; - case BPF_FIELD_EXISTS: + case BPF_CORE_FIELD_EXISTS: default: return -EOPNOTSUPP; } @@ -747,20 +747,20 @@ static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, } switch (relo->kind) { - case BPF_TYPE_ID_TARGET: + case BPF_CORE_TYPE_ID_TARGET: *val = spec->root_type_id; break; - case BPF_TYPE_EXISTS: + case BPF_CORE_TYPE_EXISTS: *val = 1; break; - case BPF_TYPE_SIZE: + case BPF_CORE_TYPE_SIZE: sz = btf__resolve_size(spec->btf, spec->root_type_id); if (sz < 0) return -EINVAL; *val = sz; break; - case BPF_TYPE_ID_LOCAL: - /* BPF_TYPE_ID_LOCAL is handled specially and shouldn't get here */ + case BPF_CORE_TYPE_ID_LOCAL: + /* BPF_CORE_TYPE_ID_LOCAL is handled specially and shouldn't get here */ default: return -EOPNOTSUPP; } @@ -776,10 +776,10 @@ static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, const struct btf_enum *e; switch (relo->kind) { - case BPF_ENUMVAL_EXISTS: + case BPF_CORE_ENUMVAL_EXISTS: *val = spec ? 1 : 0; break; - case BPF_ENUMVAL_VALUE: + case BPF_CORE_ENUMVAL_VALUE: if (!spec) return -EUCLEAN; /* request instruction poisoning */ t = btf_type_by_id(spec->btf, spec->spec[0].type_id); @@ -1236,7 +1236,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, libbpf_print(LIBBPF_DEBUG, "\n"); /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ - if (relo->kind == BPF_TYPE_ID_LOCAL) { + if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) { targ_res.validate = true; targ_res.poison = false; targ_res.orig_val = local_spec.root_type_id; @@ -1302,7 +1302,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, } /* - * For BPF_FIELD_EXISTS relo or when used BPF program has field + * For BPF_CORE_FIELD_EXISTS relo or when used BPF program has field * existence checks or kernel version/config checks, it's expected * that we might not find any candidates. In this case, if field * wasn't found in any candidate, the list of candidates shouldn't diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 3b9f8f18346c..3d0b86e7f439 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -4,23 +4,7 @@ #ifndef __RELO_CORE_H #define __RELO_CORE_H -/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value - * has to be adjusted by relocations. - */ -enum bpf_core_relo_kind { - BPF_FIELD_BYTE_OFFSET = 0, /* field byte offset */ - BPF_FIELD_BYTE_SIZE = 1, /* field size in bytes */ - BPF_FIELD_EXISTS = 2, /* field existence in target kernel */ - BPF_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ - BPF_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ - BPF_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ - BPF_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ - BPF_TYPE_ID_TARGET = 7, /* type ID in target kernel */ - BPF_TYPE_EXISTS = 8, /* type existence in target kernel */ - BPF_TYPE_SIZE = 9, /* type size in bytes */ - BPF_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ - BPF_ENUMVAL_VALUE = 11, /* enum value integer value */ -}; +#include /* The minimum bpf_core_relo checked by the loader * From fbd94c7afcf99c9f3b1ba1168657ecc428eb2c8d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:28 -0800 Subject: [PATCH 063/115] bpf: Pass a set of bpf_core_relo-s to prog_load command. struct bpf_core_relo is generated by llvm and processed by libbpf. It's a de-facto uapi. With CO-RE in the kernel the struct bpf_core_relo becomes uapi de-jure. Add an ability to pass a set of 'struct bpf_core_relo' to prog_load command and let the kernel perform CO-RE relocations. Note the struct bpf_line_info and struct bpf_func_info have the same layout when passed from LLVM to libbpf and from libbpf to the kernel except "insn_off" fields means "byte offset" when LLVM generates it. Then libbpf converts it to "insn index" to pass to the kernel. The struct bpf_core_relo's "insn_off" field is always "byte offset". Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-6-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 8 ++++ include/uapi/linux/bpf.h | 59 +++++++++++++++++++++++++- kernel/bpf/btf.c | 6 +++ kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 76 ++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 59 +++++++++++++++++++++++++- tools/lib/bpf/relo_core.h | 53 ------------------------ 7 files changed, 207 insertions(+), 56 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cad0829710be..8bbf08fbab66 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1732,6 +1732,14 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); +struct bpf_core_ctx { + struct bpf_verifier_log *log; + const struct btf *btf; +}; + +int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, + int relo_idx, void *insn); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9e66b1880020..c26871263f1f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1342,8 +1342,10 @@ union bpf_attr { /* or valid module BTF object fd or 0 to attach to vmlinux */ __u32 attach_btf_obj_fd; }; - __u32 :32; /* pad */ + __u32 core_relo_cnt; /* number of bpf_core_relo */ __aligned_u64 fd_array; /* array of FDs */ + __aligned_u64 core_relos; + __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -6393,4 +6395,59 @@ enum bpf_core_relo_kind { BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ }; +/* + * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf + * and from libbpf to the kernel. + * + * CO-RE relocation captures the following data: + * - insn_off - instruction offset (in bytes) within a BPF program that needs + * its insn->imm field to be relocated with actual field info; + * - type_id - BTF type ID of the "root" (containing) entity of a relocatable + * type or field; + * - access_str_off - offset into corresponding .BTF string section. String + * interpretation depends on specific relocation kind: + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * - kind - one of enum bpf_core_relo_kind; + * + * Example: + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample *s = ...; + * int *x = &s->a; // encoded as "0:0" (a is field #0) + * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + * + * type_id for all relocs in this example will capture BTF type id of + * `struct sample`. + * + * Such relocation is emitted when using __builtin_preserve_access_index() + * Clang built-in, passing expression that captures field address, e.g.: + * + * bpf_probe_read(&dst, sizeof(dst), + * __builtin_preserve_access_index(&src->a.b.c)); + * + * In this case Clang will emit field relocation recording necessary data to + * be able to find offset of embedded `a.b.c` field within `src` struct. + * + * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +struct bpf_core_relo { + __u32 insn_off; + __u32 type_id; + __u32 access_str_off; + enum bpf_core_relo_kind kind; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c79595aad55b..0d070461e2b8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6439,3 +6439,9 @@ size_t bpf_core_essential_name_len(const char *name) } return n; } + +int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, + int relo_idx, void *insn) +{ + return -EOPNOTSUPP; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 47089d1d67a4..b3ada4085f85 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2184,7 +2184,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array +#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6c9c0d9a04a0..6522ffdea487 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10273,6 +10273,78 @@ static int check_btf_line(struct bpf_verifier_env *env, return err; } +#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo) +#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE + +static int check_core_relo(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + u32 i, nr_core_relo, ncopy, expected_size, rec_size; + struct bpf_core_relo core_relo = {}; + struct bpf_prog *prog = env->prog; + const struct btf *btf = prog->aux->btf; + struct bpf_core_ctx ctx = { + .log = &env->log, + .btf = btf, + }; + bpfptr_t u_core_relo; + int err; + + nr_core_relo = attr->core_relo_cnt; + if (!nr_core_relo) + return 0; + if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo)) + return -EINVAL; + + rec_size = attr->core_relo_rec_size; + if (rec_size < MIN_CORE_RELO_SIZE || + rec_size > MAX_CORE_RELO_SIZE || + rec_size % sizeof(u32)) + return -EINVAL; + + u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel); + expected_size = sizeof(struct bpf_core_relo); + ncopy = min_t(u32, expected_size, rec_size); + + /* Unlike func_info and line_info, copy and apply each CO-RE + * relocation record one at a time. + */ + for (i = 0; i < nr_core_relo; i++) { + /* future proofing when sizeof(bpf_core_relo) changes */ + err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size); + if (err) { + if (err == -E2BIG) { + verbose(env, "nonzero tailing record in core_relo"); + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, core_relo_rec_size), + &expected_size, sizeof(expected_size))) + err = -EFAULT; + } + break; + } + + if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) { + err = -EFAULT; + break; + } + + if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) { + verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n", + i, core_relo.insn_off, prog->len); + err = -EINVAL; + break; + } + + err = bpf_core_apply(&ctx, &core_relo, i, + &prog->insnsi[core_relo.insn_off / 8]); + if (err) + break; + bpfptr_add(&u_core_relo, rec_size); + } + return err; +} + static int check_btf_info(struct bpf_verifier_env *env, const union bpf_attr *attr, bpfptr_t uattr) @@ -10303,6 +10375,10 @@ static int check_btf_info(struct bpf_verifier_env *env, if (err) return err; + err = check_core_relo(env, attr, uattr); + if (err) + return err; + return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9e66b1880020..c26871263f1f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1342,8 +1342,10 @@ union bpf_attr { /* or valid module BTF object fd or 0 to attach to vmlinux */ __u32 attach_btf_obj_fd; }; - __u32 :32; /* pad */ + __u32 core_relo_cnt; /* number of bpf_core_relo */ __aligned_u64 fd_array; /* array of FDs */ + __aligned_u64 core_relos; + __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -6393,4 +6395,59 @@ enum bpf_core_relo_kind { BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ }; +/* + * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf + * and from libbpf to the kernel. + * + * CO-RE relocation captures the following data: + * - insn_off - instruction offset (in bytes) within a BPF program that needs + * its insn->imm field to be relocated with actual field info; + * - type_id - BTF type ID of the "root" (containing) entity of a relocatable + * type or field; + * - access_str_off - offset into corresponding .BTF string section. String + * interpretation depends on specific relocation kind: + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * - kind - one of enum bpf_core_relo_kind; + * + * Example: + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample *s = ...; + * int *x = &s->a; // encoded as "0:0" (a is field #0) + * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + * + * type_id for all relocs in this example will capture BTF type id of + * `struct sample`. + * + * Such relocation is emitted when using __builtin_preserve_access_index() + * Clang built-in, passing expression that captures field address, e.g.: + * + * bpf_probe_read(&dst, sizeof(dst), + * __builtin_preserve_access_index(&src->a.b.c)); + * + * In this case Clang will emit field relocation recording necessary data to + * be able to find offset of embedded `a.b.c` field within `src` struct. + * + * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +struct bpf_core_relo { + __u32 insn_off; + __u32 type_id; + __u32 access_str_off; + enum bpf_core_relo_kind kind; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 3d0b86e7f439..f410691cc4e5 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -6,59 +6,6 @@ #include -/* The minimum bpf_core_relo checked by the loader - * - * CO-RE relocation captures the following data: - * - insn_off - instruction offset (in bytes) within a BPF program that needs - * its insn->imm field to be relocated with actual field info; - * - type_id - BTF type ID of the "root" (containing) entity of a relocatable - * type or field; - * - access_str_off - offset into corresponding .BTF string section. String - * interpretation depends on specific relocation kind: - * - for field-based relocations, string encodes an accessed field using - * a sequence of field and array indices, separated by colon (:). It's - * conceptually very close to LLVM's getelementptr ([0]) instruction's - * arguments for identifying offset to a field. - * - for type-based relocations, strings is expected to be just "0"; - * - for enum value-based relocations, string contains an index of enum - * value within its enum type; - * - * Example to provide a better feel. - * - * struct sample { - * int a; - * struct { - * int b[10]; - * }; - * }; - * - * struct sample *s = ...; - * int x = &s->a; // encoded as "0:0" (a is field #0) - * int y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, - * // b is field #0 inside anon struct, accessing elem #5) - * int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) - * - * type_id for all relocs in this example will capture BTF type id of - * `struct sample`. - * - * Such relocation is emitted when using __builtin_preserve_access_index() - * Clang built-in, passing expression that captures field address, e.g.: - * - * bpf_probe_read(&dst, sizeof(dst), - * __builtin_preserve_access_index(&src->a.b.c)); - * - * In this case Clang will emit field relocation recording necessary data to - * be able to find offset of embedded `a.b.c` field within `src` struct. - * - * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction - */ -struct bpf_core_relo { - __u32 insn_off; - __u32 type_id; - __u32 access_str_off; - enum bpf_core_relo_kind kind; -}; - struct bpf_core_cand { const struct btf *btf; const struct btf_type *t; From c5a2d43e998a821701029f23e25b62f9188e93ff Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:29 -0800 Subject: [PATCH 064/115] bpf: Adjust BTF log size limit. Make BTF log size limit to be the same as the verifier log size limit. Otherwise tools that progressively increase log size and use the same log for BTF loading and program loading will be hitting hard to debug EINVAL. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-7-alexei.starovoitov@gmail.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0d070461e2b8..dbf1f389b1d3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4472,7 +4472,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, log->len_total = log_size; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || !log->level || !log->ubuf) { err = -EINVAL; goto errout; From 03d5b99138dd8c7bfb838396acb180bd515ebf06 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Dec 2021 10:10:30 -0800 Subject: [PATCH 065/115] libbpf: Cleanup struct bpf_core_cand. Remove two redundant fields from struct bpf_core_cand. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-8-alexei.starovoitov@gmail.com --- tools/lib/bpf/libbpf.c | 30 +++++++++++++++++------------- tools/lib/bpf/relo_core.h | 2 -- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9eaf2d9820e6..96792d6e6fc1 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5179,15 +5179,18 @@ static int bpf_core_add_cands(struct bpf_core_cand *local_cand, struct bpf_core_cand_list *cands) { struct bpf_core_cand *new_cands, *cand; - const struct btf_type *t; - const char *targ_name; + const struct btf_type *t, *local_t; + const char *targ_name, *local_name; size_t targ_essent_len; int n, i; + local_t = btf__type_by_id(local_cand->btf, local_cand->id); + local_name = btf__str_by_offset(local_cand->btf, local_t->name_off); + n = btf__type_cnt(targ_btf); for (i = targ_start_id; i < n; i++) { t = btf__type_by_id(targ_btf, i); - if (btf_kind(t) != btf_kind(local_cand->t)) + if (btf_kind(t) != btf_kind(local_t)) continue; targ_name = btf__name_by_offset(targ_btf, t->name_off); @@ -5198,12 +5201,12 @@ static int bpf_core_add_cands(struct bpf_core_cand *local_cand, if (targ_essent_len != local_essent_len) continue; - if (strncmp(local_cand->name, targ_name, local_essent_len) != 0) + if (strncmp(local_name, targ_name, local_essent_len) != 0) continue; pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s in [%s]\n", - local_cand->id, btf_kind_str(local_cand->t), - local_cand->name, i, btf_kind_str(t), targ_name, + local_cand->id, btf_kind_str(local_t), + local_name, i, btf_kind_str(t), targ_name, targ_btf_name); new_cands = libbpf_reallocarray(cands->cands, cands->len + 1, sizeof(*cands->cands)); @@ -5212,8 +5215,6 @@ static int bpf_core_add_cands(struct bpf_core_cand *local_cand, cand = &new_cands[cands->len]; cand->btf = targ_btf; - cand->t = t; - cand->name = targ_name; cand->id = i; cands->cands = new_cands; @@ -5320,18 +5321,21 @@ bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 l struct bpf_core_cand local_cand = {}; struct bpf_core_cand_list *cands; const struct btf *main_btf; + const struct btf_type *local_t; + const char *local_name; size_t local_essent_len; int err, i; local_cand.btf = local_btf; - local_cand.t = btf__type_by_id(local_btf, local_type_id); - if (!local_cand.t) + local_cand.id = local_type_id; + local_t = btf__type_by_id(local_btf, local_type_id); + if (!local_t) return ERR_PTR(-EINVAL); - local_cand.name = btf__name_by_offset(local_btf, local_cand.t->name_off); - if (str_is_empty(local_cand.name)) + local_name = btf__name_by_offset(local_btf, local_t->name_off); + if (str_is_empty(local_name)) return ERR_PTR(-EINVAL); - local_essent_len = bpf_core_essential_name_len(local_cand.name); + local_essent_len = bpf_core_essential_name_len(local_name); cands = calloc(1, sizeof(*cands)); if (!cands) diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index f410691cc4e5..4f864b8e33b7 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -8,8 +8,6 @@ struct bpf_core_cand { const struct btf *btf; - const struct btf_type *t; - const char *name; __u32 id; }; From 1e89106da25390826608ad6ac0edfb7c9952eff3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:31 -0800 Subject: [PATCH 066/115] bpf: Add bpf_core_add_cands() and wire it into bpf_core_apply_relo_insn(). Given BPF program's BTF root type name perform the following steps: . search in vmlinux candidate cache. . if (present in cache and candidate list >= 1) return candidate list. . do a linear search through kernel BTFs for possible candidates. . regardless of number of candidates found populate vmlinux cache. . if (candidate list >= 1) return candidate list. . search in module candidate cache. . if (present in cache) return candidate list (even if list is empty). . do a linear search through BTFs of all kernel modules collecting candidates from all of them. . regardless of number of candidates found populate module cache. . return candidate list. Then wire the result into bpf_core_apply_relo_insn(). When BPF program is trying to CO-RE relocate a type that doesn't exist in either vmlinux BTF or in modules BTFs these steps will perform 2 cache lookups when cache is hit. Note the cache doesn't prevent the abuse by the program that might have lots of relocations that cannot be resolved. Hence cond_resched(). CO-RE in the kernel requires CAP_BPF, since BTF loading requires it. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-9-alexei.starovoitov@gmail.com --- kernel/bpf/btf.c | 346 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 345 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbf1f389b1d3..ed4258cb0832 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -25,6 +25,7 @@ #include #include #include +#include "../tools/lib/bpf/relo_core.h" /* BTF (BPF Type Format) is the meta data format which describes * the data types of BPF program/map. Hence, it basically focus @@ -6169,6 +6170,8 @@ btf_module_read(struct file *file, struct kobject *kobj, return len; } +static void purge_cand_cache(struct btf *btf); + static int btf_module_notify(struct notifier_block *nb, unsigned long op, void *module) { @@ -6203,6 +6206,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, goto out; } + purge_cand_cache(NULL); mutex_lock(&btf_module_mutex); btf_mod->module = module; btf_mod->btf = btf; @@ -6245,6 +6249,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, list_del(&btf_mod->list); if (btf_mod->sysfs_attr) sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); + purge_cand_cache(btf_mod->btf); btf_put(btf_mod->btf); kfree(btf_mod->sysfs_attr); kfree(btf_mod); @@ -6440,8 +6445,347 @@ size_t bpf_core_essential_name_len(const char *name) return n; } +struct bpf_cand_cache { + const char *name; + u32 name_len; + u16 kind; + u16 cnt; + struct { + const struct btf *btf; + u32 id; + } cands[]; +}; + +static void bpf_free_cands(struct bpf_cand_cache *cands) +{ + if (!cands->cnt) + /* empty candidate array was allocated on stack */ + return; + kfree(cands); +} + +static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands) +{ + kfree(cands->name); + kfree(cands); +} + +#define VMLINUX_CAND_CACHE_SIZE 31 +static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE]; + +#define MODULE_CAND_CACHE_SIZE 31 +static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE]; + +static DEFINE_MUTEX(cand_cache_mutex); + +static void __print_cand_cache(struct bpf_verifier_log *log, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc; + int i, j; + + for (i = 0; i < cache_size; i++) { + cc = cache[i]; + if (!cc) + continue; + bpf_log(log, "[%d]%s(", i, cc->name); + for (j = 0; j < cc->cnt; j++) { + bpf_log(log, "%d", cc->cands[j].id); + if (j < cc->cnt - 1) + bpf_log(log, " "); + } + bpf_log(log, "), "); + } +} + +static void print_cand_cache(struct bpf_verifier_log *log) +{ + mutex_lock(&cand_cache_mutex); + bpf_log(log, "vmlinux_cand_cache:"); + __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + bpf_log(log, "\nmodule_cand_cache:"); + __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE); + bpf_log(log, "\n"); + mutex_unlock(&cand_cache_mutex); +} + +static u32 hash_cands(struct bpf_cand_cache *cands) +{ + return jhash(cands->name, cands->name_len, 0); +} + +static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size]; + + if (cc && cc->name_len == cands->name_len && + !strncmp(cc->name, cands->name, cands->name_len)) + return cc; + return NULL; +} + +static size_t sizeof_cands(int cnt) +{ + return offsetof(struct bpf_cand_cache, cands[cnt]); +} + +static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands; + + if (*cc) { + bpf_free_cands_from_cache(*cc); + *cc = NULL; + } + new_cands = kmalloc(sizeof_cands(cands->cnt), GFP_KERNEL); + if (!new_cands) { + bpf_free_cands(cands); + return ERR_PTR(-ENOMEM); + } + memcpy(new_cands, cands, sizeof_cands(cands->cnt)); + /* strdup the name, since it will stay in cache. + * the cands->name points to strings in prog's BTF and the prog can be unloaded. + */ + new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL); + bpf_free_cands(cands); + if (!new_cands->name) { + kfree(new_cands); + return ERR_PTR(-ENOMEM); + } + *cc = new_cands; + return new_cands; +} + +static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc; + int i, j; + + for (i = 0; i < cache_size; i++) { + cc = cache[i]; + if (!cc) + continue; + if (!btf) { + /* when new module is loaded purge all of module_cand_cache, + * since new module might have candidates with the name + * that matches cached cands. + */ + bpf_free_cands_from_cache(cc); + cache[i] = NULL; + continue; + } + /* when module is unloaded purge cache entries + * that match module's btf + */ + for (j = 0; j < cc->cnt; j++) + if (cc->cands[j].btf == btf) { + bpf_free_cands_from_cache(cc); + cache[i] = NULL; + break; + } + } + +} + +static void purge_cand_cache(struct btf *btf) +{ + mutex_lock(&cand_cache_mutex); + __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE); + mutex_unlock(&cand_cache_mutex); +} + +static struct bpf_cand_cache * +bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, + int targ_start_id) +{ + struct bpf_cand_cache *new_cands; + const struct btf_type *t; + const char *targ_name; + size_t targ_essent_len; + int n, i; + + n = btf_nr_types(targ_btf); + for (i = targ_start_id; i < n; i++) { + t = btf_type_by_id(targ_btf, i); + if (btf_kind(t) != cands->kind) + continue; + + targ_name = btf_name_by_offset(targ_btf, t->name_off); + if (!targ_name) + continue; + + /* the resched point is before strncmp to make sure that search + * for non-existing name will have a chance to schedule(). + */ + cond_resched(); + + if (strncmp(cands->name, targ_name, cands->name_len) != 0) + continue; + + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != cands->name_len) + continue; + + /* most of the time there is only one candidate for a given kind+name pair */ + new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL); + if (!new_cands) { + bpf_free_cands(cands); + return ERR_PTR(-ENOMEM); + } + + memcpy(new_cands, cands, sizeof_cands(cands->cnt)); + bpf_free_cands(cands); + cands = new_cands; + cands->cands[cands->cnt].btf = targ_btf; + cands->cands[cands->cnt].id = i; + cands->cnt++; + } + return cands; +} + +static struct bpf_cand_cache * +bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) +{ + struct bpf_cand_cache *cands, *cc, local_cand = {}; + const struct btf *local_btf = ctx->btf; + const struct btf_type *local_type; + const struct btf *main_btf; + size_t local_essent_len; + struct btf *mod_btf; + const char *name; + int id; + + main_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(main_btf)) + return (void *)main_btf; + + local_type = btf_type_by_id(local_btf, local_type_id); + if (!local_type) + return ERR_PTR(-EINVAL); + + name = btf_name_by_offset(local_btf, local_type->name_off); + if (str_is_empty(name)) + return ERR_PTR(-EINVAL); + local_essent_len = bpf_core_essential_name_len(name); + + cands = &local_cand; + cands->name = name; + cands->kind = btf_kind(local_type); + cands->name_len = local_essent_len; + + cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + /* cands is a pointer to stack here */ + if (cc) { + if (cc->cnt) + return cc; + goto check_modules; + } + + /* Attempt to find target candidates in vmlinux BTF first */ + cands = bpf_core_add_cands(cands, main_btf, 1); + if (IS_ERR(cands)) + return cands; + + /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */ + + /* populate cache even when cands->cnt == 0 */ + cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + if (IS_ERR(cc)) + return cc; + + /* if vmlinux BTF has any candidate, don't go for module BTFs */ + if (cc->cnt) + return cc; + +check_modules: + /* cands is a pointer to stack here and cands->cnt == 0 */ + cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); + if (cc) + /* if cache has it return it even if cc->cnt == 0 */ + return cc; + + /* If candidate is not found in vmlinux's BTF then search in module's BTFs */ + spin_lock_bh(&btf_idr_lock); + idr_for_each_entry(&btf_idr, mod_btf, id) { + if (!btf_is_module(mod_btf)) + continue; + /* linear search could be slow hence unlock/lock + * the IDR to avoiding holding it for too long + */ + btf_get(mod_btf); + spin_unlock_bh(&btf_idr_lock); + cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); + if (IS_ERR(cands)) { + btf_put(mod_btf); + return cands; + } + spin_lock_bh(&btf_idr_lock); + btf_put(mod_btf); + } + spin_unlock_bh(&btf_idr_lock); + /* cands is a pointer to kmalloced memory here if cands->cnt > 0 + * or pointer to stack if cands->cnd == 0. + * Copy it into the cache even when cands->cnt == 0 and + * return the result. + */ + return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); +} + int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn) { - return -EOPNOTSUPP; + bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL; + struct bpf_core_cand_list cands = {}; + int err; + + if (need_cands) { + struct bpf_cand_cache *cc; + int i; + + mutex_lock(&cand_cache_mutex); + cc = bpf_core_find_cands(ctx, relo->type_id); + if (IS_ERR(cc)) { + bpf_log(ctx->log, "target candidate search failed for %d\n", + relo->type_id); + err = PTR_ERR(cc); + goto out; + } + if (cc->cnt) { + cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL); + if (!cands.cands) { + err = -ENOMEM; + goto out; + } + } + for (i = 0; i < cc->cnt; i++) { + bpf_log(ctx->log, + "CO-RE relocating %s %s: found target candidate [%d]\n", + btf_kind_str[cc->kind], cc->name, cc->cands[i].id); + cands.cands[i].btf = cc->cands[i].btf; + cands.cands[i].id = cc->cands[i].id; + } + cands.len = cc->cnt; + /* cand_cache_mutex needs to span the cache lookup and + * copy of btf pointer into bpf_core_cand_list, + * since module can be unloaded while bpf_core_apply_relo_insn + * is working with module's btf. + */ + } + + err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8, + relo, relo_idx, ctx->btf, &cands); +out: + if (need_cands) { + kfree(cands.cands); + mutex_unlock(&cand_cache_mutex); + if (ctx->log->level & BPF_LOG_LEVEL2) + print_cand_cache(ctx->log); + } + return err; } From d0e928876e30b18411b80fd2445424bc00e95745 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:32 -0800 Subject: [PATCH 067/115] libbpf: Use CO-RE in the kernel in light skeleton. Without lskel the CO-RE relocations are processed by libbpf before any other work is done. Instead, when lskel is needed, remember relocation as RELO_CORE kind. Then when loader prog is generated for a given bpf program pass CO-RE relos of that program to gen loader via bpf_gen__record_relo_core(). The gen loader will remember them as-is and pass it later as-is into the kernel. The normal libbpf flow is to process CO-RE early before call relos happen. In case of gen_loader the core relos have to be added to other relos to be copied together when bpf static function is appended in different places to other main bpf progs. During the copy the append_subprog_relos() will adjust insn_idx for normal relos and for RELO_CORE kind too. When that is done each struct reloc_desc has good relos for specific main prog. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-10-alexei.starovoitov@gmail.com --- tools/lib/bpf/bpf_gen_internal.h | 3 + tools/lib/bpf/gen_loader.c | 41 +++++++++++- tools/lib/bpf/libbpf.c | 109 ++++++++++++++++++++++--------- 3 files changed, 120 insertions(+), 33 deletions(-) diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h index ae7704deba30..9d57fa84664b 100644 --- a/tools/lib/bpf/bpf_gen_internal.h +++ b/tools/lib/bpf/bpf_gen_internal.h @@ -39,6 +39,8 @@ struct bpf_gen { int error; struct ksym_relo_desc *relos; int relo_cnt; + struct bpf_core_relo *core_relos; + int core_relo_cnt; char attach_target[128]; int attach_kind; struct ksym_desc *ksyms; @@ -64,5 +66,6 @@ void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx); void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *name, enum bpf_attach_type type); void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, bool is_typeless, int kind, int insn_idx); +void bpf_gen__record_relo_core(struct bpf_gen *gen, const struct bpf_core_relo *core_relo); #endif diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 6f3790369463..87d385e892ab 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -829,6 +829,22 @@ static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, emit_ksym_relo_log(gen, relo, kdesc->ref); } +void bpf_gen__record_relo_core(struct bpf_gen *gen, + const struct bpf_core_relo *core_relo) +{ + struct bpf_core_relo *relos; + + relos = libbpf_reallocarray(gen->core_relos, gen->core_relo_cnt + 1, sizeof(*relos)); + if (!relos) { + gen->error = -ENOMEM; + return; + } + gen->core_relos = relos; + relos += gen->core_relo_cnt; + memcpy(relos, core_relo, sizeof(*relos)); + gen->core_relo_cnt++; +} + static void emit_relo(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insns) { int insn; @@ -861,6 +877,15 @@ static void emit_relos(struct bpf_gen *gen, int insns) emit_relo(gen, gen->relos + i, insns); } +static void cleanup_core_relo(struct bpf_gen *gen) +{ + if (!gen->core_relo_cnt) + return; + free(gen->core_relos); + gen->core_relo_cnt = 0; + gen->core_relos = NULL; +} + static void cleanup_relos(struct bpf_gen *gen, int insns) { int i, insn; @@ -888,6 +913,7 @@ static void cleanup_relos(struct bpf_gen *gen, int insns) gen->relo_cnt = 0; gen->relos = NULL; } + cleanup_core_relo(gen); } void bpf_gen__prog_load(struct bpf_gen *gen, @@ -895,12 +921,13 @@ void bpf_gen__prog_load(struct bpf_gen *gen, const char *license, struct bpf_insn *insns, size_t insn_cnt, struct bpf_prog_load_opts *load_attr, int prog_idx) { - int attr_size = offsetofend(union bpf_attr, fd_array); - int prog_load_attr, license_off, insns_off, func_info, line_info; + int prog_load_attr, license_off, insns_off, func_info, line_info, core_relos; + int attr_size = offsetofend(union bpf_attr, core_relo_rec_size); union bpf_attr attr; memset(&attr, 0, attr_size); - pr_debug("gen: prog_load: type %d insns_cnt %zd\n", prog_type, insn_cnt); + pr_debug("gen: prog_load: type %d insns_cnt %zd progi_idx %d\n", + prog_type, insn_cnt, prog_idx); /* add license string to blob of bytes */ license_off = add_data(gen, license, strlen(license) + 1); /* add insns to blob of bytes */ @@ -924,6 +951,11 @@ void bpf_gen__prog_load(struct bpf_gen *gen, line_info = add_data(gen, load_attr->line_info, attr.line_info_cnt * attr.line_info_rec_size); + attr.core_relo_rec_size = sizeof(struct bpf_core_relo); + attr.core_relo_cnt = gen->core_relo_cnt; + core_relos = add_data(gen, gen->core_relos, + attr.core_relo_cnt * attr.core_relo_rec_size); + memcpy(attr.prog_name, prog_name, min((unsigned)strlen(prog_name), BPF_OBJ_NAME_LEN - 1)); prog_load_attr = add_data(gen, &attr, attr_size); @@ -940,6 +972,9 @@ void bpf_gen__prog_load(struct bpf_gen *gen, /* populate union bpf_attr with a pointer to line_info */ emit_rel_store(gen, attr_field(prog_load_attr, line_info), line_info); + /* populate union bpf_attr with a pointer to core_relos */ + emit_rel_store(gen, attr_field(prog_load_attr, core_relos), core_relos); + /* populate union bpf_attr fd_array with a pointer to data where map_fds are saved */ emit_rel_store(gen, attr_field(prog_load_attr, fd_array), gen->fd_array); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 96792d6e6fc1..831c12e00813 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -230,13 +230,19 @@ enum reloc_type { RELO_EXTERN_VAR, RELO_EXTERN_FUNC, RELO_SUBPROG_ADDR, + RELO_CORE, }; struct reloc_desc { enum reloc_type type; int insn_idx; - int map_idx; - int sym_off; + union { + const struct bpf_core_relo *core_relo; /* used when type == RELO_CORE */ + struct { + int map_idx; + int sym_off; + }; + }; }; struct bpf_sec_def; @@ -5485,6 +5491,24 @@ static void *u32_as_hash_key(__u32 x) return (void *)(uintptr_t)x; } +static int record_relo_core(struct bpf_program *prog, + const struct bpf_core_relo *core_relo, int insn_idx) +{ + struct reloc_desc *relos, *relo; + + relos = libbpf_reallocarray(prog->reloc_desc, + prog->nr_reloc + 1, sizeof(*relos)); + if (!relos) + return -ENOMEM; + relo = &relos[prog->nr_reloc]; + relo->type = RELO_CORE; + relo->insn_idx = insn_idx; + relo->core_relo = core_relo; + prog->reloc_desc = relos; + prog->nr_reloc++; + return 0; +} + static int bpf_core_apply_relo(struct bpf_program *prog, const struct bpf_core_relo *relo, int relo_idx, @@ -5521,10 +5545,12 @@ static int bpf_core_apply_relo(struct bpf_program *prog, return -EINVAL; if (prog->obj->gen_loader) { - pr_warn("// TODO core_relo: prog %td insn[%d] %s kind %d\n", + const char *spec_str = btf__name_by_offset(local_btf, relo->access_str_off); + + pr_debug("record_relo_core: prog %td insn[%d] %s %s %s final insn_idx %d\n", prog - prog->obj->programs, relo->insn_off / 8, - local_name, relo->kind); - return -ENOTSUP; + btf_kind_str(local_type), local_name, spec_str, insn_idx); + return record_relo_core(prog, relo, insn_idx); } if (relo->kind != BPF_CORE_TYPE_ID_LOCAL && @@ -5729,6 +5755,9 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) case RELO_CALL: /* handled already */ break; + case RELO_CORE: + /* will be handled by bpf_program_record_relos() */ + break; default: pr_warn("prog '%s': relo #%d: bad relo type %d\n", prog->name, i, relo->type); @@ -6169,6 +6198,35 @@ bpf_object__free_relocs(struct bpf_object *obj) } } +static int cmp_relocs(const void *_a, const void *_b) +{ + const struct reloc_desc *a = _a; + const struct reloc_desc *b = _b; + + if (a->insn_idx != b->insn_idx) + return a->insn_idx < b->insn_idx ? -1 : 1; + + /* no two relocations should have the same insn_idx, but ... */ + if (a->type != b->type) + return a->type < b->type ? -1 : 1; + + return 0; +} + +static void bpf_object__sort_relos(struct bpf_object *obj) +{ + int i; + + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *p = &obj->programs[i]; + + if (!p->nr_reloc) + continue; + + qsort(p->reloc_desc, p->nr_reloc, sizeof(*p->reloc_desc), cmp_relocs); + } +} + static int bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) { @@ -6183,6 +6241,8 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) err); return err; } + if (obj->gen_loader) + bpf_object__sort_relos(obj); } /* Before relocating calls pre-process relocations and mark @@ -6387,21 +6447,6 @@ static int bpf_object__collect_map_relos(struct bpf_object *obj, return 0; } -static int cmp_relocs(const void *_a, const void *_b) -{ - const struct reloc_desc *a = _a; - const struct reloc_desc *b = _b; - - if (a->insn_idx != b->insn_idx) - return a->insn_idx < b->insn_idx ? -1 : 1; - - /* no two relocations should have the same insn_idx, but ... */ - if (a->type != b->type) - return a->type < b->type ? -1 : 1; - - return 0; -} - static int bpf_object__collect_relos(struct bpf_object *obj) { int i, err; @@ -6434,14 +6479,7 @@ static int bpf_object__collect_relos(struct bpf_object *obj) return err; } - for (i = 0; i < obj->nr_programs; i++) { - struct bpf_program *p = &obj->programs[i]; - - if (!p->nr_reloc) - continue; - - qsort(p->reloc_desc, p->nr_reloc, sizeof(*p->reloc_desc), cmp_relocs); - } + bpf_object__sort_relos(obj); return 0; } @@ -6683,7 +6721,7 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog return ret; } -static int bpf_program__record_externs(struct bpf_program *prog) +static int bpf_program_record_relos(struct bpf_program *prog) { struct bpf_object *obj = prog->obj; int i; @@ -6705,6 +6743,17 @@ static int bpf_program__record_externs(struct bpf_program *prog) ext->is_weak, false, BTF_KIND_FUNC, relo->insn_idx); break; + case RELO_CORE: { + struct bpf_core_relo cr = { + .insn_off = relo->insn_idx * 8, + .type_id = relo->core_relo->type_id, + .access_str_off = relo->core_relo->access_str_off, + .kind = relo->core_relo->kind, + }; + + bpf_gen__record_relo_core(obj->gen_loader, &cr); + break; + } default: continue; } @@ -6744,7 +6793,7 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog prog->name, prog->instances.nr); } if (obj->gen_loader) - bpf_program__record_externs(prog); + bpf_program_record_relos(prog); err = bpf_object_load_prog_instance(obj, prog, prog->insns, prog->insns_cnt, license, kern_ver, &fd); From be05c94476f3cf4fdc29feab4ed1053187323296 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:33 -0800 Subject: [PATCH 068/115] libbpf: Support init of inner maps in light skeleton. Add ability to initialize inner maps in light skeleton. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-11-alexei.starovoitov@gmail.com --- tools/lib/bpf/bpf_gen_internal.h | 1 + tools/lib/bpf/gen_loader.c | 27 +++++++++++++++++++++++++++ tools/lib/bpf/libbpf.c | 6 +++--- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h index 9d57fa84664b..9dd5f76692ef 100644 --- a/tools/lib/bpf/bpf_gen_internal.h +++ b/tools/lib/bpf/bpf_gen_internal.h @@ -67,5 +67,6 @@ void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *name, enum b void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, bool is_typeless, int kind, int insn_idx); void bpf_gen__record_relo_core(struct bpf_gen *gen, const struct bpf_core_relo *core_relo); +void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int key, int inner_map_idx); #endif diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 87d385e892ab..ed0e949790da 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -1053,6 +1053,33 @@ void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *pvalue, emit_check_err(gen); } +void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int slot, + int inner_map_idx) +{ + int attr_size = offsetofend(union bpf_attr, flags); + int map_update_attr, key; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: populate_outer_map: outer %d key %d inner %d\n", + outer_map_idx, slot, inner_map_idx); + + key = add_data(gen, &slot, sizeof(slot)); + + map_update_attr = add_data(gen, &attr, attr_size); + move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4, + blob_fd_array_off(gen, outer_map_idx)); + emit_rel_store(gen, attr_field(map_update_attr, key), key); + emit_rel_store(gen, attr_field(map_update_attr, value), + blob_fd_array_off(gen, inner_map_idx)); + + /* emit MAP_UPDATE_ELEM command */ + emit_sys_bpf(gen, BPF_MAP_UPDATE_ELEM, map_update_attr, attr_size); + debug_ret(gen, "populate_outer_map outer %d key %d inner %d", + outer_map_idx, slot, inner_map_idx); + emit_check_err(gen); +} + void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx) { int attr_size = offsetofend(union bpf_attr, map_fd); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 831c12e00813..1341ce539662 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4971,9 +4971,9 @@ static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map) fd = bpf_map__fd(targ_map); if (obj->gen_loader) { - pr_warn("// TODO map_update_elem: idx %td key %d value==map_idx %td\n", - map - obj->maps, i, targ_map - obj->maps); - return -ENOTSUP; + bpf_gen__populate_outer_map(obj->gen_loader, + map - obj->maps, i, + targ_map - obj->maps); } else { err = bpf_map_update_elem(map->fd, &i, &fd, 0); } From 19250f5fc0c283892a61f3abf9d65e6325f63897 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:34 -0800 Subject: [PATCH 069/115] libbpf: Clean gen_loader's attach kind. The gen_loader has to clear attach_kind otherwise the programs without attach_btf_id will fail load if they follow programs with attach_btf_id. Fixes: 67234743736a ("libbpf: Generate loader program out of BPF ELF file.") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-12-alexei.starovoitov@gmail.com --- tools/lib/bpf/gen_loader.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index ed0e949790da..21dfb930f1d2 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -1005,9 +1005,11 @@ void bpf_gen__prog_load(struct bpf_gen *gen, debug_ret(gen, "prog_load %s insn_cnt %d", attr.prog_name, attr.insn_cnt); /* successful or not, close btf module FDs used in extern ksyms and attach_btf_obj_fd */ cleanup_relos(gen, insns_off); - if (gen->attach_kind) + if (gen->attach_kind) { emit_sys_close_blob(gen, attr_field(prog_load_attr, attach_btf_obj_fd)); + gen->attach_kind = 0; + } emit_check_err(gen); /* remember prog_fd in the stack, if successful */ emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, From bc5f75da977b2a4d9aa6827081e6c2ddd3347328 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:35 -0800 Subject: [PATCH 070/115] selftests/bpf: Add lskel version of kfunc test. Add light skeleton version of kfunc_call_test_subprog test. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-13-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/Makefile | 2 +- .../selftests/bpf/prog_tests/kfunc_call.c | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index a6c0e92c86a1..6046f86841cd 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -327,7 +327,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \ test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c # Generate both light skeleton and libbpf skeleton for these -LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c +LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c SKEL_BLACKLIST += $$(LSKELS) test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index 5c9c0176991b..7d7445ccc141 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -4,6 +4,7 @@ #include #include "kfunc_call_test.lskel.h" #include "kfunc_call_test_subprog.skel.h" +#include "kfunc_call_test_subprog.lskel.h" static void test_main(void) { @@ -49,6 +50,26 @@ static void test_subprog(void) kfunc_call_test_subprog__destroy(skel); } +static void test_subprog_lskel(void) +{ + struct kfunc_call_test_subprog_lskel *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test_subprog_lskel__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = skel->progs.kfunc_call_test1.prog_fd; + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 10, "test1-retval"); + ASSERT_NEQ(skel->data->active_res, -1, "active_res"); + ASSERT_EQ(skel->data->sk_state_res, BPF_TCP_CLOSE, "sk_state_res"); + + kfunc_call_test_subprog_lskel__destroy(skel); +} + void test_kfunc_call(void) { if (test__start_subtest("main")) @@ -56,4 +77,7 @@ void test_kfunc_call(void) if (test__start_subtest("subprog")) test_subprog(); + + if (test__start_subtest("subprog_lskel")) + test_subprog_lskel(); } From d82fa9b708d7d8a9c275d86c4388d24ecc63206c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:36 -0800 Subject: [PATCH 071/115] selftests/bpf: Improve inner_map test coverage. Check that hash and array inner maps are properly initialized. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-14-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/progs/map_ptr_kern.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index b1b711d9b214..b64df94ec476 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -334,9 +334,11 @@ static inline int check_lpm_trie(void) return 1; } +#define INNER_MAX_ENTRIES 1234 + struct inner_map { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); + __uint(max_entries, INNER_MAX_ENTRIES); __type(key, __u32); __type(value, __u32); } inner_map SEC(".maps"); @@ -348,7 +350,7 @@ struct { __type(value, __u32); __array(values, struct { __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); + __uint(max_entries, INNER_MAX_ENTRIES); __type(key, __u32); __type(value, __u32); }); @@ -360,8 +362,13 @@ static inline int check_array_of_maps(void) { struct bpf_array *array_of_maps = (struct bpf_array *)&m_array_of_maps; struct bpf_map *map = (struct bpf_map *)&m_array_of_maps; + struct bpf_array *inner_map; + int key = 0; VERIFY(check_default(&array_of_maps->map, map)); + inner_map = bpf_map_lookup_elem(array_of_maps, &key); + VERIFY(inner_map != 0); + VERIFY(inner_map->map.max_entries == INNER_MAX_ENTRIES); return 1; } @@ -382,8 +389,13 @@ static inline int check_hash_of_maps(void) { struct bpf_htab *hash_of_maps = (struct bpf_htab *)&m_hash_of_maps; struct bpf_map *map = (struct bpf_map *)&m_hash_of_maps; + struct bpf_htab *inner_map; + int key = 2; VERIFY(check_default(&hash_of_maps->map, map)); + inner_map = bpf_map_lookup_elem(hash_of_maps, &key); + VERIFY(inner_map != 0); + VERIFY(inner_map->map.max_entries == INNER_MAX_ENTRIES); return 1; } From 650c9dbd101ba7d7180f4e77deb1c273f4ea5ca3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:37 -0800 Subject: [PATCH 072/115] selftests/bpf: Convert map_ptr_kern test to use light skeleton. To exercise CO-RE in the kernel further convert map_ptr_kern test to light skeleton. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-15-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/Makefile | 3 ++- tools/testing/selftests/bpf/prog_tests/map_ptr.c | 16 +++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6046f86841cd..200ebcc73651 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -325,7 +325,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ linked_vars.skel.h linked_maps.skel.h LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \ - test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c + test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c \ + map_ptr_kern.c # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c SKEL_BLACKLIST += $$(LSKELS) diff --git a/tools/testing/selftests/bpf/prog_tests/map_ptr.c b/tools/testing/selftests/bpf/prog_tests/map_ptr.c index 4972f92205c7..273725504f11 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_ptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_ptr.c @@ -4,31 +4,29 @@ #include #include -#include "map_ptr_kern.skel.h" +#include "map_ptr_kern.lskel.h" void test_map_ptr(void) { - struct map_ptr_kern *skel; + struct map_ptr_kern_lskel *skel; __u32 duration = 0, retval; char buf[128]; int err; int page_size = getpagesize(); - skel = map_ptr_kern__open(); + skel = map_ptr_kern_lskel__open(); if (!ASSERT_OK_PTR(skel, "skel_open")) return; - err = bpf_map__set_max_entries(skel->maps.m_ringbuf, page_size); - if (!ASSERT_OK(err, "bpf_map__set_max_entries")) - goto cleanup; + skel->maps.m_ringbuf.max_entries = page_size; - err = map_ptr_kern__load(skel); + err = map_ptr_kern_lskel__load(skel); if (!ASSERT_OK(err, "skel_load")) goto cleanup; skel->bss->page_size = page_size; - err = bpf_prog_test_run(bpf_program__fd(skel->progs.cg_skb), 1, &pkt_v4, + err = bpf_prog_test_run(skel->progs.cg_skb.prog_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, NULL, &retval, NULL); if (CHECK(err, "test_run", "err=%d errno=%d\n", err, errno)) @@ -39,5 +37,5 @@ void test_map_ptr(void) goto cleanup; cleanup: - map_ptr_kern__destroy(skel); + map_ptr_kern_lskel__destroy(skel); } From 26b367e3663931f2fee5f0786a1eff712e67b0bf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:38 -0800 Subject: [PATCH 073/115] selftests/bpf: Additional test for CO-RE in the kernel. Add a test where randmap() function is appended to three different bpf programs. That action checks struct bpf_core_relo replication logic and offset adjustment in gen loader part of libbpf. Fourth bpf program has 360 CO-RE relocations from vmlinux, bpf_testmod, and non-existing type. It tests candidate cache logic. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-16-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/Makefile | 2 +- .../selftests/bpf/prog_tests/core_kern.c | 14 +++ tools/testing/selftests/bpf/progs/core_kern.c | 104 ++++++++++++++++++ 3 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/core_kern.c create mode 100644 tools/testing/selftests/bpf/progs/core_kern.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 200ebcc73651..8981369b071b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -326,7 +326,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \ test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c \ - map_ptr_kern.c + map_ptr_kern.c core_kern.c # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c SKEL_BLACKLIST += $$(LSKELS) diff --git a/tools/testing/selftests/bpf/prog_tests/core_kern.c b/tools/testing/selftests/bpf/prog_tests/core_kern.c new file mode 100644 index 000000000000..561c5185d886 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/core_kern.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include "test_progs.h" +#include "core_kern.lskel.h" + +void test_core_kern_lskel(void) +{ + struct core_kern_lskel *skel; + + skel = core_kern_lskel__open_and_load(); + ASSERT_OK_PTR(skel, "open_and_load"); + core_kern_lskel__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/core_kern.c b/tools/testing/selftests/bpf/progs/core_kern.c new file mode 100644 index 000000000000..13499cc15c7d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/core_kern.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" + +#include +#include +#include + +#define ATTR __always_inline +#include "test_jhash.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 256); +} array1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 256); +} array2 SEC(".maps"); + +static __noinline int randmap(int v, const struct net_device *dev) +{ + struct bpf_map *map = (struct bpf_map *)&array1; + int key = bpf_get_prandom_u32() & 0xff; + int *val; + + if (bpf_get_prandom_u32() & 1) + map = (struct bpf_map *)&array2; + + val = bpf_map_lookup_elem(map, &key); + if (val) + *val = bpf_get_prandom_u32() + v + dev->mtu; + + return 0; +} + +SEC("tp_btf/xdp_devmap_xmit") +int BPF_PROG(tp_xdp_devmap_xmit_multi, const struct net_device + *from_dev, const struct net_device *to_dev, int sent, int drops, + int err) +{ + return randmap(from_dev->ifindex, from_dev); +} + +SEC("fentry/eth_type_trans") +int BPF_PROG(fentry_eth_type_trans, struct sk_buff *skb, + struct net_device *dev, unsigned short protocol) +{ + return randmap(dev->ifindex + skb->len, dev); +} + +SEC("fexit/eth_type_trans") +int BPF_PROG(fexit_eth_type_trans, struct sk_buff *skb, + struct net_device *dev, unsigned short protocol) +{ + return randmap(dev->ifindex + skb->len, dev); +} + +volatile const int never; + +struct __sk_bUfF /* it will not exist in vmlinux */ { + int len; +} __attribute__((preserve_access_index)); + +struct bpf_testmod_test_read_ctx /* it exists in bpf_testmod */ { + size_t len; +} __attribute__((preserve_access_index)); + +SEC("tc") +int balancer_ingress(struct __sk_buff *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + void *ptr; + int ret = 0, nh_off, i = 0; + + nh_off = 14; + + /* pragma unroll doesn't work on large loops */ +#define C do { \ + ptr = data + i; \ + if (ptr + nh_off > data_end) \ + break; \ + ctx->tc_index = jhash(ptr, nh_off, ctx->cb[0] + i++); \ + if (never) { \ + /* below is a dead code with unresolvable CO-RE relo */ \ + i += ((struct __sk_bUfF *)ctx)->len; \ + /* this CO-RE relo may or may not resolve + * depending on whether bpf_testmod is loaded. + */ \ + i += ((struct bpf_testmod_test_read_ctx *)ctx)->len; \ + } \ + } while (0); +#define C30 C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C; + C30;C30;C30; /* 90 calls */ + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; From 3268f0316af629474ec4fa8d9b4e6f618cb96794 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:39 -0800 Subject: [PATCH 074/115] selftests/bpf: Revert CO-RE removal in test_ksyms_weak. The commit 087cba799ced ("selftests/bpf: Add weak/typeless ksym test for light skeleton") added test_ksyms_weak to light skeleton testing, but remove CO-RE access. Revert that part of commit, since light skeleton can use CO-RE in the kernel. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-17-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/progs/test_ksyms_weak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_weak.c b/tools/testing/selftests/bpf/progs/test_ksyms_weak.c index 8eadbd4caf7a..5f8379aadb29 100644 --- a/tools/testing/selftests/bpf/progs/test_ksyms_weak.c +++ b/tools/testing/selftests/bpf/progs/test_ksyms_weak.c @@ -38,7 +38,7 @@ int pass_handler(const void *ctx) /* tests existing symbols. */ rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0); if (rq) - out__existing_typed = 0; + out__existing_typed = rq->cpu; out__existing_typeless = (__u64)&bpf_prog_active; /* tests non-existent symbols. */ From 098dc5335a2083223c80d058ab4d23f6ce120b97 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:40 -0800 Subject: [PATCH 075/115] selftests/bpf: Add CO-RE relocations to verifier scale test. Add 182 CO-RE relocations to verifier scale test. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-18-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/progs/test_verif_scale2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale2.c b/tools/testing/selftests/bpf/progs/test_verif_scale2.c index f024154c7be7..f90ffcafd1e8 100644 --- a/tools/testing/selftests/bpf/progs/test_verif_scale2.c +++ b/tools/testing/selftests/bpf/progs/test_verif_scale2.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include +#include "vmlinux.h" #include #define ATTR __always_inline #include "test_jhash.h" -SEC("scale90_inline") +SEC("tc") int balancer_ingress(struct __sk_buff *ctx) { void *data_end = (void *)(long)ctx->data_end; From eee9a6df0eed6481d5448a55b218a45868b41b5b Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Wed, 1 Dec 2021 14:51:02 +0000 Subject: [PATCH 076/115] selftests/bpf: Build testing_helpers.o out of tree Add $(OUTPUT) prefix to testing_helpers.o, so it can be built out of tree when necessary. At the moment, in addition to being built in-tree even when out-of-tree is required, testing_helpers.o is not built with the right recipe when cross-building. For consistency the other helpers, cgroup_helpers and trace_helpers, can also be passed as objects instead of source. Use *_HELPERS variable to keep the Makefile readable. Fixes: f87c1930ac29 ("selftests/bpf: Merge test_stub.c into testing_helpers.c") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201145101.823159-1-jean-philippe@linaro.org --- tools/testing/selftests/bpf/Makefile | 40 +++++++++++++++------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8981369b071b..cd73dccaeb09 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -192,22 +192,26 @@ TEST_GEN_PROGS_EXTENDED += $(DEFAULT_BPFTOOL) $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(BPFOBJ) -$(OUTPUT)/test_dev_cgroup: cgroup_helpers.c testing_helpers.o -$(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c testing_helpers.o -$(OUTPUT)/test_sock: cgroup_helpers.c testing_helpers.o -$(OUTPUT)/test_sock_addr: cgroup_helpers.c testing_helpers.o -$(OUTPUT)/test_sockmap: cgroup_helpers.c testing_helpers.o -$(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c testing_helpers.o -$(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c testing_helpers.o -$(OUTPUT)/test_cgroup_storage: cgroup_helpers.c testing_helpers.o -$(OUTPUT)/test_sock_fields: cgroup_helpers.c testing_helpers.o -$(OUTPUT)/test_sysctl: cgroup_helpers.c testing_helpers.o -$(OUTPUT)/test_tag: testing_helpers.o -$(OUTPUT)/test_lirc_mode2_user: testing_helpers.o -$(OUTPUT)/xdping: testing_helpers.o -$(OUTPUT)/flow_dissector_load: testing_helpers.o -$(OUTPUT)/test_maps: testing_helpers.o -$(OUTPUT)/test_verifier: testing_helpers.o +CGROUP_HELPERS := $(OUTPUT)/cgroup_helpers.o +TESTING_HELPERS := $(OUTPUT)/testing_helpers.o +TRACE_HELPERS := $(OUTPUT)/trace_helpers.o + +$(OUTPUT)/test_dev_cgroup: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_skb_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_sock: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_sock_addr: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) +$(OUTPUT)/get_cgroup_id_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_cgroup_storage: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS) +$(OUTPUT)/test_tag: $(TESTING_HELPERS) +$(OUTPUT)/test_lirc_mode2_user: $(TESTING_HELPERS) +$(OUTPUT)/xdping: $(TESTING_HELPERS) +$(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) +$(OUTPUT)/test_maps: $(TESTING_HELPERS) +$(OUTPUT)/test_verifier: $(TESTING_HELPERS) BPFTOOL ?= $(DEFAULT_BPFTOOL) $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ @@ -536,8 +540,8 @@ $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ - $(OUTPUT)/testing_helpers.o \ - $(OUTPUT)/trace_helpers.o \ + $(TESTING_HELPERS) \ + $(TRACE_HELPERS) \ $(OUTPUT)/bench_count.o \ $(OUTPUT)/bench_rename.o \ $(OUTPUT)/bench_trigger.o \ From 8b4ff5f8bb126fa8ee6918f4854748277609cf68 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Nov 2021 16:50:30 -0800 Subject: [PATCH 077/115] selftests/bpf: Update test names for xchg and cmpxchg The test_cmpxchg() and test_xchg() functions say "test_run add". Therefore, make them say "test_run cmpxchg" and "test_run xchg", respectively. Signed-off-by: Paul E. McKenney Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201005030.GA3071525@paulmck-ThinkPad-P17-Gen-1 --- tools/testing/selftests/bpf/prog_tests/atomics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/atomics.c b/tools/testing/selftests/bpf/prog_tests/atomics.c index 0f9525293881..86b7d5d84eec 100644 --- a/tools/testing/selftests/bpf/prog_tests/atomics.c +++ b/tools/testing/selftests/bpf/prog_tests/atomics.c @@ -167,7 +167,7 @@ static void test_cmpxchg(struct atomics_lskel *skel) prog_fd = skel->progs.cmpxchg.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - if (CHECK(err || retval, "test_run add", + if (CHECK(err || retval, "test_run cmpxchg", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration)) goto cleanup; @@ -196,7 +196,7 @@ static void test_xchg(struct atomics_lskel *skel) prog_fd = skel->progs.xchg.prog_fd; err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, NULL, &retval, &duration); - if (CHECK(err || retval, "test_run add", + if (CHECK(err || retval, "test_run xchg", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration)) goto cleanup; From 74d9807023573ba2d82ec3f505f6aa0c7076918c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Dec 2021 15:28:16 -0800 Subject: [PATCH 078/115] libbpf: Use __u32 fields in bpf_map_create_opts Corresponding Linux UAPI struct uses __u32, not int, so keep it consistent. Fixes: 992c4225419a ("libbpf: Unify low-level map creation APIs w/ new bpf_map_create()") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211201232824.3166325-2-andrii@kernel.org --- tools/lib/bpf/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 70b6f44fc8b0..f79e5fbcf1c1 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -43,12 +43,12 @@ struct bpf_map_create_opts { __u32 btf_value_type_id; __u32 btf_vmlinux_value_type_id; - int inner_map_fd; - int map_flags; + __u32 inner_map_fd; + __u32 map_flags; __u64 map_extra; - int numa_node; - int map_ifindex; + __u32 numa_node; + __u32 map_ifindex; }; #define bpf_map_create_opts__last_field map_ifindex From dbdd2c7f8cec2d09ae0e1bd707ae6050fa1c105f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Dec 2021 15:28:17 -0800 Subject: [PATCH 079/115] libbpf: Add API to get/set log_level at per-program level Add bpf_program__set_log_level() and bpf_program__log_level() to fetch and adjust log_level sent during BPF_PROG_LOAD command. This allows to selectively request more or less verbose output in BPF verifier log. Also bump libbpf version to 0.7 and make these APIs the first in v0.7. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211201232824.3166325-3-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 14 ++++++++++++++ tools/lib/bpf/libbpf.h | 2 ++ tools/lib/bpf/libbpf.map | 6 ++++++ tools/lib/bpf/libbpf_version.h | 2 +- 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1341ce539662..de260c94e418 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8475,6 +8475,20 @@ int bpf_program__set_flags(struct bpf_program *prog, __u32 flags) return 0; } +__u32 bpf_program__log_level(const struct bpf_program *prog) +{ + return prog->log_level; +} + +int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->log_level = log_level; + return 0; +} + #define SEC_DEF(sec_pfx, ptype, atype, flags, ...) { \ .sec = sec_pfx, \ .prog_type = BPF_PROG_TYPE_##ptype, \ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index d02139fec4ac..148fa85bab33 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -499,6 +499,8 @@ bpf_program__set_expected_attach_type(struct bpf_program *prog, LIBBPF_API __u32 bpf_program__flags(const struct bpf_program *prog); LIBBPF_API int bpf_program__set_flags(struct bpf_program *prog, __u32 flags); +LIBBPF_API __u32 bpf_program__log_level(const struct bpf_program *prog); +LIBBPF_API int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level); LIBBPF_API int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 623002b83b2b..715df3a27389 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -419,3 +419,9 @@ LIBBPF_0.6.0 { perf_buffer__new_raw; perf_buffer__new_raw_deprecated; } LIBBPF_0.5.0; + +LIBBPF_0.7.0 { + global: + bpf_program__log_level; + bpf_program__set_log_level; +}; diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h index dd56d76f291c..0fefefc3500b 100644 --- a/tools/lib/bpf/libbpf_version.h +++ b/tools/lib/bpf/libbpf_version.h @@ -4,6 +4,6 @@ #define __LIBBPF_VERSION_H #define LIBBPF_MAJOR_VERSION 0 -#define LIBBPF_MINOR_VERSION 6 +#define LIBBPF_MINOR_VERSION 7 #endif /* __LIBBPF_VERSION_H */ From a15d408b839af421fba0a2ff6df193c13ef753d4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Dec 2021 15:28:18 -0800 Subject: [PATCH 080/115] bpftool: Migrate off of deprecated bpf_create_map_xattr() API Switch to bpf_map_create() API instead. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211201232824.3166325-4-andrii@kernel.org --- tools/bpf/bpftool/map.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 25b258804f11..cc530a229812 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -1261,7 +1261,10 @@ static int do_pin(int argc, char **argv) static int do_create(int argc, char **argv) { - struct bpf_create_map_attr attr = { NULL, }; + LIBBPF_OPTS(bpf_map_create_opts, attr); + enum bpf_map_type map_type = BPF_MAP_TYPE_UNSPEC; + __u32 key_size = 0, value_size = 0, max_entries = 0; + const char *map_name = NULL; const char *pinfile; int err = -1, fd; @@ -1276,30 +1279,30 @@ static int do_create(int argc, char **argv) if (is_prefix(*argv, "type")) { NEXT_ARG(); - if (attr.map_type) { + if (map_type) { p_err("map type already specified"); goto exit; } - attr.map_type = map_type_from_str(*argv); - if ((int)attr.map_type < 0) { + map_type = map_type_from_str(*argv); + if ((int)map_type < 0) { p_err("unrecognized map type: %s", *argv); goto exit; } NEXT_ARG(); } else if (is_prefix(*argv, "name")) { NEXT_ARG(); - attr.name = GET_ARG(); + map_name = GET_ARG(); } else if (is_prefix(*argv, "key")) { - if (parse_u32_arg(&argc, &argv, &attr.key_size, + if (parse_u32_arg(&argc, &argv, &key_size, "key size")) goto exit; } else if (is_prefix(*argv, "value")) { - if (parse_u32_arg(&argc, &argv, &attr.value_size, + if (parse_u32_arg(&argc, &argv, &value_size, "value size")) goto exit; } else if (is_prefix(*argv, "entries")) { - if (parse_u32_arg(&argc, &argv, &attr.max_entries, + if (parse_u32_arg(&argc, &argv, &max_entries, "max entries")) goto exit; } else if (is_prefix(*argv, "flags")) { @@ -1340,14 +1343,14 @@ static int do_create(int argc, char **argv) } } - if (!attr.name) { + if (!map_name) { p_err("map name not specified"); goto exit; } set_max_rlimit(); - fd = bpf_create_map_xattr(&attr); + fd = bpf_map_create(map_type, map_name, key_size, value_size, max_entries, &attr); if (fd < 0) { p_err("map create failed: %s", strerror(errno)); goto exit; From 045b233a29a2ea3a168296f000cd5b1c08c4a2f7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Dec 2021 15:28:19 -0800 Subject: [PATCH 081/115] selftests/bpf: Remove recently reintroduced legacy btf__dedup() use We've added one extra patch that added back the use of legacy btf__dedup() variant. Clean that up. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211201232824.3166325-5-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c index 94ff9757557a..878a864dae3b 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c @@ -364,7 +364,7 @@ static void test_split_dup_struct_in_cu() "\t'f2' type_id=1 bits_offset=32"); /* ..dedup them... */ - err = btf__dedup(btf1, NULL, NULL); + err = btf__dedup(btf1, NULL); if (!ASSERT_OK(err, "btf_dedup")) goto cleanup; @@ -405,7 +405,7 @@ static void test_split_dup_struct_in_cu() "\t'f1' type_id=4 bits_offset=0\n" "\t'f2' type_id=4 bits_offset=32"); - err = btf__dedup(btf2, NULL, NULL); + err = btf__dedup(btf2, NULL); if (!ASSERT_OK(err, "btf_dedup")) goto cleanup; From 00872de6e1b004377f6036f95db43e2145606eb2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Dec 2021 15:28:20 -0800 Subject: [PATCH 082/115] selftests/bpf: Mute xdpxceiver.c's deprecation warnings xdpxceiver.c is using AF_XDP APIs that are deprecated starting from libbpf 0.7. Until we migrate the test to libxdp or solve this issue in some other way, mute deprecation warnings within xdpxceiver.c. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211201232824.3166325-6-andrii@kernel.org --- tools/testing/selftests/bpf/xdpxceiver.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 040164c7efc1..0a5d23da486d 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -100,6 +100,12 @@ #include "xdpxceiver.h" #include "../kselftest.h" +/* AF_XDP APIs were moved into libxdp and marked as deprecated in libbpf. + * Until xdpxceiver is either moved or re-writed into libxdp, suppress + * deprecation warnings in this file + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; static const char *IP1 = "192.168.100.162"; From 186d1a86003ddcf0ec9e85e17ece868663106639 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Dec 2021 15:28:21 -0800 Subject: [PATCH 083/115] selftests/bpf: Remove all the uses of deprecated bpf_prog_load_xattr() Migrate all the selftests that were still using bpf_prog_load_xattr(). Few are converted to skeleton, others will use bpf_object__open_file() API. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211201232824.3166325-7-andrii@kernel.org --- .../bpf/prog_tests/bpf_verif_scale.c | 28 ++++++--- .../bpf/prog_tests/connect_force_port.c | 17 +++--- .../selftests/bpf/prog_tests/kfree_skb.c | 60 +++++++------------ .../bpf/prog_tests/sockopt_inherit.c | 12 ++-- .../selftests/bpf/prog_tests/sockopt_multi.c | 12 ++-- .../selftests/bpf/prog_tests/tcp_rtt.c | 21 +++---- .../bpf/prog_tests/test_global_funcs.c | 28 ++++++--- tools/testing/selftests/bpf/test_sock_addr.c | 37 ++++++++---- .../selftests/bpf/xdp_redirect_multi.c | 15 ++--- 9 files changed, 121 insertions(+), 109 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c index 1fb16f8dad56..ff6cce9fef06 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c @@ -19,16 +19,28 @@ extern int extra_prog_load_log_flags; static int check_load(const char *file, enum bpf_prog_type type) { - struct bpf_prog_load_attr attr; struct bpf_object *obj = NULL; - int err, prog_fd; + struct bpf_program *prog; + int err; - memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); - attr.file = file; - attr.prog_type = type; - attr.log_level = 4 | extra_prog_load_log_flags; - attr.prog_flags = BPF_F_TEST_RND_HI32; - err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + obj = bpf_object__open_file(file, NULL); + err = libbpf_get_error(obj); + if (err) + return err; + + prog = bpf_object__next_program(obj, NULL); + if (!prog) { + err = -ENOENT; + goto err_out; + } + + bpf_program__set_type(prog, type); + bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32); + bpf_program__set_log_level(prog, 4 | extra_prog_load_log_flags); + + err = bpf_object__load(obj); + +err_out: bpf_object__close(obj); return err; } diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c index 9229db2f5ca5..ca574e1e30e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -51,19 +51,20 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type) bool v4 = family == AF_INET; __u16 expected_local_port = v4 ? 22222 : 22223; __u16 expected_peer_port = 60000; - struct bpf_prog_load_attr attr = { - .file = v4 ? "./connect_force_port4.o" : - "./connect_force_port6.o", - }; struct bpf_program *prog; struct bpf_object *obj; - int xlate_fd, fd, err; + const char *obj_file = v4 ? "connect_force_port4.o" : "connect_force_port6.o"; + int fd, err; __u32 duration = 0; - err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd); - if (err) { - log_err("Failed to load BPF object"); + obj = bpf_object__open_file(obj_file, NULL); + if (!ASSERT_OK_PTR(obj, "bpf_obj_open")) return -1; + + err = bpf_object__load(obj); + if (!ASSERT_OK(err, "bpf_obj_load")) { + err = -EIO; + goto close_bpf_object; } prog = bpf_object__find_program_by_title(obj, v4 ? diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c index 2a49f8fcde06..ce10d2fc3a6c 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c +++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include "kfree_skb.skel.h" struct meta { int ifindex; @@ -58,16 +59,11 @@ void serial_test_kfree_skb(void) .ctx_in = &skb, .ctx_size_in = sizeof(skb), }; - struct bpf_prog_load_attr attr = { - .file = "./kfree_skb.o", - }; - - struct bpf_link *link = NULL, *link_fentry = NULL, *link_fexit = NULL; - struct bpf_map *perf_buf_map, *global_data; - struct bpf_program *prog, *fentry, *fexit; - struct bpf_object *obj, *obj2 = NULL; + struct kfree_skb *skel = NULL; + struct bpf_link *link; + struct bpf_object *obj; struct perf_buffer *pb = NULL; - int err, kfree_skb_fd; + int err; bool passed = false; __u32 duration = 0; const int zero = 0; @@ -78,40 +74,27 @@ void serial_test_kfree_skb(void) if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno)) return; - err = bpf_prog_load_xattr(&attr, &obj2, &kfree_skb_fd); - if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno)) + skel = kfree_skb__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kfree_skb_skel")) goto close_prog; - prog = bpf_object__find_program_by_title(obj2, "tp_btf/kfree_skb"); - if (CHECK(!prog, "find_prog", "prog kfree_skb not found\n")) - goto close_prog; - fentry = bpf_object__find_program_by_title(obj2, "fentry/eth_type_trans"); - if (CHECK(!fentry, "find_prog", "prog eth_type_trans not found\n")) - goto close_prog; - fexit = bpf_object__find_program_by_title(obj2, "fexit/eth_type_trans"); - if (CHECK(!fexit, "find_prog", "prog eth_type_trans not found\n")) - goto close_prog; - - global_data = bpf_object__find_map_by_name(obj2, ".bss"); - if (CHECK(!global_data, "find global data", "not found\n")) - goto close_prog; - - link = bpf_program__attach_raw_tracepoint(prog, NULL); + link = bpf_program__attach_raw_tracepoint(skel->progs.trace_kfree_skb, NULL); if (!ASSERT_OK_PTR(link, "attach_raw_tp")) goto close_prog; - link_fentry = bpf_program__attach_trace(fentry); - if (!ASSERT_OK_PTR(link_fentry, "attach fentry")) - goto close_prog; - link_fexit = bpf_program__attach_trace(fexit); - if (!ASSERT_OK_PTR(link_fexit, "attach fexit")) - goto close_prog; + skel->links.trace_kfree_skb = link; - perf_buf_map = bpf_object__find_map_by_name(obj2, "perf_buf_map"); - if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n")) + link = bpf_program__attach_trace(skel->progs.fentry_eth_type_trans); + if (!ASSERT_OK_PTR(link, "attach fentry")) goto close_prog; + skel->links.fentry_eth_type_trans = link; + + link = bpf_program__attach_trace(skel->progs.fexit_eth_type_trans); + if (!ASSERT_OK_PTR(link, "attach fexit")) + goto close_prog; + skel->links.fexit_eth_type_trans = link; /* set up perf buffer */ - pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1, + pb = perf_buffer__new(bpf_map__fd(skel->maps.perf_buf_map), 1, on_sample, NULL, &passed, NULL); if (!ASSERT_OK_PTR(pb, "perf_buf__new")) goto close_prog; @@ -133,7 +116,7 @@ void serial_test_kfree_skb(void) */ ASSERT_TRUE(passed, "passed"); - err = bpf_map_lookup_elem(bpf_map__fd(global_data), &zero, test_ok); + err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.bss), &zero, test_ok); if (CHECK(err, "get_result", "failed to get output data: %d\n", err)) goto close_prog; @@ -141,9 +124,6 @@ void serial_test_kfree_skb(void) CHECK_FAIL(!test_ok[0] || !test_ok[1]); close_prog: perf_buffer__free(pb); - bpf_link__destroy(link); - bpf_link__destroy(link_fentry); - bpf_link__destroy(link_fexit); bpf_object__close(obj); - bpf_object__close(obj2); + kfree_skb__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c index 86f97681ad89..6a953f4adfdc 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c @@ -167,20 +167,20 @@ static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title) static void run_test(int cgroup_fd) { - struct bpf_prog_load_attr attr = { - .file = "./sockopt_inherit.o", - }; int server_fd = -1, client_fd; struct bpf_object *obj; void *server_err; pthread_t tid; - int ignored; int err; - err = bpf_prog_load_xattr(&attr, &obj, &ignored); - if (CHECK_FAIL(err)) + obj = bpf_object__open_file("sockopt_inherit.o", NULL); + if (!ASSERT_OK_PTR(obj, "obj_open")) return; + err = bpf_object__load(obj); + if (!ASSERT_OK(err, "obj_load")) + goto close_bpf_object; + err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt"); if (CHECK_FAIL(err)) goto close_bpf_object; diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c index bc34f7773444..abce12ddcc37 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c @@ -297,14 +297,10 @@ static int run_setsockopt_test(struct bpf_object *obj, int cg_parent, void test_sockopt_multi(void) { - struct bpf_prog_load_attr attr = { - .file = "./sockopt_multi.o", - }; int cg_parent = -1, cg_child = -1; struct bpf_object *obj = NULL; int sock_fd = -1; int err = -1; - int ignored; cg_parent = test__join_cgroup("/parent"); if (CHECK_FAIL(cg_parent < 0)) @@ -314,8 +310,12 @@ void test_sockopt_multi(void) if (CHECK_FAIL(cg_child < 0)) goto out; - err = bpf_prog_load_xattr(&attr, &obj, &ignored); - if (CHECK_FAIL(err)) + obj = bpf_object__open_file("sockopt_multi.o", NULL); + if (!ASSERT_OK_PTR(obj, "obj_load")) + goto out; + + err = bpf_object__load(obj); + if (!ASSERT_OK(err, "obj_load")) goto out; sock_fd = socket(AF_INET, SOCK_STREAM, 0); diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c index 265b4fe33ec3..96ff2c20af81 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c @@ -2,6 +2,7 @@ #include #include "cgroup_helpers.h" #include "network_helpers.h" +#include "tcp_rtt.skel.h" struct tcp_rtt_storage { __u32 invoked; @@ -91,26 +92,18 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked, static int run_test(int cgroup_fd, int server_fd) { - struct bpf_prog_load_attr attr = { - .prog_type = BPF_PROG_TYPE_SOCK_OPS, - .file = "./tcp_rtt.o", - .expected_attach_type = BPF_CGROUP_SOCK_OPS, - }; - struct bpf_object *obj; - struct bpf_map *map; + struct tcp_rtt *skel; int client_fd; int prog_fd; int map_fd; int err; - err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); - if (err) { - log_err("Failed to load BPF object"); + skel = tcp_rtt__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_load")) return -1; - } - map = bpf_object__next_map(obj, NULL); - map_fd = bpf_map__fd(map); + map_fd = bpf_map__fd(skel->maps.socket_storage_map); + prog_fd = bpf_program__fd(skel->progs._sockops); err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_SOCK_OPS, 0); if (err) { @@ -149,7 +142,7 @@ static int run_test(int cgroup_fd, int server_fd) close(client_fd); close_bpf_object: - bpf_object__close(obj); + tcp_rtt__destroy(skel); return err; } diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c index 7e13129f593a..509e21d5cb9d 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c @@ -30,17 +30,29 @@ extern int extra_prog_load_log_flags; static int check_load(const char *file) { - struct bpf_prog_load_attr attr; struct bpf_object *obj = NULL; - int err, prog_fd; + struct bpf_program *prog; + int err; - memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); - attr.file = file; - attr.prog_type = BPF_PROG_TYPE_UNSPEC; - attr.log_level = extra_prog_load_log_flags; - attr.prog_flags = BPF_F_TEST_RND_HI32; found = false; - err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + + obj = bpf_object__open_file(file, NULL); + err = libbpf_get_error(obj); + if (err) + return err; + + prog = bpf_object__next_program(obj, NULL); + if (!prog) { + err = -ENOENT; + goto err_out; + } + + bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32); + bpf_program__set_log_level(prog, extra_prog_load_log_flags); + + err = bpf_object__load(obj); + +err_out: bpf_object__close(obj); return err; } diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 05c9e4944c01..f0c8d05ba6d1 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -663,23 +663,36 @@ static int load_insns(const struct sock_addr_test *test, static int load_path(const struct sock_addr_test *test, const char *path) { - struct bpf_prog_load_attr attr; struct bpf_object *obj; - int prog_fd; + struct bpf_program *prog; + int err; - memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); - attr.file = path; - attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; - attr.expected_attach_type = test->expected_attach_type; - attr.prog_flags = BPF_F_TEST_RND_HI32; - - if (bpf_prog_load_xattr(&attr, &obj, &prog_fd)) { - if (test->expected_result != LOAD_REJECT) - log_err(">>> Loading program (%s) error.\n", path); + obj = bpf_object__open_file(path, NULL); + err = libbpf_get_error(obj); + if (err) { + log_err(">>> Opening BPF object (%s) error.\n", path); return -1; } - return prog_fd; + prog = bpf_object__next_program(obj, NULL); + if (!prog) + goto err_out; + + bpf_program__set_type(prog, BPF_PROG_TYPE_CGROUP_SOCK_ADDR); + bpf_program__set_expected_attach_type(prog, test->expected_attach_type); + bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32); + + err = bpf_object__load(obj); + if (err) { + if (test->expected_result != LOAD_REJECT) + log_err(">>> Loading program (%s) error.\n", path); + goto err_out; + } + + return bpf_program__fd(prog); +err_out: + bpf_object__close(obj); + return -1; } static int bind4_prog_load(const struct sock_addr_test *test) diff --git a/tools/testing/selftests/bpf/xdp_redirect_multi.c b/tools/testing/selftests/bpf/xdp_redirect_multi.c index f5ffba341c17..51c8224b4ccc 100644 --- a/tools/testing/selftests/bpf/xdp_redirect_multi.c +++ b/tools/testing/selftests/bpf/xdp_redirect_multi.c @@ -85,10 +85,7 @@ int main(int argc, char **argv) { int prog_fd, group_all, mac_map; struct bpf_program *ingress_prog, *egress_prog; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_UNSPEC, - }; - int i, ret, opt, egress_prog_fd = 0; + int i, err, ret, opt, egress_prog_fd = 0; struct bpf_devmap_val devmap_val; bool attach_egress_prog = false; unsigned char mac_addr[6]; @@ -147,10 +144,14 @@ int main(int argc, char **argv) printf("\n"); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + obj = bpf_object__open_file(filename, NULL); + err = libbpf_get_error(obj); + if (err) goto err_out; + err = bpf_object__load(obj); + if (err) + goto err_out; + prog_fd = bpf_program__fd(bpf_object__next_program(obj, NULL)); if (attach_egress_prog) group_all = bpf_object__find_map_fd_by_name(obj, "map_egress"); From 527024f7aeb683ce7ef49b07ef7ce9ecf015288d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Dec 2021 15:28:22 -0800 Subject: [PATCH 084/115] samples/bpf: Clean up samples/bpf build failes Remove xdp_samples_user.o rule redefinition which generates Makefile warning and instead override TPROGS_CFLAGS. This seems to work fine when building inside selftests/bpf. That was one big head-scratcher before I found that generic Makefile.target hid this surprising specialization for for xdp_samples_user.o. Main change is to use actual locally installed libbpf headers. Also drop printk macro re-definition (not even used!). Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211201232824.3166325-8-andrii@kernel.org --- samples/bpf/Makefile | 13 ++++++++++++- samples/bpf/Makefile.target | 11 ----------- samples/bpf/hbm_kern.h | 2 -- samples/bpf/lwt_len_hist_kern.c | 7 ------- 4 files changed, 12 insertions(+), 21 deletions(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index a886dff1ba89..6ae62b1dc938 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -328,7 +328,7 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) libbpf_hdrs: $(LIBBPF) -$(obj)/$(TRACE_HELPERS): | libbpf_hdrs +$(obj)/$(TRACE_HELPERS) $(obj)/$(CGROUP_HELPERS) $(obj)/$(XDP_SAMPLE): | libbpf_hdrs .PHONY: libbpf_hdrs @@ -343,6 +343,17 @@ $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h +# Override includes for xdp_sample_user.o because $(srctree)/usr/include in +# TPROGS_CFLAGS causes conflicts +XDP_SAMPLE_CFLAGS += -Wall -O2 -lm \ + -I$(src)/../../tools/include \ + -I$(src)/../../tools/include/uapi \ + -I$(LIBBPF_INCLUDE) \ + -I$(src)/../../tools/testing/selftests/bpf + +$(obj)/$(XDP_SAMPLE): TPROGS_CFLAGS = $(XDP_SAMPLE_CFLAGS) +$(obj)/$(XDP_SAMPLE): $(src)/xdp_sample_user.h $(src)/xdp_sample_shared.h + -include $(BPF_SAMPLES_PATH)/Makefile.target VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ diff --git a/samples/bpf/Makefile.target b/samples/bpf/Makefile.target index 5a368affa038..7621f55e2947 100644 --- a/samples/bpf/Makefile.target +++ b/samples/bpf/Makefile.target @@ -73,14 +73,3 @@ quiet_cmd_tprog-cobjs = CC $@ cmd_tprog-cobjs = $(CC) $(tprogc_flags) -c -o $@ $< $(tprog-cobjs): $(obj)/%.o: $(src)/%.c FORCE $(call if_changed_dep,tprog-cobjs) - -# Override includes for xdp_sample_user.o because $(srctree)/usr/include in -# TPROGS_CFLAGS causes conflicts -XDP_SAMPLE_CFLAGS += -Wall -O2 -lm \ - -I./tools/include \ - -I./tools/include/uapi \ - -I./tools/lib \ - -I./tools/testing/selftests/bpf -$(obj)/xdp_sample_user.o: $(src)/xdp_sample_user.c \ - $(src)/xdp_sample_user.h $(src)/xdp_sample_shared.h - $(CC) $(XDP_SAMPLE_CFLAGS) -c -o $@ $< diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h index 722b3fadb467..1752a46a2b05 100644 --- a/samples/bpf/hbm_kern.h +++ b/samples/bpf/hbm_kern.h @@ -9,8 +9,6 @@ * Include file for sample Host Bandwidth Manager (HBM) BPF programs */ #define KBUILD_MODNAME "foo" -#include -#include #include #include #include diff --git a/samples/bpf/lwt_len_hist_kern.c b/samples/bpf/lwt_len_hist_kern.c index 9ed63e10e170..1fa14c54963a 100644 --- a/samples/bpf/lwt_len_hist_kern.c +++ b/samples/bpf/lwt_len_hist_kern.c @@ -16,13 +16,6 @@ #include #include -# define printk(fmt, ...) \ - ({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ - }) - struct bpf_elf_map { __u32 type; __u32 size_key; From c58f9815ba9735752d3735efb915e8878604684b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Dec 2021 15:28:23 -0800 Subject: [PATCH 085/115] samples/bpf: Get rid of deprecated libbpf API uses Replace deprecated APIs with new ones. Also mute source code using deprecated AF_XDP (xsk.h). Figuring out what to do with all the AF_XDP stuff is a separate problem that should be solved with its own set of changes. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211201232824.3166325-9-andrii@kernel.org --- samples/bpf/cookie_uid_helper_example.c | 14 +++++++++----- samples/bpf/fds_example.c | 24 +++++++++++++++--------- samples/bpf/map_perf_test_user.c | 15 +++++++++------ samples/bpf/sock_example.c | 12 ++++++++---- samples/bpf/sockex1_user.c | 15 ++++++++++++--- samples/bpf/sockex2_user.c | 16 ++++++++++++---- samples/bpf/test_cgrp2_array_pin.c | 4 ++-- samples/bpf/test_cgrp2_attach.c | 13 ++++++++----- samples/bpf/test_cgrp2_sock.c | 8 ++++++-- samples/bpf/test_lru_dist.c | 11 +++++++---- samples/bpf/trace_output_user.c | 4 +--- samples/bpf/xdp_sample_pkts_user.c | 22 +++++++++++----------- samples/bpf/xdpsock_ctrl_proc.c | 3 +++ samples/bpf/xdpsock_user.c | 3 +++ samples/bpf/xsk_fwd.c | 3 +++ 15 files changed, 109 insertions(+), 58 deletions(-) diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c index 54958802c032..f0df3dda4b1f 100644 --- a/samples/bpf/cookie_uid_helper_example.c +++ b/samples/bpf/cookie_uid_helper_example.c @@ -67,8 +67,8 @@ static bool test_finish; static void maps_create(void) { - map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(uint32_t), - sizeof(struct stats), 100, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), + sizeof(struct stats), 100, NULL); if (map_fd < 0) error(1, errno, "map create failed!\n"); } @@ -157,9 +157,13 @@ static void prog_load(void) offsetof(struct __sk_buff, len)), BPF_EXIT_INSN(), }; - prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, - ARRAY_SIZE(prog), "GPL", 0, - log_buf, sizeof(log_buf)); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = log_buf, + .log_size = sizeof(log_buf), + ); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + prog, ARRAY_SIZE(prog), &opts); if (prog_fd < 0) error(1, errno, "failed to load prog\n%s\n", log_buf); } diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c index 9a7c1fd7a4a8..16dbf49e0f19 100644 --- a/samples/bpf/fds_example.c +++ b/samples/bpf/fds_example.c @@ -54,16 +54,22 @@ static int bpf_prog_create(const char *object) }; size_t insns_cnt = sizeof(insns) / sizeof(struct bpf_insn); struct bpf_object *obj; - int prog_fd; + int err; if (object) { - assert(!bpf_prog_load(object, BPF_PROG_TYPE_UNSPEC, - &obj, &prog_fd)); - return prog_fd; + obj = bpf_object__open_file(object, NULL); + assert(!libbpf_get_error(obj)); + err = bpf_object__load(obj); + assert(!err); + return bpf_program__fd(bpf_object__next_program(obj, NULL)); } else { - return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, - insns, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = bpf_log_buf, + .log_size = BPF_LOG_BUF_SIZE, + ); + + return bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + insns, insns_cnt, &opts); } } @@ -73,8 +79,8 @@ static int bpf_do_map(const char *file, uint32_t flags, uint32_t key, int fd, ret; if (flags & BPF_F_PIN) { - fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t), - sizeof(uint32_t), 1024, 0); + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(uint32_t), + sizeof(uint32_t), 1024, NULL); printf("bpf: map fd:%d (%s)\n", fd, strerror(errno)); assert(fd > 0); diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 9db949290a78..319fd31522f3 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -134,19 +134,22 @@ static void do_test_lru(enum test_type test, int cpu) */ int outer_fd = map_fd[array_of_lru_hashs_idx]; unsigned int mycpu, mynode; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = BPF_F_NUMA_NODE, + ); assert(cpu < MAX_NR_CPUS); ret = syscall(__NR_getcpu, &mycpu, &mynode, NULL); assert(!ret); + opts.numa_node = mynode; inner_lru_map_fds[cpu] = - bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH, - test_map_names[INNER_LRU_HASH_PREALLOC], - sizeof(uint32_t), - sizeof(long), - inner_lru_hash_size, 0, - mynode); + bpf_map_create(BPF_MAP_TYPE_LRU_HASH, + test_map_names[INNER_LRU_HASH_PREALLOC], + sizeof(uint32_t), + sizeof(long), + inner_lru_hash_size, &opts); if (inner_lru_map_fds[cpu] == -1) { printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n", strerror(errno), errno); diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c index 23d1930e1927..a88f69504c08 100644 --- a/samples/bpf/sock_example.c +++ b/samples/bpf/sock_example.c @@ -37,8 +37,8 @@ static int test_sock(void) int sock = -1, map_fd, prog_fd, i, key; long long value = 0, tcp_cnt, udp_cnt, icmp_cnt; - map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), - 256, 0); + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(key), sizeof(value), + 256, NULL); if (map_fd < 0) { printf("failed to create map '%s'\n", strerror(errno)); goto cleanup; @@ -59,9 +59,13 @@ static int test_sock(void) BPF_EXIT_INSN(), }; size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = bpf_log_buf, + .log_size = BPF_LOG_BUF_SIZE, + ); - prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, insns_cnt, - "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + prog, insns_cnt, &opts); if (prog_fd < 0) { printf("failed to load prog '%s'\n", strerror(errno)); goto cleanup; diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c index 3c83722877dc..9e8d39e245c1 100644 --- a/samples/bpf/sockex1_user.c +++ b/samples/bpf/sockex1_user.c @@ -11,17 +11,26 @@ int main(int ac, char **argv) { struct bpf_object *obj; + struct bpf_program *prog; int map_fd, prog_fd; char filename[256]; - int i, sock; + int i, sock, err; FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, - &obj, &prog_fd)) + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) return 1; + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_SOCKET_FILTER); + + err = bpf_object__load(obj); + if (err) + return 1; + + prog_fd = bpf_program__fd(prog); map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); sock = open_raw_sock("lo"); diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c index bafa567b840c..6a3fd369d3fc 100644 --- a/samples/bpf/sockex2_user.c +++ b/samples/bpf/sockex2_user.c @@ -16,18 +16,26 @@ struct pair { int main(int ac, char **argv) { + struct bpf_program *prog; struct bpf_object *obj; int map_fd, prog_fd; char filename[256]; - int i, sock; + int i, sock, err; FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, - &obj, &prog_fd)) + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) return 1; + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_SOCKET_FILTER); + + err = bpf_object__load(obj); + if (err) + return 1; + + prog_fd = bpf_program__fd(prog); map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); sock = open_raw_sock("lo"); diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c index 6d564aa75447..05e88aa63009 100644 --- a/samples/bpf/test_cgrp2_array_pin.c +++ b/samples/bpf/test_cgrp2_array_pin.c @@ -64,9 +64,9 @@ int main(int argc, char **argv) } if (create_array) { - array_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY, + array_fd = bpf_map_create(BPF_MAP_TYPE_CGROUP_ARRAY, NULL, sizeof(uint32_t), sizeof(uint32_t), - 1, 0); + 1, NULL); if (array_fd < 0) { fprintf(stderr, "bpf_create_map(BPF_MAP_TYPE_CGROUP_ARRAY,...): %s(%d)\n", diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c index 390ff38d2ac6..6d90874b09c3 100644 --- a/samples/bpf/test_cgrp2_attach.c +++ b/samples/bpf/test_cgrp2_attach.c @@ -71,10 +71,13 @@ static int prog_load(int map_fd, int verdict) BPF_EXIT_INSN(), }; size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = bpf_log_buf, + .log_size = BPF_LOG_BUF_SIZE, + ); - return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, - prog, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); + return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SKB, NULL, "GPL", + prog, insns_cnt, &opts); } static int usage(const char *argv0) @@ -90,9 +93,9 @@ static int attach_filter(int cg_fd, int type, int verdict) int prog_fd, map_fd, ret, key; long long pkt_cnt, byte_cnt; - map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(key), sizeof(byte_cnt), - 256, 0); + 256, NULL); if (map_fd < 0) { printf("Failed to create map: '%s'\n", strerror(errno)); return EXIT_FAILURE; diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c index b0811da5a00f..a0811df888f4 100644 --- a/samples/bpf/test_cgrp2_sock.c +++ b/samples/bpf/test_cgrp2_sock.c @@ -70,6 +70,10 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio) BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)), BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)), }; + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = bpf_log_buf, + .log_size = BPF_LOG_BUF_SIZE, + ); struct bpf_insn *prog; size_t insns_cnt; @@ -115,8 +119,8 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio) insns_cnt /= sizeof(struct bpf_insn); - ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt, - "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); + ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", + prog, insns_cnt, &opts); free(prog); diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c index c92c5c06b965..75e877853596 100644 --- a/samples/bpf/test_lru_dist.c +++ b/samples/bpf/test_lru_dist.c @@ -105,10 +105,10 @@ struct pfect_lru { static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size, unsigned int nr_possible_elems) { - lru->map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, + lru->map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(unsigned long long), sizeof(struct pfect_lru_node *), - nr_possible_elems, 0); + nr_possible_elems, NULL); assert(lru->map_fd != -1); lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node)); @@ -207,10 +207,13 @@ static unsigned int read_keys(const char *dist_file, static int create_map(int map_type, int map_flags, unsigned int size) { + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = map_flags, + ); int map_fd; - map_fd = bpf_create_map(map_type, sizeof(unsigned long long), - sizeof(unsigned long long), size, map_flags); + map_fd = bpf_map_create(map_type, NULL, sizeof(unsigned long long), + sizeof(unsigned long long), size, &opts); if (map_fd == -1) perror("bpf_create_map"); diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c index 364b98764d54..371732f9cf8e 100644 --- a/samples/bpf/trace_output_user.c +++ b/samples/bpf/trace_output_user.c @@ -43,7 +43,6 @@ static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size) int main(int argc, char **argv) { - struct perf_buffer_opts pb_opts = {}; struct bpf_link *link = NULL; struct bpf_program *prog; struct perf_buffer *pb; @@ -84,8 +83,7 @@ int main(int argc, char **argv) goto cleanup; } - pb_opts.sample_cb = print_bpf_output; - pb = perf_buffer__new(map_fd, 8, &pb_opts); + pb = perf_buffer__new(map_fd, 8, print_bpf_output, NULL, NULL, NULL); ret = libbpf_get_error(pb); if (ret) { printf("failed to setup perf_buffer: %d\n", ret); diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c index f4382ccdcbb1..587eacb49103 100644 --- a/samples/bpf/xdp_sample_pkts_user.c +++ b/samples/bpf/xdp_sample_pkts_user.c @@ -110,12 +110,9 @@ static void usage(const char *prog) int main(int argc, char **argv) { - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; - struct perf_buffer_opts pb_opts = {}; const char *optstr = "FS"; int prog_fd, map_fd, opt; + struct bpf_program *prog; struct bpf_object *obj; struct bpf_map *map; char filename[256]; @@ -144,15 +141,19 @@ int main(int argc, char **argv) } snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - prog_load_attr.file = filename; - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + obj = bpf_object__open_file(filename, NULL); + if (libbpf_get_error(obj)) return 1; - if (!prog_fd) { - printf("bpf_prog_load_xattr: %s\n", strerror(errno)); + prog = bpf_object__next_program(obj, NULL); + bpf_program__set_type(prog, BPF_PROG_TYPE_XDP); + + err = bpf_object__load(obj); + if (err) return 1; - } + + prog_fd = bpf_program__fd(prog); map = bpf_object__next_map(obj, NULL); if (!map) { @@ -181,8 +182,7 @@ int main(int argc, char **argv) return 1; } - pb_opts.sample_cb = print_bpf_output; - pb = perf_buffer__new(map_fd, 8, &pb_opts); + pb = perf_buffer__new(map_fd, 8, print_bpf_output, NULL, NULL, NULL); err = libbpf_get_error(pb); if (err) { perror("perf_buffer setup failed"); diff --git a/samples/bpf/xdpsock_ctrl_proc.c b/samples/bpf/xdpsock_ctrl_proc.c index 384e62e3c6d6..cc4408797ab7 100644 --- a/samples/bpf/xdpsock_ctrl_proc.c +++ b/samples/bpf/xdpsock_ctrl_proc.c @@ -15,6 +15,9 @@ #include #include "xdpsock.h" +/* libbpf APIs for AF_XDP are deprecated starting from v0.7 */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + static const char *opt_if = ""; static struct option long_options[] = { diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 49d7a6ad7e39..616d663d55aa 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -36,6 +36,9 @@ #include #include "xdpsock.h" +/* libbpf APIs for AF_XDP are deprecated starting from v0.7 */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + #ifndef SOL_XDP #define SOL_XDP 283 #endif diff --git a/samples/bpf/xsk_fwd.c b/samples/bpf/xsk_fwd.c index 1cd97c84c337..52e7c4ffd228 100644 --- a/samples/bpf/xsk_fwd.c +++ b/samples/bpf/xsk_fwd.c @@ -27,6 +27,9 @@ #include #include +/* libbpf APIs for AF_XDP are deprecated starting from v0.7 */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) typedef __u64 u64; From c93faaaf2f67ba5396840316651cdc7640d9fa9e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 1 Dec 2021 15:28:24 -0800 Subject: [PATCH 086/115] libbpf: Deprecate bpf_prog_load_xattr() API bpf_prog_load_xattr() is high-level API that's named as a low-level BPF_PROG_LOAD wrapper APIs, but it actually operates on struct bpf_object. It's badly and confusingly misnamed as it will load all the progs insige bpf_object, returning prog_fd of the very first BPF program. It also has a bunch of ad-hoc things like log_level override, map_ifindex auto-setting, etc. All this can be expressed more explicitly and cleanly through existing libbpf APIs. This patch marks bpf_prog_load_xattr() for deprecation in libbpf v0.8 ([0]). [0] Closes: https://github.com/libbpf/libbpf/issues/308 Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211201232824.3166325-10-andrii@kernel.org --- tools/lib/bpf/libbpf.h | 1 + tools/lib/bpf/libbpf_common.h | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 148fa85bab33..c0d62dd37c5d 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -682,6 +682,7 @@ struct bpf_prog_load_attr { int prog_flags; }; +LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__open() and bpf_object__load() instead") LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, struct bpf_object **pobj, int *prog_fd); LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__open() and bpf_object__load() instead") diff --git a/tools/lib/bpf/libbpf_common.h b/tools/lib/bpf/libbpf_common.h index b21cefc9c3b6..000e37798ff2 100644 --- a/tools/lib/bpf/libbpf_common.h +++ b/tools/lib/bpf/libbpf_common.h @@ -40,6 +40,11 @@ #else #define __LIBBPF_MARK_DEPRECATED_0_7(X) #endif +#if __LIBBPF_CURRENT_VERSION_GEQ(0, 8) +#define __LIBBPF_MARK_DEPRECATED_0_8(X) X +#else +#define __LIBBPF_MARK_DEPRECATED_0_8(X) +#endif /* This set of internal macros allows to do "function overloading" based on * number of arguments provided by used in backwards-compatible way during the From 0bf40542c05ef62997738cd45eea553415adb045 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Dec 2021 16:46:40 -0800 Subject: [PATCH 087/115] perf: Mute libbpf API deprecations temporarily Libbpf development version was bumped to 0.7 in c93faaaf2f67 ("libbpf: Deprecate bpf_prog_load_xattr() API"), activating a bunch of previously scheduled deprecations. Most APIs are pretty straightforward to replace with newer APIs, but perf has a complicated mixed setup with libbpf used both as static and shared configurations, which makes it non-trivial to migrate the APIs. Further, bpf_program__set_prep() needs more involved refactoring, which will require help from Arnaldo and/or Jiri. So for now, mute deprecation warnings and work on migrating perf off of deprecated APIs separately with the input from owners of the perf tool. Fixes: c93faaaf2f67 ("libbpf: Deprecate bpf_prog_load_xattr() API") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211203004640.2455717-1-andrii@kernel.org --- tools/perf/tests/bpf.c | 4 ++++ tools/perf/util/bpf-loader.c | 3 +++ 2 files changed, 7 insertions(+) diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index 2bf146e49ce8..c52bf10f746e 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -312,9 +312,13 @@ static int check_env(void) return err; } +/* temporarily disable libbpf deprecation warnings */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" err = bpf_load_program(BPF_PROG_TYPE_KPROBE, insns, sizeof(insns) / sizeof(insns[0]), license, kver_int, NULL, 0); +#pragma GCC diagnostic pop if (err < 0) { pr_err("Missing basic BPF support, skip this test: %s\n", strerror(errno)); diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index fbb3c4057c30..528aeb0ab79d 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -29,6 +29,9 @@ #include +/* temporarily disable libbpf deprecation warnings */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + static int libbpf_perf_print(enum libbpf_print_level level __attribute__((unused)), const char *fmt, va_list args) { From 78c1f8d0634cc35da613d844eda7c849fc50f643 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 3 Dec 2021 10:28:36 -0800 Subject: [PATCH 088/115] libbpf: Reduce bpf_core_apply_relo_insn() stack usage. Reduce bpf_core_apply_relo_insn() stack usage and bump BPF_CORE_SPEC_MAX_LEN limit back to 64. Fixes: 29db4bea1d10 ("bpf: Prepare relo_core.c for kernel duty.") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211203182836.16646-1-alexei.starovoitov@gmail.com --- kernel/bpf/btf.c | 11 ++++++- tools/lib/bpf/libbpf.c | 4 ++- tools/lib/bpf/relo_core.c | 62 ++++++++++++--------------------------- tools/lib/bpf/relo_core.h | 30 ++++++++++++++++++- 4 files changed, 61 insertions(+), 46 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed4258cb0832..2a902a946f70 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6742,8 +6742,16 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, { bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL; struct bpf_core_cand_list cands = {}; + struct bpf_core_spec *specs; int err; + /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" + * into arrays of btf_ids of struct fields and array indices. + */ + specs = kcalloc(3, sizeof(*specs), GFP_KERNEL); + if (!specs) + return -ENOMEM; + if (need_cands) { struct bpf_cand_cache *cc; int i; @@ -6779,8 +6787,9 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, } err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8, - relo, relo_idx, ctx->btf, &cands); + relo, relo_idx, ctx->btf, &cands, specs); out: + kfree(specs); if (need_cands) { kfree(cands.cands); mutex_unlock(&cand_cache_mutex); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index de260c94e418..6db0b5e8540e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5515,6 +5515,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, const struct btf *local_btf, struct hashmap *cand_cache) { + struct bpf_core_spec specs_scratch[3] = {}; const void *type_key = u32_as_hash_key(relo->type_id); struct bpf_core_cand_list *cands = NULL; const char *prog_name = prog->name; @@ -5569,7 +5570,8 @@ static int bpf_core_apply_relo(struct bpf_program *prog, } } - return bpf_core_apply_relo_insn(prog_name, insn, insn_idx, relo, relo_idx, local_btf, cands); + return bpf_core_apply_relo_insn(prog_name, insn, insn_idx, relo, + relo_idx, local_btf, cands, specs_scratch); } static int diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index d194fb9306ed..32464f0ab4b1 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -68,33 +68,6 @@ enum libbpf_print_level { #include "libbpf_internal.h" #endif -#define BPF_CORE_SPEC_MAX_LEN 32 - -/* represents BPF CO-RE field or array element accessor */ -struct bpf_core_accessor { - __u32 type_id; /* struct/union type or array element type */ - __u32 idx; /* field index or array index */ - const char *name; /* field name or NULL for array accessor */ -}; - -struct bpf_core_spec { - const struct btf *btf; - /* high-level spec: named fields and array indices only */ - struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN]; - /* original unresolved (no skip_mods_or_typedefs) root type ID */ - __u32 root_type_id; - /* CO-RE relocation kind */ - enum bpf_core_relo_kind relo_kind; - /* high-level spec length */ - int len; - /* raw, low-level spec: 1-to-1 with accessor spec string */ - int raw_spec[BPF_CORE_SPEC_MAX_LEN]; - /* raw spec length */ - int raw_len; - /* field bit offset represented by spec */ - __u32 bit_offset; -}; - static bool is_flex_arr(const struct btf *btf, const struct bpf_core_accessor *acc, const struct btf_array *arr) @@ -1200,9 +1173,12 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, const struct bpf_core_relo *relo, int relo_idx, const struct btf *local_btf, - struct bpf_core_cand_list *cands) + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch) { - struct bpf_core_spec local_spec, cand_spec, targ_spec = {}; + struct bpf_core_spec *local_spec = &specs_scratch[0]; + struct bpf_core_spec *cand_spec = &specs_scratch[1]; + struct bpf_core_spec *targ_spec = &specs_scratch[2]; struct bpf_core_relo_res cand_res, targ_res; const struct btf_type *local_type; const char *local_name; @@ -1221,7 +1197,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, return -EINVAL; err = bpf_core_parse_spec(prog_name, local_btf, local_id, spec_str, - relo->kind, &local_spec); + relo->kind, local_spec); if (err) { pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", prog_name, relo_idx, local_id, btf_kind_str(local_type), @@ -1232,15 +1208,15 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); - bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &local_spec); + bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, local_spec); libbpf_print(LIBBPF_DEBUG, "\n"); /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) { targ_res.validate = true; targ_res.poison = false; - targ_res.orig_val = local_spec.root_type_id; - targ_res.new_val = local_spec.root_type_id; + targ_res.orig_val = local_spec->root_type_id; + targ_res.new_val = local_spec->root_type_id; goto patch_insn; } @@ -1253,38 +1229,38 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, for (i = 0, j = 0; i < cands->len; i++) { - err = bpf_core_spec_match(&local_spec, cands->cands[i].btf, - cands->cands[i].id, &cand_spec); + err = bpf_core_spec_match(local_spec, cands->cands[i].btf, + cands->cands[i].id, cand_spec); if (err < 0) { pr_warn("prog '%s': relo #%d: error matching candidate #%d ", prog_name, relo_idx, i); - bpf_core_dump_spec(prog_name, LIBBPF_WARN, &cand_spec); + bpf_core_dump_spec(prog_name, LIBBPF_WARN, cand_spec); libbpf_print(LIBBPF_WARN, ": %d\n", err); return err; } pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name, relo_idx, err == 0 ? "non-matching" : "matching", i); - bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &cand_spec); + bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, cand_spec); libbpf_print(LIBBPF_DEBUG, "\n"); if (err == 0) continue; - err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, &cand_spec, &cand_res); + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, cand_spec, &cand_res); if (err) return err; if (j == 0) { targ_res = cand_res; - targ_spec = cand_spec; - } else if (cand_spec.bit_offset != targ_spec.bit_offset) { + *targ_spec = *cand_spec; + } else if (cand_spec->bit_offset != targ_spec->bit_offset) { /* if there are many field relo candidates, they * should all resolve to the same bit offset */ pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n", - prog_name, relo_idx, cand_spec.bit_offset, - targ_spec.bit_offset); + prog_name, relo_idx, cand_spec->bit_offset, + targ_spec->bit_offset); return -EINVAL; } else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) { /* all candidates should result in the same relocation @@ -1328,7 +1304,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, prog_name, relo_idx); /* calculate single target relo result explicitly */ - err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, NULL, &targ_res); + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, NULL, &targ_res); if (err) return err; } diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 4f864b8e33b7..17799819ad7c 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -17,11 +17,39 @@ struct bpf_core_cand_list { int len; }; +#define BPF_CORE_SPEC_MAX_LEN 64 + +/* represents BPF CO-RE field or array element accessor */ +struct bpf_core_accessor { + __u32 type_id; /* struct/union type or array element type */ + __u32 idx; /* field index or array index */ + const char *name; /* field name or NULL for array accessor */ +}; + +struct bpf_core_spec { + const struct btf *btf; + /* high-level spec: named fields and array indices only */ + struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN]; + /* original unresolved (no skip_mods_or_typedefs) root type ID */ + __u32 root_type_id; + /* CO-RE relocation kind */ + enum bpf_core_relo_kind relo_kind; + /* high-level spec length */ + int len; + /* raw, low-level spec: 1-to-1 with accessor spec string */ + int raw_spec[BPF_CORE_SPEC_MAX_LEN]; + /* raw spec length */ + int raw_len; + /* field bit offset represented by spec */ + __u32 bit_offset; +}; + int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, int insn_idx, const struct bpf_core_relo *relo, int relo_idx, const struct btf *local_btf, - struct bpf_core_cand_list *cands); + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch); int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id); From da54ab14953c38d98cb3e34c564c06c3739394b2 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 30 Nov 2021 20:18:11 +0200 Subject: [PATCH 089/115] bpf: Fix the test_task_vma selftest to support output shorter than 1 kB The test for bpf_iter_task_vma assumes that the output will be longer than 1 kB, as the comment above the loop says. Due to this assumption, the loop becomes infinite if the output turns to be shorter than 1 kB. The return value of read_fd_into_buffer is 0 when the end of file was reached, and len isn't being increased any more. This commit adds a break on EOF to handle short output correctly. For the reference, this is the contents that I get when running test_progs under vmtest.sh, and it's shorter than 1 kB: 00400000-00401000 r--p 00000000 fe:00 25867 /root/bpf/test_progs 00401000-00674000 r-xp 00001000 fe:00 25867 /root/bpf/test_progs 00674000-0095f000 r--p 00274000 fe:00 25867 /root/bpf/test_progs 0095f000-00983000 r--p 0055e000 fe:00 25867 /root/bpf/test_progs 00983000-00a8a000 rw-p 00582000 fe:00 25867 /root/bpf/test_progs 00a8a000-0484e000 rw-p 00000000 00:00 0 7f6c64000000-7f6c64021000 rw-p 00000000 00:00 0 7f6c64021000-7f6c68000000 ---p 00000000 00:00 0 7f6c6ac8f000-7f6c6ac90000 r--s 00000000 00:0d 8032 anon_inode:bpf-map 7f6c6ac90000-7f6c6ac91000 ---p 00000000 00:00 0 7f6c6ac91000-7f6c6b491000 rw-p 00000000 00:00 0 7f6c6b491000-7f6c6b492000 r--s 00000000 00:0d 8032 anon_inode:bpf-map 7f6c6b492000-7f6c6b493000 rw-s 00000000 00:0d 8032 anon_inode:bpf-map 7ffc1e23d000-7ffc1e25e000 rw-p 00000000 00:00 0 7ffc1e3b8000-7ffc1e3bc000 r--p 00000000 00:00 0 7ffc1e3bc000-7ffc1e3bd000 r-xp 00000000 00:00 0 7fffffffe000-7ffffffff000 --xp 00000000 00:00 0 Fixes: e8168840e16c ("selftests/bpf: Add test for bpf_iter_task_vma") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211130181811.594220-1-maximmi@nvidia.com --- tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 0b996be923b5..b84f859b1267 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -1206,13 +1206,14 @@ static void test_task_vma(void) goto out; /* Read CMP_BUFFER_SIZE (1kB) from bpf_iter. Read in small chunks - * to trigger seq_file corner cases. The expected output is much - * longer than 1kB, so the while loop will terminate. + * to trigger seq_file corner cases. */ len = 0; while (len < CMP_BUFFER_SIZE) { err = read_fd_into_buffer(iter_fd, task_vma_output + len, min(read_size, CMP_BUFFER_SIZE - len)); + if (!err) + break; if (CHECK(err < 0, "read_iter_fd", "read_iter_fd failed\n")) goto out; len += err; From 866de407444398bc8140ea70de1dba5f91cc34ac Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 3 Dec 2021 13:30:01 +0800 Subject: [PATCH 090/115] bpf: Disallow BPF_LOG_KERNEL log level for bpf(BPF_BTF_LOAD) BPF_LOG_KERNEL is only used internally, so disallow bpf_btf_load() to set log level as BPF_LOG_KERNEL. The same checking has already been done in bpf_check(), so factor out a helper to check the validity of log attributes and use it in both places. Fixes: 8580ac9404f6 ("bpf: Process in-kernel BTF") Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211203053001.740945-1-houtao1@huawei.com --- include/linux/bpf_verifier.h | 7 +++++++ kernel/bpf/btf.c | 3 +-- kernel/bpf/verifier.c | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c8a78e830fca..182b16a91084 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -396,6 +396,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) log->level == BPF_LOG_KERNEL); } +static inline bool +bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +{ + return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 && + log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); +} + #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_info { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2a902a946f70..36a5cc0f53c6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4473,8 +4473,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, log->len_total = log_size; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || - !log->level || !log->ubuf) { + if (!bpf_verifier_log_attr_valid(log)) { err = -EINVAL; goto errout; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6522ffdea487..1126b75fe650 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14050,11 +14050,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) log->ubuf = (char __user *) (unsigned long) attr->log_buf; log->len_total = attr->log_size; - ret = -EINVAL; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || - !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) + if (!bpf_verifier_log_attr_valid(log)) { + ret = -EINVAL; goto err_unlock; + } } if (IS_ERR(btf_vmlinux)) { From 942df4dc5ea159100466f198d8687a49c2359ca3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 4 Dec 2021 11:46:23 -0800 Subject: [PATCH 091/115] bpftool: Add debug mode for gen_loader. Make -d flag functional for gen_loader style program loading. For example: $ bpftool prog load -L -d test_d_path.o ... // will print: libbpf: loading ./test_d_path.o libbpf: elf: section(3) fentry/security_inode_getattr, size 280, link 0, flags 6, type=1 ... libbpf: prog 'prog_close': found data map 0 (test_d_p.bss, sec 7, off 0) for insn 30 libbpf: gen: load_btf: size 5376 libbpf: gen: map_create: test_d_p.bss idx 0 type 2 value_type_id 118 libbpf: map 'test_d_p.bss': created successfully, fd=0 libbpf: gen: map_update_elem: idx 0 libbpf: sec 'fentry/filp_close': found 1 CO-RE relocations libbpf: record_relo_core: prog 1 insn[15] struct file 0:1 final insn_idx 15 libbpf: gen: prog_load: type 26 insns_cnt 35 progi_idx 0 libbpf: gen: find_attach_tgt security_inode_getattr 12 libbpf: gen: prog_load: type 26 insns_cnt 37 progi_idx 1 libbpf: gen: find_attach_tgt filp_close 12 libbpf: gen: finish 0 ... // at this point libbpf finished generating loader program 0: (bf) r6 = r1 1: (bf) r1 = r10 2: (07) r1 += -136 3: (b7) r2 = 136 4: (b7) r3 = 0 5: (85) call bpf_probe_read_kernel#113 6: (05) goto pc+104 ... // this is the assembly dump of the loader program 390: (63) *(u32 *)(r6 +44) = r0 391: (18) r1 = map[idx:0]+5584 393: (61) r0 = *(u32 *)(r1 +0) 394: (63) *(u32 *)(r6 +24) = r0 395: (b7) r0 = 0 396: (95) exit err 0 // the loader program was loaded and executed successfully (null) func#0 @0 ... // CO-RE in the kernel logs: CO-RE relocating STRUCT file: found target candidate [500] prog '': relo #0: kind (0), spec is [8] STRUCT file.f_path (0:1 @ offset 16) prog '': relo #0: matching candidate #0 [500] STRUCT file.f_path (0:1 @ offset 16) prog '': relo #0: patched insn #15 (ALU/ALU64) imm 16 -> 16 vmlinux_cand_cache:[11]file(500), module_cand_cache: ... // verifier logs when it was checking test_d_path.o program: R1 type=ctx expected=fp 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 ; int BPF_PROG(prog_close, struct file *file, void *id) 0: (79) r6 = *(u64 *)(r1 +0) func 'filp_close' arg0 has btf_id 500 type STRUCT 'file' 1: R1=ctx(id=0,off=0,imm=0) R6_w=ptr_file(id=0,off=0,imm=0) R10=fp0 ; pid_t pid = bpf_get_current_pid_tgid() >> 32; 1: (85) call bpf_get_current_pid_tgid#14 ... // if there are multiple programs being loaded by the loader program ... // only the last program in the elf file will be printed, since ... // the same verifier log_buf is used for all PROG_LOAD commands. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211204194623.27779-1-alexei.starovoitov@gmail.com --- tools/bpf/bpftool/prog.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index e47e8b06cc3d..45ccc254e69f 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1774,17 +1774,19 @@ static int try_loader(struct gen_loader_opts *gen) sizeof(struct bpf_prog_desc)); int log_buf_sz = (1u << 24) - 1; int err, fds_before, fd_delta; - char *log_buf; + char *log_buf = NULL; ctx = alloca(ctx_sz); memset(ctx, 0, ctx_sz); ctx->sz = ctx_sz; - ctx->log_level = 1; - ctx->log_size = log_buf_sz; - log_buf = malloc(log_buf_sz); - if (!log_buf) - return -ENOMEM; - ctx->log_buf = (long) log_buf; + if (verifier_logs) { + ctx->log_level = 1 + 2 + 4; + ctx->log_size = log_buf_sz; + log_buf = malloc(log_buf_sz); + if (!log_buf) + return -ENOMEM; + ctx->log_buf = (long) log_buf; + } opts.ctx = ctx; opts.data = gen->data; opts.data_sz = gen->data_sz; @@ -1793,9 +1795,9 @@ static int try_loader(struct gen_loader_opts *gen) fds_before = count_open_fds(); err = bpf_load_and_run(&opts); fd_delta = count_open_fds() - fds_before; - if (err < 0) { + if (err < 0 || verifier_logs) { fprintf(stderr, "err %d\n%s\n%s", err, opts.errstr, log_buf); - if (fd_delta) + if (fd_delta && err < 0) fprintf(stderr, "loader prog leaked %d FDs\n", fd_delta); } From db52f57211b4e45f0ebb274e2c877b211dc18591 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Mon, 6 Dec 2021 13:03:15 +0530 Subject: [PATCH 092/115] bpf: Remove config check to enable bpf support for branch records Branch data available to BPF programs can be very useful to get stack traces out of userspace application. Commit fff7b64355ea ("bpf: Add bpf_read_branch_records() helper") added BPF support to capture branch records in x86. Enable this feature also for other architectures as well by removing checks specific to x86. If an architecture doesn't support branch records, bpf_read_branch_records() still has appropriate checks and it will return an -EINVAL in that scenario. Based on UAPI helper doc in include/uapi/linux/bpf.h, unsupported architectures should return -ENOENT in such case. Hence, update the appropriate check to return -ENOENT instead. Selftest 'perf_branches' result on power9 machine which has the branch stacks support: - Before this patch: [command]# ./test_progs -t perf_branches #88/1 perf_branches/perf_branches_hw:FAIL #88/2 perf_branches/perf_branches_no_hw:OK #88 perf_branches:FAIL Summary: 0/1 PASSED, 0 SKIPPED, 1 FAILED - After this patch: [command]# ./test_progs -t perf_branches #88/1 perf_branches/perf_branches_hw:OK #88/2 perf_branches/perf_branches_no_hw:OK #88 perf_branches:OK Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED Selftest 'perf_branches' result on power9 machine which doesn't have branch stack report: - After this patch: [command]# ./test_progs -t perf_branches #88/1 perf_branches/perf_branches_hw:SKIP #88/2 perf_branches/perf_branches_no_hw:OK #88 perf_branches:OK Summary: 1/1 PASSED, 1 SKIPPED, 0 FAILED Fixes: fff7b64355eac ("bpf: Add bpf_read_branch_records() helper") Suggested-by: Peter Zijlstra Signed-off-by: Kajol Jain Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211206073315.77432-1-kjain@linux.ibm.com --- kernel/trace/bpf_trace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 25ea521fb8f1..77f13de6f9f9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1404,9 +1404,6 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { -#ifndef CONFIG_X86 - return -ENOENT; -#else static const u32 br_entry_size = sizeof(struct perf_branch_entry); struct perf_branch_stack *br_stack = ctx->data->br_stack; u32 to_copy; @@ -1415,7 +1412,7 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, return -EINVAL; if (unlikely(!br_stack)) - return -EINVAL; + return -ENOENT; if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE) return br_stack->nr * br_entry_size; @@ -1427,7 +1424,6 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, memcpy(buf, br_stack->entries, to_copy); return to_copy; -#endif } static const struct bpf_func_proto bpf_read_branch_records_proto = { From 222c98c7979084fbefb4ce2ae377210c6e42011e Mon Sep 17 00:00:00 2001 From: huangxuesen Date: Mon, 6 Dec 2021 09:47:16 +0800 Subject: [PATCH 093/115] libbpf: Fix trivial typo Fix typo in comment from 'bpf_skeleton_map' to 'bpf_map_skeleton' and from 'bpf_skeleton_prog' to 'bpf_prog_skeleton'. Signed-off-by: huangxuesen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1638755236-3851199-1-git-send-email-hxseverything@gmail.com --- tools/lib/bpf/libbpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index c0d62dd37c5d..2fa046a96142 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1038,11 +1038,11 @@ struct bpf_object_skeleton { struct bpf_object **obj; int map_cnt; - int map_skel_sz; /* sizeof(struct bpf_skeleton_map) */ + int map_skel_sz; /* sizeof(struct bpf_map_skeleton) */ struct bpf_map_skeleton *maps; int prog_cnt; - int prog_skel_sz; /* sizeof(struct bpf_skeleton_prog) */ + int prog_skel_sz; /* sizeof(struct bpf_prog_skeleton) */ struct bpf_prog_skeleton *progs; }; From d5284dedccdb9053988278dd30c834d46b8c866d Mon Sep 17 00:00:00 2001 From: Grant Seltzer Date: Mon, 6 Dec 2021 15:37:09 -0500 Subject: [PATCH 094/115] libbpf: Add doc comments in libbpf.h This adds comments above functions in libbpf.h which document their uses. These comments are of a format that doxygen and sphinx can pick up and render. These are rendered by libbpf.readthedocs.org These doc comments are for: - bpf_object__open_file() - bpf_object__open_mem() - bpf_program__attach_uprobe() - bpf_program__attach_uprobe_opts() Signed-off-by: Grant Seltzer Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211206203709.332530-1-grantseltzer@gmail.com --- tools/lib/bpf/libbpf.h | 53 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 2fa046a96142..4802c1e736c3 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -112,8 +112,30 @@ struct bpf_object_open_opts { #define bpf_object_open_opts__last_field btf_custom_path LIBBPF_API struct bpf_object *bpf_object__open(const char *path); + +/** + * @brief **bpf_object__open_file()** creates a bpf_object by opening + * the BPF ELF object file pointed to by the passed path and loading it + * into memory. + * @param path BPF object file path + * @param opts options for how to load the bpf object, this parameter is + * optional and can be set to NULL + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ LIBBPF_API struct bpf_object * bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts); + +/** + * @brief **bpf_object__open_mem()** creates a bpf_object by reading + * the BPF objects raw bytes from a memory buffer containing a valid + * BPF ELF object file. + * @param obj_buf pointer to the buffer containing ELF file bytes + * @param obj_buf_sz number of bytes in the buffer + * @param opts options for how to load the bpf object + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ LIBBPF_API struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts); @@ -348,10 +370,41 @@ struct bpf_uprobe_opts { }; #define bpf_uprobe_opts__last_field retprobe +/** + * @brief **bpf_program__attach_uprobe()** attaches a BPF program + * to the userspace function which is found by binary path and + * offset. You can optionally specify a particular proccess to attach + * to. You can also optionally attach the program to the function + * exit instead of entry. + * + * @param prog BPF program to attach + * @param retprobe Attach to function exit + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains the function symbol + * @param func_offset Offset within the binary of the function symbol + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ LIBBPF_API struct bpf_link * bpf_program__attach_uprobe(const struct bpf_program *prog, bool retprobe, pid_t pid, const char *binary_path, size_t func_offset); + +/** + * @brief **bpf_program__attach_uprobe_opts()** is just like + * bpf_program__attach_uprobe() except with a options struct + * for various configurations. + * + * @param prog BPF program to attach + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains the function symbol + * @param func_offset Offset within the binary of the function symbol + * @param opts Options for altering program attachment + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ LIBBPF_API struct bpf_link * bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, const char *binary_path, size_t func_offset, From 29f2e5bd9439445fe14ba8570b1c9a7ad682df84 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 6 Dec 2021 17:48:39 -0800 Subject: [PATCH 095/115] bpf: Silence purge_cand_cache build warning. When CONFIG_DEBUG_INFO_BTF_MODULES is not set the following warning can be seen: kernel/bpf/btf.c:6588:13: warning: 'purge_cand_cache' defined but not used [-Wunused-function] Fix it. Fixes: 1e89106da253 ("bpf: Add bpf_core_add_cands() and wire it into bpf_core_apply_relo_insn().") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211207014839.6976-1-alexei.starovoitov@gmail.com --- kernel/bpf/btf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 36a5cc0f53c6..01b47d4df3ab 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6560,6 +6560,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, return new_cands; } +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache, int cache_size) { @@ -6598,6 +6599,7 @@ static void purge_cand_cache(struct btf *btf) __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE); mutex_unlock(&cand_cache_mutex); } +#endif static struct bpf_cand_cache * bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, From e64fbcaa7a666f16329b1c67af15ea501bc84586 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 3 Dec 2021 20:50:03 +0100 Subject: [PATCH 096/115] samples: bpf: Fix xdp_sample_user.o linking with Clang Clang (13) doesn't get the jokes about specifying libraries to link in cclags of individual .o objects: clang-13: warning: -lm: 'linker' input unused [-Wunused-command-line-argument] [ ... ] LD samples/bpf/xdp_redirect_cpu LD samples/bpf/xdp_redirect_map_multi LD samples/bpf/xdp_redirect_map LD samples/bpf/xdp_redirect LD samples/bpf/xdp_monitor /usr/bin/ld: samples/bpf/xdp_sample_user.o: in function `sample_summary_print': xdp_sample_user.c:(.text+0x84c): undefined reference to `floor' /usr/bin/ld: xdp_sample_user.c:(.text+0x870): undefined reference to `ceil' /usr/bin/ld: xdp_sample_user.c:(.text+0x8cf): undefined reference to `floor' /usr/bin/ld: xdp_sample_user.c:(.text+0x8f3): undefined reference to `ceil' [ more ] Specify '-lm' as ldflags for all xdp_sample_user.o users in the main Makefile and remove it from ccflags of ^ in Makefile.target -- just like it's done for all other samples. This works with all compilers. Fixes: 6e1051a54e31 ("samples: bpf: Convert xdp_monitor to XDP samples helper") Fixes: b926c55d856c ("samples: bpf: Convert xdp_redirect to XDP samples helper") Fixes: e531a220cc59 ("samples: bpf: Convert xdp_redirect_cpu to XDP samples helper") Fixes: bbe65865aa05 ("samples: bpf: Convert xdp_redirect_map to XDP samples helper") Fixes: 594a116b2aa1 ("samples: bpf: Convert xdp_redirect_map_multi to XDP samples helper") Signed-off-by: Alexander Lobakin Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20211203195004.5803-2-alexandr.lobakin@intel.com --- samples/bpf/Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 6ae62b1dc938..38638845db9d 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -215,6 +215,11 @@ TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib endif TPROGS_LDLIBS += $(LIBBPF) -lelf -lz +TPROGLDLIBS_xdp_monitor += -lm +TPROGLDLIBS_xdp_redirect += -lm +TPROGLDLIBS_xdp_redirect_cpu += -lm +TPROGLDLIBS_xdp_redirect_map += -lm +TPROGLDLIBS_xdp_redirect_map_multi += -lm TPROGLDLIBS_tracex4 += -lrt TPROGLDLIBS_trace_output += -lrt TPROGLDLIBS_map_perf_test += -lrt @@ -345,7 +350,7 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h # Override includes for xdp_sample_user.o because $(srctree)/usr/include in # TPROGS_CFLAGS causes conflicts -XDP_SAMPLE_CFLAGS += -Wall -O2 -lm \ +XDP_SAMPLE_CFLAGS += -Wall -O2 \ -I$(src)/../../tools/include \ -I$(src)/../../tools/include/uapi \ -I$(LIBBPF_INCLUDE) \ From 6f670d06e47c774bc065aaa84a527a4838f34bd8 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 3 Dec 2021 20:50:04 +0100 Subject: [PATCH 097/115] samples: bpf: Fix 'unknown warning group' build warning on Clang Clang doesn't have 'stringop-truncation' group like GCC does, and complains about it when building samples which use xdp_sample_user infra: samples/bpf/xdp_sample_user.h:48:32: warning: unknown warning group '-Wstringop-truncation', ignored [-Wunknown-warning-option] #pragma GCC diagnostic ignored "-Wstringop-truncation" ^ [ repeat ] Those are harmless, but avoidable when guarding it with ifdef. I could guard push/pop as well, but this would require one more ifdef cruft around a single line which I don't think is reasonable. Fixes: 156f886cf697 ("samples: bpf: Add basic infrastructure for XDP samples") Signed-off-by: Alexander Lobakin Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20211203195004.5803-3-alexandr.lobakin@intel.com --- samples/bpf/xdp_sample_user.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/samples/bpf/xdp_sample_user.h b/samples/bpf/xdp_sample_user.h index d97465ff8c62..5f44b877ecf5 100644 --- a/samples/bpf/xdp_sample_user.h +++ b/samples/bpf/xdp_sample_user.h @@ -45,7 +45,9 @@ const char *get_driver_name(int ifindex); int get_mac_addr(int ifindex, void *mac_addr); #pragma GCC diagnostic push +#ifndef __clang__ #pragma GCC diagnostic ignored "-Wstringop-truncation" +#endif __attribute__((unused)) static inline char *safe_strncpy(char *dst, const char *src, size_t size) { From 8d0f9e73efe7f0ff4b1d0d013044e8ef91782689 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 7 Dec 2021 15:23:40 -0800 Subject: [PATCH 098/115] perf/bpf_counter: Use bpf_map_create instead of bpf_create_map bpf_create_map is deprecated. Replace it with bpf_map_create. Also add a __weak bpf_map_create() so that when older version of libbpf is linked as a shared library, it falls back to bpf_create_map(). Fixes: 992c4225419a ("libbpf: Unify low-level map creation APIs w/ new bpf_map_create()") Signed-off-by: Song Liu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211207232340.2561471-1-song@kernel.org --- tools/perf/util/bpf_counter.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index c17d4a43ce06..5a97fd7d0a71 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -307,6 +307,20 @@ static bool bperf_attr_map_compatible(int attr_map_fd) (map_info.value_size == sizeof(struct perf_event_attr_map_entry)); } +int __weak +bpf_map_create(enum bpf_map_type map_type, + const char *map_name __maybe_unused, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + const struct bpf_map_create_opts *opts __maybe_unused) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + return bpf_create_map(map_type, key_size, value_size, max_entries, 0); +#pragma GCC diagnostic pop +} + static int bperf_lock_attr_map(struct target *target) { char path[PATH_MAX]; @@ -320,10 +334,10 @@ static int bperf_lock_attr_map(struct target *target) } if (access(path, F_OK)) { - map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, + map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(struct perf_event_attr), sizeof(struct perf_event_attr_map_entry), - ATTR_MAP_SIZE, 0); + ATTR_MAP_SIZE, NULL); if (map_fd < 0) return -1; From b540358e6c4d86eb450f3539aea198653e656641 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 8 Dec 2021 21:04:03 -0800 Subject: [PATCH 099/115] selftests/bpf: Fix a compilation warning The following warning is triggered when I used clang compiler to build the selftest. /.../prog_tests/btf_dedup_split.c:368:6: warning: variable 'btf2' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (!ASSERT_OK(err, "btf_dedup")) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ /.../prog_tests/btf_dedup_split.c:424:12: note: uninitialized use occurs here btf__free(btf2); ^~~~ /.../prog_tests/btf_dedup_split.c:368:2: note: remove the 'if' if its condition is always false if (!ASSERT_OK(err, "btf_dedup")) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /.../prog_tests/btf_dedup_split.c:343:25: note: initialize the variable 'btf2' to silence this warning struct btf *btf1, *btf2; ^ = NULL Initialize local variable btf2 = NULL and the warning is gone. Fixes: 9a49afe6f5a5 ("selftests/bpf: Add btf_dedup case with duplicated structs within CU") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211209050403.1770836-1-yhs@fb.com --- tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c index 878a864dae3b..90aac437576d 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c @@ -340,7 +340,7 @@ static void btf_add_dup_struct_in_cu(struct btf *btf, int start_id) static void test_split_dup_struct_in_cu() { - struct btf *btf1, *btf2; + struct btf *btf1, *btf2 = NULL; int err; /* generate the base data.. */ From 73b6eae583f44e278e19489a411f9c1e22d530fc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 7 Dec 2021 22:47:18 +0000 Subject: [PATCH 100/115] bpf: Remove redundant assignment to pointer t The pointer t is being initialized with a value that is never read. The pointer is re-assigned a value a littler later on, hence the initialization is redundant and can be removed. Signed-off-by: Colin Ian King Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211207224718.59593-1-colin.i.king@gmail.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 01b47d4df3ab..27b7de538697 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -837,7 +837,7 @@ static const char *btf_show_name(struct btf_show *show) const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)]; const char *name = NULL, *prefix = "", *parens = ""; const struct btf_member *m = show->state.member; - const struct btf_type *t = show->state.type; + const struct btf_type *t; const struct btf_array *array; u32 id = show->state.type_id; const char *member = NULL; From ac55b3f00c323cf09d59a191e14bcf39b691078c Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Thu, 9 Dec 2021 08:00:51 +0000 Subject: [PATCH 101/115] samples/bpf: Remove unneeded variable Return value directly instead of taking this in another redundant variable. Reported-by: Zeal Robot Signed-off-by: Minghao Chi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211209080051.421844-1-chi.minghao@zte.com.cn --- samples/bpf/xdp_redirect_cpu.bpf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/samples/bpf/xdp_redirect_cpu.bpf.c b/samples/bpf/xdp_redirect_cpu.bpf.c index f10fe3cf25f6..25e3a405375f 100644 --- a/samples/bpf/xdp_redirect_cpu.bpf.c +++ b/samples/bpf/xdp_redirect_cpu.bpf.c @@ -100,7 +100,6 @@ u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) void *data = (void *)(long)ctx->data; struct iphdr *iph = data + nh_off; struct udphdr *udph; - u16 dport; if (iph + 1 > data_end) return 0; @@ -111,8 +110,7 @@ u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) if (udph + 1 > data_end) return 0; - dport = bpf_ntohs(udph->dest); - return dport; + return bpf_ntohs(udph->dest); } static __always_inline From 4cf23a3c6359556a1cca489cf2b901e2b904c4b0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:29 -0800 Subject: [PATCH 102/115] libbpf: Fix bpf_prog_load() log_buf logic for log_level 0 To unify libbpf APIs behavior w.r.t. log_buf and log_level, fix bpf_prog_load() to follow the same logic as bpf_btf_load() and high-level bpf_object__load() API will follow in the subsequent patches: - if log_level is 0 and non-NULL log_buf is provided by a user, attempt load operation initially with no log_buf and log_level set; - if successful, we are done, return new FD; - on error, retry the load operation with log_level bumped to 1 and log_buf set; this way verbose logging will be requested only when we are sure that there is a failure, but will be fast in the common/expected success case. Of course, user can still specify log_level > 0 from the very beginning to force log collection. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-2-andrii@kernel.org --- tools/lib/bpf/bpf.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 4e7836e1a7b5..3dc86342f0a0 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -303,10 +303,6 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, if (log_level && !log_buf) return libbpf_err(-EINVAL); - attr.log_level = log_level; - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_size; - func_info_rec_size = OPTS_GET(opts, func_info_rec_size, 0); func_info = OPTS_GET(opts, func_info, NULL); attr.func_info_rec_size = func_info_rec_size; @@ -321,6 +317,12 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, attr.fd_array = ptr_to_u64(OPTS_GET(opts, fd_array, NULL)); + if (log_level) { + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_size; + attr.log_level = log_level; + } + fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts); if (fd >= 0) return fd; @@ -366,16 +368,17 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, goto done; } - if (log_level || !log_buf) - goto done; + if (log_level == 0 && log_buf) { + /* log_level == 0 with non-NULL log_buf requires retrying on error + * with log_level == 1 and log_buf/log_buf_size set, to get details of + * failure + */ + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_size; + attr.log_level = 1; - /* Try again with log */ - log_buf[0] = 0; - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_size; - attr.log_level = 1; - - fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts); + fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts); + } done: /* free() doesn't affect errno, so we don't need to restore it */ free(finfo); From 0ed08d6725b5116aaad7a0082d721286e0a43dca Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:30 -0800 Subject: [PATCH 103/115] libbpf: Add OPTS-based bpf_btf_load() API Similar to previous bpf_prog_load() and bpf_map_create() APIs, add bpf_btf_load() API which is taking optional OPTS struct. Schedule bpf_load_btf() for deprecation in v0.8 ([0]). This makes naming consistent with BPF_BTF_LOAD command, sets up an API for extensibility in the future, moves options parameters (log-related fields) into optional options, and also allows to pass log_level directly. It also removes log buffer auto-allocation logic from low-level API (consistent with bpf_prog_load() behavior), but preserves a special treatment of log_level == 0 with non-NULL log_buf, which matches low-level bpf_prog_load() and high-level libbpf APIs for BTF and program loading behaviors. [0] Closes: https://github.com/libbpf/libbpf/issues/419 Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-3-andrii@kernel.org --- tools/lib/bpf/bpf.c | 59 +++++++++++++++++++++++++++++------ tools/lib/bpf/bpf.h | 19 +++++++++-- tools/lib/bpf/libbpf.map | 1 + tools/lib/bpf/libbpf_probes.c | 2 +- 4 files changed, 69 insertions(+), 12 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 3dc86342f0a0..6b2407e12060 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -1047,24 +1047,65 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd) return libbpf_err_errno(fd); } -int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, - bool do_log) +int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_load_opts *opts) { - union bpf_attr attr = {}; + const size_t attr_sz = offsetofend(union bpf_attr, btf_log_level); + union bpf_attr attr; + char *log_buf; + size_t log_size; + __u32 log_level; int fd; - attr.btf = ptr_to_u64(btf); + memset(&attr, 0, attr_sz); + + if (!OPTS_VALID(opts, bpf_btf_load_opts)) + return libbpf_err(-EINVAL); + + log_buf = OPTS_GET(opts, log_buf, NULL); + log_size = OPTS_GET(opts, log_size, 0); + log_level = OPTS_GET(opts, log_level, 0); + + if (log_size > UINT_MAX) + return libbpf_err(-EINVAL); + if (log_size && !log_buf) + return libbpf_err(-EINVAL); + + attr.btf = ptr_to_u64(btf_data); attr.btf_size = btf_size; + /* log_level == 0 and log_buf != NULL means "try loading without + * log_buf, but retry with log_buf and log_level=1 on error", which is + * consistent across low-level and high-level BTF and program loading + * APIs within libbpf and provides a sensible behavior in practice + */ + if (log_level) { + attr.btf_log_buf = ptr_to_u64(log_buf); + attr.btf_log_size = (__u32)log_size; + attr.btf_log_level = log_level; + } + + fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz); + if (fd < 0 && log_buf && log_level == 0) { + attr.btf_log_buf = ptr_to_u64(log_buf); + attr.btf_log_size = (__u32)log_size; + attr.btf_log_level = 1; + fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz); + } + return libbpf_err_errno(fd); +} + +int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, bool do_log) +{ + LIBBPF_OPTS(bpf_btf_load_opts, opts); + int fd; retry: if (do_log && log_buf && log_buf_size) { - attr.btf_log_level = 1; - attr.btf_log_size = log_buf_size; - attr.btf_log_buf = ptr_to_u64(log_buf); + opts.log_buf = log_buf; + opts.log_size = log_buf_size; + opts.log_level = 1; } - fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, sizeof(attr)); - + fd = bpf_btf_load(btf, btf_size, &opts); if (fd < 0 && !do_log && log_buf && log_buf_size) { do_log = true; goto retry; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index f79e5fbcf1c1..5f7d9636643d 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -213,6 +213,23 @@ LIBBPF_API int bpf_verify_program(enum bpf_prog_type type, char *log_buf, size_t log_buf_sz, int log_level); +struct bpf_btf_load_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + /* kernel log options */ + char *log_buf; + __u32 log_level; + __u32 log_size; +}; +#define bpf_btf_load_opts__last_field log_size + +LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size, + const struct bpf_btf_load_opts *opts); + +LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_btf_load() instead") +LIBBPF_API int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, + __u32 log_buf_size, bool do_log); + LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags); @@ -340,8 +357,6 @@ LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); -LIBBPF_API int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, - __u32 log_buf_size, bool do_log); LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, __u64 *probe_addr); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 715df3a27389..08cdfe840436 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -422,6 +422,7 @@ LIBBPF_0.6.0 { LIBBPF_0.7.0 { global: + bpf_btf_load; bpf_program__log_level; bpf_program__set_log_level; }; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 41f2be47c2ea..4bdec69523a7 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -164,7 +164,7 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len, memcpy(raw_btf + hdr.hdr_len, raw_types, hdr.type_len); memcpy(raw_btf + hdr.hdr_len + hdr.type_len, str_sec, hdr.str_len); - btf_fd = bpf_load_btf(raw_btf, btf_len, NULL, 0, false); + btf_fd = bpf_btf_load(raw_btf, btf_len, NULL); free(raw_btf); return btf_fd; From 1a190d1e8eb9ff84354e38f7482dc77b626f3cc9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:31 -0800 Subject: [PATCH 104/115] libbpf: Allow passing preallocated log_buf when loading BTF into kernel Add libbpf-internal btf_load_into_kernel() that allows to pass preallocated log_buf and custom log_level to be passed into kernel during BPF_BTF_LOAD call. When custom log_buf is provided, btf_load_into_kernel() won't attempt an retry with automatically allocated internal temporary buffer to capture BTF validation log. It's important to note the relation between log_buf and log_level, which slightly deviates from stricter kernel logic. From kernel's POV, if log_buf is specified, log_level has to be > 0, and vice versa. While kernel has good reasons to request such "sanity, this, in practice, is a bit unconvenient and restrictive for libbpf's high-level bpf_object APIs. So libbpf will allow to set non-NULL log_buf and log_level == 0. This is fine and means to attempt to load BTF without logging requested, but if it failes, retry the load with custom log_buf and log_level 1. Similar logic will be implemented for program loading. In practice this means that users can provide custom log buffer just in case error happens, but not really request slower verbose logging all the time. This is also consistent with libbpf behavior when custom log_buf is not set: libbpf first tries to load everything with log_level=0, and only if error happens allocates internal log buffer and retries with log_level=1. Also, while at it, make BTF validation log more obvious and follow the log pattern libbpf is using for dumping BPF verifier log during BPF_PROG_LOAD. BTF loading resulting in an error will look like this: libbpf: BTF loading error: -22 libbpf: -- BEGIN BTF LOAD LOG --- magic: 0xeb9f version: 1 flags: 0x0 hdr_len: 24 type_off: 0 type_len: 1040 str_off: 1040 str_len: 2063598257 btf_total_size: 1753 Total section length too long -- END BTF LOAD LOG -- libbpf: Error loading .BTF into kernel: -22. BTF is optional, ignoring. This makes it much easier to find relevant parts in libbpf log output. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-4-andrii@kernel.org --- tools/lib/bpf/btf.c | 82 +++++++++++++++++++++++---------- tools/lib/bpf/libbpf_internal.h | 1 + 2 files changed, 58 insertions(+), 25 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 0d7b16eab569..e171424192ae 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1124,54 +1124,86 @@ struct btf *btf__parse_split(const char *path, struct btf *base_btf) static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian); -int btf__load_into_kernel(struct btf *btf) +int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level) { - __u32 log_buf_size = 0, raw_size; - char *log_buf = NULL; + LIBBPF_OPTS(bpf_btf_load_opts, opts); + __u32 buf_sz = 0, raw_size; + char *buf = NULL, *tmp; void *raw_data; int err = 0; if (btf->fd >= 0) return libbpf_err(-EEXIST); + if (log_sz && !log_buf) + return libbpf_err(-EINVAL); -retry_load: - if (log_buf_size) { - log_buf = malloc(log_buf_size); - if (!log_buf) - return libbpf_err(-ENOMEM); - - *log_buf = 0; - } - + /* cache native raw data representation */ raw_data = btf_get_raw_data(btf, &raw_size, false); if (!raw_data) { err = -ENOMEM; goto done; } - /* cache native raw data representation */ btf->raw_size = raw_size; btf->raw_data = raw_data; - btf->fd = bpf_load_btf(raw_data, raw_size, log_buf, log_buf_size, false); - if (btf->fd < 0) { - if (!log_buf || errno == ENOSPC) { - log_buf_size = max((__u32)BPF_LOG_BUF_SIZE, - log_buf_size << 1); - free(log_buf); - goto retry_load; +retry_load: + /* if log_level is 0, we won't provide log_buf/log_size to the kernel, + * initially. Only if BTF loading fails, we bump log_level to 1 and + * retry, using either auto-allocated or custom log_buf. This way + * non-NULL custom log_buf provides a buffer just in case, but hopes + * for successful load and no need for log_buf. + */ + if (log_level) { + /* if caller didn't provide custom log_buf, we'll keep + * allocating our own progressively bigger buffers for BTF + * verification log + */ + if (!log_buf) { + buf_sz = max((__u32)BPF_LOG_BUF_SIZE, buf_sz * 2); + tmp = realloc(buf, buf_sz); + if (!tmp) { + err = -ENOMEM; + goto done; + } + buf = tmp; + buf[0] = '\0'; } + opts.log_buf = log_buf ? log_buf : buf; + opts.log_size = log_buf ? log_sz : buf_sz; + opts.log_level = log_level; + } + + btf->fd = bpf_btf_load(raw_data, raw_size, &opts); + if (btf->fd < 0) { + /* time to turn on verbose mode and try again */ + if (log_level == 0) { + log_level = 1; + goto retry_load; + } + /* only retry if caller didn't provide custom log_buf, but + * make sure we can never overflow buf_sz + */ + if (!log_buf && errno == ENOSPC && buf_sz <= UINT_MAX / 2) + goto retry_load; + err = -errno; - pr_warn("Error loading BTF: %s(%d)\n", strerror(errno), errno); - if (*log_buf) - pr_warn("%s\n", log_buf); - goto done; + pr_warn("BTF loading error: %d\n", err); + /* don't print out contents of custom log_buf */ + if (!log_buf && buf[0]) + pr_warn("-- BEGIN BTF LOAD LOG ---\n%s\n-- END BTF LOAD LOG --\n", buf); } done: - free(log_buf); + free(buf); return libbpf_err(err); } + +int btf__load_into_kernel(struct btf *btf) +{ + return btf_load_into_kernel(btf, NULL, 0, 0); +} + int btf__load(struct btf *) __attribute__((alias("btf__load_into_kernel"))); int btf__fd(const struct btf *btf) diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 6f143e9e810c..355c41019aed 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -277,6 +277,7 @@ int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz); int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); int libbpf__load_raw_btf(const char *raw_types, size_t types_len, const char *str_sec, size_t str_len); +int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level); struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf); void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, From e0e3ea888c69b4ea17133b8ac8dfd5066a759b5a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:32 -0800 Subject: [PATCH 105/115] libbpf: Allow passing user log setting through bpf_object_open_opts Allow users to provide their own custom log_buf, log_size, and log_level at bpf_object level through bpf_object_open_opts. This log_buf will be used during BTF loading. Subsequent patch will use same log_buf during BPF program loading, unless overriden at per-bpf_program level. When such custom log_buf is provided, libbpf won't be attempting retrying loading of BTF to try to provide its own log buffer to capture kernel's error log output. User is responsible to provide big enough buffer, otherwise they run a risk of getting -ENOSPC error from the bpf() syscall. See also comments in bpf_object_open_opts regarding log_level and log_buf interactions. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-5-andrii@kernel.org --- tools/lib/bpf/bpf.h | 3 ++- tools/lib/bpf/libbpf.c | 24 +++++++++++++++++++++++- tools/lib/bpf/libbpf.h | 41 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 5f7d9636643d..94e553a0ff9d 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -195,8 +195,9 @@ struct bpf_load_program_attr { /* Flags to direct loading requirements */ #define MAPS_RELAX_COMPAT 0x01 -/* Recommend log buffer size */ +/* Recommended log buffer size */ #define BPF_LOG_BUF_SIZE (UINT32_MAX >> 8) /* verifier maximum in kernels <= 5.1 */ + LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead") LIBBPF_API int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, char *log_buf, size_t log_buf_sz); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6db0b5e8540e..38999e9c08e0 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -573,6 +573,11 @@ struct bpf_object { size_t btf_module_cnt; size_t btf_module_cap; + /* optional log settings passed to BPF_BTF_LOAD and BPF_PROG_LOAD commands */ + char *log_buf; + size_t log_size; + __u32 log_level; + void *priv; bpf_object_clear_priv_t clear_priv; @@ -3017,7 +3022,9 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) */ btf__set_fd(kern_btf, 0); } else { - err = btf__load_into_kernel(kern_btf); + /* currently BPF_BTF_LOAD only supports log_level 1 */ + err = btf_load_into_kernel(kern_btf, obj->log_buf, obj->log_size, + obj->log_level ? 1 : 0); } if (sanitize) { if (!err) { @@ -6932,6 +6939,9 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, struct bpf_object *obj; char tmp_name[64]; int err; + char *log_buf; + size_t log_size; + __u32 log_level; if (elf_version(EV_CURRENT) == EV_NONE) { pr_warn("failed to init libelf for %s\n", @@ -6954,10 +6964,22 @@ __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, pr_debug("loading object '%s' from buffer\n", obj_name); } + log_buf = OPTS_GET(opts, kernel_log_buf, NULL); + log_size = OPTS_GET(opts, kernel_log_size, 0); + log_level = OPTS_GET(opts, kernel_log_level, 0); + if (log_size > UINT_MAX) + return ERR_PTR(-EINVAL); + if (log_size && !log_buf) + return ERR_PTR(-EINVAL); + obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name); if (IS_ERR(obj)) return obj; + obj->log_buf = log_buf; + obj->log_size = log_size; + obj->log_level = log_level; + btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL); if (btf_tmp_path) { if (strlen(btf_tmp_path) >= PATH_MAX) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 4802c1e736c3..5c984c63859f 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -108,8 +108,47 @@ struct bpf_object_open_opts { * struct_ops, etc) will need actual kernel BTF at /sys/kernel/btf/vmlinux. */ const char *btf_custom_path; + /* Pointer to a buffer for storing kernel logs for applicable BPF + * commands. Valid kernel_log_size has to be specified as well and are + * passed-through to bpf() syscall. Keep in mind that kernel might + * fail operation with -ENOSPC error if provided buffer is too small + * to contain entire log output. + * See the comment below for kernel_log_level for interaction between + * log_buf and log_level settings. + * + * If specified, this log buffer will be passed for: + * - each BPF progral load (BPF_PROG_LOAD) attempt, unless overriden + * with bpf_program__set_log() on per-program level, to get + * BPF verifier log output. + * - during BPF object's BTF load into kernel (BPF_BTF_LOAD) to get + * BTF sanity checking log. + * + * Each BPF command (BPF_BTF_LOAD or BPF_PROG_LOAD) will overwrite + * previous contents, so if you need more fine-grained control, set + * per-program buffer with bpf_program__set_log_buf() to preserve each + * individual program's verification log. Keep using kernel_log_buf + * for BTF verification log, if necessary. + */ + char *kernel_log_buf; + size_t kernel_log_size; + /* + * Log level can be set independently from log buffer. Log_level=0 + * means that libbpf will attempt loading BTF or program without any + * logging requested, but will retry with either its own or custom log + * buffer, if provided, and log_level=1 on any error. + * And vice versa, setting log_level>0 will request BTF or prog + * loading with verbose log from the first attempt (and as such also + * for successfully loaded BTF or program), and the actual log buffer + * could be either libbpf's own auto-allocated log buffer, if + * kernel_log_buffer is NULL, or user-provided custom kernel_log_buf. + * If user didn't provide custom log buffer, libbpf will emit captured + * logs through its print callback. + */ + __u32 kernel_log_level; + + size_t :0; }; -#define bpf_object_open_opts__last_field btf_custom_path +#define bpf_object_open_opts__last_field kernel_log_level LIBBPF_API struct bpf_object *bpf_object__open(const char *path); From ad9a7f96445b70c415d8e193f854321b110c890a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:33 -0800 Subject: [PATCH 106/115] libbpf: Improve logging around BPF program loading Add missing "prog '%s': " prefixes in few places and use consistently markers for beginning and end of program load logs. Here's an example of log output: libbpf: prog 'handler': BPF program load failed: Permission denied libbpf: -- BEGIN PROG LOAD LOG --- arg#0 reference type('UNKNOWN ') size cannot be determined: -22 ; out1 = in1; 0: (18) r1 = 0xffffc9000cdcc000 2: (61) r1 = *(u32 *)(r1 +0) ... 81: (63) *(u32 *)(r4 +0) = r5 R1_w=map_value(id=0,off=16,ks=4,vs=20,imm=0) R4=map_value(id=0,off=400,ks=4,vs=16,imm=0) invalid access to map value, value_size=16 off=400 size=4 R4 min value is outside of the allowed memory range processed 63 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 -- END PROG LOAD LOG -- libbpf: failed to load program 'handler' libbpf: failed to load object 'test_skeleton' The entire verifier log, including BEGIN and END markers are now always youtput during a single print callback call. This should make it much easier to post-process or parse it, if necessary. It's not an explicit API guarantee, but it can be reasonably expected to stay like that. Also __bpf_object__open is renamed to bpf_object_open() as it's always an adventure to find the exact function that implements bpf_object's open phase, so drop the double underscored and use internal libbpf naming convention. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-6-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 38 +++++++++---------- .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 6 ++- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 38999e9c08e0..f07ff39a9d20 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6662,8 +6662,10 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr); if (ret >= 0) { - if (log_buf && load_attr.log_level) - pr_debug("verifier log:\n%s", log_buf); + if (log_buf && load_attr.log_level) { + pr_debug("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", + prog->name, log_buf); + } if (obj->has_rodata && kernel_supports(obj, FEAT_PROG_BIND_MAP)) { struct bpf_map *map; @@ -6676,8 +6678,8 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog if (bpf_prog_bind_map(ret, bpf_map__fd(map), NULL)) { cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); - pr_warn("prog '%s': failed to bind .rodata map: %s\n", - prog->name, cp); + pr_warn("prog '%s': failed to bind map '%s': %s\n", + prog->name, map->real_name, cp); /* Don't fail hard if can't bind rodata. */ } } @@ -6691,23 +6693,22 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog if (!log_buf || errno == ENOSPC) { log_buf_size = max((size_t)BPF_LOG_BUF_SIZE, log_buf_size << 1); - free(log_buf); goto retry_load; } ret = errno ? -errno : -LIBBPF_ERRNO__LOAD; cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); - pr_warn("load bpf program failed: %s\n", cp); + pr_warn("prog '%s': BPF program load failed: %s\n", prog->name, cp); pr_perm_msg(ret); if (log_buf && log_buf[0] != '\0') { ret = -LIBBPF_ERRNO__VERIFY; - pr_warn("-- BEGIN DUMP LOG ---\n"); - pr_warn("\n%s\n", log_buf); - pr_warn("-- END LOG --\n"); - } else if (insns_cnt >= BPF_MAXINSNS) { - pr_warn("Program too large (%d insns), at most %d insns\n", - insns_cnt, BPF_MAXINSNS); + pr_warn("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", + prog->name, log_buf); + } + if (insns_cnt >= BPF_MAXINSNS) { + pr_warn("prog '%s': program too large (%d insns), at most %d insns\n", + prog->name, insns_cnt, BPF_MAXINSNS); ret = -LIBBPF_ERRNO__PROG2BIG; } else if (prog->type != BPF_PROG_TYPE_KPROBE) { /* Wrong program type? */ @@ -6931,9 +6932,8 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object return 0; } -static struct bpf_object * -__bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz, - const struct bpf_object_open_opts *opts) +static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz, + const struct bpf_object_open_opts *opts) { const char *obj_name, *kconfig, *btf_tmp_path; struct bpf_object *obj; @@ -7033,7 +7033,7 @@ __bpf_object__open_xattr(struct bpf_object_open_attr *attr, int flags) return NULL; pr_debug("loading %s\n", attr->file); - return __bpf_object__open(attr->file, NULL, 0, &opts); + return bpf_object_open(attr->file, NULL, 0, &opts); } struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) @@ -7059,7 +7059,7 @@ bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) pr_debug("loading %s\n", path); - return libbpf_ptr(__bpf_object__open(path, NULL, 0, opts)); + return libbpf_ptr(bpf_object_open(path, NULL, 0, opts)); } struct bpf_object * @@ -7069,7 +7069,7 @@ bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, if (!obj_buf || obj_buf_sz == 0) return libbpf_err_ptr(-EINVAL); - return libbpf_ptr(__bpf_object__open(NULL, obj_buf, obj_buf_sz, opts)); + return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)); } struct bpf_object * @@ -7086,7 +7086,7 @@ bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz, if (!obj_buf || obj_buf_sz == 0) return errno = EINVAL, NULL; - return libbpf_ptr(__bpf_object__open(NULL, obj_buf, obj_buf_sz, &opts)); + return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, &opts)); } static int bpf_object_unload(struct bpf_object *obj) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 94e03df69d71..8daca0ac909f 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -217,14 +217,16 @@ static bool found; static int libbpf_debug_print(enum libbpf_print_level level, const char *format, va_list args) { - char *log_buf; + const char *log_buf; if (level != LIBBPF_WARN || - strcmp(format, "libbpf: \n%s\n")) { + !strstr(format, "-- BEGIN PROG LOAD LOG --")) { vprintf(format, args); return 0; } + /* skip prog_name */ + va_arg(args, char *); log_buf = va_arg(args, char *); if (!log_buf) goto out; From 2eda2145ebfc76569fd088f46356203fc0c785a1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:34 -0800 Subject: [PATCH 107/115] libbpf: Preserve kernel error code and remove kprobe prog type guessing Instead of rewriting error code returned by the kernel of prog load with libbpf-sepcific variants pass through the original error. There is now also no need to have a backup generic -LIBBPF_ERRNO__LOAD fallback error as bpf_prog_load() guarantees that errno will be properly set no matter what. Also drop a completely outdated and pretty useless BPF_PROG_TYPE_KPROBE guess logic. It's not necessary and neither it's helpful in modern BPF applications. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-7-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index f07ff39a9d20..3fd4e3d5a11f 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -6696,34 +6696,19 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog free(log_buf); goto retry_load; } - ret = errno ? -errno : -LIBBPF_ERRNO__LOAD; + + ret = -errno; cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warn("prog '%s': BPF program load failed: %s\n", prog->name, cp); pr_perm_msg(ret); if (log_buf && log_buf[0] != '\0') { - ret = -LIBBPF_ERRNO__VERIFY; pr_warn("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", prog->name, log_buf); } if (insns_cnt >= BPF_MAXINSNS) { pr_warn("prog '%s': program too large (%d insns), at most %d insns\n", prog->name, insns_cnt, BPF_MAXINSNS); - ret = -LIBBPF_ERRNO__PROG2BIG; - } else if (prog->type != BPF_PROG_TYPE_KPROBE) { - /* Wrong program type? */ - int fd; - - load_attr.expected_attach_type = 0; - load_attr.log_buf = NULL; - load_attr.log_size = 0; - fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, prog_name, license, - insns, insns_cnt, &load_attr); - if (fd >= 0) { - close(fd); - ret = -LIBBPF_ERRNO__PROGTYPE; - goto out; - } } out: From b3ce907950350a58880b94fed2b6022f160b8b9a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:35 -0800 Subject: [PATCH 108/115] libbpf: Add per-program log buffer setter and getter Allow to set user-provided log buffer on a per-program basis ([0]). This gives great deal of flexibility in terms of which programs are loaded with logging enabled and where corresponding logs go. Log buffer set with bpf_program__set_log_buf() overrides kernel_log_buf and kernel_log_size settings set at bpf_object open time through bpf_object_open_opts, if any. Adjust bpf_object_load_prog_instance() logic to not perform own log buf allocation and load retry if custom log buffer is provided by the user. [0] Closes: https://github.com/libbpf/libbpf/issues/418 Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-8-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 92 ++++++++++++++++++++++++++++++++-------- tools/lib/bpf/libbpf.h | 7 +++ tools/lib/bpf/libbpf.map | 2 + 3 files changed, 84 insertions(+), 17 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3fd4e3d5a11f..e3e56bebd014 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -331,7 +331,11 @@ struct bpf_program { struct reloc_desc *reloc_desc; int nr_reloc; - int log_level; + + /* BPF verifier log settings */ + char *log_buf; + size_t log_size; + __u32 log_level; struct { int nr; @@ -713,6 +717,9 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, prog->instances.fds = NULL; prog->instances.nr = -1; + /* inherit object's log_level */ + prog->log_level = obj->log_level; + prog->sec_name = strdup(sec_name); if (!prog->sec_name) goto errout; @@ -6591,8 +6598,10 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog const char *prog_name = NULL; char *cp, errmsg[STRERR_BUFSIZE]; size_t log_buf_size = 0; - char *log_buf = NULL; + char *log_buf = NULL, *tmp; int btf_fd, ret, err; + bool own_log_buf = true; + __u32 log_level = prog->log_level; if (prog->type == BPF_PROG_TYPE_UNSPEC) { /* @@ -6627,7 +6636,7 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog load_attr.line_info_rec_size = prog->line_info_rec_size; load_attr.line_info_cnt = prog->line_info_cnt; } - load_attr.log_level = prog->log_level; + load_attr.log_level = log_level; load_attr.prog_flags = prog->prog_flags; load_attr.fd_array = obj->fd_array; @@ -6648,21 +6657,42 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog *prog_fd = -1; return 0; } -retry_load: - if (log_buf_size) { - log_buf = malloc(log_buf_size); - if (!log_buf) - return -ENOMEM; - *log_buf = 0; +retry_load: + /* if log_level is zero, we don't request logs initiallly even if + * custom log_buf is specified; if the program load fails, then we'll + * bump log_level to 1 and use either custom log_buf or we'll allocate + * our own and retry the load to get details on what failed + */ + if (log_level) { + if (prog->log_buf) { + log_buf = prog->log_buf; + log_buf_size = prog->log_size; + own_log_buf = false; + } else if (obj->log_buf) { + log_buf = obj->log_buf; + log_buf_size = obj->log_size; + own_log_buf = false; + } else { + log_buf_size = max((size_t)BPF_LOG_BUF_SIZE, log_buf_size * 2); + tmp = realloc(log_buf, log_buf_size); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + log_buf = tmp; + log_buf[0] = '\0'; + own_log_buf = true; + } } load_attr.log_buf = log_buf; load_attr.log_size = log_buf_size; - ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr); + load_attr.log_level = log_level; + ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr); if (ret >= 0) { - if (log_buf && load_attr.log_level) { + if (log_level && own_log_buf) { pr_debug("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", prog->name, log_buf); } @@ -6690,19 +6720,26 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog goto out; } - if (!log_buf || errno == ENOSPC) { - log_buf_size = max((size_t)BPF_LOG_BUF_SIZE, - log_buf_size << 1); - free(log_buf); + if (log_level == 0) { + log_level = 1; goto retry_load; } + /* On ENOSPC, increase log buffer size and retry, unless custom + * log_buf is specified. + * Be careful to not overflow u32, though. Kernel's log buf size limit + * isn't part of UAPI so it can always be bumped to full 4GB. So don't + * multiply by 2 unless we are sure we'll fit within 32 bits. + * Currently, we'll get -EINVAL when we reach (UINT_MAX >> 2). + */ + if (own_log_buf && errno == ENOSPC && log_buf_size <= UINT_MAX / 2) + goto retry_load; ret = -errno; cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warn("prog '%s': BPF program load failed: %s\n", prog->name, cp); pr_perm_msg(ret); - if (log_buf && log_buf[0] != '\0') { + if (own_log_buf && log_buf && log_buf[0] != '\0') { pr_warn("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", prog->name, log_buf); } @@ -6712,7 +6749,8 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog } out: - free(log_buf); + if (own_log_buf) + free(log_buf); return ret; } @@ -8498,6 +8536,26 @@ int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level) return 0; } +const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size) +{ + *log_size = prog->log_size; + return prog->log_buf; +} + +int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size) +{ + if (log_size && !log_buf) + return -EINVAL; + if (prog->log_size > UINT_MAX) + return -EINVAL; + if (prog->obj->loaded) + return -EBUSY; + + prog->log_buf = log_buf; + prog->log_size = log_size; + return 0; +} + #define SEC_DEF(sec_pfx, ptype, atype, flags, ...) { \ .sec = sec_pfx, \ .prog_type = BPF_PROG_TYPE_##ptype, \ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5c984c63859f..dacde55bebff 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -591,8 +591,15 @@ bpf_program__set_expected_attach_type(struct bpf_program *prog, LIBBPF_API __u32 bpf_program__flags(const struct bpf_program *prog); LIBBPF_API int bpf_program__set_flags(struct bpf_program *prog, __u32 flags); + +/* Per-program log level and log buffer getters/setters. + * See bpf_object_open_opts comments regarding log_level and log_buf + * interactions. + */ LIBBPF_API __u32 bpf_program__log_level(const struct bpf_program *prog); LIBBPF_API int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level); +LIBBPF_API const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size); +LIBBPF_API int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size); LIBBPF_API int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 08cdfe840436..4d483af7dba6 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -423,6 +423,8 @@ LIBBPF_0.6.0 { LIBBPF_0.7.0 { global: bpf_btf_load; + bpf_program__log_buf; bpf_program__log_level; + bpf_program__set_log_buf; bpf_program__set_log_level; }; From e7b924ca715f0d1c0be62b205c36c4076b335421 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:36 -0800 Subject: [PATCH 109/115] libbpf: Deprecate bpf_object__load_xattr() Deprecate non-extensible bpf_object__load_xattr() in v0.8 ([0]). With log_level control through bpf_object_open_opts or bpf_program__set_log_level(), we are finally at the point where bpf_object__load_xattr() doesn't provide any functionality that can't be accessed through other (better) ways. The other feature, target_btf_path, is also controllable through bpf_object_open_opts. [0] Closes: https://github.com/libbpf/libbpf/issues/289 Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-9-andrii@kernel.org --- tools/lib/bpf/libbpf.c | 23 ++++++++++------------- tools/lib/bpf/libbpf.h | 1 + 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e3e56bebd014..18d95c6a89fe 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7462,14 +7462,10 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, return 0; } -int bpf_object__load_xattr(struct bpf_object_load_attr *attr) +static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path) { - struct bpf_object *obj; int err, i; - if (!attr) - return libbpf_err(-EINVAL); - obj = attr->obj; if (!obj) return libbpf_err(-EINVAL); @@ -7479,7 +7475,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) } if (obj->gen_loader) - bpf_gen__init(obj->gen_loader, attr->log_level); + bpf_gen__init(obj->gen_loader, extra_log_level); err = bpf_object__probe_loading(obj); err = err ? : bpf_object__load_vmlinux_btf(obj, false); @@ -7488,8 +7484,8 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) err = err ? : bpf_object__sanitize_maps(obj); err = err ? : bpf_object__init_kern_struct_ops_maps(obj); err = err ? : bpf_object__create_maps(obj); - err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : attr->target_btf_path); - err = err ? : bpf_object__load_progs(obj, attr->log_level); + err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); + err = err ? : bpf_object__load_progs(obj, extra_log_level); err = err ? : bpf_object_init_prog_arrays(obj); if (obj->gen_loader) { @@ -7534,13 +7530,14 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) return libbpf_err(err); } +int bpf_object__load_xattr(struct bpf_object_load_attr *attr) +{ + return bpf_object_load(attr->obj, attr->log_level, attr->target_btf_path); +} + int bpf_object__load(struct bpf_object *obj) { - struct bpf_object_load_attr attr = { - .obj = obj, - }; - - return bpf_object__load_xattr(&attr); + return bpf_object_load(obj, 0, NULL); } static int make_parent_dir(const char *path) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index dacde55bebff..a8b894dae633 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -214,6 +214,7 @@ struct bpf_object_load_attr { /* Load/unload object into/from kernel */ LIBBPF_API int bpf_object__load(struct bpf_object *obj); +LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__load() instead") LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr); LIBBPF_DEPRECATED_SINCE(0, 6, "bpf_object__unload() is deprecated, use bpf_object__close() instead") LIBBPF_API int bpf_object__unload(struct bpf_object *obj); From dc94121b5ca17adaaabb7959c10d9c6ea504f7b1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:37 -0800 Subject: [PATCH 110/115] selftests/bpf: Replace all uses of bpf_load_btf() with bpf_btf_load() Switch all selftests uses of to-be-deprecated bpf_load_btf() with equivalent bpf_btf_load() calls. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-10-andrii@kernel.org --- .../selftests/bpf/map_tests/sk_storage_map.c | 2 +- tools/testing/selftests/bpf/prog_tests/btf.c | 50 +++++++++++-------- tools/testing/selftests/bpf/test_verifier.c | 2 +- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c index 8eea4ffeb092..099eb4dfd4f7 100644 --- a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c +++ b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c @@ -136,7 +136,7 @@ static int load_btf(void) memcpy(raw_btf + sizeof(btf_hdr) + sizeof(btf_raw_types), btf_str_sec, sizeof(btf_str_sec)); - return bpf_load_btf(raw_btf, sizeof(raw_btf), 0, 0, 0); + return bpf_btf_load(raw_btf, sizeof(raw_btf), NULL); } static int create_sk_storage_map(void) diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index cab810bab593..01b776a7beeb 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -4071,6 +4071,28 @@ static void *btf_raw_create(const struct btf_header *hdr, return raw_btf; } +static int load_raw_btf(const void *raw_data, size_t raw_size) +{ + LIBBPF_OPTS(bpf_btf_load_opts, opts); + int btf_fd; + + if (always_log) { + opts.log_buf = btf_log_buf, + opts.log_size = BTF_LOG_BUF_SIZE, + opts.log_level = 1; + } + + btf_fd = bpf_btf_load(raw_data, raw_size, &opts); + if (btf_fd < 0 && !always_log) { + opts.log_buf = btf_log_buf, + opts.log_size = BTF_LOG_BUF_SIZE, + opts.log_level = 1; + btf_fd = bpf_btf_load(raw_data, raw_size, &opts); + } + + return btf_fd; +} + static void do_test_raw(unsigned int test_num) { struct btf_raw_test *test = &raw_tests[test_num - 1]; @@ -4100,16 +4122,14 @@ static void do_test_raw(unsigned int test_num) hdr->str_len = (int)hdr->str_len + test->str_len_delta; *btf_log_buf = '\0'; - btf_fd = bpf_load_btf(raw_btf, raw_btf_size, - btf_log_buf, BTF_LOG_BUF_SIZE, - always_log); + btf_fd = load_raw_btf(raw_btf, raw_btf_size); free(raw_btf); err = ((btf_fd < 0) != test->btf_load_err); if (CHECK(err, "btf_fd:%d test->btf_load_err:%u", btf_fd, test->btf_load_err) || CHECK(test->err_str && !strstr(btf_log_buf, test->err_str), - "expected err_str:%s", test->err_str)) { + "expected err_str:%s\n", test->err_str)) { err = -1; goto done; } @@ -4227,9 +4247,7 @@ static int test_big_btf_info(unsigned int test_num) goto done; } - btf_fd = bpf_load_btf(raw_btf, raw_btf_size, - btf_log_buf, BTF_LOG_BUF_SIZE, - always_log); + btf_fd = load_raw_btf(raw_btf, raw_btf_size); if (CHECK(btf_fd < 0, "errno:%d", errno)) { err = -1; goto done; @@ -4315,9 +4333,7 @@ static int test_btf_id(unsigned int test_num) info[i].btf_size = raw_btf_size; } - btf_fd[0] = bpf_load_btf(raw_btf, raw_btf_size, - btf_log_buf, BTF_LOG_BUF_SIZE, - always_log); + btf_fd[0] = load_raw_btf(raw_btf, raw_btf_size); if (CHECK(btf_fd[0] < 0, "errno:%d", errno)) { err = -1; goto done; @@ -4447,9 +4463,7 @@ static void do_test_get_info(unsigned int test_num) goto done; } - btf_fd = bpf_load_btf(raw_btf, raw_btf_size, - btf_log_buf, BTF_LOG_BUF_SIZE, - always_log); + btf_fd = load_raw_btf(raw_btf, raw_btf_size); if (CHECK(btf_fd <= 0, "errno:%d", errno)) { err = -1; goto done; @@ -5169,12 +5183,10 @@ static void do_test_pprint(int test_num) return; *btf_log_buf = '\0'; - btf_fd = bpf_load_btf(raw_btf, raw_btf_size, - btf_log_buf, BTF_LOG_BUF_SIZE, - always_log); + btf_fd = load_raw_btf(raw_btf, raw_btf_size); free(raw_btf); - if (CHECK(btf_fd < 0, "errno:%d", errno)) { + if (CHECK(btf_fd < 0, "errno:%d\n", errno)) { err = -1; goto done; } @@ -6538,9 +6550,7 @@ static void do_test_info_raw(unsigned int test_num) return; *btf_log_buf = '\0'; - btf_fd = bpf_load_btf(raw_btf, raw_btf_size, - btf_log_buf, BTF_LOG_BUF_SIZE, - always_log); + btf_fd = load_raw_btf(raw_btf, raw_btf_size); free(raw_btf); if (CHECK(btf_fd < 0, "invalid btf_fd errno:%d", errno)) { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 222cb063ddf4..07b88a8f504f 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -641,7 +641,7 @@ static int load_btf(void) memcpy(ptr, btf_str_sec, hdr.str_len); ptr += hdr.str_len; - btf_fd = bpf_load_btf(raw_btf, ptr - raw_btf, 0, 0, 0); + btf_fd = bpf_btf_load(raw_btf, ptr - raw_btf, NULL); free(raw_btf); if (btf_fd < 0) return -1; From 57e889269af3dd0609933e2550c4baee7a7eb84c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:38 -0800 Subject: [PATCH 111/115] selftests/bpf: Add test for libbpf's custom log_buf behavior Add a selftest that validates that per-program and per-object log_buf overrides work as expected. Also test same logic for low-level bpf_prog_load() and bpf_btf_load() APIs. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-11-andrii@kernel.org --- .../selftests/bpf/prog_tests/log_buf.c | 276 ++++++++++++++++++ .../selftests/bpf/progs/test_log_buf.c | 24 ++ 2 files changed, 300 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/log_buf.c create mode 100644 tools/testing/selftests/bpf/progs/test_log_buf.c diff --git a/tools/testing/selftests/bpf/prog_tests/log_buf.c b/tools/testing/selftests/bpf/prog_tests/log_buf.c new file mode 100644 index 000000000000..e469b023962b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/log_buf.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include + +#include "test_log_buf.skel.h" + +static size_t libbpf_log_pos; +static char libbpf_log_buf[1024 * 1024]; +static bool libbpf_log_error; + +static int libbpf_print_cb(enum libbpf_print_level level, const char *fmt, va_list args) +{ + int emitted_cnt; + size_t left_cnt; + + left_cnt = sizeof(libbpf_log_buf) - libbpf_log_pos; + emitted_cnt = vsnprintf(libbpf_log_buf + libbpf_log_pos, left_cnt, fmt, args); + + if (emitted_cnt < 0 || emitted_cnt + 1 > left_cnt) { + libbpf_log_error = true; + return 0; + } + + libbpf_log_pos += emitted_cnt; + return 0; +} + +static void obj_load_log_buf(void) +{ + libbpf_print_fn_t old_print_cb = libbpf_set_print(libbpf_print_cb); + LIBBPF_OPTS(bpf_object_open_opts, opts); + const size_t log_buf_sz = 1024 * 1024; + struct test_log_buf* skel; + char *obj_log_buf, *good_log_buf, *bad_log_buf; + int err; + + obj_log_buf = malloc(3 * log_buf_sz); + if (!ASSERT_OK_PTR(obj_log_buf, "obj_log_buf")) + return; + + good_log_buf = obj_log_buf + log_buf_sz; + bad_log_buf = obj_log_buf + 2 * log_buf_sz; + obj_log_buf[0] = good_log_buf[0] = bad_log_buf[0] = '\0'; + + opts.kernel_log_buf = obj_log_buf; + opts.kernel_log_size = log_buf_sz; + opts.kernel_log_level = 4; /* for BTF this will turn into 1 */ + + /* In the first round every prog has its own log_buf, so libbpf logs + * don't have program failure logs + */ + skel = test_log_buf__open_opts(&opts); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + + /* set very verbose level for good_prog so we always get detailed logs */ + bpf_program__set_log_buf(skel->progs.good_prog, good_log_buf, log_buf_sz); + bpf_program__set_log_level(skel->progs.good_prog, 2); + + bpf_program__set_log_buf(skel->progs.bad_prog, bad_log_buf, log_buf_sz); + /* log_level 0 with custom log_buf means that verbose logs are not + * requested if program load is successful, but libbpf should retry + * with log_level 1 on error and put program's verbose load log into + * custom log_buf + */ + bpf_program__set_log_level(skel->progs.bad_prog, 0); + + err = test_log_buf__load(skel); + if (!ASSERT_ERR(err, "unexpected_load_success")) + goto cleanup; + + ASSERT_FALSE(libbpf_log_error, "libbpf_log_error"); + + /* there should be no prog loading log because we specified per-prog log buf */ + ASSERT_NULL(strstr(libbpf_log_buf, "-- BEGIN PROG LOAD LOG --"), "unexp_libbpf_log"); + ASSERT_OK_PTR(strstr(libbpf_log_buf, "prog 'bad_prog': BPF program load failed"), + "libbpf_log_not_empty"); + ASSERT_OK_PTR(strstr(obj_log_buf, "DATASEC license"), "obj_log_not_empty"); + ASSERT_OK_PTR(strstr(good_log_buf, "0: R1=ctx(id=0,off=0,imm=0) R10=fp0"), + "good_log_verbose"); + ASSERT_OK_PTR(strstr(bad_log_buf, "invalid access to map value, value_size=16 off=16000 size=4"), + "bad_log_not_empty"); + + if (env.verbosity > VERBOSE_NONE) { + printf("LIBBPF LOG: \n=================\n%s=================\n", libbpf_log_buf); + printf("OBJ LOG: \n=================\n%s=================\n", obj_log_buf); + printf("GOOD_PROG LOG:\n=================\n%s=================\n", good_log_buf); + printf("BAD_PROG LOG:\n=================\n%s=================\n", bad_log_buf); + } + + /* reset everything */ + test_log_buf__destroy(skel); + obj_log_buf[0] = good_log_buf[0] = bad_log_buf[0] = '\0'; + libbpf_log_buf[0] = '\0'; + libbpf_log_pos = 0; + libbpf_log_error = false; + + /* In the second round we let bad_prog's failure be logged through print callback */ + opts.kernel_log_buf = NULL; /* let everything through into print callback */ + opts.kernel_log_size = 0; + opts.kernel_log_level = 1; + + skel = test_log_buf__open_opts(&opts); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + + /* set normal verbose level for good_prog to check log_level is taken into account */ + bpf_program__set_log_buf(skel->progs.good_prog, good_log_buf, log_buf_sz); + bpf_program__set_log_level(skel->progs.good_prog, 1); + + err = test_log_buf__load(skel); + if (!ASSERT_ERR(err, "unexpected_load_success")) + goto cleanup; + + ASSERT_FALSE(libbpf_log_error, "libbpf_log_error"); + + /* this time prog loading error should be logged through print callback */ + ASSERT_OK_PTR(strstr(libbpf_log_buf, "libbpf: prog 'bad_prog': -- BEGIN PROG LOAD LOG --"), + "libbpf_log_correct"); + ASSERT_STREQ(obj_log_buf, "", "obj_log__empty"); + ASSERT_STREQ(good_log_buf, "processed 4 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0\n", + "good_log_ok"); + ASSERT_STREQ(bad_log_buf, "", "bad_log_empty"); + + if (env.verbosity > VERBOSE_NONE) { + printf("LIBBPF LOG: \n=================\n%s=================\n", libbpf_log_buf); + printf("OBJ LOG: \n=================\n%s=================\n", obj_log_buf); + printf("GOOD_PROG LOG:\n=================\n%s=================\n", good_log_buf); + printf("BAD_PROG LOG:\n=================\n%s=================\n", bad_log_buf); + } + +cleanup: + free(obj_log_buf); + test_log_buf__destroy(skel); + libbpf_set_print(old_print_cb); +} + +static void bpf_prog_load_log_buf(void) +{ + const struct bpf_insn good_prog_insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + const size_t good_prog_insn_cnt = sizeof(good_prog_insns) / sizeof(struct bpf_insn); + const struct bpf_insn bad_prog_insns[] = { + BPF_EXIT_INSN(), + }; + size_t bad_prog_insn_cnt = sizeof(bad_prog_insns) / sizeof(struct bpf_insn); + LIBBPF_OPTS(bpf_prog_load_opts, opts); + const size_t log_buf_sz = 1024 * 1024; + char *log_buf; + int fd = -1; + + log_buf = malloc(log_buf_sz); + if (!ASSERT_OK_PTR(log_buf, "log_buf_alloc")) + return; + opts.log_buf = log_buf; + opts.log_size = log_buf_sz; + + /* with log_level == 0 log_buf shoud stay empty for good prog */ + log_buf[0] = '\0'; + opts.log_level = 0; + fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "good_prog", "GPL", + good_prog_insns, good_prog_insn_cnt, &opts); + ASSERT_STREQ(log_buf, "", "good_log_0"); + ASSERT_GE(fd, 0, "good_fd1"); + if (fd >= 0) + close(fd); + fd = -1; + + /* log_level == 2 should always fill log_buf, even for good prog */ + log_buf[0] = '\0'; + opts.log_level = 2; + fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "good_prog", "GPL", + good_prog_insns, good_prog_insn_cnt, &opts); + ASSERT_OK_PTR(strstr(log_buf, "0: R1=ctx(id=0,off=0,imm=0) R10=fp0"), "good_log_2"); + ASSERT_GE(fd, 0, "good_fd2"); + if (fd >= 0) + close(fd); + fd = -1; + + /* log_level == 0 should fill log_buf for bad prog */ + log_buf[0] = '\0'; + opts.log_level = 0; + fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "bad_prog", "GPL", + bad_prog_insns, bad_prog_insn_cnt, &opts); + ASSERT_OK_PTR(strstr(log_buf, "R0 !read_ok"), "bad_log_0"); + ASSERT_LT(fd, 0, "bad_fd"); + if (fd >= 0) + close(fd); + fd = -1; + + free(log_buf); +} + +static void bpf_btf_load_log_buf(void) +{ + LIBBPF_OPTS(bpf_btf_load_opts, opts); + const size_t log_buf_sz = 1024 * 1024; + const void *raw_btf_data; + __u32 raw_btf_size; + struct btf *btf; + char *log_buf; + int fd = -1; + + btf = btf__new_empty(); + if (!ASSERT_OK_PTR(btf, "empty_btf")) + return; + + ASSERT_GT(btf__add_int(btf, "int", 4, 0), 0, "int_type"); + + raw_btf_data = btf__raw_data(btf, &raw_btf_size); + if (!ASSERT_OK_PTR(raw_btf_data, "raw_btf_data_good")) + goto cleanup; + + log_buf = malloc(log_buf_sz); + if (!ASSERT_OK_PTR(log_buf, "log_buf_alloc")) + goto cleanup; + opts.log_buf = log_buf; + opts.log_size = log_buf_sz; + + /* with log_level == 0 log_buf shoud stay empty for good BTF */ + log_buf[0] = '\0'; + opts.log_level = 0; + fd = bpf_btf_load(raw_btf_data, raw_btf_size, &opts); + ASSERT_STREQ(log_buf, "", "good_log_0"); + ASSERT_GE(fd, 0, "good_fd1"); + if (fd >= 0) + close(fd); + fd = -1; + + /* log_level == 2 should always fill log_buf, even for good BTF */ + log_buf[0] = '\0'; + opts.log_level = 2; + fd = bpf_btf_load(raw_btf_data, raw_btf_size, &opts); + printf("LOG_BUF: %s\n", log_buf); + ASSERT_OK_PTR(strstr(log_buf, "magic: 0xeb9f"), "good_log_2"); + ASSERT_GE(fd, 0, "good_fd2"); + if (fd >= 0) + close(fd); + fd = -1; + + /* make BTF bad, add pointer pointing to non-existing type */ + ASSERT_GT(btf__add_ptr(btf, 100), 0, "bad_ptr_type"); + + raw_btf_data = btf__raw_data(btf, &raw_btf_size); + if (!ASSERT_OK_PTR(raw_btf_data, "raw_btf_data_bad")) + goto cleanup; + + /* log_level == 0 should fill log_buf for bad BTF */ + log_buf[0] = '\0'; + opts.log_level = 0; + fd = bpf_btf_load(raw_btf_data, raw_btf_size, &opts); + printf("LOG_BUF: %s\n", log_buf); + ASSERT_OK_PTR(strstr(log_buf, "[2] PTR (anon) type_id=100 Invalid type_id"), "bad_log_0"); + ASSERT_LT(fd, 0, "bad_fd"); + if (fd >= 0) + close(fd); + fd = -1; + +cleanup: + free(log_buf); + btf__free(btf); +} + +void test_log_buf(void) +{ + if (test__start_subtest("obj_load_log_buf")) + obj_load_log_buf(); + if (test__start_subtest("bpf_prog_load_log_buf")) + bpf_prog_load_log_buf(); + if (test__start_subtest("bpf_btf_load_log_buf")) + bpf_btf_load_log_buf(); +} diff --git a/tools/testing/selftests/bpf/progs/test_log_buf.c b/tools/testing/selftests/bpf/progs/test_log_buf.c new file mode 100644 index 000000000000..199f459bd5ae --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_log_buf.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include +#include + +int a[4]; +const volatile int off = 4000; + +SEC("raw_tp/sys_enter") +int good_prog(const void *ctx) +{ + a[0] = (int)(long)ctx; + return a[1]; +} + +SEC("raw_tp/sys_enter") +int bad_prog(const void *ctx) +{ + /* out of bounds access */ + return a[off]; +} + +char _license[] SEC("license") = "GPL"; From 3fc5fdcca144badbaf29b62aacbf7877f2f39a74 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:39 -0800 Subject: [PATCH 112/115] selftests/bpf: Remove the only use of deprecated bpf_object__load_xattr() Switch from bpf_object__load_xattr() to bpf_object__load() and kernel_log_level in bpf_object_open_opts. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-12-andrii@kernel.org --- tools/testing/selftests/bpf/testing_helpers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 0f1c37ac6f2c..795b6798ccee 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -88,13 +88,15 @@ int extra_prog_load_log_flags = 0; int bpf_prog_test_load(const char *file, enum bpf_prog_type type, struct bpf_object **pobj, int *prog_fd) { - struct bpf_object_load_attr attr = {}; + LIBBPF_OPTS(bpf_object_open_opts, opts, + .kernel_log_level = extra_prog_load_log_flags, + ); struct bpf_object *obj; struct bpf_program *prog; __u32 flags; int err; - obj = bpf_object__open(file); + obj = bpf_object__open_file(file, &opts); if (!obj) return -errno; @@ -110,9 +112,7 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type, flags = bpf_program__flags(prog) | BPF_F_TEST_RND_HI32; bpf_program__set_flags(prog, flags); - attr.obj = obj; - attr.log_level = extra_prog_load_log_flags; - err = bpf_object__load_xattr(&attr); + err = bpf_object__load(obj); if (err) goto err_out; From b59e4ce8bcaab6445f4a0d37a96ca8953caaf5cf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Dec 2021 11:38:40 -0800 Subject: [PATCH 113/115] bpftool: Switch bpf_object__load_xattr() to bpf_object__load() Switch all the uses of to-be-deprecated bpf_object__load_xattr() into a simple bpf_object__load() calls with optional log_level passed through open_opts.kernel_log_level, if -d option is specified. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211209193840.1248570-13-andrii@kernel.org --- tools/bpf/bpftool/gen.c | 11 ++++------- tools/bpf/bpftool/prog.c | 24 ++++++++++-------------- tools/bpf/bpftool/struct_ops.c | 15 +++++++-------- 3 files changed, 21 insertions(+), 29 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 997a2865e04a..b4695df2ea3d 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -486,7 +486,6 @@ static void codegen_destroy(struct bpf_object *obj, const char *obj_name) static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *header_guard) { - struct bpf_object_load_attr load_attr = {}; DECLARE_LIBBPF_OPTS(gen_loader_opts, opts); struct bpf_map *map; char ident[256]; @@ -496,12 +495,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h if (err) return err; - load_attr.obj = obj; - if (verifier_logs) - /* log_level1 + log_level2 + stats, but not stable UAPI */ - load_attr.log_level = 1 + 2 + 4; - - err = bpf_object__load_xattr(&load_attr); + err = bpf_object__load(obj); if (err) { p_err("failed to load object file"); goto out; @@ -719,6 +713,9 @@ static int do_skeleton(int argc, char **argv) if (obj_name[0] == '\0') get_obj_name(obj_name, file); opts.object_name = obj_name; + if (verifier_logs) + /* log_level1 + log_level2 + stats, but not stable UAPI */ + opts.kernel_log_level = 1 + 2 + 4; obj = bpf_object__open_mem(obj_data, file_sz, &opts); err = libbpf_get_error(obj); if (err) { diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 45ccc254e69f..f874896c4154 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1464,7 +1464,6 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts, .relaxed_maps = relaxed_maps, ); - struct bpf_object_load_attr load_attr = { 0 }; enum bpf_attach_type expected_attach_type; struct map_replace *map_replace = NULL; struct bpf_program *prog = NULL, *pos; @@ -1598,6 +1597,10 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) set_max_rlimit(); + if (verifier_logs) + /* log_level1 + log_level2 + stats, but not stable UAPI */ + open_opts.kernel_log_level = 1 + 2 + 4; + obj = bpf_object__open_file(file, &open_opts); if (libbpf_get_error(obj)) { p_err("failed to open object file"); @@ -1677,12 +1680,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) goto err_close_obj; } - load_attr.obj = obj; - if (verifier_logs) - /* log_level1 + log_level2 + stats, but not stable UAPI */ - load_attr.log_level = 1 + 2 + 4; - - err = bpf_object__load_xattr(&load_attr); + err = bpf_object__load(obj); if (err) { p_err("failed to load object file"); goto err_close_obj; @@ -1809,7 +1807,6 @@ static int do_loader(int argc, char **argv) { DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts); DECLARE_LIBBPF_OPTS(gen_loader_opts, gen); - struct bpf_object_load_attr load_attr = {}; struct bpf_object *obj; const char *file; int err = 0; @@ -1818,6 +1815,10 @@ static int do_loader(int argc, char **argv) return -1; file = GET_ARG(); + if (verifier_logs) + /* log_level1 + log_level2 + stats, but not stable UAPI */ + open_opts.kernel_log_level = 1 + 2 + 4; + obj = bpf_object__open_file(file, &open_opts); if (libbpf_get_error(obj)) { p_err("failed to open object file"); @@ -1828,12 +1829,7 @@ static int do_loader(int argc, char **argv) if (err) goto err_close_obj; - load_attr.obj = obj; - if (verifier_logs) - /* log_level1 + log_level2 + stats, but not stable UAPI */ - load_attr.log_level = 1 + 2 + 4; - - err = bpf_object__load_xattr(&load_attr); + err = bpf_object__load(obj); if (err) { p_err("failed to load object file"); goto err_close_obj; diff --git a/tools/bpf/bpftool/struct_ops.c b/tools/bpf/bpftool/struct_ops.c index cbdca37a53f0..2f693b082bdb 100644 --- a/tools/bpf/bpftool/struct_ops.c +++ b/tools/bpf/bpftool/struct_ops.c @@ -479,7 +479,7 @@ static int do_unregister(int argc, char **argv) static int do_register(int argc, char **argv) { - struct bpf_object_load_attr load_attr = {}; + LIBBPF_OPTS(bpf_object_open_opts, open_opts); const struct bpf_map_def *def; struct bpf_map_info info = {}; __u32 info_len = sizeof(info); @@ -494,18 +494,17 @@ static int do_register(int argc, char **argv) file = GET_ARG(); - obj = bpf_object__open(file); + if (verifier_logs) + /* log_level1 + log_level2 + stats, but not stable UAPI */ + open_opts.kernel_log_level = 1 + 2 + 4; + + obj = bpf_object__open_file(file, &open_opts); if (libbpf_get_error(obj)) return -1; set_max_rlimit(); - load_attr.obj = obj; - if (verifier_logs) - /* log_level1 + log_level2 + stats, but not stable UAPI */ - load_attr.log_level = 1 + 2 + 4; - - if (bpf_object__load_xattr(&load_attr)) { + if (bpf_object__load(obj)) { bpf_object__close(obj); return -1; } From b69c5c07a66ee569b8ccdc0cb567fe0622c89ea5 Mon Sep 17 00:00:00 2001 From: Vincent Minet Date: Fri, 10 Dec 2021 07:31:12 +0100 Subject: [PATCH 114/115] libbpf: Fix typo in btf__dedup@LIBBPF_0.0.2 definition The btf__dedup_deprecated name was misspelled in the definition of the compat symbol for btf__dedup. This leads it to be missing from the shared library. This fixes it. Fixes: 957d350a8b94 ("libbpf: Turn btf_dedup_opts into OPTS-based struct") Signed-off-by: Vincent Minet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211210063112.80047-1-vincent@vincent-minet.net --- tools/lib/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index e171424192ae..9aa19c89f758 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -3107,7 +3107,7 @@ int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts) return libbpf_err(err); } -COMPAT_VERSION(bpf__dedup_deprecated, btf__dedup, LIBBPF_0.0.2) +COMPAT_VERSION(btf__dedup_deprecated, btf__dedup, LIBBPF_0.0.2) int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, const void *unused_opts) { LIBBPF_OPTS(btf_dedup_opts, opts, .btf_ext = btf_ext); From 229fae38d0fc0d6ff58d57cbeb1432da55e58d4f Mon Sep 17 00:00:00 2001 From: Shuyi Cheng Date: Fri, 10 Dec 2021 17:39:57 +0800 Subject: [PATCH 115/115] libbpf: Add "bool skipped" to struct bpf_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix error: "failed to pin map: Bad file descriptor, path: /sys/fs/bpf/_rodata_str1_1." In the old kernel, the global data map will not be created, see [0]. So we should skip the pinning of the global data map to avoid bpf_object__pin_maps returning error. Therefore, when the map is not created, we mark “map->skipped" as true and then check during relocation and during pinning. Fixes: 16e0c35c6f7a ("libbpf: Load global data maps lazily on legacy kernels") Signed-off-by: Shuyi Cheng Signed-off-by: Andrii Nakryiko --- tools/lib/bpf/libbpf.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 18d95c6a89fe..d027e1d620fc 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -431,6 +431,7 @@ struct bpf_map { char *pin_path; bool pinned; bool reused; + bool skipped; __u64 map_extra; }; @@ -5087,8 +5088,10 @@ bpf_object__create_maps(struct bpf_object *obj) * kernels. */ if (bpf_map__is_internal(map) && - !kernel_supports(obj, FEAT_GLOBAL_DATA)) + !kernel_supports(obj, FEAT_GLOBAL_DATA)) { + map->skipped = true; continue; + } retried = false; retry: @@ -5717,8 +5720,7 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) } else { const struct bpf_map *map = &obj->maps[relo->map_idx]; - if (bpf_map__is_internal(map) && - !kernel_supports(obj, FEAT_GLOBAL_DATA)) { + if (map->skipped) { pr_warn("prog '%s': relo #%d: kernel doesn't support global data\n", prog->name, i); return -ENOTSUP; @@ -7926,6 +7928,9 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path) char *pin_path = NULL; char buf[PATH_MAX]; + if (map->skipped) + continue; + if (path) { int len;