From 4bfe3bd3cc351efd1d51b3258b060e9445533888 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 27 Dec 2017 19:16:28 +0000
Subject: [PATCH 01/32] tools/bpftool: use version from the kernel source tree

Bpftool determines it's own version based on the kernel
version, which is picked from the linux/version.h header.

It's strange to use the version of the installed kernel
headers, and makes much more sense to use the version
of the actual source tree, where bpftool sources are.

Fix this by building kernelversion target and use
the resulting string as bpftool version.

Example:
before:

$ bpftool version
bpftool v4.14.6

after:
$ bpftool version
bpftool v4.15.0-rc3

$bpftool version --json
{"version":"4.15.0-rc3"}

Signed-off-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Makefile |  3 +++
 tools/bpf/bpftool/main.c   | 13 ++-----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 3f17ad317512..f8f31a8d9269 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -23,6 +23,8 @@ endif
 
 LIBBPF = $(BPF_PATH)libbpf.a
 
+BPFTOOL_VERSION=$(shell make --no-print-directory -sC ../../.. kernelversion)
+
 $(LIBBPF): FORCE
 	$(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) $(OUTPUT)libbpf.a FEATURES_DUMP=$(FEATURE_DUMP_EXPORT)
 
@@ -38,6 +40,7 @@ CC = gcc
 CFLAGS += -O2
 CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow
 CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/
+CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"'
 LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
 
 INSTALL ?= install
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index ecd53ccf1239..3a0396d87c42 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -38,7 +38,6 @@
 #include <errno.h>
 #include <getopt.h>
 #include <linux/bpf.h>
-#include <linux/version.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -95,21 +94,13 @@ static int do_help(int argc, char **argv)
 
 static int do_version(int argc, char **argv)
 {
-	unsigned int version[3];
-
-	version[0] = LINUX_VERSION_CODE >> 16;
-	version[1] = LINUX_VERSION_CODE >> 8 & 0xf;
-	version[2] = LINUX_VERSION_CODE & 0xf;
-
 	if (json_output) {
 		jsonw_start_object(json_wtr);
 		jsonw_name(json_wtr, "version");
-		jsonw_printf(json_wtr, "\"%u.%u.%u\"",
-			     version[0], version[1], version[2]);
+		jsonw_printf(json_wtr, "\"%s\"", BPFTOOL_VERSION);
 		jsonw_end_object(json_wtr);
 	} else {
-		printf("%s v%u.%u.%u\n", bin_name,
-		       version[0], version[1], version[2]);
+		printf("%s v%s\n", bin_name, BPFTOOL_VERSION);
 	}
 	return 0;
 }

From fb982666e380c1632a74495b68b3c33a66e76430 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 27 Dec 2017 19:16:29 +0000
Subject: [PATCH 02/32] tools/bpftool: fix bpftool build with bintutils >= 2.9

Bpftool build is broken with binutils version 2.29 and later.
The cause is commit 003ca0fd2286 ("Refactor disassembler selection")
in the binutils repo, which changed the disassembler() function
signature.

Fix this by adding a new "feature" to the tools/build/features
infrastructure and make it responsible for decision which
disassembler() function signature to use.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/Makefile                            | 29 +++++++++++++++++++
 tools/bpf/bpf_jit_disasm.c                    |  7 +++++
 tools/bpf/bpftool/Makefile                    | 24 +++++++++++++++
 tools/bpf/bpftool/jit_disasm.c                |  7 +++++
 tools/build/feature/Makefile                  |  4 +++
 .../feature/test-disassembler-four-args.c     | 15 ++++++++++
 6 files changed, 86 insertions(+)
 create mode 100644 tools/build/feature/test-disassembler-four-args.c

diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile
index 07a6697466ef..c8ec0ae16bf0 100644
--- a/tools/bpf/Makefile
+++ b/tools/bpf/Makefile
@@ -9,6 +9,35 @@ MAKE = make
 CFLAGS += -Wall -O2
 CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include
 
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(CURDIR)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+endif
+
+FEATURE_USER = .bpf
+FEATURE_TESTS = libbfd disassembler-four-args
+FEATURE_DISPLAY = libbfd disassembler-four-args
+
+check_feat := 1
+NON_CHECK_FEAT_TARGETS := clean bpftool_clean
+ifdef MAKECMDGOALS
+ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),)
+  check_feat := 0
+endif
+endif
+
+ifeq ($(check_feat),1)
+ifeq ($(FEATURES_DUMP),)
+include $(srctree)/tools/build/Makefile.feature
+else
+include $(FEATURES_DUMP)
+endif
+endif
+
+ifeq ($(feature-disassembler-four-args), 1)
+CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE
+endif
+
 %.yacc.c: %.y
 	$(YACC) -o $@ -d $<
 
diff --git a/tools/bpf/bpf_jit_disasm.c b/tools/bpf/bpf_jit_disasm.c
index 75bf526a0168..30044bc4f389 100644
--- a/tools/bpf/bpf_jit_disasm.c
+++ b/tools/bpf/bpf_jit_disasm.c
@@ -72,7 +72,14 @@ static void get_asm_insns(uint8_t *image, size_t len, int opcodes)
 
 	disassemble_init_for_target(&info);
 
+#ifdef DISASM_FOUR_ARGS_SIGNATURE
+	disassemble = disassembler(info.arch,
+				   bfd_big_endian(bfdf),
+				   info.mach,
+				   bfdf);
+#else
 	disassemble = disassembler(bfdf);
+#endif
 	assert(disassemble);
 
 	do {
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index f8f31a8d9269..2237bc43f71c 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -46,6 +46,30 @@ LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
 INSTALL ?= install
 RM ?= rm -f
 
+FEATURE_USER = .bpftool
+FEATURE_TESTS = libbfd disassembler-four-args
+FEATURE_DISPLAY = libbfd disassembler-four-args
+
+check_feat := 1
+NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall
+ifdef MAKECMDGOALS
+ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),)
+  check_feat := 0
+endif
+endif
+
+ifeq ($(check_feat),1)
+ifeq ($(FEATURES_DUMP),)
+include $(srctree)/tools/build/Makefile.feature
+else
+include $(FEATURES_DUMP)
+endif
+endif
+
+ifeq ($(feature-disassembler-four-args), 1)
+CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE
+endif
+
 include $(wildcard *.d)
 
 all: $(OUTPUT)bpftool
diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c
index 1551d3918d4c..57d32e8a1391 100644
--- a/tools/bpf/bpftool/jit_disasm.c
+++ b/tools/bpf/bpftool/jit_disasm.c
@@ -107,7 +107,14 @@ void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes)
 
 	disassemble_init_for_target(&info);
 
+#ifdef DISASM_FOUR_ARGS_SIGNATURE
+	disassemble = disassembler(info.arch,
+				   bfd_big_endian(bfdf),
+				   info.mach,
+				   bfdf);
+#else
 	disassemble = disassembler(bfdf);
+#endif
 	assert(disassemble);
 
 	if (json_output)
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 96982640fbf8..17f2c73fff8b 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -13,6 +13,7 @@ FILES=                                          \
          test-hello.bin                         \
          test-libaudit.bin                      \
          test-libbfd.bin                        \
+         test-disassembler-four-args.bin        \
          test-liberty.bin                       \
          test-liberty-z.bin                     \
          test-cplus-demangle.bin                \
@@ -188,6 +189,9 @@ $(OUTPUT)test-libpython-version.bin:
 $(OUTPUT)test-libbfd.bin:
 	$(BUILD) -DPACKAGE='"perf"' -lbfd -lz -liberty -ldl
 
+$(OUTPUT)test-disassembler-four-args.bin:
+	$(BUILD) -lbfd -lopcodes
+
 $(OUTPUT)test-liberty.bin:
 	$(CC) $(CFLAGS) -Wall -Werror -o $@ test-libbfd.c -DPACKAGE='"perf"' $(LDFLAGS) -lbfd -ldl -liberty
 
diff --git a/tools/build/feature/test-disassembler-four-args.c b/tools/build/feature/test-disassembler-four-args.c
new file mode 100644
index 000000000000..45ce65cfddf0
--- /dev/null
+++ b/tools/build/feature/test-disassembler-four-args.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <bfd.h>
+#include <dis-asm.h>
+
+int main(void)
+{
+	bfd *abfd = bfd_openr(NULL, NULL);
+
+	disassembler(bfd_get_arch(abfd),
+		     bfd_big_endian(abfd),
+		     bfd_get_mach(abfd),
+		     abfd);
+
+	return 0;
+}

From e0d3974ac77b0d581b92affe5851fc40ad2f42a4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:03 -0800
Subject: [PATCH 03/32] bpf: offload: don't require rtnl for dev list
 manipulation

We don't need the RTNL lock for all operations on offload state.
We only need to hold it around ndo calls.  The device offload
initialization doesn't require it.  The soon-to-come querying
of the offload info will only need it partially.  We will also
be able to remove the waitqueue in following patches.

Use struct rw_semaphore because map offload will require sleeping
with the semaphore held for read.

Suggested-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/offload.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 8455b89d1bbf..032079754d88 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -20,13 +20,16 @@
 #include <linux/netdevice.h>
 #include <linux/printk.h>
 #include <linux/rtnetlink.h>
+#include <linux/rwsem.h>
 
-/* protected by RTNL */
+/* Protects bpf_prog_offload_devs and offload members of all progs.
+ * RTNL lock cannot be taken when holding this lock.
+ */
+static DECLARE_RWSEM(bpf_devs_lock);
 static LIST_HEAD(bpf_prog_offload_devs);
 
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 {
-	struct net *net = current->nsproxy->net_ns;
 	struct bpf_dev_offload *offload;
 
 	if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
@@ -43,19 +46,26 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 	offload->prog = prog;
 	init_waitqueue_head(&offload->verifier_done);
 
-	rtnl_lock();
-	offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
-	if (!offload->netdev) {
-		rtnl_unlock();
-		kfree(offload);
-		return -EINVAL;
-	}
+	offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
+					   attr->prog_ifindex);
+	if (!offload->netdev)
+		goto err_free;
 
+	down_write(&bpf_devs_lock);
+	if (offload->netdev->reg_state != NETREG_REGISTERED)
+		goto err_unlock;
 	prog->aux->offload = offload;
 	list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
-	rtnl_unlock();
+	dev_put(offload->netdev);
+	up_write(&bpf_devs_lock);
 
 	return 0;
+err_unlock:
+	up_write(&bpf_devs_lock);
+	dev_put(offload->netdev);
+err_free:
+	kfree(offload);
+	return -EINVAL;
 }
 
 static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
@@ -126,7 +136,9 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog)
 	wake_up(&offload->verifier_done);
 
 	rtnl_lock();
+	down_write(&bpf_devs_lock);
 	__bpf_prog_offload_destroy(prog);
+	up_write(&bpf_devs_lock);
 	rtnl_unlock();
 
 	kfree(offload);
@@ -181,11 +193,13 @@ static int bpf_offload_notification(struct notifier_block *notifier,
 		if (netdev->reg_state != NETREG_UNREGISTERING)
 			break;
 
+		down_write(&bpf_devs_lock);
 		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
 					 offloads) {
 			if (offload->netdev == netdev)
 				__bpf_prog_offload_destroy(offload->prog);
 		}
+		up_write(&bpf_devs_lock);
 		break;
 	default:
 		break;

From 9a18eedb145d080d542766af1d7513ebfccd1604 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:04 -0800
Subject: [PATCH 04/32] bpf: offload: don't use prog->aux->offload as boolean

We currently use aux->offload to indicate that program is bound
to a specific device.  This forces us to keep the offload structure
around even after the device is gone.  Add a bool member to
struct bpf_prog_aux to indicate if offload was requested.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  | 3 ++-
 kernel/bpf/syscall.c | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index da54ef644fcd..838eee10e979 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -201,6 +201,7 @@ struct bpf_prog_aux {
 	u32 stack_depth;
 	u32 id;
 	u32 func_cnt;
+	bool offload_requested;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
 	struct latch_tree_node ksym_tnode;
@@ -529,7 +530,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
 
 static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
 {
-	return aux->offload;
+	return aux->offload_requested;
 }
 #else
 static inline int bpf_prog_offload_init(struct bpf_prog *prog,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 007802c5ca7d..e0afc2e39fd5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1157,6 +1157,8 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (!prog)
 		return -ENOMEM;
 
+	prog->aux->offload_requested = !!attr->prog_ifindex;
+
 	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
 		goto free_prog_nouncharge;
@@ -1178,7 +1180,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 	atomic_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
-	if (attr->prog_ifindex) {
+	if (bpf_prog_is_dev_bound(prog->aux)) {
 		err = bpf_prog_offload_init(prog, attr);
 		if (err)
 			goto free_prog;

From cae1927c0b4a93ae15de824faca1f6f611a44fcd Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:05 -0800
Subject: [PATCH 05/32] bpf: offload: allow netdev to disappear while verifier
 is running

To allow verifier instruction callbacks without any extra locking
NETDEV_UNREGISTER notification would wait on a waitqueue for verifier
to finish.  This design decision was made when rtnl lock was providing
all the locking.  Use the read/write lock instead and remove the
workqueue.

Verifier will now call into the offload code, so dev_ops are moved
to offload structure.  Since verifier calls are all under
bpf_prog_is_dev_bound() we no longer need static inline implementations
to please builds with CONFIG_NET=n.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  2 +-
 .../net/ethernet/netronome/nfp/bpf/verifier.c |  2 +-
 drivers/net/netdevsim/bpf.c                   |  2 +-
 include/linux/bpf.h                           |  9 ++++--
 include/linux/bpf_verifier.h                  | 16 ++--------
 include/linux/netdevice.h                     |  4 +--
 kernel/bpf/offload.c                          | 30 ++++++++++---------
 kernel/bpf/verifier.c                         | 20 +++++--------
 8 files changed, 37 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index aae1be9ed056..89a9b6393882 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -238,7 +238,7 @@ struct nfp_bpf_vnic {
 
 int nfp_bpf_jit(struct nfp_prog *prog);
 
-extern const struct bpf_ext_analyzer_ops nfp_bpf_analyzer_ops;
+extern const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops;
 
 struct netdev_bpf;
 struct nfp_app;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 9c2608445bd8..d8870c2f11f3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -260,6 +260,6 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	return 0;
 }
 
-const struct bpf_ext_analyzer_ops nfp_bpf_analyzer_ops = {
+const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops = {
 	.insn_hook = nfp_verify_insn,
 };
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index a243fa7ae02f..5134d5c1306c 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -66,7 +66,7 @@ nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn)
 	return 0;
 }
 
-static const struct bpf_ext_analyzer_ops nsim_bpf_analyzer_ops = {
+static const struct bpf_prog_offload_ops nsim_bpf_analyzer_ops = {
 	.insn_hook = nsim_bpf_verify_insn,
 };
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 838eee10e979..669549f7e3e8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -17,6 +17,7 @@
 #include <linux/numa.h>
 #include <linux/wait.h>
 
+struct bpf_verifier_env;
 struct perf_event;
 struct bpf_prog;
 struct bpf_map;
@@ -184,14 +185,18 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 };
 
+struct bpf_prog_offload_ops {
+	int (*insn_hook)(struct bpf_verifier_env *env,
+			 int insn_idx, int prev_insn_idx);
+};
+
 struct bpf_dev_offload {
 	struct bpf_prog		*prog;
 	struct net_device	*netdev;
 	void			*dev_priv;
 	struct list_head	offloads;
 	bool			dev_state;
-	bool			verifier_running;
-	wait_queue_head_t	verifier_done;
+	const struct bpf_prog_offload_ops *dev_ops;
 };
 
 struct bpf_prog_aux {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 883a35d50cd5..2feb218c001d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -166,12 +166,6 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log)
 	return log->len_used >= log->len_total - 1;
 }
 
-struct bpf_verifier_env;
-struct bpf_ext_analyzer_ops {
-	int (*insn_hook)(struct bpf_verifier_env *env,
-			 int insn_idx, int prev_insn_idx);
-};
-
 #define BPF_MAX_SUBPROGS 256
 
 /* single container for all structs
@@ -185,7 +179,6 @@ struct bpf_verifier_env {
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
-	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
@@ -206,13 +199,8 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 	return cur->frame[cur->curframe]->regs;
 }
 
-#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
-#else
-static inline int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
-{
-	return -EOPNOTSUPP;
-}
-#endif
+int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
+				 int insn_idx, int prev_insn_idx);
 
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 352066e4eeef..49bfc6eec74c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -804,7 +804,7 @@ enum bpf_netdev_command {
 	BPF_OFFLOAD_DESTROY,
 };
 
-struct bpf_ext_analyzer_ops;
+struct bpf_prog_offload_ops;
 struct netlink_ext_ack;
 
 struct netdev_bpf {
@@ -826,7 +826,7 @@ struct netdev_bpf {
 		/* BPF_OFFLOAD_VERIFIER_PREP */
 		struct {
 			struct bpf_prog *prog;
-			const struct bpf_ext_analyzer_ops *ops; /* callee set */
+			const struct bpf_prog_offload_ops *ops; /* callee set */
 		} verifier;
 		/* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */
 		struct {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 032079754d88..69ddc3899bab 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -44,7 +44,6 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 		return -ENOMEM;
 
 	offload->prog = prog;
-	init_waitqueue_head(&offload->verifier_done);
 
 	offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
 					   attr->prog_ifindex);
@@ -97,15 +96,28 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
 	if (err)
 		goto exit_unlock;
 
-	env->dev_ops = data.verifier.ops;
-
+	env->prog->aux->offload->dev_ops = data.verifier.ops;
 	env->prog->aux->offload->dev_state = true;
-	env->prog->aux->offload->verifier_running = true;
 exit_unlock:
 	rtnl_unlock();
 	return err;
 }
 
+int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
+				 int insn_idx, int prev_insn_idx)
+{
+	struct bpf_dev_offload *offload;
+	int ret = -ENODEV;
+
+	down_read(&bpf_devs_lock);
+	offload = env->prog->aux->offload;
+	if (offload->netdev)
+		ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
+	up_read(&bpf_devs_lock);
+
+	return ret;
+}
+
 static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
 	struct bpf_dev_offload *offload = prog->aux->offload;
@@ -117,9 +129,6 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 
 	data.offload.prog = prog;
 
-	if (offload->verifier_running)
-		wait_event(offload->verifier_done, !offload->verifier_running);
-
 	if (offload->dev_state)
 		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
 
@@ -132,9 +141,6 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
 	struct bpf_dev_offload *offload = prog->aux->offload;
 
-	offload->verifier_running = false;
-	wake_up(&offload->verifier_done);
-
 	rtnl_lock();
 	down_write(&bpf_devs_lock);
 	__bpf_prog_offload_destroy(prog);
@@ -146,15 +152,11 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog)
 
 static int bpf_prog_offload_translate(struct bpf_prog *prog)
 {
-	struct bpf_dev_offload *offload = prog->aux->offload;
 	struct netdev_bpf data = {};
 	int ret;
 
 	data.offload.prog = prog;
 
-	offload->verifier_running = false;
-	wake_up(&offload->verifier_done);
-
 	rtnl_lock();
 	ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
 	rtnl_unlock();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 98d8637cf70d..a2b211262c25 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4438,15 +4438,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	return 0;
 }
 
-static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
-				  int insn_idx, int prev_insn_idx)
-{
-	if (env->dev_ops && env->dev_ops->insn_hook)
-		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
-
-	return 0;
-}
-
 static int do_check(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *state;
@@ -4531,9 +4522,12 @@ static int do_check(struct bpf_verifier_env *env)
 			print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks);
 		}
 
-		err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
-		if (err)
-			return err;
+		if (bpf_prog_is_dev_bound(env->prog->aux)) {
+			err = bpf_prog_offload_verify_insn(env, insn_idx,
+							   prev_insn_idx);
+			if (err)
+				return err;
+		}
 
 		regs = cur_regs(env);
 		env->insn_aux_data[insn_idx].seen = true;
@@ -5463,7 +5457,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
 
-	if (env->prog->aux->offload) {
+	if (bpf_prog_is_dev_bound(env->prog->aux)) {
 		ret = bpf_prog_offload_verifier_prep(env);
 		if (ret)
 			goto err_unlock;

From ce3b9db4db0e0e2b9761c56d08615ea0159e4a1b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:06 -0800
Subject: [PATCH 06/32] bpf: offload: free prog->aux->offload when device
 disappears

All bpf offload operations should now be under bpf_devs_lock,
it's safe to free and clear the entire offload structure,
not only the netdev pointer.

__bpf_prog_offload_destroy() will no longer be called multiple
times.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/offload.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 69ddc3899bab..3126e1a842e6 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -70,12 +70,14 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
 			     struct netdev_bpf *data)
 {
-	struct net_device *netdev = prog->aux->offload->netdev;
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct net_device *netdev;
 
 	ASSERT_RTNL();
 
-	if (!netdev)
+	if (!offload)
 		return -ENODEV;
+	netdev = offload->netdev;
 	if (!netdev->netdev_ops->ndo_bpf)
 		return -EOPNOTSUPP;
 
@@ -111,7 +113,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
 
 	down_read(&bpf_devs_lock);
 	offload = env->prog->aux->offload;
-	if (offload->netdev)
+	if (offload)
 		ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 	up_read(&bpf_devs_lock);
 
@@ -123,31 +125,24 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 	struct bpf_dev_offload *offload = prog->aux->offload;
 	struct netdev_bpf data = {};
 
-	/* Caution - if netdev is destroyed before the program, this function
-	 * will be called twice.
-	 */
-
 	data.offload.prog = prog;
 
 	if (offload->dev_state)
 		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
 
-	offload->dev_state = false;
 	list_del_init(&offload->offloads);
-	offload->netdev = NULL;
+	kfree(offload);
+	prog->aux->offload = NULL;
 }
 
 void bpf_prog_offload_destroy(struct bpf_prog *prog)
 {
-	struct bpf_dev_offload *offload = prog->aux->offload;
-
 	rtnl_lock();
 	down_write(&bpf_devs_lock);
-	__bpf_prog_offload_destroy(prog);
+	if (prog->aux->offload)
+		__bpf_prog_offload_destroy(prog);
 	up_write(&bpf_devs_lock);
 	rtnl_unlock();
-
-	kfree(offload);
 }
 
 static int bpf_prog_offload_translate(struct bpf_prog *prog)

From ad8ad79f4f6078f456792f7f8d344da2be9bc74f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:07 -0800
Subject: [PATCH 07/32] bpf: offload: free program id when device disappears

Bound programs are quite useless after their device disappears.
They are simply waiting for reference count to go to zero,
don't list them in BPF_PROG_GET_NEXT_ID by freeing their ID
early.

Note that orphaned offload programs will return -ENODEV on
BPF_OBJ_GET_INFO_BY_FD so user will never see ID 0.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  | 2 ++
 kernel/bpf/offload.c | 3 +++
 kernel/bpf/syscall.c | 9 +++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 669549f7e3e8..9a916ab34299 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -357,6 +357,8 @@ void bpf_prog_put(struct bpf_prog *prog);
 int __bpf_prog_charge(struct user_struct *user, u32 pages);
 void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
 
+void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
+
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
 struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 3126e1a842e6..e4f1668a021c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -130,6 +130,9 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 	if (offload->dev_state)
 		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
 
+	/* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
+	bpf_prog_free_id(prog, true);
+
 	list_del_init(&offload->offloads);
 	kfree(offload);
 	prog->aux->offload = NULL;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e0afc2e39fd5..e02dafa6f402 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -905,9 +905,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
 {
-	/* cBPF to eBPF migrations are currently not in the idr store. */
+	/* cBPF to eBPF migrations are currently not in the idr store.
+	 * Offloaded programs are removed from the store when their device
+	 * disappears - even if someone grabs an fd to them they are unusable,
+	 * simply waiting for refcnt to drop to be freed.
+	 */
 	if (!prog->aux->id)
 		return;
 
@@ -917,6 +921,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
 		__acquire(&prog_idr_lock);
 
 	idr_remove(&prog_idr, prog->aux->id);
+	prog->aux->id = 0;
 
 	if (do_idr_lock)
 		spin_unlock_bh(&prog_idr_lock);

From cdab6ba8668d68f031bfd5d237b4586ec4f8cd88 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:08 -0800
Subject: [PATCH 08/32] nsfs: generalize ns_get_path() for path resolution with
 a task

ns_get_path() takes struct task_struct and proc_ns_ops as its
parameters.  For path resolution directly from a namespace,
e.g. based on a networking device's net name space, we need
more flexibility.  Add a ns_get_path_cb() helper which will
allow callers to use any method of obtaining the name space
reference.  Convert ns_get_path() to use ns_get_path_cb().

Following patches will bring a networking user.

CC: Eric W. Biederman <ebiederm@xmission.com>
Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 fs/nsfs.c               | 29 ++++++++++++++++++++++++++---
 include/linux/proc_ns.h |  3 +++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 7c6f76d29f56..36b0772701a0 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -103,14 +103,14 @@ static void *__ns_get_path(struct path *path, struct ns_common *ns)
 	goto got_it;
 }
 
-void *ns_get_path(struct path *path, struct task_struct *task,
-			const struct proc_ns_operations *ns_ops)
+void *ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
+		     void *private_data)
 {
 	struct ns_common *ns;
 	void *ret;
 
 again:
-	ns = ns_ops->get(task);
+	ns = ns_get_cb(private_data);
 	if (!ns)
 		return ERR_PTR(-ENOENT);
 
@@ -120,6 +120,29 @@ void *ns_get_path(struct path *path, struct task_struct *task,
 	return ret;
 }
 
+struct ns_get_path_task_args {
+	const struct proc_ns_operations *ns_ops;
+	struct task_struct *task;
+};
+
+static struct ns_common *ns_get_path_task(void *private_data)
+{
+	struct ns_get_path_task_args *args = private_data;
+
+	return args->ns_ops->get(args->task);
+}
+
+void *ns_get_path(struct path *path, struct task_struct *task,
+		  const struct proc_ns_operations *ns_ops)
+{
+	struct ns_get_path_task_args args = {
+		.ns_ops	= ns_ops,
+		.task	= task,
+	};
+
+	return ns_get_path_cb(path, ns_get_path_task, &args);
+}
+
 int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns))
 {
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 2ff18c9840a7..d31cb6215905 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -78,6 +78,9 @@ extern struct file *proc_ns_fget(int fd);
 #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
 extern void *ns_get_path(struct path *path, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops);
+typedef struct ns_common *ns_get_path_helper_t(void *);
+extern void *ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb,
+			    void *private_data);
 
 extern int ns_get_name(char *buf, size_t size, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops);

From 675fc275a3a2d905535207237402c6d8dcb5fa4b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:09 -0800
Subject: [PATCH 09/32] bpf: offload: report device information for offloaded
 programs

Report to the user ifindex and namespace information of offloaded
programs.  If device has disappeared return -ENODEV.  Specify the
namespace using dev/inode combination.

CC: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h            |  2 ++
 include/uapi/linux/bpf.h       |  3 ++
 kernel/bpf/offload.c           | 59 ++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  6 ++++
 tools/include/uapi/linux/bpf.h |  3 ++
 5 files changed, 73 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9a916ab34299..7810ae57b357 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -531,6 +531,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+			       struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 69eabfcb9bdb..f2f8b36e2ad4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -921,6 +921,9 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index e4f1668a021c..040d4e0edf3f 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -16,9 +16,11 @@
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bug.h>
+#include <linux/kdev_t.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
+#include <linux/proc_ns.h>
 #include <linux/rtnetlink.h>
 #include <linux/rwsem.h>
 
@@ -176,6 +178,63 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
+struct ns_get_path_bpf_prog_args {
+	struct bpf_prog *prog;
+	struct bpf_prog_info *info;
+};
+
+static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data)
+{
+	struct ns_get_path_bpf_prog_args *args = private_data;
+	struct bpf_prog_aux *aux = args->prog->aux;
+	struct ns_common *ns;
+	struct net *net;
+
+	rtnl_lock();
+	down_read(&bpf_devs_lock);
+
+	if (aux->offload) {
+		args->info->ifindex = aux->offload->netdev->ifindex;
+		net = dev_net(aux->offload->netdev);
+		get_net(net);
+		ns = &net->ns;
+	} else {
+		args->info->ifindex = 0;
+		ns = NULL;
+	}
+
+	up_read(&bpf_devs_lock);
+	rtnl_unlock();
+
+	return ns;
+}
+
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+			       struct bpf_prog *prog)
+{
+	struct ns_get_path_bpf_prog_args args = {
+		.prog	= prog,
+		.info	= info,
+	};
+	struct inode *ns_inode;
+	struct path ns_path;
+	void *res;
+
+	res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
+	if (IS_ERR(res)) {
+		if (!info->ifindex)
+			return -ENODEV;
+		return PTR_ERR(res);
+	}
+
+	ns_inode = ns_path.dentry->d_inode;
+	info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+	info->netns_ino = ns_inode->i_ino;
+	path_put(&ns_path);
+
+	return 0;
+}
+
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e02dafa6f402..ebf0fb23e237 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1707,6 +1707,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
+	if (bpf_prog_is_dev_bound(prog->aux)) {
+		err = bpf_prog_offload_info_fill(&info, prog);
+		if (err)
+			return err;
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index db1b0923a308..4e8c60acfa32 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -921,6 +921,9 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {

From 522622104ebabbc3372d2fad706b4d30cee13319 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:10 -0800
Subject: [PATCH 10/32] tools: bpftool: report device information for offloaded
 programs

Print the just-exposed device information about device to which
program is bound.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/common.c | 52 ++++++++++++++++++++++++++++++++++++++
 tools/bpf/bpftool/main.h   |  2 ++
 tools/bpf/bpftool/prog.c   |  3 +++
 3 files changed, 57 insertions(+)

diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index b62c94e3997a..6601c95a9258 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -44,7 +44,9 @@
 #include <unistd.h>
 #include <linux/limits.h>
 #include <linux/magic.h>
+#include <net/if.h>
 #include <sys/mount.h>
+#include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
 
@@ -412,3 +414,53 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab)
 		free(obj);
 	}
 }
+
+static char *
+ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf)
+{
+	struct stat st;
+	int err;
+
+	err = stat("/proc/self/ns/net", &st);
+	if (err) {
+		p_err("Can't stat /proc/self: %s", strerror(errno));
+		return NULL;
+	}
+
+	if (st.st_dev != ns_dev || st.st_ino != ns_ino)
+		return NULL;
+
+	return if_indextoname(ifindex, buf);
+}
+
+void print_dev_plain(__u32 ifindex, __u64 ns_dev, __u64 ns_inode)
+{
+	char name[IF_NAMESIZE];
+
+	if (!ifindex)
+		return;
+
+	printf(" dev ");
+	if (ifindex_to_name_ns(ifindex, ns_dev, ns_inode, name))
+		printf("%s", name);
+	else
+		printf("ifindex %u ns_dev %llu ns_ino %llu",
+		       ifindex, ns_dev, ns_inode);
+}
+
+void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode)
+{
+	char name[IF_NAMESIZE];
+
+	if (!ifindex)
+		return;
+
+	jsonw_name(json_wtr, "dev");
+	jsonw_start_object(json_wtr);
+	jsonw_uint_field(json_wtr, "ifindex", ifindex);
+	jsonw_uint_field(json_wtr, "ns_dev", ns_dev);
+	jsonw_uint_field(json_wtr, "ns_inode", ns_inode);
+	if (ifindex_to_name_ns(ifindex, ns_dev, ns_inode, name))
+		jsonw_string_field(json_wtr, "ifname", name);
+	jsonw_end_object(json_wtr);
+}
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 8f6d3cac0347..65b526fe6e7e 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -96,6 +96,8 @@ struct pinned_obj {
 int build_pinned_obj_table(struct pinned_obj_table *table,
 			   enum bpf_obj_type type);
 void delete_pinned_obj_table(struct pinned_obj_table *tab);
+void print_dev_plain(__u32 ifindex, __u64 ns_dev, __u64 ns_inode);
+void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode);
 
 struct cmd {
 	const char *cmd;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index fd0873178503..98f871ed53d6 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -230,6 +230,8 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
 		     info->tag[0], info->tag[1], info->tag[2], info->tag[3],
 		     info->tag[4], info->tag[5], info->tag[6], info->tag[7]);
 
+	print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
+
 	if (info->load_time) {
 		char buf[32];
 
@@ -287,6 +289,7 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd)
 
 	printf("tag ");
 	fprint_hex(stdout, info->tag, BPF_TAG_SIZE, "");
+	print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino);
 	printf("\n");
 
 	if (info->load_time) {

From 752d7b4501c250bead233ab041738db84436b1af Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Dec 2017 18:39:11 -0800
Subject: [PATCH 11/32] selftests/bpf: test device info reporting for bound
 progs

Check if bound programs report correct device info.  Test
in local namespace, in remote one, back to the local ns,
remove the device and check that information is cleared.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_offload.py | 112 ++++++++++++++++++--
 1 file changed, 101 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
index c940505c2978..e3c750f17cb8 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -18,6 +18,8 @@ import argparse
 import json
 import os
 import pprint
+import random
+import string
 import subprocess
 import time
 
@@ -27,6 +29,7 @@ bpf_test_dir = os.path.dirname(os.path.realpath(__file__))
 pp = pprint.PrettyPrinter()
 devs = [] # devices we created for clean up
 files = [] # files to be removed
+netns = [] # net namespaces to be removed
 
 def log_get_sec(level=0):
     return "*" * (log_level + level)
@@ -128,22 +131,25 @@ def rm(f):
     if f in files:
         files.remove(f)
 
-def tool(name, args, flags, JSON=True, fail=True):
+def tool(name, args, flags, JSON=True, ns="", fail=True):
     params = ""
     if JSON:
         params += "%s " % (flags["json"])
 
-    ret, out = cmd(name + " " + params + args, fail=fail)
+    if ns != "":
+        ns = "ip netns exec %s " % (ns)
+
+    ret, out = cmd(ns + name + " " + params + args, fail=fail)
     if JSON and len(out.strip()) != 0:
         return ret, json.loads(out)
     else:
         return ret, out
 
-def bpftool(args, JSON=True, fail=True):
-    return tool("bpftool", args, {"json":"-p"}, JSON=JSON, fail=fail)
+def bpftool(args, JSON=True, ns="", fail=True):
+    return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns, fail=fail)
 
-def bpftool_prog_list(expected=None):
-    _, progs = bpftool("prog show", JSON=True, fail=True)
+def bpftool_prog_list(expected=None, ns=""):
+    _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True)
     if expected is not None:
         if len(progs) != expected:
             fail(True, "%d BPF programs loaded, expected %d" %
@@ -158,13 +164,13 @@ def bpftool_prog_list_wait(expected=0, n_retry=20):
         time.sleep(0.05)
     raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs))
 
-def ip(args, force=False, JSON=True, fail=True):
+def ip(args, force=False, JSON=True, ns="", fail=True):
     if force:
         args = "-force " + args
-    return tool("ip", args, {"json":"-j"}, JSON=JSON, fail=fail)
+    return tool("ip", args, {"json":"-j"}, JSON=JSON, ns=ns, fail=fail)
 
-def tc(args, JSON=True, fail=True):
-    return tool("tc", args, {"json":"-p"}, JSON=JSON, fail=fail)
+def tc(args, JSON=True, ns="", fail=True):
+    return tool("tc", args, {"json":"-p"}, JSON=JSON, ns=ns, fail=fail)
 
 def ethtool(dev, opt, args, fail=True):
     return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail)
@@ -178,6 +184,15 @@ def bpf_pinned(name):
 def bpf_bytecode(bytecode):
     return "bytecode \"%s\"" % (bytecode)
 
+def mknetns(n_retry=10):
+    for i in range(n_retry):
+        name = ''.join([random.choice(string.ascii_letters) for i in range(8)])
+        ret, _ = ip("netns add %s" % (name), fail=False)
+        if ret == 0:
+            netns.append(name)
+            return name
+    return None
+
 class DebugfsDir:
     """
     Class for accessing DebugFS directories as a dictionary.
@@ -237,6 +252,8 @@ class NetdevSim:
         self.dev = self._netdevsim_create()
         devs.append(self)
 
+        self.ns = ""
+
         self.dfs_dir = '/sys/kernel/debug/netdevsim/%s' % (self.dev['ifname'])
         self.dfs_refresh()
 
@@ -257,7 +274,7 @@ class NetdevSim:
 
     def remove(self):
         devs.remove(self)
-        ip("link del dev %s" % (self.dev["ifname"]))
+        ip("link del dev %s" % (self.dev["ifname"]), ns=self.ns)
 
     def dfs_refresh(self):
         self.dfs = DebugfsDir(self.dfs_dir)
@@ -285,6 +302,11 @@ class NetdevSim:
             time.sleep(0.05)
         raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs))
 
+    def set_ns(self, ns):
+        name = "1" if ns == "" else ns
+        ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns)
+        self.ns = ns
+
     def set_mtu(self, mtu, fail=True):
         return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu),
                   fail=fail)
@@ -372,6 +394,8 @@ def clean_up():
         dev.remove()
     for f in files:
         cmd("rm -f %s" % (f))
+    for ns in netns:
+        cmd("ip netns delete %s" % (ns))
 
 def pin_prog(file_name, idx=0):
     progs = bpftool_prog_list(expected=(idx + 1))
@@ -381,6 +405,35 @@ def pin_prog(file_name, idx=0):
 
     return file_name, bpf_pinned(file_name)
 
+def check_dev_info(other_ns, ns, pin_file=None, removed=False):
+    if removed:
+        bpftool_prog_list(expected=0)
+        ret, err = bpftool("prog show pin %s" % (pin_file), fail=False)
+        fail(ret == 0, "Showing prog with removed device did not fail")
+        fail(err["error"].find("No such device") == -1,
+             "Showing prog with removed device expected ENODEV, error is %s" %
+             (err["error"]))
+        return
+    progs = bpftool_prog_list(expected=int(not removed), ns=ns)
+    prog = progs[0]
+
+    fail("dev" not in prog.keys(), "Device parameters not reported")
+    dev = prog["dev"]
+    fail("ifindex" not in dev.keys(), "Device parameters not reported")
+    fail("ns_dev" not in dev.keys(), "Device parameters not reported")
+    fail("ns_inode" not in dev.keys(), "Device parameters not reported")
+
+    if not removed and not other_ns:
+        fail("ifname" not in dev.keys(), "Ifname not reported")
+        fail(dev["ifname"] != sim["ifname"],
+             "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"]))
+    else:
+        fail("ifname" in dev.keys(), "Ifname is reported for other ns")
+        if removed:
+            fail(dev["ifindex"] != 0, "Device perameters not zero on removed")
+            fail(dev["ns_dev"] != 0, "Device perameters not zero on removed")
+            fail(dev["ns_inode"] != 0, "Device perameters not zero on removed")
+
 # Parse command line
 parser = argparse.ArgumentParser()
 parser.add_argument("--log", help="output verbose log to given file")
@@ -417,6 +470,12 @@ for s in samples:
     skip(ret != 0, "sample %s/%s not found, please compile it" %
          (bpf_test_dir, s))
 
+# Check if net namespaces seem to work
+ns = mknetns()
+skip(ns is None, "Could not create a net namespace")
+cmd("ip netns delete %s" % (ns))
+netns = []
+
 try:
     obj = bpf_obj("sample_ret0.o")
     bytecode = bpf_bytecode("1,6 0 0 4294967295,")
@@ -549,6 +608,8 @@ try:
     progs = bpftool_prog_list(expected=1)
     fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"],
          "Loaded program has wrong ID")
+    fail("dev" in progs[0].keys(),
+         "Device parameters reported for non-offloaded program")
 
     start_test("Test XDP prog replace with bad flags...")
     ret, _ = sim.set_xdp(obj, "offload", force=True, fail=False)
@@ -673,6 +734,35 @@ try:
     fail(time_diff < delay_sec, "Removal process took %s, expected %s" %
          (time_diff, delay_sec))
 
+    # Remove all pinned files and reinstantiate the netdev
+    clean_up()
+    bpftool_prog_list_wait(expected=0)
+
+    sim = NetdevSim()
+    sim.set_ethtool_tc_offloads(True)
+    sim.set_xdp(obj, "offload")
+
+    start_test("Test bpftool bound info reporting (own ns)...")
+    check_dev_info(False, "")
+
+    start_test("Test bpftool bound info reporting (other ns)...")
+    ns = mknetns()
+    sim.set_ns(ns)
+    check_dev_info(True, "")
+
+    start_test("Test bpftool bound info reporting (remote ns)...")
+    check_dev_info(False, ns)
+
+    start_test("Test bpftool bound info reporting (back to own ns)...")
+    sim.set_ns("")
+    check_dev_info(False, "")
+
+    pin_file, _ = pin_prog("/sys/fs/bpf/tmp")
+    sim.remove()
+
+    start_test("Test bpftool bound info reporting (removed dev)...")
+    check_dev_info(True, "", pin_file=pin_file, removed=True)
+
     print("%s: OK" % (os.path.basename(__file__)))
 
 finally:

From 65b875bcc816335be41c336a595adbc10bd885cb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 2 Jan 2018 14:48:35 -0800
Subject: [PATCH 12/32] tools: bpftool: rename cgroup list -> show in the code

So far we have used "show" as a keyword for listing
programs and maps.  Use the word "show" in the code
for cgroups too, next commit will alias show and list.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/cgroup.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 34ca303d72bc..24091d87bee3 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -41,7 +41,7 @@ static enum bpf_attach_type parse_attach_type(const char *str)
 	return __MAX_BPF_ATTACH_TYPE;
 }
 
-static int list_bpf_prog(int id, const char *attach_type_str,
+static int show_bpf_prog(int id, const char *attach_type_str,
 			 const char *attach_flags_str)
 {
 	struct bpf_prog_info info = {};
@@ -77,7 +77,7 @@ static int list_bpf_prog(int id, const char *attach_type_str,
 	return 0;
 }
 
-static int list_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type)
+static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type)
 {
 	__u32 prog_ids[1024] = {0};
 	char *attach_flags_str;
@@ -111,23 +111,23 @@ static int list_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type)
 	}
 
 	for (iter = 0; iter < prog_cnt; iter++)
-		list_bpf_prog(prog_ids[iter], attach_type_strings[type],
+		show_bpf_prog(prog_ids[iter], attach_type_strings[type],
 			      attach_flags_str);
 
 	return 0;
 }
 
-static int do_list(int argc, char **argv)
+static int do_show(int argc, char **argv)
 {
 	enum bpf_attach_type type;
 	int cgroup_fd;
 	int ret = -1;
 
 	if (argc < 1) {
-		p_err("too few parameters for cgroup list\n");
+		p_err("too few parameters for cgroup show\n");
 		goto exit;
 	} else if (argc > 1) {
-		p_err("too many parameters for cgroup list\n");
+		p_err("too many parameters for cgroup show\n");
 		goto exit;
 	}
 
@@ -147,10 +147,10 @@ static int do_list(int argc, char **argv)
 		/*
 		 * Not all attach types may be supported, so it's expected,
 		 * that some requests will fail.
-		 * If we were able to get the list for at least one
+		 * If we were able to get the show for at least one
 		 * attach type, let's return 0.
 		 */
-		if (list_attached_bpf_progs(cgroup_fd, type) == 0)
+		if (show_attached_bpf_progs(cgroup_fd, type) == 0)
 			ret = 0;
 	}
 
@@ -294,7 +294,7 @@ static int do_help(int argc, char **argv)
 }
 
 static const struct cmd cmds[] = {
-	{ "list",	do_list },
+	{ "list",	do_show },
 	{ "attach",	do_attach },
 	{ "detach",	do_detach },
 	{ "help",	do_help },

From 6ebe6dbd6886af07b102aca42e44edbee94a22d9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 2 Jan 2018 14:48:36 -0800
Subject: [PATCH 13/32] tools: bpftool: alias show and list commands

iproute2 seems to accept show and list as aliases.
Let's do the same thing, and by allowing both bring
cgroup syntax back in line with maps and progs.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 6 +++---
 tools/bpf/bpftool/Documentation/bpftool-map.rst    | 6 +++---
 tools/bpf/bpftool/Documentation/bpftool-prog.rst   | 6 +++---
 tools/bpf/bpftool/Documentation/bpftool.rst        | 6 +++---
 tools/bpf/bpftool/bash-completion/bpftool          | 8 ++++----
 tools/bpf/bpftool/cgroup.c                         | 3 ++-
 tools/bpf/bpftool/map.c                            | 3 ++-
 tools/bpf/bpftool/prog.c                           | 3 ++-
 8 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 45c71b1f682b..2fe2a1bdbe3e 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -15,12 +15,12 @@ SYNOPSIS
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
 
 	*COMMANDS* :=
-	{ **list** | **attach** | **detach** | **help** }
+	{ **show** | **list** | **attach** | **detach** | **help** }
 
 MAP COMMANDS
 =============
 
-|	**bpftool** **cgroup list** *CGROUP*
+|	**bpftool** **cgroup { show | list }** *CGROUP*
 |	**bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*]
 |	**bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
 |	**bpftool** **cgroup help**
@@ -31,7 +31,7 @@ MAP COMMANDS
 
 DESCRIPTION
 ===========
-	**bpftool cgroup list** *CGROUP*
+	**bpftool cgroup { show | list }** *CGROUP*
 		  List all programs attached to the cgroup *CGROUP*.
 
 		  Output will start with program ID followed by attach type,
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 421cabc417e6..0ab32b312aec 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -15,13 +15,13 @@ SYNOPSIS
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
 
 	*COMMANDS* :=
-	{ **show** | **dump** | **update** | **lookup** | **getnext** | **delete**
+	{ **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
 	| **pin** | **help** }
 
 MAP COMMANDS
 =============
 
-|	**bpftool** **map show**   [*MAP*]
+|	**bpftool** **map { show | list }**   [*MAP*]
 |	**bpftool** **map dump**    *MAP*
 |	**bpftool** **map update**  *MAP*  **key** *BYTES*   **value** *VALUE* [*UPDATE_FLAGS*]
 |	**bpftool** **map lookup**  *MAP*  **key** *BYTES*
@@ -36,7 +36,7 @@ MAP COMMANDS
 
 DESCRIPTION
 ===========
-	**bpftool map show**   [*MAP*]
+	**bpftool map { show | list }**   [*MAP*]
 		  Show information about loaded maps.  If *MAP* is specified
 		  show information only about given map, otherwise list all
 		  maps currently loaded on the system.
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 81c97c0e9b67..e4ceee7f2dff 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -15,12 +15,12 @@ SYNOPSIS
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
 
 	*COMMANDS* :=
-	{ **show** | **dump xlated** | **dump jited** | **pin** | **load** | **help** }
+	{ **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | **help** }
 
 MAP COMMANDS
 =============
 
-|	**bpftool** **prog show** [*PROG*]
+|	**bpftool** **prog { show | list }** [*PROG*]
 |	**bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes**}]
 |	**bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | **opcodes**}]
 |	**bpftool** **prog pin** *PROG* *FILE*
@@ -31,7 +31,7 @@ MAP COMMANDS
 
 DESCRIPTION
 ===========
-	**bpftool prog show** [*PROG*]
+	**bpftool prog { show | list }** [*PROG*]
 		  Show information about loaded programs.  If *PROG* is
 		  specified show information only about given program, otherwise
 		  list all programs currently loaded on the system.
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 6732a5a617e4..20689a321ffe 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -22,13 +22,13 @@ SYNOPSIS
 	| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
 
 	*MAP-COMMANDS* :=
-	{ **show** | **dump** | **update** | **lookup** | **getnext** | **delete**
+	{ **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
 	| **pin** | **help** }
 
-	*PROG-COMMANDS* := { **show** | **dump jited** | **dump xlated** | **pin**
+	*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin**
 	| **load** | **help** }
 
-	*CGROUP-COMMANDS* := { **list** | **attach** | **detach** | **help** }
+	*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
 
 DESCRIPTION
 ===========
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 7febee05c8e7..0137866bb8f6 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -197,7 +197,7 @@ _bpftool()
 
             local PROG_TYPE='id pinned tag'
             case $command in
-                show)
+                show|list)
                     [[ $prev != "$command" ]] && return 0
                     COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) )
                     return 0
@@ -232,7 +232,7 @@ _bpftool()
                     ;;
                 *)
                     [[ $prev == $object ]] && \
-                        COMPREPLY=( $( compgen -W 'dump help pin show' -- \
+                        COMPREPLY=( $( compgen -W 'dump help pin show list' -- \
                             "$cur" ) )
                     ;;
             esac
@@ -240,7 +240,7 @@ _bpftool()
         map)
             local MAP_TYPE='id pinned'
             case $command in
-                show|dump)
+                show|list|dump)
                     case $prev in
                         $command)
                             COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
@@ -343,7 +343,7 @@ _bpftool()
                 *)
                     [[ $prev == $object ]] && \
                         COMPREPLY=( $( compgen -W 'delete dump getnext help \
-                            lookup pin show update' -- "$cur" ) )
+                            lookup pin show list update' -- "$cur" ) )
                     ;;
             esac
             ;;
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 24091d87bee3..35f5f003df28 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -277,7 +277,7 @@ static int do_help(int argc, char **argv)
 	}
 
 	fprintf(stderr,
-		"Usage: %s %s list CGROUP\n"
+		"Usage: %s %s { show | list } CGROUP\n"
 		"       %s %s attach CGROUP ATTACH_TYPE PROG [ATTACH_FLAGS]\n"
 		"       %s %s detach CGROUP ATTACH_TYPE PROG\n"
 		"       %s %s help\n"
@@ -294,6 +294,7 @@ static int do_help(int argc, char **argv)
 }
 
 static const struct cmd cmds[] = {
+	{ "show",	do_show },
 	{ "list",	do_show },
 	{ "attach",	do_attach },
 	{ "detach",	do_detach },
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index a8c3a33dd185..8d7db9d6b9cd 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -861,7 +861,7 @@ static int do_help(int argc, char **argv)
 	}
 
 	fprintf(stderr,
-		"Usage: %s %s show   [MAP]\n"
+		"Usage: %s %s { show | list }   [MAP]\n"
 		"       %s %s dump    MAP\n"
 		"       %s %s update  MAP  key BYTES value VALUE [UPDATE_FLAGS]\n"
 		"       %s %s lookup  MAP  key BYTES\n"
@@ -885,6 +885,7 @@ static int do_help(int argc, char **argv)
 
 static const struct cmd cmds[] = {
 	{ "show",	do_show },
+	{ "list",	do_show },
 	{ "help",	do_help },
 	{ "dump",	do_dump },
 	{ "update",	do_update },
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 98f871ed53d6..5577960bffe4 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -836,7 +836,7 @@ static int do_help(int argc, char **argv)
 	}
 
 	fprintf(stderr,
-		"Usage: %s %s show [PROG]\n"
+		"Usage: %s %s { show | list } [PROG]\n"
 		"       %s %s dump xlated PROG [{ file FILE | opcodes }]\n"
 		"       %s %s dump jited  PROG [{ file FILE | opcodes }]\n"
 		"       %s %s pin   PROG FILE\n"
@@ -854,6 +854,7 @@ static int do_help(int argc, char **argv)
 
 static const struct cmd cmds[] = {
 	{ "show",	do_show },
+	{ "list",	do_show },
 	{ "help",	do_help },
 	{ "dump",	do_dump },
 	{ "pin",	do_pin },

From b4fac96d9a672def4e24c1858adb7e9351471815 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 2 Jan 2018 14:48:37 -0800
Subject: [PATCH 14/32] tools: bpftool: remove new lines from errors

It's a little bit unusual for kernel style, but we add the new line
character to error strings inside the p_err() function.  We do this
because new lines at the end of error strings will break JSON output.

Fix a few p_err("..\n") which snuck in recently.

Fixes: 5ccda64d38cc ("bpftool: implement cgroup bpf operations")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/cgroup.c | 18 +++++++++---------
 tools/bpf/bpftool/prog.c   |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 35f5f003df28..cae32a61cb18 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -124,16 +124,16 @@ static int do_show(int argc, char **argv)
 	int ret = -1;
 
 	if (argc < 1) {
-		p_err("too few parameters for cgroup show\n");
+		p_err("too few parameters for cgroup show");
 		goto exit;
 	} else if (argc > 1) {
-		p_err("too many parameters for cgroup show\n");
+		p_err("too many parameters for cgroup show");
 		goto exit;
 	}
 
 	cgroup_fd = open(argv[0], O_RDONLY);
 	if (cgroup_fd < 0) {
-		p_err("can't open cgroup %s\n", argv[1]);
+		p_err("can't open cgroup %s", argv[1]);
 		goto exit;
 	}
 
@@ -171,19 +171,19 @@ static int do_attach(int argc, char **argv)
 	int i;
 
 	if (argc < 4) {
-		p_err("too few parameters for cgroup attach\n");
+		p_err("too few parameters for cgroup attach");
 		goto exit;
 	}
 
 	cgroup_fd = open(argv[0], O_RDONLY);
 	if (cgroup_fd < 0) {
-		p_err("can't open cgroup %s\n", argv[1]);
+		p_err("can't open cgroup %s", argv[1]);
 		goto exit;
 	}
 
 	attach_type = parse_attach_type(argv[1]);
 	if (attach_type == __MAX_BPF_ATTACH_TYPE) {
-		p_err("invalid attach type\n");
+		p_err("invalid attach type");
 		goto exit_cgroup;
 	}
 
@@ -199,7 +199,7 @@ static int do_attach(int argc, char **argv)
 		} else if (is_prefix(argv[i], "override")) {
 			attach_flags |= BPF_F_ALLOW_OVERRIDE;
 		} else {
-			p_err("unknown option: %s\n", argv[i]);
+			p_err("unknown option: %s", argv[i]);
 			goto exit_cgroup;
 		}
 	}
@@ -229,13 +229,13 @@ static int do_detach(int argc, char **argv)
 	int ret = -1;
 
 	if (argc < 4) {
-		p_err("too few parameters for cgroup detach\n");
+		p_err("too few parameters for cgroup detach");
 		goto exit;
 	}
 
 	cgroup_fd = open(argv[0], O_RDONLY);
 	if (cgroup_fd < 0) {
-		p_err("can't open cgroup %s\n", argv[1]);
+		p_err("can't open cgroup %s", argv[1]);
 		goto exit;
 	}
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 5577960bffe4..c6a28be4665c 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -813,12 +813,12 @@ static int do_load(int argc, char **argv)
 		usage();
 
 	if (bpf_prog_load(argv[0], BPF_PROG_TYPE_UNSPEC, &obj, &prog_fd)) {
-		p_err("failed to load program\n");
+		p_err("failed to load program");
 		return -1;
 	}
 
 	if (do_pin_fd(prog_fd, argv[1])) {
-		p_err("failed to pin program\n");
+		p_err("failed to pin program");
 		return -1;
 	}
 

From c20a71a7a3841a6198721a0b4276f3298b38bbcf Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Jan 2018 17:57:39 -0800
Subject: [PATCH 15/32] bpf: sockmap remove unused function

This was added for some work that was eventually factored out but the
helper call was missed. Remove it now and add it back later if needed.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5ee2e41893d9..3f662ee23a34 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -96,14 +96,6 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
-/* compute the linear packet data range [data, data_end) for skb when
- * sk_skb type programs are in use.
- */
-static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
-{
-	TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
-}
-
 enum __sk_action {
 	__SK_DROP = 0,
 	__SK_PASS,

From 5f103c5d4dbadec0f2cacd39b6429e1b8a8cf983 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Jan 2018 17:57:56 -0800
Subject: [PATCH 16/32] bpf: only build sockmap with CONFIG_INET

The sockmap infrastructure is only aware of TCP sockets at the
moment. In the future we plan to add UDP. In both cases CONFIG_NET
should be built-in.

So lets only build sockmap if CONFIG_INET is enabled.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h       | 2 +-
 include/linux/bpf_types.h | 2 +-
 kernel/bpf/Makefile       | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7810ae57b357..9e03046d1df2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -554,7 +554,7 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
 }
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
-#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
+#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
 #else
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 978c1d9c9383..19b8349a3809 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -42,7 +42,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
-#ifdef CONFIG_STREAM_PARSER
+#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e691da0b3bab..a713fd23ec88 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
+ifeq ($(CONFIG_INET),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
 endif
+endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif

From aecd67b60722dd24353b0bc50e78a55b30707dcd Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:13 +0100
Subject: [PATCH 17/32] xdp: base API for new XDP rx-queue info concept

This patch only introduce the core data structures and API functions.
All XDP enabled drivers must use the API before this info can used.

There is a need for XDP to know more about the RX-queue a given XDP
frames have arrived on.  For both the XDP bpf-prog and kernel side.

Instead of extending xdp_buff each time new info is needed, the patch
creates a separate read-mostly struct xdp_rxq_info, that contains this
info.  We stress this data/cache-line is for read-only info.  This is
NOT for dynamic per packet info, use the data_meta for such use-cases.

The performance advantage is this info can be setup at RX-ring init
time, instead of updating N-members in xdp_buff.  A possible (driver
level) micro optimization is that xdp_buff->rxq assignment could be
done once per XDP/NAPI loop.  The extra pointer deref only happens for
program needing access to this info (thus, no slowdown to existing
use-cases).

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  2 ++
 include/net/xdp.h      | 47 +++++++++++++++++++++++++++++
 net/core/Makefile      |  2 +-
 net/core/xdp.c         | 67 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100644 include/net/xdp.h
 create mode 100644 net/core/xdp.c

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2b0df2703671..425056c7f96c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -20,6 +20,7 @@
 #include <linux/set_memory.h>
 #include <linux/kallsyms.h>
 
+#include <net/xdp.h>
 #include <net/sch_generic.h>
 
 #include <uapi/linux/filter.h>
@@ -503,6 +504,7 @@ struct xdp_buff {
 	void *data_end;
 	void *data_meta;
 	void *data_hard_start;
+	struct xdp_rxq_info *rxq;
 };
 
 /* Compute the linear packet data range [data, data_end) which
diff --git a/include/net/xdp.h b/include/net/xdp.h
new file mode 100644
index 000000000000..86c41631a908
--- /dev/null
+++ b/include/net/xdp.h
@@ -0,0 +1,47 @@
+/* include/net/xdp.h
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+#ifndef __LINUX_NET_XDP_H__
+#define __LINUX_NET_XDP_H__
+
+/**
+ * DOC: XDP RX-queue information
+ *
+ * The XDP RX-queue info (xdp_rxq_info) is associated with the driver
+ * level RX-ring queues.  It is information that is specific to how
+ * the driver have configured a given RX-ring queue.
+ *
+ * Each xdp_buff frame received in the driver carry a (pointer)
+ * reference to this xdp_rxq_info structure.  This provides the XDP
+ * data-path read-access to RX-info for both kernel and bpf-side
+ * (limited subset).
+ *
+ * For now, direct access is only safe while running in NAPI/softirq
+ * context.  Contents is read-mostly and must not be updated during
+ * driver NAPI/softirq poll.
+ *
+ * The driver usage API is a register and unregister API.
+ *
+ * The struct is not directly tied to the XDP prog.  A new XDP prog
+ * can be attached as long as it doesn't change the underlying
+ * RX-ring.  If the RX-ring does change significantly, the NIC driver
+ * naturally need to stop the RX-ring before purging and reallocating
+ * memory.  In that process the driver MUST call unregistor (which
+ * also apply for driver shutdown and unload).  The register API is
+ * also mandatory during RX-ring setup.
+ */
+
+struct xdp_rxq_info {
+	struct net_device *dev;
+	u32 queue_index;
+	u32 reg_state;
+} ____cacheline_aligned; /* perf critical, avoid false-sharing */
+
+int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+		     struct net_device *dev, u32 queue_index);
+void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
+void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
+
+#endif /* __LINUX_NET_XDP_H__ */
diff --git a/net/core/Makefile b/net/core/Makefile
index 1fd0a9c88b1b..6dbbba8c57ae 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
 			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
-			fib_notifier.o
+			fib_notifier.o xdp.o
 
 obj-y += net-sysfs.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
diff --git a/net/core/xdp.c b/net/core/xdp.c
new file mode 100644
index 000000000000..229bc5a0ee04
--- /dev/null
+++ b/net/core/xdp.c
@@ -0,0 +1,67 @@
+/* net/core/xdp.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+#include <linux/types.h>
+#include <linux/mm.h>
+
+#include <net/xdp.h>
+
+#define REG_STATE_NEW		0x0
+#define REG_STATE_REGISTERED	0x1
+#define REG_STATE_UNREGISTERED	0x2
+#define REG_STATE_UNUSED	0x3
+
+void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
+{
+	/* Simplify driver cleanup code paths, allow unreg "unused" */
+	if (xdp_rxq->reg_state == REG_STATE_UNUSED)
+		return;
+
+	WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
+
+	xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
+	xdp_rxq->dev = NULL;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
+
+static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
+{
+	memset(xdp_rxq, 0, sizeof(*xdp_rxq));
+}
+
+/* Returns 0 on success, negative on failure */
+int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+		     struct net_device *dev, u32 queue_index)
+{
+	if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
+		WARN(1, "Driver promised not to register this");
+		return -EINVAL;
+	}
+
+	if (xdp_rxq->reg_state == REG_STATE_REGISTERED) {
+		WARN(1, "Missing unregister, handled but fix driver");
+		xdp_rxq_info_unreg(xdp_rxq);
+	}
+
+	if (!dev) {
+		WARN(1, "Missing net_device from driver");
+		return -ENODEV;
+	}
+
+	/* State either UNREGISTERED or NEW */
+	xdp_rxq_info_init(xdp_rxq);
+	xdp_rxq->dev = dev;
+	xdp_rxq->queue_index = queue_index;
+
+	xdp_rxq->reg_state = REG_STATE_REGISTERED;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
+
+void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
+{
+	xdp_rxq->reg_state = REG_STATE_UNUSED;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);

From 0ddf543226acacfb9f521dafc6c817d5b04c7b1f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:18 +0100
Subject: [PATCH 18/32] xdp/mlx5: setup xdp_rxq_info

The mlx5 driver have a special drop-RQ queue (one per interface) that
simply drops all incoming traffic. It helps driver keep other HW
objects (flow steering) alive upon down/up operations.  It is
temporarily pointed by flow steering objects during the interface
setup, and when interface is down. It lacks many fields that are set
in a regular RQ (for example its state is never switched to
MLX5_RQC_STATE_RDY). (Thanks to Tariq Toukan for explanation).

The XDP RX-queue info for this drop-RQ marked as unused, which
allow us to use the same takedown/free code path as other RX-queues.

Driver hook points for xdp_rxq_info:
 * reg   : mlx5e_alloc_rq()
 * unused: mlx5e_alloc_drop_rq()
 * unreg : mlx5e_free_rq()

Tested on actual hardware with samples/bpf program

Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Matan Barak <matanb@mellanox.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      | 4 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 9 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 1 +
 3 files changed, 14 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 543060c305a0..5299310f2481 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -46,6 +46,7 @@
 #include <linux/mlx5/transobj.h>
 #include <linux/rhashtable.h>
 #include <net/switchdev.h>
+#include <net/xdp.h>
 #include "wq.h"
 #include "mlx5_core.h"
 #include "en_stats.h"
@@ -571,6 +572,9 @@ struct mlx5e_rq {
 	u32                    rqn;
 	struct mlx5_core_dev  *mdev;
 	struct mlx5_core_mkey  umr_mkey;
+
+	/* XDP read-mostly */
+	struct xdp_rxq_info    xdp_rxq;
 } ____cacheline_aligned_in_smp;
 
 struct mlx5e_channel {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3aa1c90e7c86..539bd1d24396 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -582,6 +582,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		goto err_rq_wq_destroy;
 	}
 
+	if (xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix) < 0)
+		goto err_rq_wq_destroy;
+
 	rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
 	rq->buff.headroom = params->rq_headroom;
 
@@ -687,6 +690,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 err_rq_wq_destroy:
 	if (rq->xdp_prog)
 		bpf_prog_put(rq->xdp_prog);
+	xdp_rxq_info_unreg(&rq->xdp_rxq);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 
 	return err;
@@ -699,6 +703,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 	if (rq->xdp_prog)
 		bpf_prog_put(rq->xdp_prog);
 
+	xdp_rxq_info_unreg(&rq->xdp_rxq);
+
 	switch (rq->wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
 		mlx5e_rq_free_mpwqe_info(rq);
@@ -2766,6 +2772,9 @@ static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev,
 	if (err)
 		return err;
 
+	/* Mark as unused given "Drop-RQ" packets never reach XDP */
+	xdp_rxq_info_unused(&rq->xdp_rxq);
+
 	rq->mdev = mdev;
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 5b499c7a698f..7b38480811d4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -812,6 +812,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 	xdp.data_hard_start = va;
+	xdp.rxq = &rq->xdp_rxq;
 
 	act = bpf_prog_run_xdp(prog, &xdp);
 	switch (act) {

From 871288248de23d5c87433dcd94910ff813495588 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:23 +0100
Subject: [PATCH 19/32] i40e: setup xdp_rxq_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The i40e driver has a special "FDIR" RX-ring (I40E_VSI_FDIR) which is
a sideband channel for configuring/updating the flow director tables.
This (i40e_vsi_)type does not invoke XDP-ebpf code.

As suggested by Björn (V2): Instead of marking this I40E_VSI_FDIR RX-ring
a special case, reverse the logic and only select RX-rings of type
I40E_VSI_MAIN to register xdp_rxq_info's for.

Driver hook points for xdp_rxq_info:
 * reg  : i40e_setup_rx_descriptors (via i40e_vsi_setup_rx_resources)
 * unreg: i40e_free_rx_resources    (via i40e_vsi_free_rx_resources)

Tested on actual hardware with samples/bpf program.

V2: Fixed bug in i40e_set_ringparam (memset zero) + match on I40E_VSI_MAIN.
V4: Update patch desc that got out-of-sync with code.

Cc: intel-wired-lan@lists.osuosl.org
Cc: Björn Töpel <bjorn.topel@intel.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  2 ++
 drivers/net/ethernet/intel/i40e/i40e_txrx.c    | 18 ++++++++++++++++--
 drivers/net/ethernet/intel/i40e/i40e_txrx.h    |  3 +++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 5f6cf7212d4f..cfd788b4fd7a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1585,6 +1585,8 @@ static int i40e_set_ringparam(struct net_device *netdev,
 			 */
 			rx_rings[i].desc = NULL;
 			rx_rings[i].rx_bi = NULL;
+			/* Clear cloned XDP RX-queue info before setup call */
+			memset(&rx_rings[i].xdp_rxq, 0, sizeof(rx_rings[i].xdp_rxq));
 			/* this is to allow wr32 to have something to write to
 			 * during early allocation of Rx buffers
 			 */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 4566d66ffc7c..2a8a85e3ae8f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -27,6 +27,7 @@
 #include <linux/prefetch.h>
 #include <net/busy_poll.h>
 #include <linux/bpf_trace.h>
+#include <net/xdp.h>
 #include "i40e.h"
 #include "i40e_trace.h"
 #include "i40e_prototype.h"
@@ -1236,6 +1237,8 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
 void i40e_free_rx_resources(struct i40e_ring *rx_ring)
 {
 	i40e_clean_rx_ring(rx_ring);
+	if (rx_ring->vsi->type == I40E_VSI_MAIN)
+		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 	rx_ring->xdp_prog = NULL;
 	kfree(rx_ring->rx_bi);
 	rx_ring->rx_bi = NULL;
@@ -1256,6 +1259,7 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
 int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
 {
 	struct device *dev = rx_ring->dev;
+	int err = -ENOMEM;
 	int bi_size;
 
 	/* warn if we are about to overwrite the pointer */
@@ -1283,13 +1287,21 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
+	/* XDP RX-queue info only needed for RX rings exposed to XDP */
+	if (rx_ring->vsi->type == I40E_VSI_MAIN) {
+		err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+				       rx_ring->queue_index);
+		if (err < 0)
+			goto err;
+	}
+
 	rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
 
 	return 0;
 err:
 	kfree(rx_ring->rx_bi);
 	rx_ring->rx_bi = NULL;
-	return -ENOMEM;
+	return err;
 }
 
 /**
@@ -2068,11 +2080,13 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 	struct sk_buff *skb = rx_ring->skb;
 	u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
 	bool failure = false, xdp_xmit = false;
+	struct xdp_buff xdp;
+
+	xdp.rxq = &rx_ring->xdp_rxq;
 
 	while (likely(total_rx_packets < (unsigned int)budget)) {
 		struct i40e_rx_buffer *rx_buffer;
 		union i40e_rx_desc *rx_desc;
-		struct xdp_buff xdp;
 		unsigned int size;
 		u16 vlan_tag;
 		u8 rx_ptype;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index fbae1182e2ea..2d08760fc4ce 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -27,6 +27,8 @@
 #ifndef _I40E_TXRX_H_
 #define _I40E_TXRX_H_
 
+#include <net/xdp.h>
+
 /* Interrupt Throttling and Rate Limiting Goodies */
 
 #define I40E_MAX_ITR               0x0FF0  /* reg uses 2 usec resolution */
@@ -428,6 +430,7 @@ struct i40e_ring {
 					 */
 
 	struct i40e_channel *ch;
+	struct xdp_rxq_info xdp_rxq;
 } ____cacheline_internodealigned_in_smp;
 
 static inline bool ring_uses_build_skb(struct i40e_ring *ring)

From 99ffc5ade4e8703c3bc56fa6bb8e25437da09ee9 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:29 +0100
Subject: [PATCH 20/32] ixgbe: setup xdp_rxq_info

Driver hook points for xdp_rxq_info:
 * reg  : ixgbe_setup_rx_resources()
 * unreg: ixgbe_free_rx_resources()

Tested on actual hardware.

V2: Fix ixgbe_set_ringparam, clear xdp_rxq_info in temp_ring

Cc: intel-wired-lan@lists.osuosl.org
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h         |  2 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |  4 ++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    | 10 +++++++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 468c3555a629..8611763d6129 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -53,6 +53,7 @@
 #include <linux/dca.h>
 #endif
 
+#include <net/xdp.h>
 #include <net/busy_poll.h>
 
 /* common prefix used by pr_<> macros */
@@ -371,6 +372,7 @@ struct ixgbe_ring {
 		struct ixgbe_tx_queue_stats tx_stats;
 		struct ixgbe_rx_queue_stats rx_stats;
 	};
+	struct xdp_rxq_info xdp_rxq;
 } ____cacheline_internodealigned_in_smp;
 
 enum ixgbe_ring_f_enum {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 0aad1c2a3667..0aaf70b3cfcd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -1156,6 +1156,10 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
 			memcpy(&temp_ring[i], adapter->rx_ring[i],
 			       sizeof(struct ixgbe_ring));
 
+			/* Clear copied XDP RX-queue info */
+			memset(&temp_ring[i].xdp_rxq, 0,
+			       sizeof(temp_ring[i].xdp_rxq));
+
 			temp_ring[i].count = new_rx_count;
 			err = ixgbe_setup_rx_resources(adapter, &temp_ring[i]);
 			if (err) {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 7737a05c717c..95aba975b391 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2318,12 +2318,14 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 #endif /* IXGBE_FCOE */
 	u16 cleaned_count = ixgbe_desc_unused(rx_ring);
 	bool xdp_xmit = false;
+	struct xdp_buff xdp;
+
+	xdp.rxq = &rx_ring->xdp_rxq;
 
 	while (likely(total_rx_packets < budget)) {
 		union ixgbe_adv_rx_desc *rx_desc;
 		struct ixgbe_rx_buffer *rx_buffer;
 		struct sk_buff *skb;
-		struct xdp_buff xdp;
 		unsigned int size;
 
 		/* return some buffers to hardware, one at a time is too slow */
@@ -6444,6 +6446,11 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
+	/* XDP RX-queue info */
+	if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
+			     rx_ring->queue_index) < 0)
+		goto err;
+
 	rx_ring->xdp_prog = adapter->xdp_prog;
 
 	return 0;
@@ -6541,6 +6548,7 @@ void ixgbe_free_rx_resources(struct ixgbe_ring *rx_ring)
 	ixgbe_clean_rx_ring(rx_ring);
 
 	rx_ring->xdp_prog = NULL;
+	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 	vfree(rx_ring->rx_buffer_info);
 	rx_ring->rx_buffer_info = NULL;
 

From c0124f327e5cabd844a10d7e1fc5aa2a81e796a9 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:34 +0100
Subject: [PATCH 21/32] xdp/qede: setup xdp_rxq_info and intro
 xdp_rxq_info_is_reg

The driver code qede_free_fp_array() depend on kfree() can be called
with a NULL pointer. This stems from the qede_alloc_fp_array()
function which either (kz)alloc memory for fp->txq or fp->rxq.
This also simplifies error handling code in case of memory allocation
failures, but xdp_rxq_info_unreg need to know the difference.

Introduce xdp_rxq_info_is_reg() to handle if a memory allocation fails
and detect this is the failure path by seeing that xdp_rxq_info was
not registred yet, which first happens after successful alloaction in
qede_init_fp().

Driver hook points for xdp_rxq_info:
 * reg  : qede_init_fp
 * unreg: qede_free_fp_array

Tested on actual hardware with samples/bpf program.

V2: Driver have no proper error path for failed XDP RX-queue info reg, as
qede_init_fp() is a void function.

Cc: everest-linux-l2@cavium.com
Cc: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/qlogic/qede/qede.h      |  2 ++
 drivers/net/ethernet/qlogic/qede/qede_fp.c   |  1 +
 drivers/net/ethernet/qlogic/qede/qede_main.c | 10 ++++++++++
 include/net/xdp.h                            |  1 +
 net/core/xdp.c                               |  6 ++++++
 5 files changed, 20 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 8a336517baac..8116cfd30fad 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -40,6 +40,7 @@
 #include <linux/kernel.h>
 #include <linux/mutex.h>
 #include <linux/bpf.h>
+#include <net/xdp.h>
 #include <linux/qed/qede_rdma.h>
 #include <linux/io.h>
 #ifdef CONFIG_RFS_ACCEL
@@ -345,6 +346,7 @@ struct qede_rx_queue {
 	u64 xdp_no_pass;
 
 	void *handle;
+	struct xdp_rxq_info xdp_rxq;
 };
 
 union db_prod {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 48ec4c56cddf..dafc079ab6b9 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1006,6 +1006,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 	xdp.data = xdp.data_hard_start + *data_offset;
 	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
+	xdp.rxq = &rxq->xdp_rxq;
 
 	/* Queues always have a full reset currently, so for the time
 	 * being until there's atomic program replace just mark read
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 90d79ae2a48f..9929b4370ce6 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -765,6 +765,12 @@ static void qede_free_fp_array(struct qede_dev *edev)
 			fp = &edev->fp_array[i];
 
 			kfree(fp->sb_info);
+			/* Handle mem alloc failure case where qede_init_fp
+			 * didn't register xdp_rxq_info yet.
+			 * Implicit only (fp->type & QEDE_FASTPATH_RX)
+			 */
+			if (fp->rxq && xdp_rxq_info_is_reg(&fp->rxq->xdp_rxq))
+				xdp_rxq_info_unreg(&fp->rxq->xdp_rxq);
 			kfree(fp->rxq);
 			kfree(fp->xdp_tx);
 			kfree(fp->txq);
@@ -1493,6 +1499,10 @@ static void qede_init_fp(struct qede_dev *edev)
 			else
 				fp->rxq->data_direction = DMA_FROM_DEVICE;
 			fp->rxq->dev = &edev->pdev->dev;
+
+			/* Driver have no error path from here */
+			WARN_ON(xdp_rxq_info_reg(&fp->rxq->xdp_rxq, edev->ndev,
+						 fp->rxq->rxq_id) < 0);
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 86c41631a908..b2362ddfa694 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -43,5 +43,6 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
+bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
 
 #endif /* __LINUX_NET_XDP_H__ */
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 229bc5a0ee04..097a0f74e004 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -65,3 +65,9 @@ void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
 	xdp_rxq->reg_state = REG_STATE_UNUSED;
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);
+
+bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
+{
+	return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);

From ae75415de1110613aa437ddf1196af6d33f1c90a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:39 +0100
Subject: [PATCH 22/32] mlx4: setup xdp_rxq_info

Driver hook points for xdp_rxq_info:
 * reg  : mlx4_en_create_rx_ring
 * unreg: mlx4_en_destroy_rx_ring

Tested on actual hardware.

Cc: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     | 13 ++++++++++---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  4 +++-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 21bc17fa3854..8fc51bc29003 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2172,8 +2172,9 @@ static int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
 
 		if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i],
 					   prof->rx_ring_size, priv->stride,
-					   node))
+					   node, i))
 			goto err;
+
 	}
 
 #ifdef CONFIG_RFS_ACCEL
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 5f9dbc9a7f5b..b4d144e67514 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -262,7 +262,7 @@ void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
 
 int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 			   struct mlx4_en_rx_ring **pring,
-			   u32 size, u16 stride, int node)
+			   u32 size, u16 stride, int node, int queue_index)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_en_rx_ring *ring;
@@ -286,6 +286,9 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 	ring->log_stride = ffs(ring->stride) - 1;
 	ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
 
+	if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index) < 0)
+		goto err_ring;
+
 	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
 					sizeof(struct mlx4_en_rx_alloc));
 	ring->rx_info = vzalloc_node(tmp, node);
@@ -293,7 +296,7 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 		ring->rx_info = vzalloc(tmp);
 		if (!ring->rx_info) {
 			err = -ENOMEM;
-			goto err_ring;
+			goto err_xdp_info;
 		}
 	}
 
@@ -317,6 +320,8 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 err_info:
 	vfree(ring->rx_info);
 	ring->rx_info = NULL;
+err_xdp_info:
+	xdp_rxq_info_unreg(&ring->xdp_rxq);
 err_ring:
 	kfree(ring);
 	*pring = NULL;
@@ -440,6 +445,7 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
 					lockdep_is_held(&mdev->state_lock));
 	if (old_prog)
 		bpf_prog_put(old_prog);
+	xdp_rxq_info_unreg(&ring->xdp_rxq);
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
 	vfree(ring->rx_info);
 	ring->rx_info = NULL;
@@ -652,6 +658,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	int cq_ring = cq->ring;
 	bool doorbell_pending;
 	struct mlx4_cqe *cqe;
+	struct xdp_buff xdp;
 	int polled = 0;
 	int index;
 
@@ -666,6 +673,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	/* Protect accesses to: ring->xdp_prog, priv->mac_hash list */
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(ring->xdp_prog);
+	xdp.rxq = &ring->xdp_rxq;
 	doorbell_pending = 0;
 
 	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
@@ -750,7 +758,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		 * read bytes but not past the end of the frag.
 		 */
 		if (xdp_prog) {
-			struct xdp_buff xdp;
 			dma_addr_t dma;
 			void *orig_data;
 			u32 act;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 7db3d0d9bfce..f470ae37d937 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -46,6 +46,7 @@
 #endif
 #include <linux/cpu_rmap.h>
 #include <linux/ptp_clock_kernel.h>
+#include <net/xdp.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/qp.h>
@@ -356,6 +357,7 @@ struct mlx4_en_rx_ring {
 	unsigned long dropped;
 	int hwtstamp_rx_filter;
 	cpumask_var_t affinity_mask;
+	struct xdp_rxq_info xdp_rxq;
 };
 
 struct mlx4_en_cq {
@@ -720,7 +722,7 @@ void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev);
 void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv);
 int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 			   struct mlx4_en_rx_ring **pring,
-			   u32 size, u16 stride, int node);
+			   u32 size, u16 stride, int node, int queue_index);
 void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
 			     struct mlx4_en_rx_ring **pring,
 			     u32 size, u16 stride);

From 96a8604f95fa216b9ddfd15c687eed42a2f56901 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:44 +0100
Subject: [PATCH 23/32] bnxt_en: setup xdp_rxq_info

Driver hook points for xdp_rxq_info:
 * reg  : bnxt_alloc_rx_rings
 * unreg: bnxt_free_rx_rings

This driver should be updated to re-register when changing
allocation mode of RX rings.

Tested on actual hardware.

Cc: Andy Gospodarek <andy@greyhouse.net>
Cc: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 10 ++++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  2 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c |  1 +
 3 files changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9efbdc6f1fcb..89c3c8760a78 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2247,6 +2247,9 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
 		if (rxr->xdp_prog)
 			bpf_prog_put(rxr->xdp_prog);
 
+		if (xdp_rxq_info_is_reg(&rxr->xdp_rxq))
+			xdp_rxq_info_unreg(&rxr->xdp_rxq);
+
 		kfree(rxr->rx_tpa);
 		rxr->rx_tpa = NULL;
 
@@ -2280,6 +2283,10 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
 
 		ring = &rxr->rx_ring_struct;
 
+		rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i);
+		if (rc < 0)
+			return rc;
+
 		rc = bnxt_alloc_ring(bp, ring);
 		if (rc)
 			return rc;
@@ -2834,6 +2841,9 @@ void bnxt_set_ring_params(struct bnxt *bp)
 	bp->cp_ring_mask = bp->cp_bit - 1;
 }
 
+/* Changing allocation mode of RX rings.
+ * TODO: Update when extending xdp_rxq_info to support allocation modes.
+ */
 int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode)
 {
 	if (page_mode) {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 5359a1f0045f..2d268fc26f5e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -23,6 +23,7 @@
 #include <net/devlink.h>
 #include <net/dst_metadata.h>
 #include <net/switchdev.h>
+#include <net/xdp.h>
 
 struct tx_bd {
 	__le32 tx_bd_len_flags_type;
@@ -664,6 +665,7 @@ struct bnxt_rx_ring_info {
 
 	struct bnxt_ring_struct	rx_ring_struct;
 	struct bnxt_ring_struct	rx_agg_ring_struct;
+	struct xdp_rxq_info	xdp_rxq;
 };
 
 struct bnxt_cp_ring_info {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 261e5847557a..1389ab5e05df 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -96,6 +96,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 	xdp.data = *data_ptr;
 	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = *data_ptr + *len;
+	xdp.rxq = &rxr->xdp_rxq;
 	orig_data = xdp.data;
 	mapping = rx_buf->mapping - bp->rx_dma_offset;
 

From 7f1c684a8966d19177a0440113a39007f95af38d Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:49 +0100
Subject: [PATCH 24/32] nfp: setup xdp_rxq_info

Driver hook points for xdp_rxq_info:
 * reg  : nfp_net_rx_ring_alloc
 * unreg: nfp_net_rx_ring_free

In struct nfp_net_rx_ring moved member @size into a hole on 64-bit.
Thus, the size remaines the same after adding member @xdp_rxq.

Cc: oss-drivers@netronome.com
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h        |  5 ++++-
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 10 ++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 3801c52098d5..0e564cfabe7e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -47,6 +47,7 @@
 #include <linux/netdevice.h>
 #include <linux/pci.h>
 #include <linux/io-64-nonatomic-hi-lo.h>
+#include <net/xdp.h>
 
 #include "nfp_net_ctrl.h"
 
@@ -350,6 +351,7 @@ struct nfp_net_rx_buf {
  * @rxds:       Virtual address of FL/RX ring in host memory
  * @dma:        DMA address of the FL/RX ring
  * @size:       Size, in bytes, of the FL/RX ring (needed to free)
+ * @xdp_rxq:    RX-ring info avail for XDP
  */
 struct nfp_net_rx_ring {
 	struct nfp_net_r_vector *r_vec;
@@ -361,13 +363,14 @@ struct nfp_net_rx_ring {
 	u32 idx;
 
 	int fl_qcidx;
+	unsigned int size;
 	u8 __iomem *qcp_fl;
 
 	struct nfp_net_rx_buf *rxbufs;
 	struct nfp_net_rx_desc *rxds;
 
 	dma_addr_t dma;
-	unsigned int size;
+	struct xdp_rxq_info xdp_rxq;
 } ____cacheline_aligned;
 
 /**
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 0add4870ce2e..45b8cae937be 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1608,11 +1608,13 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 	unsigned int true_bufsz;
 	struct sk_buff *skb;
 	int pkts_polled = 0;
+	struct xdp_buff xdp;
 	int idx;
 
 	rcu_read_lock();
 	xdp_prog = READ_ONCE(dp->xdp_prog);
 	true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz;
+	xdp.rxq = &rx_ring->xdp_rxq;
 	tx_ring = r_vec->xdp_ring;
 
 	while (pkts_polled < budget) {
@@ -1703,7 +1705,6 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 				  dp->bpf_offload_xdp) && !meta.portid) {
 			void *orig_data = rxbuf->frag + pkt_off;
 			unsigned int dma_off;
-			struct xdp_buff xdp;
 			int act;
 
 			xdp.data_hard_start = rxbuf->frag + NFP_NET_RX_BUF_HEADROOM;
@@ -2252,6 +2253,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
 	struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
 	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
 
+	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 	kfree(rx_ring->rxbufs);
 
 	if (rx_ring->rxds)
@@ -2275,7 +2277,11 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
 static int
 nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
 {
-	int sz;
+	int sz, err;
+
+	err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, dp->netdev, rx_ring->idx);
+	if (err < 0)
+		return err;
 
 	rx_ring->cnt = dp->rxd_cnt;
 	rx_ring->size = sizeof(*rx_ring->rxds) * rx_ring->cnt;

From 27e95e3648910c81a0840aa10dde77323795519e Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:54 +0100
Subject: [PATCH 25/32] thunderx: setup xdp_rxq_info

This driver uses a bool scheme for "enable"/"disable" when setting up
different resources.  Thus, the hook points for xdp_rxq_info is done
in the same function call nicvf_rcv_queue_config().  This is activated
through enable/disable via nicvf_config_data_transfer(), which is tied
into nicvf_stop()/nicvf_open().

Extending driver packet handler call-path nicvf_rcv_pkt_handler() with
a pointer to the given struct rcv_queue, in-order to access the
xdp_rxq_info data area (in nicvf_xdp_rx()).

V2: Driver have no proper error path for failed XDP RX-queue info reg,
as nicvf_rcv_queue_config is a void function.

Cc: linux-arm-kernel@lists.infradead.org
Cc: Sunil Goutham <sgoutham@cavium.com>
Cc: Robert Richter <rric@kernel.org>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 11 +++++++----
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c |  4 ++++
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h |  2 ++
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 52b3a6044f85..21618d0d694f 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -521,7 +521,7 @@ static void nicvf_unmap_page(struct nicvf *nic, struct page *page, u64 dma_addr)
 
 static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 				struct cqe_rx_t *cqe_rx, struct snd_queue *sq,
-				struct sk_buff **skb)
+				struct rcv_queue *rq, struct sk_buff **skb)
 {
 	struct xdp_buff xdp;
 	struct page *page;
@@ -545,6 +545,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 	xdp.data = (void *)cpu_addr;
 	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + len;
+	xdp.rxq = &rq->xdp_rxq;
 	orig_data = xdp.data;
 
 	rcu_read_lock();
@@ -698,7 +699,8 @@ static inline void nicvf_set_rxhash(struct net_device *netdev,
 
 static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 				  struct napi_struct *napi,
-				  struct cqe_rx_t *cqe_rx, struct snd_queue *sq)
+				  struct cqe_rx_t *cqe_rx,
+				  struct snd_queue *sq, struct rcv_queue *rq)
 {
 	struct sk_buff *skb = NULL;
 	struct nicvf *nic = netdev_priv(netdev);
@@ -724,7 +726,7 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 	/* For XDP, ignore pkts spanning multiple pages */
 	if (nic->xdp_prog && (cqe_rx->rb_cnt == 1)) {
 		/* Packet consumed by XDP */
-		if (nicvf_xdp_rx(snic, nic->xdp_prog, cqe_rx, sq, &skb))
+		if (nicvf_xdp_rx(snic, nic->xdp_prog, cqe_rx, sq, rq, &skb))
 			return;
 	} else {
 		skb = nicvf_get_rcv_skb(snic, cqe_rx,
@@ -781,6 +783,7 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 	struct cqe_rx_t *cq_desc;
 	struct netdev_queue *txq;
 	struct snd_queue *sq = &qs->sq[cq_idx];
+	struct rcv_queue *rq = &qs->rq[cq_idx];
 	unsigned int tx_pkts = 0, tx_bytes = 0, txq_idx;
 
 	spin_lock_bh(&cq->lock);
@@ -811,7 +814,7 @@ static int nicvf_cq_intr_handler(struct net_device *netdev, u8 cq_idx,
 
 		switch (cq_desc->cqe_type) {
 		case CQE_TYPE_RX:
-			nicvf_rcv_pkt_handler(netdev, napi, cq_desc, sq);
+			nicvf_rcv_pkt_handler(netdev, napi, cq_desc, sq, rq);
 			work_done++;
 		break;
 		case CQE_TYPE_SEND:
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index f38ea349aa00..14e62c6ac342 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -760,6 +760,7 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs,
 
 	if (!rq->enable) {
 		nicvf_reclaim_rcv_queue(nic, qs, qidx);
+		xdp_rxq_info_unreg(&rq->xdp_rxq);
 		return;
 	}
 
@@ -772,6 +773,9 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs,
 	/* all writes of RBDR data to be loaded into L2 Cache as well*/
 	rq->caching = 1;
 
+	/* Driver have no proper error path for failed XDP RX-queue info reg */
+	WARN_ON(xdp_rxq_info_reg(&rq->xdp_rxq, nic->netdev, qidx) < 0);
+
 	/* Send a mailbox msg to PF to config RQ */
 	mbx.rq.msg = NIC_MBOX_MSG_RQ_CFG;
 	mbx.rq.qs_num = qs->vnic_id;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
index 178ab6e8e3c5..7d1e4e2aaad0 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
@@ -12,6 +12,7 @@
 #include <linux/netdevice.h>
 #include <linux/iommu.h>
 #include <linux/bpf.h>
+#include <net/xdp.h>
 #include "q_struct.h"
 
 #define MAX_QUEUE_SET			128
@@ -255,6 +256,7 @@ struct rcv_queue {
 	u8		start_qs_rbdr_idx; /* RBDR idx in the above QS */
 	u8		caching;
 	struct		rx_tx_queue_stats stats;
+	struct xdp_rxq_info xdp_rxq;
 } ____cacheline_aligned_in_smp;
 
 struct cmp_queue {

From 8bf5c4ee1889308ccd396fdfd40ac94129ee419f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:25:59 +0100
Subject: [PATCH 26/32] tun: setup xdp_rxq_info

Driver hook points for xdp_rxq_info:
 * reg  : tun_attach
 * unreg: __tun_detach

I've done some manual testing of this tun driver, but I would
appriciate good review and someone else running their use-case tests,
as I'm not 100% sure I understand the tfile->detached semantics.

V2: Removed the skb_array_cleanup() call from V1 by request from Jason Wang.

Cc: Jason Wang <jasowang@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/tun.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e367d6310353..e7c5f4b2a9a6 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -180,6 +180,7 @@ struct tun_file {
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
+	struct xdp_rxq_info xdp_rxq;
 };
 
 struct tun_flow_entry {
@@ -687,8 +688,10 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 			    tun->dev->reg_state == NETREG_REGISTERED)
 				unregister_netdevice(tun->dev);
 		}
-		if (tun)
+		if (tun) {
 			skb_array_cleanup(&tfile->tx_array);
+			xdp_rxq_info_unreg(&tfile->xdp_rxq);
+		}
 		sock_put(&tfile->sk);
 	}
 }
@@ -728,11 +731,13 @@ static void tun_detach_all(struct net_device *dev)
 		tun_napi_del(tun, tfile);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
+		xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
 	}
 	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
 		tun_enable_queue(tfile);
 		tun_queue_purge(tfile);
+		xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
 	}
 	BUG_ON(tun->numdisabled != 0);
@@ -784,6 +789,22 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 
 	tfile->queue_index = tun->numqueues;
 	tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
+
+	if (tfile->detached) {
+		/* Re-attach detached tfile, updating XDP queue_index */
+		WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));
+
+		if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
+			tfile->xdp_rxq.queue_index = tfile->queue_index;
+	} else {
+		/* Setup XDP RX-queue info, for new tfile getting attached */
+		err = xdp_rxq_info_reg(&tfile->xdp_rxq,
+				       tun->dev, tfile->queue_index);
+		if (err < 0)
+			goto out;
+		err = 0;
+	}
+
 	rcu_assign_pointer(tfile->tun, tun);
 	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
 	tun->numqueues++;
@@ -1508,6 +1529,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 		xdp.data = buf + pad;
 		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
+		xdp.rxq = &tfile->xdp_rxq;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 

From 754b8a21a96d5f11712245aef907149606b323ae Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:26:04 +0100
Subject: [PATCH 27/32] virtio_net: setup xdp_rxq_info

The virtio_net driver doesn't dynamically change the RX-ring queue
layout and backing pages, but instead reject XDP setup if all the
conditions for XDP is not meet.  Thus, the xdp_rxq_info also remains
fairly static.  This allow us to simply add the reg/unreg to
net_device open/close functions.

Driver hook points for xdp_rxq_info:
 * reg  : virtnet_open
 * unreg: virtnet_close

V3:
 - bugfix, also setup xdp.rxq in receive_mergeable()
 - Tested bpf-sample prog inside guest on a virtio_net device

Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/virtio_net.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6fb7b658a6cc..ed8299343728 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -31,6 +31,7 @@
 #include <linux/average.h>
 #include <linux/filter.h>
 #include <net/route.h>
+#include <net/xdp.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -115,6 +116,8 @@ struct receive_queue {
 
 	/* Name of this receive queue: input.$index */
 	char name[40];
+
+	struct xdp_rxq_info xdp_rxq;
 };
 
 struct virtnet_info {
@@ -559,6 +562,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 		xdp.data = xdp.data_hard_start + xdp_headroom;
 		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
+		xdp.rxq = &rq->xdp_rxq;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
@@ -692,6 +696,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		xdp.data = data + vi->hdr_len;
 		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + (len - vi->hdr_len);
+		xdp.rxq = &rq->xdp_rxq;
+
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
 		if (act != XDP_PASS)
@@ -1225,13 +1231,18 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
 static int virtnet_open(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	int i;
+	int i, err;
 
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		if (i < vi->curr_queue_pairs)
 			/* Make sure we have some buffers: if oom use wq. */
 			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
 				schedule_delayed_work(&vi->refill, 0);
+
+		err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i);
+		if (err < 0)
+			return err;
+
 		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
 		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
 	}
@@ -1560,6 +1571,7 @@ static int virtnet_close(struct net_device *dev)
 	cancel_delayed_work_sync(&vi->refill);
 
 	for (i = 0; i < vi->max_queue_pairs; i++) {
+		xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
 		napi_disable(&vi->rq[i].napi);
 		virtnet_napi_tx_disable(&vi->sq[i].napi);
 	}

From e817f85652c14d78f170b18797e4c477c78949e0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:26:09 +0100
Subject: [PATCH 28/32] xdp: generic XDP handling of xdp_rxq_info

Hook points for xdp_rxq_info:
 * reg  : netif_alloc_rx_queues
 * unreg: netif_free_rx_queues

The net_device have some members (num_rx_queues + real_num_rx_queues)
and data-area (dev->_rx with struct netdev_rx_queue's) that were
primarily used for exporting information about RPS (CONFIG_RPS) queues
to sysfs (CONFIG_SYSFS).

For generic XDP extend struct netdev_rx_queue with the xdp_rxq_info,
and remove some of the CONFIG_SYSFS ifdefs.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 69 +++++++++++++++++++++++++++++++++------
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 49bfc6eec74c..440b000f07f4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -44,6 +44,7 @@
 #include <net/dcbnl.h>
 #endif
 #include <net/netprio_cgroup.h>
+#include <net/xdp.h>
 
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
@@ -686,6 +687,7 @@ struct netdev_rx_queue {
 #endif
 	struct kobject			kobj;
 	struct net_device		*dev;
+	struct xdp_rxq_info		xdp_rxq;
 } ____cacheline_aligned_in_smp;
 
 /*
diff --git a/net/core/dev.c b/net/core/dev.c
index 2eb66c0d9cdb..d7925ef8743d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3906,9 +3906,33 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	return NET_RX_DROP;
 }
 
+static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct netdev_rx_queue *rxqueue;
+
+	rxqueue = dev->_rx;
+
+	if (skb_rx_queue_recorded(skb)) {
+		u16 index = skb_get_rx_queue(skb);
+
+		if (unlikely(index >= dev->real_num_rx_queues)) {
+			WARN_ONCE(dev->real_num_rx_queues > 1,
+				  "%s received packet on queue %u, but number "
+				  "of RX queues is %u\n",
+				  dev->name, index, dev->real_num_rx_queues);
+
+			return rxqueue; /* Return first rxqueue */
+		}
+		rxqueue += index;
+	}
+	return rxqueue;
+}
+
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
+	struct netdev_rx_queue *rxqueue;
 	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
 	void *orig_data;
@@ -3952,6 +3976,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
 	orig_data = xdp.data;
 
+	rxqueue = netif_get_rxqueue(skb);
+	xdp.rxq = &rxqueue->xdp_rxq;
+
 	act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
 	off = xdp.data - orig_data;
@@ -7589,12 +7616,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 }
 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 
-#ifdef CONFIG_SYSFS
 static int netif_alloc_rx_queues(struct net_device *dev)
 {
 	unsigned int i, count = dev->num_rx_queues;
 	struct netdev_rx_queue *rx;
 	size_t sz = count * sizeof(*rx);
+	int err = 0;
 
 	BUG_ON(count < 1);
 
@@ -7604,11 +7631,39 @@ static int netif_alloc_rx_queues(struct net_device *dev)
 
 	dev->_rx = rx;
 
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
 		rx[i].dev = dev;
+
+		/* XDP RX-queue setup */
+		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
+		if (err < 0)
+			goto err_rxq_info;
+	}
 	return 0;
+
+err_rxq_info:
+	/* Rollback successful reg's and free other resources */
+	while (i--)
+		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
+	kfree(dev->_rx);
+	dev->_rx = NULL;
+	return err;
+}
+
+static void netif_free_rx_queues(struct net_device *dev)
+{
+	unsigned int i, count = dev->num_rx_queues;
+	struct netdev_rx_queue *rx;
+
+	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
+	if (!dev->_rx)
+		return;
+
+	rx = dev->_rx;
+
+	for (i = 0; i < count; i++)
+		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
 }
-#endif
 
 static void netdev_init_one_queue(struct net_device *dev,
 				  struct netdev_queue *queue, void *_unused)
@@ -8169,12 +8224,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
-#ifdef CONFIG_SYSFS
 	if (rxqs < 1) {
 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
 		return NULL;
 	}
-#endif
 
 	alloc_size = sizeof(struct net_device);
 	if (sizeof_priv) {
@@ -8231,12 +8284,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (netif_alloc_netdev_queues(dev))
 		goto free_all;
 
-#ifdef CONFIG_SYSFS
 	dev->num_rx_queues = rxqs;
 	dev->real_num_rx_queues = rxqs;
 	if (netif_alloc_rx_queues(dev))
 		goto free_all;
-#endif
 
 	strcpy(dev->name, name);
 	dev->name_assign_type = name_assign_type;
@@ -8275,9 +8326,7 @@ void free_netdev(struct net_device *dev)
 
 	might_sleep();
 	netif_free_tx_queues(dev);
-#ifdef CONFIG_SYSFS
-	kvfree(dev->_rx);
-#endif
+	netif_free_rx_queues(dev);
 
 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
 

From 02dd3291b2f095bbc88e1d2628fd5bf2e92de69b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:26:14 +0100
Subject: [PATCH 29/32] bpf: finally expose xdp_rxq_info to XDP bpf-programs

Now all XDP driver have been updated to setup xdp_rxq_info and assign
this to xdp_buff->rxq.  Thus, it is now safe to enable access to some
of the xdp_rxq_info struct members.

This patch extend xdp_md and expose UAPI to userspace for
ingress_ifindex and rx_queue_index.  Access happens via bpf
instruction rewrite, that load data directly from struct xdp_rxq_info.

* ingress_ifindex map to xdp_rxq_info->dev->ifindex
* rx_queue_index  map to xdp_rxq_info->queue_index

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h |  3 +++
 net/core/filter.c        | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f2f8b36e2ad4..405317f9c064 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -899,6 +899,9 @@ struct xdp_md {
 	__u32 data;
 	__u32 data_end;
 	__u32 data_meta;
+	/* Below access go though struct xdp_rxq_info */
+	__u32 ingress_ifindex; /* rxq->dev->ifindex */
+	__u32 rx_queue_index;  /* rxq->queue_index  */
 };
 
 enum sk_action {
diff --git a/net/core/filter.c b/net/core/filter.c
index 130b842c3a15..acdb94c0e97f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4304,6 +4304,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data_end));
 		break;
+	case offsetof(struct xdp_md, ingress_ifindex):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, rxq));
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
+				      si->dst_reg, si->dst_reg,
+				      offsetof(struct xdp_rxq_info, dev));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct net_device,
+						     ifindex, 4, target_size));
+		break;
+	case offsetof(struct xdp_md, rx_queue_index):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, rxq));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct xdp_rxq_info,
+						queue_index, 4, target_size));
+		break;
 	}
 
 	return insn - insn_buf;

From 0fca931a6f21c11f675363b92b5a4fe86da59f30 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 3 Jan 2018 11:26:19 +0100
Subject: [PATCH 30/32] samples/bpf: program demonstrating access to
 xdp_rxq_info

This sample program can be used for monitoring and reporting how many
packets per sec (pps) are received per NIC RX queue index and which
CPU processed the packet. In itself it is a useful tool for quickly
identifying RSS imbalance issues, see below.

The default XDP action is XDP_PASS in-order to provide a monitor
mode. For benchmarking purposes it is possible to specify other XDP
actions on the cmdline --action.

Output below shows an imbalance RSS case where most RXQ's deliver to
CPU-0 while CPU-2 only get packets from a single RXQ.  Looking at
things from a CPU level the two CPUs are processing approx the same
amount, BUT looking at the rx_queue_index levels it is clear that
RXQ-2 receive much better service, than other RXQs which all share CPU-0.

Running XDP on dev:i40e1 (ifindex:3) action:XDP_PASS
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      0       900,473     0
XDP-RX CPU      2       906,921     0
XDP-RX CPU      total   1,807,395

RXQ stats       RXQ:CPU pps         issue-pps
rx_queue_index    0:0   180,098     0
rx_queue_index    0:sum 180,098
rx_queue_index    1:0   180,098     0
rx_queue_index    1:sum 180,098
rx_queue_index    2:2   906,921     0
rx_queue_index    2:sum 906,921
rx_queue_index    3:0   180,098     0
rx_queue_index    3:sum 180,098
rx_queue_index    4:0   180,082     0
rx_queue_index    4:sum 180,082
rx_queue_index    5:0   180,093     0
rx_queue_index    5:sum 180,093

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile            |   4 +
 samples/bpf/xdp_rxq_info_kern.c |  96 ++++++
 samples/bpf/xdp_rxq_info_user.c | 531 ++++++++++++++++++++++++++++++++
 3 files changed, 631 insertions(+)
 create mode 100644 samples/bpf/xdp_rxq_info_kern.c
 create mode 100644 samples/bpf/xdp_rxq_info_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4fb944a7ecf8..3ff7a05bea9a 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -41,6 +41,7 @@ hostprogs-y += xdp_redirect
 hostprogs-y += xdp_redirect_map
 hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
+hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 
 # Libbpf dependencies
@@ -90,6 +91,7 @@ xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
 xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
 xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
+xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 
 # Tell kbuild to always build the programs
@@ -139,6 +141,7 @@ always += xdp_redirect_kern.o
 always += xdp_redirect_map_kern.o
 always += xdp_redirect_cpu_kern.o
 always += xdp_monitor_kern.o
+always += xdp_rxq_info_kern.o
 always += syscall_tp_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
@@ -182,6 +185,7 @@ HOSTLOADLIBES_xdp_redirect += -lelf
 HOSTLOADLIBES_xdp_redirect_map += -lelf
 HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
+HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c
new file mode 100644
index 000000000000..3fd209291653
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_kern.c
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ *  Example howto extract XDP RX-queue info
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* Config setup from with userspace
+ *
+ * User-side setup ifindex in config_map, to verify that
+ * ctx->ingress_ifindex is correct (against configured ifindex)
+ */
+struct config {
+	__u32 action;
+	int ifindex;
+};
+struct bpf_map_def SEC("maps") config_map = {
+	.type		= BPF_MAP_TYPE_ARRAY,
+	.key_size	= sizeof(int),
+	.value_size	= sizeof(struct config),
+	.max_entries	= 1,
+};
+
+/* Common stats data record (shared with userspace) */
+struct datarec {
+	__u64 processed;
+	__u64 issue;
+};
+
+struct bpf_map_def SEC("maps") stats_global_map = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+#define MAX_RXQs 64
+
+/* Stats per rx_queue_index (per CPU) */
+struct bpf_map_def SEC("maps") rx_queue_index_map = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= MAX_RXQs + 1,
+};
+
+SEC("xdp_prog0")
+int  xdp_prognum0(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct datarec *rec, *rxq_rec;
+	int ingress_ifindex;
+	struct config *config;
+	u32 key = 0;
+
+	/* Global stats record */
+	rec = bpf_map_lookup_elem(&stats_global_map, &key);
+	if (!rec)
+		return XDP_ABORTED;
+	rec->processed++;
+
+	/* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF
+	 * instructions inside kernel to access xdp_rxq->dev->ifindex
+	 */
+	ingress_ifindex = ctx->ingress_ifindex;
+
+	config = bpf_map_lookup_elem(&config_map, &key);
+	if (!config)
+		return XDP_ABORTED;
+
+	/* Simple test: check ctx provided ifindex is as expected */
+	if (ingress_ifindex != config->ifindex) {
+		/* count this error case */
+		rec->issue++;
+		return XDP_ABORTED;
+	}
+
+	/* Update stats per rx_queue_index. Handle if rx_queue_index
+	 * is larger than stats map can contain info for.
+	 */
+	key = ctx->rx_queue_index;
+	if (key >= MAX_RXQs)
+		key = MAX_RXQs;
+	rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key);
+	if (!rxq_rec)
+		return XDP_ABORTED;
+	rxq_rec->processed++;
+	if (key == MAX_RXQs)
+		rxq_rec->issue++;
+
+	return config->action;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c
new file mode 100644
index 000000000000..32430e8b3a6a
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_user.c
@@ -0,0 +1,531 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ */
+static const char *__doc__ = " XDP RX-queue info extract example\n\n"
+	"Monitor how many packets per sec (pps) are received\n"
+	"per NIC RX queue index and which CPU processed the packet\n"
+	;
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+
+#include <arpa/inet.h>
+#include <linux/if_link.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags;
+
+/* Exit return codes */
+#define EXIT_OK		0
+#define EXIT_FAIL		1
+#define EXIT_FAIL_OPTION	2
+#define EXIT_FAIL_XDP		3
+#define EXIT_FAIL_BPF		4
+#define EXIT_FAIL_MEM		5
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"dev",		required_argument,	NULL, 'd' },
+	{"skb-mode",	no_argument,		NULL, 'S' },
+	{"sec",		required_argument,	NULL, 's' },
+	{"no-separators", no_argument,		NULL, 'z' },
+	{"action",	required_argument,	NULL, 'a' },
+	{0, 0, NULL,  0 }
+};
+
+static void int_exit(int sig)
+{
+	fprintf(stderr,
+		"Interrupted: Removing XDP program on ifindex:%d device:%s\n",
+		ifindex, ifname);
+	if (ifindex > -1)
+		set_link_xdp_fd(ifindex, -1, xdp_flags);
+	exit(EXIT_OK);
+}
+
+struct config {
+	__u32 action;
+	int ifindex;
+};
+#define XDP_ACTION_MAX (XDP_TX + 1)
+#define XDP_ACTION_MAX_STRLEN 11
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+	[XDP_ABORTED]	= "XDP_ABORTED",
+	[XDP_DROP]	= "XDP_DROP",
+	[XDP_PASS]	= "XDP_PASS",
+	[XDP_TX]	= "XDP_TX",
+};
+
+static const char *action2str(int action)
+{
+	if (action < XDP_ACTION_MAX)
+		return xdp_action_names[action];
+	return NULL;
+}
+
+static int parse_xdp_action(char *action_str)
+{
+	size_t maxlen;
+	__u64 action = -1;
+	int i;
+
+	for (i = 0; i < XDP_ACTION_MAX; i++) {
+		maxlen = XDP_ACTION_MAX_STRLEN;
+		if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) {
+			action = i;
+			break;
+		}
+	}
+	return action;
+}
+
+static void list_xdp_actions(void)
+{
+	int i;
+
+	printf("Available XDP --action <options>\n");
+	for (i = 0; i < XDP_ACTION_MAX; i++)
+		printf("\t%s\n", xdp_action_names[i]);
+	printf("\n");
+}
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf("\nDOCUMENTATION:\n%s\n", __doc__);
+	printf(" Usage: %s (options-see-below)\n", argv[0]);
+	printf(" Listing options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)",
+				*long_options[i].flag);
+		else
+			printf(" short-option: -%c",
+				long_options[i].val);
+		printf("\n");
+	}
+	printf("\n");
+	list_xdp_actions();
+}
+
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(void)
+{
+	struct timespec t;
+	int res;
+
+	res = clock_gettime(CLOCK_MONOTONIC, &t);
+	if (res < 0) {
+		fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+		exit(EXIT_FAIL);
+	}
+	return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+/* Common stats data record shared with _kern.c */
+struct datarec {
+	__u64 processed;
+	__u64 issue;
+};
+struct record {
+	__u64 timestamp;
+	struct datarec total;
+	struct datarec *cpu;
+};
+struct stats_record {
+	struct record stats;
+	struct record *rxq;
+};
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec *array;
+	size_t size;
+
+	size = sizeof(struct datarec) * nr_cpus;
+	array = malloc(size);
+	memset(array, 0, size);
+	if (!array) {
+		fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+		exit(EXIT_FAIL_MEM);
+	}
+	return array;
+}
+
+static struct record *alloc_record_per_rxq(void)
+{
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	struct record *array;
+	size_t size;
+
+	size = sizeof(struct record) * nr_rxqs;
+	array = malloc(size);
+	memset(array, 0, size);
+	if (!array) {
+		fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs);
+		exit(EXIT_FAIL_MEM);
+	}
+	return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	struct stats_record *rec;
+	int i;
+
+	rec = malloc(sizeof(*rec));
+	memset(rec, 0, sizeof(*rec));
+	if (!rec) {
+		fprintf(stderr, "Mem alloc error\n");
+		exit(EXIT_FAIL_MEM);
+	}
+	rec->rxq = alloc_record_per_rxq();
+	for (i = 0; i < nr_rxqs; i++)
+		rec->rxq[i].cpu = alloc_record_per_cpu();
+
+	rec->stats.cpu = alloc_record_per_cpu();
+	return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	int i;
+
+	for (i = 0; i < nr_rxqs; i++)
+		free(r->rxq[i].cpu);
+
+	free(r->rxq);
+	free(r->stats.cpu);
+	free(r);
+}
+
+static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
+{
+	/* For percpu maps, userspace gets a value per possible CPU */
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec values[nr_cpus];
+	__u64 sum_processed = 0;
+	__u64 sum_issue = 0;
+	int i;
+
+	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+		fprintf(stderr,
+			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+		return false;
+	}
+	/* Get time as close as possible to reading map contents */
+	rec->timestamp = gettime();
+
+	/* Record and sum values from each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		rec->cpu[i].processed = values[i].processed;
+		sum_processed        += values[i].processed;
+		rec->cpu[i].issue = values[i].issue;
+		sum_issue        += values[i].issue;
+	}
+	rec->total.processed = sum_processed;
+	rec->total.issue     = sum_issue;
+	return true;
+}
+
+static void stats_collect(struct stats_record *rec)
+{
+	int fd, i, max_rxqs;
+
+	fd = map_data[1].fd; /* map: stats_global_map */
+	map_collect_percpu(fd, 0, &rec->stats);
+
+	fd = map_data[2].fd; /* map: rx_queue_index_map */
+	max_rxqs = map_data[2].def.max_entries;
+	for (i = 0; i < max_rxqs; i++)
+		map_collect_percpu(fd, i, &rec->rxq[i]);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+	double period_ = 0;
+	__u64 period = 0;
+
+	period = r->timestamp - p->timestamp;
+	if (period > 0)
+		period_ = ((double) period / NANOSEC_PER_SEC);
+
+	return period_;
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+	__u64 packets = 0;
+	__u64 pps = 0;
+
+	if (period_ > 0) {
+		packets = r->processed - p->processed;
+		pps = packets / period_;
+	}
+	return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r,
+			    struct datarec *p, double period_)
+{
+	__u64 packets = 0;
+	__u64 pps = 0;
+
+	if (period_ > 0) {
+		packets = r->issue - p->issue;
+		pps = packets / period_;
+	}
+	return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+			struct stats_record *stats_prev,
+			int action)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	double pps = 0, err = 0;
+	struct record *rec, *prev;
+	double t;
+	int rxq;
+	int i;
+
+	/* Header */
+	printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s\n",
+	       ifname, ifindex, action2str(action));
+
+	/* stats_global_map */
+	{
+		char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n";
+		char *fm2_rx = "%-15s %-7s %'-11.0f\n";
+		char *errstr = "";
+
+		printf("%-15s %-7s %-11s %-11s\n",
+		       "XDP stats", "CPU", "pps", "issue-pps");
+
+		rec  =  &stats_rec->stats;
+		prev = &stats_prev->stats;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps = calc_pps     (r, p, t);
+			err = calc_errs_pps(r, p, t);
+			if (err > 0)
+				errstr = "invalid-ifindex";
+			if (pps > 0)
+				printf(fmt_rx, "XDP-RX CPU",
+					i, pps, err, errstr);
+		}
+		pps  = calc_pps     (&rec->total, &prev->total, t);
+		err  = calc_errs_pps(&rec->total, &prev->total, t);
+		printf(fm2_rx, "XDP-RX CPU", "total", pps, err);
+	}
+
+	/* rx_queue_index_map */
+	printf("\n%-15s %-7s %-11s %-11s\n",
+	       "RXQ stats", "RXQ:CPU", "pps", "issue-pps");
+
+	for (rxq = 0; rxq < nr_rxqs; rxq++) {
+		char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n";
+		char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n";
+		char *errstr = "";
+		int rxq_ = rxq;
+
+		/* Last RXQ in map catch overflows */
+		if (rxq_ == nr_rxqs - 1)
+			rxq_ = -1;
+
+		rec  =  &stats_rec->rxq[rxq];
+		prev = &stats_prev->rxq[rxq];
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps = calc_pps     (r, p, t);
+			err = calc_errs_pps(r, p, t);
+			if (err > 0) {
+				if (rxq_ == -1)
+					errstr = "map-overflow-RXQ";
+				else
+					errstr = "err";
+			}
+			if (pps > 0)
+				printf(fmt_rx, "rx_queue_index",
+				       rxq_, i, pps, err, errstr);
+		}
+		pps  = calc_pps     (&rec->total, &prev->total, t);
+		err  = calc_errs_pps(&rec->total, &prev->total, t);
+		if (pps || err)
+			printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err);
+	}
+}
+
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+	struct stats_record *tmp;
+
+	tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+static void stats_poll(int interval, int action)
+{
+	struct stats_record *record, *prev;
+
+	record = alloc_stats_record();
+	prev   = alloc_stats_record();
+	stats_collect(record);
+
+	while (1) {
+		swap(&prev, &record);
+		stats_collect(record);
+		stats_print(record, prev, action);
+		sleep(interval);
+	}
+
+	free_stats_record(record);
+	free_stats_record(prev);
+}
+
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+	bool use_separators = true;
+	struct config cfg = { 0 };
+	char filename[256];
+	int longindex = 0;
+	int interval = 2;
+	__u32 key = 0;
+	int opt, err;
+
+	char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 };
+	int action = XDP_PASS; /* Default action */
+	char *action_str = NULL;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	if (load_bpf_file(filename)) {
+		fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
+		return EXIT_FAIL;
+	}
+
+	if (!prog_fd[0]) {
+		fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
+		return EXIT_FAIL;
+	}
+
+	/* Parse commands line args */
+	while ((opt = getopt_long(argc, argv, "hSd:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		case 'd':
+			if (strlen(optarg) >= IF_NAMESIZE) {
+				fprintf(stderr, "ERR: --dev name too long\n");
+				goto error;
+			}
+			ifname = (char *)&ifname_buf;
+			strncpy(ifname, optarg, IF_NAMESIZE);
+			ifindex = if_nametoindex(ifname);
+			if (ifindex == 0) {
+				fprintf(stderr,
+					"ERR: --dev name unknown err(%d):%s\n",
+					errno, strerror(errno));
+				goto error;
+			}
+			break;
+		case 's':
+			interval = atoi(optarg);
+			break;
+		case 'S':
+			xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'z':
+			use_separators = false;
+			break;
+		case 'a':
+			action_str = (char *)&action_str_buf;
+			strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN);
+			break;
+		case 'h':
+		error:
+		default:
+			usage(argv);
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	/* Required option */
+	if (ifindex == -1) {
+		fprintf(stderr, "ERR: required option --dev missing\n");
+		usage(argv);
+		return EXIT_FAIL_OPTION;
+	}
+	cfg.ifindex = ifindex;
+
+	/* Parse action string */
+	if (action_str) {
+		action = parse_xdp_action(action_str);
+		if (action < 0) {
+			fprintf(stderr, "ERR: Invalid XDP --action: %s\n",
+				action_str);
+			list_xdp_actions();
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	cfg.action = action;
+
+	/* Trick to pretty printf with thousands separators use %' */
+	if (use_separators)
+		setlocale(LC_NUMERIC, "en_US");
+
+	/* User-side setup ifindex in config_map */
+	err = bpf_map_update_elem(map_fd[0], &key, &cfg, 0);
+	if (err) {
+		fprintf(stderr, "Store config failed (err:%d)\n", err);
+		exit(EXIT_FAIL_BPF);
+	}
+
+	/* Remove XDP program when program is interrupted */
+	signal(SIGINT, int_exit);
+
+	if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+		fprintf(stderr, "link set xdp fd failed\n");
+		return EXIT_FAIL_XDP;
+	}
+
+	stats_poll(interval, action);
+	return EXIT_OK;
+}

From 16f07c551e3ac433874031433231140d38e69ccd Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 4 Jan 2018 13:55:03 -0800
Subject: [PATCH 31/32] bpf: implement syscall command BPF_MAP_GET_NEXT_KEY for
 stacktrace map

Currently, bpf syscall command BPF_MAP_GET_NEXT_KEY is not
supported for stacktrace map. However, there are use cases where
user space wants to enumerate all stacktrace map entries where
BPF_MAP_GET_NEXT_KEY command will be really helpful.
In addition, if user space wants to delete all map entries
in order to save memory and does not want to close the
map file descriptor, BPF_MAP_GET_NEXT_KEY may help improve
performance if map entries are sparsely populated.

The implementation has similar behavior for
BPF_MAP_GET_NEXT_KEY implementation in hashtab. If user provides
a NULL key pointer or an invalid key, the first key is returned.
Otherwise, the first valid key after the input parameter "key"
is returned, or -ENOENT if no valid key can be found.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/stackmap.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index a15bc636cc98..6c63c2222ea8 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -226,9 +226,33 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 	return 0;
 }
 
-static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int stack_map_get_next_key(struct bpf_map *map, void *key,
+				  void *next_key)
 {
-	return -EINVAL;
+	struct bpf_stack_map *smap = container_of(map,
+						  struct bpf_stack_map, map);
+	u32 id;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (!key) {
+		id = 0;
+	} else {
+		id = *(u32 *)key;
+		if (id >= smap->n_buckets || !smap->buckets[id])
+			id = 0;
+		else
+			id++;
+	}
+
+	while (id < smap->n_buckets && !smap->buckets[id])
+		id++;
+
+	if (id >= smap->n_buckets)
+		return -ENOENT;
+
+	*(u32 *)next_key = id;
+	return 0;
 }
 
 static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,

From 3ced9b600234e6bb13cb25654ce44c4896485cf2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 4 Jan 2018 13:55:04 -0800
Subject: [PATCH 32/32] tools/bpf: add a bpf selftest for stacktrace

Added a bpf selftest in test_progs at tools directory for stacktrace.
The test will populate a hashtable map and a stacktrace map
at the same time with the same key, stackid.
The user space will compare both maps, using BPF_MAP_LOOKUP_ELEM
command and BPF_MAP_GET_NEXT_KEY command, to ensure that both have
the same set of keys.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile          |   2 +-
 tools/testing/selftests/bpf/test_progs.c      | 127 ++++++++++++++++++
 .../selftests/bpf/test_stacktrace_map.c       |  62 +++++++++
 3 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_stacktrace_map.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 1304753d29ea..a8aa7e251c8e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -19,7 +19,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
-	test_l4lb_noinline.o test_xdp_noinline.o
+	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o
 
 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
 	test_offload.py
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 09087ab12293..b549308abd19 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -837,6 +837,132 @@ static void test_tp_attach_query(void)
 	free(query);
 }
 
+static int compare_map_keys(int map1_fd, int map2_fd)
+{
+	__u32 key, next_key;
+	char val_buf[PERF_MAX_STACK_DEPTH * sizeof(__u64)];
+	int err;
+
+	err = bpf_map_get_next_key(map1_fd, NULL, &key);
+	if (err)
+		return err;
+	err = bpf_map_lookup_elem(map2_fd, &key, val_buf);
+	if (err)
+		return err;
+
+	while (bpf_map_get_next_key(map1_fd, &key, &next_key) == 0) {
+		err = bpf_map_lookup_elem(map2_fd, &next_key, val_buf);
+		if (err)
+			return err;
+
+		key = next_key;
+	}
+	if (errno != ENOENT)
+		return -1;
+
+	return 0;
+}
+
+static void test_stacktrace_map()
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	const char *file = "./test_stacktrace_map.o";
+	int bytes, efd, err, pmu_fd, prog_fd;
+	struct perf_event_attr attr = {};
+	__u32 key, val, duration = 0;
+	struct bpf_object *obj;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+		goto out;
+
+	/* Get the ID for the sched/sched_switch tracepoint */
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/sched/sched_switch/id");
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
+		  "read", "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	/* Open the perf event and attach bpf progrram */
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+		  pmu_fd, errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+		  err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* give some time for bpf program run */
+	sleep(1);
+
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		; /* fall through */
+
+disable_pmu:
+	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+	close(pmu_fd);
+
+close_prog:
+	bpf_object__close(obj);
+
+out:
+	return;
+}
+
 int main(void)
 {
 	struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
@@ -852,6 +978,7 @@ int main(void)
 	test_pkt_md_access();
 	test_obj_name();
 	test_tp_attach_query();
+	test_stacktrace_map();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_stacktrace_map.c b/tools/testing/selftests/bpf/test_stacktrace_map.c
new file mode 100644
index 000000000000..76d85c5d08bd
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_stacktrace_map.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH         127
+#endif
+
+struct bpf_map_def SEC("maps") control_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") stackid_hmap = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") stackmap = {
+	.type = BPF_MAP_TYPE_STACK_TRACE,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
+	.max_entries = 10000,
+};
+
+/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
+struct sched_switch_args {
+	unsigned long long pad;
+	char prev_comm[16];
+	int prev_pid;
+	int prev_prio;
+	long long prev_state;
+	char next_comm[16];
+	int next_pid;
+	int next_prio;
+};
+
+SEC("tracepoint/sched/sched_switch")
+int oncpu(struct sched_switch_args *ctx)
+{
+	__u32 key = 0, val = 0, *value_p;
+
+	value_p = bpf_map_lookup_elem(&control_map, &key);
+	if (value_p && *value_p)
+		return 0; /* skip if non-zero *value_p */
+
+	/* The size of stackmap and stackid_hmap should be the same */
+	key = bpf_get_stackid(ctx, &stackmap, 0);
+	if ((int)key >= 0)
+		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */