From 58990d1ff3f7896ee341030e9a7c2e4002570683 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 7 Jun 2018 17:40:03 +0200
Subject: [PATCH 01/23] bpf: reject passing modified ctx to helper functions

As commit 28e33f9d78ee ("bpf: disallow arithmetic operations on
context pointer") already describes, f1174f77b50c ("bpf/verifier:
rework value tracking") removed the specific white-listed cases
we had previously where we would allow for pointer arithmetic in
order to further generalize it, and allow e.g. context access via
modified registers. While the dereferencing of modified context
pointers had been forbidden through 28e33f9d78ee, syzkaller did
recently manage to trigger several KASAN splats for slab out of
bounds access and use after frees by simply passing a modified
context pointer to a helper function which would then do the bad
access since verifier allowed it in adjust_ptr_min_max_vals().

Rejecting arithmetic on ctx pointer in adjust_ptr_min_max_vals()
generally could break existing programs as there's a valid use
case in tracing in combination with passing the ctx to helpers as
bpf_probe_read(), where the register then becomes unknown at
verification time due to adding a non-constant offset to it. An
access sequence may look like the following:

  offset = args->filename;  /* field __data_loc filename */
  bpf_probe_read(&dst, len, (char *)args + offset); // args is ctx

There are two options: i) we could special case the ctx and as
soon as we add a constant or bounded offset to it (hence ctx type
wouldn't change) we could turn the ctx into an unknown scalar, or
ii) we generalize the sanity test for ctx member access into a
small helper and assert it on the ctx register that was passed
as a function argument. Fwiw, latter is more obvious and less
complex at the same time, and one case that may potentially be
legitimate in future for ctx member access at least would be for
ctx to carry a const offset. Therefore, fix follows approach
from ii) and adds test cases to BPF kselftests.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Reported-by: syzbot+3d0b2441dbb71751615e@syzkaller.appspotmail.com
Reported-by: syzbot+c8504affd4fdd0c1b626@syzkaller.appspotmail.com
Reported-by: syzbot+e5190cb881d8660fb1a3@syzkaller.appspotmail.com
Reported-by: syzbot+efae31b384d5badbd620@syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                       | 48 +++++++++++------
 tools/testing/selftests/bpf/test_verifier.c | 58 ++++++++++++++++++++-
 2 files changed, 88 insertions(+), 18 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d6403b5166f4..cced0c1e63e2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1617,6 +1617,30 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 }
 #endif
 
+static int check_ctx_reg(struct bpf_verifier_env *env,
+			 const struct bpf_reg_state *reg, int regno)
+{
+	/* Access to ctx or passing it to a helper is only allowed in
+	 * its original, unmodified form.
+	 */
+
+	if (reg->off) {
+		verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n",
+			regno, reg->off);
+		return -EACCES;
+	}
+
+	if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
 /* truncate register to smaller size (in bytes)
  * must be called with size < BPF_REG_SIZE
  */
@@ -1686,24 +1710,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			verbose(env, "R%d leaks addr into ctx\n", value_regno);
 			return -EACCES;
 		}
-		/* ctx accesses must be at a fixed offset, so that we can
-		 * determine what type of data were returned.
-		 */
-		if (reg->off) {
-			verbose(env,
-				"dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
-				regno, reg->off, off - reg->off);
-			return -EACCES;
-		}
-		if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
-			char tn_buf[48];
 
-			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose(env,
-				"variable ctx access var_off=%s off=%d size=%d",
-				tn_buf, off, size);
-			return -EACCES;
-		}
+		err = check_ctx_reg(env, reg, regno);
+		if (err < 0)
+			return err;
+
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
@@ -1984,6 +1995,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_CTX;
 		if (type != expected_type)
 			goto err_type;
+		err = check_ctx_reg(env, reg, regno);
+		if (err < 0)
+			return err;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 7cb1d74057ce..2ecd27b670d7 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -8647,7 +8647,7 @@ static struct bpf_test tests[] = {
 				    offsetof(struct __sk_buff, mark)),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "dereference of modified ctx ptr R1 off=68+8, ctx+const is allowed, ctx+const+const is not",
+		.errstr = "dereference of modified ctx ptr",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
@@ -12258,6 +12258,62 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.retval = 5,
 	},
+	{
+		"pass unmodified ctx pointer to helper",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_csum_update),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"pass modified ctx pointer to helper, 1",
+		.insns = {
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_csum_update),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = REJECT,
+		.errstr = "dereference of modified ctx ptr",
+	},
+	{
+		"pass modified ctx pointer to helper, 2",
+		.insns = {
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_get_socket_cookie),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result_unpriv = REJECT,
+		.result = REJECT,
+		.errstr_unpriv = "dereference of modified ctx ptr",
+		.errstr = "dereference of modified ctx ptr",
+	},
+	{
+		"pass modified ctx pointer to helper, 3",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_3, 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_csum_update),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = REJECT,
+		.errstr = "variable ctx access var_off=(0x0; 0x4)",
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)

From 7eced5ab5a7366ee7ca5360b3eca9d220c2b2887 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 3 Jun 2018 12:06:57 +0200
Subject: [PATCH 02/23] netfilter: nf_tables: add NFT_LOGLEVEL_* enumeration
 and use it

This is internal, not exposed through uapi, and although it maps with
userspace LOG_*, with the introduction of LOGLEVEL_AUDIT we are
incurring in namespace pollution.

This patch adds the NFT_LOGLEVEL_ enumeration and use it from nft_log.

Fixes: 1a893b44de45 ("netfilter: nf_tables: Add audit support to log statement")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netfilter/nf_tables.h | 26 ++++++++++++++++++++++--
 net/netfilter/nft_log.c                  | 10 ++++-----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ae00a3c49b8a..c9bf74b94f37 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1099,9 +1099,31 @@ enum nft_log_attributes {
 #define NFTA_LOG_MAX		(__NFTA_LOG_MAX - 1)
 
 /**
- * LOGLEVEL_AUDIT - a pseudo log level enabling audit logging
+ * enum nft_log_level - nf_tables log levels
+ *
+ * @NFT_LOGLEVEL_EMERG: system is unusable
+ * @NFT_LOGLEVEL_ALERT: action must be taken immediately
+ * @NFT_LOGLEVEL_CRIT: critical conditions
+ * @NFT_LOGLEVEL_ERR: error conditions
+ * @NFT_LOGLEVEL_WARNING: warning conditions
+ * @NFT_LOGLEVEL_NOTICE: normal but significant condition
+ * @NFT_LOGLEVEL_INFO: informational
+ * @NFT_LOGLEVEL_DEBUG: debug-level messages
+ * @NFT_LOGLEVEL_AUDIT: enabling audit logging
  */
-#define LOGLEVEL_AUDIT		8
+enum nft_log_level {
+	NFT_LOGLEVEL_EMERG,
+	NFT_LOGLEVEL_ALERT,
+	NFT_LOGLEVEL_CRIT,
+	NFT_LOGLEVEL_ERR,
+	NFT_LOGLEVEL_WARNING,
+	NFT_LOGLEVEL_NOTICE,
+	NFT_LOGLEVEL_INFO,
+	NFT_LOGLEVEL_DEBUG,
+	NFT_LOGLEVEL_AUDIT,
+	__NFT_LOGLEVEL_MAX
+};
+#define NFT_LOGLEVEL_MAX	(__NFT_LOGLEVEL_MAX + 1)
 
 /**
  * enum nft_queue_attributes - nf_tables queue expression netlink attributes
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 7eef1cffbf1b..655187bed5d8 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -111,7 +111,7 @@ static void nft_log_eval(const struct nft_expr *expr,
 	const struct nft_log *priv = nft_expr_priv(expr);
 
 	if (priv->loginfo.type == NF_LOG_TYPE_LOG &&
-	    priv->loginfo.u.log.level == LOGLEVEL_AUDIT) {
+	    priv->loginfo.u.log.level == NFT_LOGLEVEL_AUDIT) {
 		nft_log_eval_audit(pkt);
 		return;
 	}
@@ -166,9 +166,9 @@ static int nft_log_init(const struct nft_ctx *ctx,
 			li->u.log.level =
 				ntohl(nla_get_be32(tb[NFTA_LOG_LEVEL]));
 		} else {
-			li->u.log.level = LOGLEVEL_WARNING;
+			li->u.log.level = NFT_LOGLEVEL_WARNING;
 		}
-		if (li->u.log.level > LOGLEVEL_AUDIT) {
+		if (li->u.log.level > NFT_LOGLEVEL_AUDIT) {
 			err = -EINVAL;
 			goto err1;
 		}
@@ -196,7 +196,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		break;
 	}
 
-	if (li->u.log.level == LOGLEVEL_AUDIT)
+	if (li->u.log.level == NFT_LOGLEVEL_AUDIT)
 		return 0;
 
 	err = nf_logger_find_get(ctx->family, li->type);
@@ -220,7 +220,7 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
 	if (priv->prefix != nft_log_null_prefix)
 		kfree(priv->prefix);
 
-	if (li->u.log.level == LOGLEVEL_AUDIT)
+	if (li->u.log.level == NFT_LOGLEVEL_AUDIT)
 		return;
 
 	nf_logger_put(ctx->family, li->type);

From fd3a88625844907151737fc3b4201676effa6d27 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 6 Jun 2018 11:23:01 -0400
Subject: [PATCH 03/23] net: in virtio_net_hdr only add VLAN_HLEN to csum_start
 if payload holds vlan

Tun, tap, virtio, packet and uml vector all use struct virtio_net_hdr
to communicate packet metadata to userspace.

For skbuffs with vlan, the first two return the packet as it may have
existed on the wire, inserting the VLAN tag in the user buffer.  Then
virtio_net_hdr.csum_start needs to be adjusted by VLAN_HLEN bytes.

Commit f09e2249c4f5 ("macvtap: restore vlan header on user read")
added this feature to macvtap. Commit 3ce9b20f1971 ("macvtap: Fix
csum_start when VLAN tags are present") then fixed up csum_start.

Virtio, packet and uml do not insert the vlan header in the user
buffer.

When introducing virtio_net_hdr_from_skb to deduplicate filling in
the virtio_net_hdr, the variant from macvtap which adds VLAN_HLEN was
applied uniformly, breaking csum offset for packets with vlan on
virtio and packet.

Make insertion of VLAN_HLEN optional. Convert the callers to pass it
when needed.

Fixes: e858fae2b0b8f4 ("virtio_net: use common code for virtio_net_hdr and skb GSO conversion")
Fixes: 1276f24eeef2 ("packet: use common code for virtio_net_hdr and skb GSO conversion")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/um/drivers/vector_transports.c |  3 ++-
 drivers/net/tap.c                   |  5 ++++-
 drivers/net/tun.c                   |  3 ++-
 drivers/net/virtio_net.c            |  3 ++-
 include/linux/virtio_net.h          | 11 ++++-------
 net/packet/af_packet.c              |  4 ++--
 6 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/um/drivers/vector_transports.c b/arch/um/drivers/vector_transports.c
index 9065047f844b..77e4ebc206ae 100644
--- a/arch/um/drivers/vector_transports.c
+++ b/arch/um/drivers/vector_transports.c
@@ -120,7 +120,8 @@ static int raw_form_header(uint8_t *header,
 		skb,
 		vheader,
 		virtio_legacy_is_little_endian(),
-		false
+		false,
+		0
 	);
 
 	return 0;
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 9b6cb780affe..f0f7cd977667 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -774,13 +774,16 @@ static ssize_t tap_put_user(struct tap_queue *q,
 	int total;
 
 	if (q->flags & IFF_VNET_HDR) {
+		int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
 		struct virtio_net_hdr vnet_hdr;
+
 		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
 		if (iov_iter_count(iter) < vnet_hdr_len)
 			return -EINVAL;
 
 		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
-					    tap_is_little_endian(q), true))
+					    tap_is_little_endian(q), true,
+					    vlan_hlen))
 			BUG();
 
 		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 85e14adf5207..a192a017cc68 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2089,7 +2089,8 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			return -EINVAL;
 
 		if (virtio_net_hdr_from_skb(skb, &gso,
-					    tun_is_little_endian(tun), true)) {
+					    tun_is_little_endian(tun), true,
+					    vlan_hlen)) {
 			struct skb_shared_info *sinfo = skb_shinfo(skb);
 			pr_err("unexpected GSO type: "
 			       "0x%x, gso_size %d, hdr_len %d\n",
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 2aaa18ec7d46..1619ee3070b6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1411,7 +1411,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 		hdr = skb_vnet_hdr(skb);
 
 	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
-				    virtio_is_little_endian(vi->vdev), false))
+				    virtio_is_little_endian(vi->vdev), false,
+				    0))
 		BUG();
 
 	if (vi->mergeable_rx_bufs)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index f144216febc6..9397628a1967 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -58,7 +58,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 					  struct virtio_net_hdr *hdr,
 					  bool little_endian,
-					  bool has_data_valid)
+					  bool has_data_valid,
+					  int vlan_hlen)
 {
 	memset(hdr, 0, sizeof(*hdr));   /* no info leak */
 
@@ -83,12 +84,8 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		if (skb_vlan_tag_present(skb))
-			hdr->csum_start = __cpu_to_virtio16(little_endian,
-				skb_checksum_start_offset(skb) + VLAN_HLEN);
-		else
-			hdr->csum_start = __cpu_to_virtio16(little_endian,
-				skb_checksum_start_offset(skb));
+		hdr->csum_start = __cpu_to_virtio16(little_endian,
+			skb_checksum_start_offset(skb) + vlan_hlen);
 		hdr->csum_offset = __cpu_to_virtio16(little_endian,
 				skb->csum_offset);
 	} else if (has_data_valid &&
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 54ce66f68482..ee018564b2b4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2005,7 +2005,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
 		return -EINVAL;
 	*len -= sizeof(vnet_hdr);
 
-	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
+	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
 		return -EINVAL;
 
 	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
@@ -2272,7 +2272,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (do_vnet) {
 		if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
 					    sizeof(struct virtio_net_hdr),
-					    vio_le(), true)) {
+					    vio_le(), true, 0)) {
 			spin_lock(&sk->sk_receive_queue.lock);
 			goto drop_n_account;
 		}

From 52acf73b6e9a6962045feb2ba5a8921da2201915 Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Wed, 6 Jun 2018 21:32:51 +0000
Subject: [PATCH 04/23] hv_netvsc: Fix a network regression after ifdown/ifup

Recently people reported the NIC stops working after
"ifdown eth0; ifup eth0". It turns out in this case the TX queues are not
enabled, after the refactoring of the common detach logic: when the NIC
has sub-channels, usually we enable all the TX queues after all
sub-channels are set up: see rndis_set_subchannel() ->
netif_device_attach(), but in the case of "ifdown eth0; ifup eth0" where
the number of channels doesn't change, we also must make sure the TX queues
are enabled. The patch fixes the regression.

Fixes: 7b2ee50c0cd5 ("hv_netvsc: common detach logic")
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index bef4d55a108c..8eec156418ea 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -127,8 +127,10 @@ static int netvsc_open(struct net_device *net)
 	}
 
 	rdev = nvdev->extension;
-	if (!rdev->link_state)
+	if (!rdev->link_state) {
 		netif_carrier_on(net);
+		netif_tx_wake_all_queues(net);
+	}
 
 	if (vf_netdev) {
 		/* Setting synthetic device up transparently sets

From 000ade8016400d93b4d7c89970d96b8c14773d45 Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultanxda@gmail.com>
Date: Wed, 6 Jun 2018 15:56:54 -0700
Subject: [PATCH 05/23] ip_tunnel: Fix name string concatenate in
 __ip_tunnel_create()

By passing a limit of 2 bytes to strncat, strncat is limited to writing
fewer bytes than what it's supposed to append to the name here.

Since the bounds are checked on the line above this, just remove the string
bounds checks entirely since they're unneeded.

Signed-off-by: Sultan Alsawaf <sultanxda@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 38d906baf1df..c4f5602308ed 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -261,8 +261,8 @@ static struct net_device *__ip_tunnel_create(struct net *net,
 	} else {
 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
 			goto failed;
-		strlcpy(name, ops->kind, IFNAMSIZ);
-		strncat(name, "%d", 2);
+		strcpy(name, ops->kind);
+		strcat(name, "%d");
 	}
 
 	ASSERT_RTNL();

From eb55bbf865d9979098c6a7a17cbdb41237ece951 Mon Sep 17 00:00:00 2001
From: Xiangning Yu <yuxiangning@gmail.com>
Date: Thu, 7 Jun 2018 13:39:59 +0800
Subject: [PATCH 06/23] bonding: re-evaluate force_primary when the primary
 slave name changes

There is a timing issue under active-standy mode, when bond_enslave() is
called, bond->params.primary might not be initialized yet.

Any time the primary slave string changes, bond->force_primary should be
set to true to make sure the primary becomes the active slave.

Signed-off-by: Xiangning Yu <yuxiangning@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_options.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 8a945c9341d6..98663c50ded0 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -1142,6 +1142,7 @@ static int bond_option_primary_set(struct bonding *bond,
 				   slave->dev->name);
 			rcu_assign_pointer(bond->primary_slave, slave);
 			strcpy(bond->params.primary, slave->dev->name);
+			bond->force_primary = true;
 			bond_select_active_slave(bond);
 			goto out;
 		}

From 2ac0e1524d9585f5261cebecca262da6cedc1d85 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 7 Jun 2018 15:10:30 +0200
Subject: [PATCH 07/23] net: mscc: ocelot: Fix uninitialized error in
 ocelot_netdevice_event()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc-4.1.2:

    drivers/net/ethernet/mscc/ocelot.c: In function ‘ocelot_netdevice_event’:
    drivers/net/ethernet/mscc/ocelot.c:1129: warning: ‘ret’ may be used uninitialized in this function

If the list iterated over by netdev_for_each_lower_dev() is empty, ret
is never initialized, and converted into a notifier return value.

Fix this by preinitializing ret to zero.

Fixes: a556c76adc052c97 ("net: mscc: Add initial Ocelot switch support")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index c8c74aa548d9..fb2c8f8071e6 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1126,7 +1126,7 @@ static int ocelot_netdevice_event(struct notifier_block *unused,
 {
 	struct netdev_notifier_changeupper_info *info = ptr;
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	int ret;
+	int ret = 0;
 
 	if (netif_is_lag_master(dev)) {
 		struct net_device *slave;

From bf956be520fb534510e31564231163aa05f7f091 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 7 Jun 2018 10:23:10 -0700
Subject: [PATCH 08/23] umh: fix race condition

kasan reported use-after-free:
BUG: KASAN: use-after-free in call_usermodehelper_exec_work+0x2d3/0x310 kernel/umh.c:195
Write of size 4 at addr ffff8801d9202370 by task kworker/u4:2/50
Workqueue: events_unbound call_usermodehelper_exec_work
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_store4_noabort+0x17/0x20 mm/kasan/report.c:437
 call_usermodehelper_exec_work+0x2d3/0x310 kernel/umh.c:195
 process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
 worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
 kthread+0x345/0x410 kernel/kthread.c:240
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412

The reason is that 'sub_info' cannot be accessed out of parent task
context, since it will be freed by the child.
Instead remember the pid in the child task.

Fixes: 449325b52b7a ("umh: introduce fork_usermode_blob() helper")
Reported-by: syzbot+2c73319c406f1987d156@syzkaller.appspotmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/umh.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/umh.c b/kernel/umh.c
index 30db93fd7e39..c449858946af 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -99,6 +99,7 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
+	sub_info->pid = task_pid_nr(current);
 	if (sub_info->file)
 		retval = do_execve_file(sub_info->file,
 					sub_info->argv, sub_info->envp);
@@ -191,8 +192,6 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
 		if (pid < 0) {
 			sub_info->retval = pid;
 			umh_complete(sub_info);
-		} else {
-			sub_info->pid = pid;
 		}
 	}
 }

From 8d97ca6b6755bf7ef57d323642ca9ee80d689782 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 7 Jun 2018 10:29:29 -0700
Subject: [PATCH 09/23] bpfilter: fix OUTPUT_FORMAT

CONFIG_OUTPUT_FORMAT is x86 only macro.
Used objdump to extract elf file format.

Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
Reported-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bpfilter/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index aafa72001fcd..e0bbe7583e58 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -21,7 +21,7 @@ endif
 # which bpfilter_kern.c passes further into umh blob loader at run-time
 quiet_cmd_copy_umh = GEN $@
       cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
-      $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
+      $(OBJCOPY) -I binary -O `$(OBJDUMP) -f $<|grep format|cut -d' ' -f8` \
       -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
       --rename-section .data=.init.rodata $< $@
 

From 23316a366e1654e4ad05817c6075bc1019efb30a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 6 Jun 2018 09:12:44 -0700
Subject: [PATCH 10/23] tools/bpf: fix selftest get_cgroup_id_user

Commit f269099a7e7a ("tools/bpf: add a selftest for
bpf_get_current_cgroup_id() helper") added a test
for bpf_get_current_cgroup_id() helper. The bpf program
is attached to tracepoint syscalls/sys_enter_nanosleep
and will record the cgroup id if the tracepoint is hit.
The test program creates a cgroup and attachs itself to
this cgroup and expects that the test program process
cgroup id is the same as the cgroup_id retrieved
by the bpf program.

In a light system where no other processes called
nanosleep syscall, the test case can pass.
In a busy system where many different processes can hit
syscalls/sys_enter_nanosleep tracepoint, the cgroup id
recorded by bpf program may not match the test program
process cgroup_id.

This patch fixed an issue by communicating the test program
pid to bpf program. The bpf program only records
cgroup id if the current task pid is the same as
passed-in pid. This ensures that the recorded cgroup_id
is for the cgroup within which the test program resides.

Fixes: f269099a7e7a ("tools/bpf: add a selftest for bpf_get_current_cgroup_id() helper")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/get_cgroup_id_kern.c | 14 +++++++++++++-
 tools/testing/selftests/bpf/get_cgroup_id_user.c | 12 ++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/get_cgroup_id_kern.c
index 2cf8cb23f209..014dba10b8a5 100644
--- a/tools/testing/selftests/bpf/get_cgroup_id_kern.c
+++ b/tools/testing/selftests/bpf/get_cgroup_id_kern.c
@@ -11,12 +11,24 @@ struct bpf_map_def SEC("maps") cg_ids = {
 	.max_entries = 1,
 };
 
+struct bpf_map_def SEC("maps") pidmap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 1,
+};
+
 SEC("tracepoint/syscalls/sys_enter_nanosleep")
 int trace(void *ctx)
 {
-	__u32 key = 0;
+	__u32 pid = bpf_get_current_pid_tgid();
+	__u32 key = 0, *expected_pid;
 	__u64 *val;
 
+	expected_pid = bpf_map_lookup_elem(&pidmap, &key);
+	if (!expected_pid || *expected_pid != pid)
+		return 0;
+
 	val = bpf_map_lookup_elem(&cg_ids, &key);
 	if (val)
 		*val = bpf_get_current_cgroup_id();
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c
index ea19a42e5894..e8da7b39158d 100644
--- a/tools/testing/selftests/bpf/get_cgroup_id_user.c
+++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c
@@ -50,13 +50,13 @@ int main(int argc, char **argv)
 	const char *probe_name = "syscalls/sys_enter_nanosleep";
 	const char *file = "get_cgroup_id_kern.o";
 	int err, bytes, efd, prog_fd, pmu_fd;
+	int cgroup_fd, cgidmap_fd, pidmap_fd;
 	struct perf_event_attr attr = {};
-	int cgroup_fd, cgidmap_fd;
 	struct bpf_object *obj;
 	__u64 kcgid = 0, ucgid;
+	__u32 key = 0, pid;
 	int exit_code = 1;
 	char buf[256];
-	__u32 key = 0;
 
 	err = setup_cgroup_environment();
 	if (CHECK(err, "setup_cgroup_environment", "err %d errno %d\n", err,
@@ -81,6 +81,14 @@ int main(int argc, char **argv)
 		  cgidmap_fd, errno))
 		goto close_prog;
 
+	pidmap_fd = bpf_find_map(__func__, obj, "pidmap");
+	if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+		  pidmap_fd, errno))
+		goto close_prog;
+
+	pid = getpid();
+	bpf_map_update_elem(pidmap_fd, &key, &pid, 0);
+
 	snprintf(buf, sizeof(buf),
 		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
 	efd = open(buf, O_RDONLY, 0);

From a5a16e43529b5040760ebf9bd9b056dd34861f93 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 7 Jun 2018 15:37:34 +0200
Subject: [PATCH 11/23] xsk: Fix umem fill/completion queue mmap on 32-bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc-4.1.2 on 32-bit:

    net/xdp/xsk.c:663: warning: integer constant is too large for ‘long’ type
    net/xdp/xsk.c:665: warning: integer constant is too large for ‘long’ type

Add the missing "ULL" suffixes to the large XDP_UMEM_PGOFF_*_RING values
to fix this.

    net/xdp/xsk.c:663: warning: comparison is always false due to limited range of data type
    net/xdp/xsk.c:665: warning: comparison is always false due to limited range of data type

"unsigned long" is 32-bit on 32-bit systems, hence the offset is
truncated, and can never be equal to any of the XDP_UMEM_PGOFF_*_RING
values.  Use loff_t (and the required cast) to fix this.

Fixes: 423f38329d267969 ("xsk: add umem fill queue support and mmap")
Fixes: fe2308328cd2f26e ("xsk: add umem completion queue support and mmap")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/if_xdp.h | 4 ++--
 net/xdp/xsk.c               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 1fa0e977ea8d..caed8b1614ff 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -63,8 +63,8 @@ struct xdp_statistics {
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
 #define XDP_PGOFF_TX_RING		 0x80000000
-#define XDP_UMEM_PGOFF_FILL_RING	0x100000000
-#define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
+#define XDP_UMEM_PGOFF_FILL_RING	0x100000000ULL
+#define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000ULL
 
 /* Rx/Tx descriptor */
 struct xdp_desc {
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index c6ed2454f7ce..36919a254ba3 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -643,7 +643,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
 static int xsk_mmap(struct file *file, struct socket *sock,
 		    struct vm_area_struct *vma)
 {
-	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 	unsigned long size = vma->vm_end - vma->vm_start;
 	struct xdp_sock *xs = xdp_sk(sock->sk);
 	struct xsk_queue *q = NULL;

From c09290c5637692a9bfe7740e4c5e693efff12810 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 8 Jun 2018 00:06:01 +0200
Subject: [PATCH 12/23] bpf, xdp: fix crash in xdp_umem_unaccount_pages

syzkaller was able to trigger the following panic for AF_XDP:

  BUG: KASAN: null-ptr-deref in atomic64_sub include/asm-generic/atomic-instrumented.h:144 [inline]
  BUG: KASAN: null-ptr-deref in atomic_long_sub include/asm-generic/atomic-long.h:199 [inline]
  BUG: KASAN: null-ptr-deref in xdp_umem_unaccount_pages.isra.4+0x3d/0x80 net/xdp/xdp_umem.c:135
  Write of size 8 at addr 0000000000000060 by task syz-executor246/4527

  CPU: 1 PID: 4527 Comm: syz-executor246 Not tainted 4.17.0+ #89
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  Call Trace:
   __dump_stack lib/dump_stack.c:77 [inline]
   dump_stack+0x1b9/0x294 lib/dump_stack.c:113
   kasan_report_error mm/kasan/report.c:352 [inline]
   kasan_report.cold.7+0x6d/0x2fe mm/kasan/report.c:412
   check_memory_region_inline mm/kasan/kasan.c:260 [inline]
   check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
   kasan_check_write+0x14/0x20 mm/kasan/kasan.c:278
   atomic64_sub include/asm-generic/atomic-instrumented.h:144 [inline]
   atomic_long_sub include/asm-generic/atomic-long.h:199 [inline]
   xdp_umem_unaccount_pages.isra.4+0x3d/0x80 net/xdp/xdp_umem.c:135
   xdp_umem_reg net/xdp/xdp_umem.c:334 [inline]
   xdp_umem_create+0xd6c/0x10f0 net/xdp/xdp_umem.c:349
   xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531
   __sys_setsockopt+0x1bd/0x390 net/socket.c:1935
   __do_sys_setsockopt net/socket.c:1946 [inline]
   __se_sys_setsockopt net/socket.c:1943 [inline]
   __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943
   do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

In xdp_umem_reg() the call to xdp_umem_account_pages() passed
with CAP_IPC_LOCK where we didn't need to end up charging rlimit
on memlock for the current user and therefore umem->user continues
to be NULL. Later on through fault injection syzkaller triggered
a failure in either umem->pgs or umem->pages allocation such that
we bail out and undo accounting in xdp_umem_unaccount_pages()
where we eventually hit the panic since it tries to deref the
umem->user.

The code is pretty close to mm_account_pinned_pages() and
mm_unaccount_pinned_pages() pair and potentially could reuse
it even in a later cleanup, and it appears that the initial
commit c0c77d8fb787 ("xsk: add user memory registration support
sockopt") got this right while later follow-up introduced the
bug via a49049ea2576 ("xsk: simplified umem setup").

Fixes: a49049ea2576 ("xsk: simplified umem setup")
Reported-by: syzbot+979217770b09ebf5c407@syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/xdp/xdp_umem.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 7eb4948a38d2..b9ef487c4618 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -132,8 +132,10 @@ static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 
 static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
 {
-	atomic_long_sub(umem->npgs, &umem->user->locked_vm);
-	free_uid(umem->user);
+	if (umem->user) {
+		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
+		free_uid(umem->user);
+	}
 }
 
 static void xdp_umem_release(struct xdp_umem *umem)

From 66e58e0ef80a56a1d7857b6ce121141563cdd93e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 7 Jun 2018 15:31:14 -0700
Subject: [PATCH 13/23] bpfilter: fix race in pipe access

syzbot reported the following crash
[  338.293946] bpfilter: read fail -512
[  338.304515] kasan: GPF could be caused by NULL-ptr deref or user memory access
[  338.311863] general protection fault: 0000 [#1] SMP KASAN
[  338.344360] RIP: 0010:__vfs_write+0x4a6/0x960
[  338.426363] Call Trace:
[  338.456967]  __kernel_write+0x10c/0x380
[  338.460928]  __bpfilter_process_sockopt+0x1d8/0x35b
[  338.487103]  bpfilter_mbox_request+0x4d/0xb0
[  338.491492]  bpfilter_ip_get_sockopt+0x6b/0x90

This can happen when multiple cpus trying to talk to user mode process
via bpfilter_mbox_request(). One cpu grabs the mutex while another goes to
sleep on the same mutex. Then former cpu sees that umh pipe is down and
shuts down the pipes. Later cpu finally acquires the mutex and crashes
on freed pipe.
Fix the race by using info.pid as an indicator that umh and pipes are healthy
and check it after acquiring the mutex.

Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
Reported-by: syzbot+7ade6c94abb2774c0fee@syzkaller.appspotmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bpfilter/bpfilter_kern.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index b13d058f8c34..09522573f611 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -24,17 +24,19 @@ static void shutdown_umh(struct umh_info *info)
 {
 	struct task_struct *tsk;
 
+	if (!info->pid)
+		return;
 	tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
 	if (tsk)
 		force_sig(SIGKILL, tsk);
 	fput(info->pipe_to_umh);
 	fput(info->pipe_from_umh);
+	info->pid = 0;
 }
 
 static void __stop_umh(void)
 {
-	if (IS_ENABLED(CONFIG_INET) &&
-	    bpfilter_process_sockopt) {
+	if (IS_ENABLED(CONFIG_INET)) {
 		bpfilter_process_sockopt = NULL;
 		shutdown_umh(&info);
 	}
@@ -55,7 +57,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname,
 	struct mbox_reply reply;
 	loff_t pos;
 	ssize_t n;
-	int ret;
+	int ret = -EFAULT;
 
 	req.is_set = is_set;
 	req.pid = current->pid;
@@ -63,6 +65,8 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname,
 	req.addr = (long)optval;
 	req.len = optlen;
 	mutex_lock(&bpfilter_lock);
+	if (!info.pid)
+		goto out;
 	n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
 	if (n != sizeof(req)) {
 		pr_err("write fail %zd\n", n);

From bde4975310eb1982bd0bbff673989052d92fd481 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Wed, 6 Jun 2018 18:45:22 +0000
Subject: [PATCH 14/23] net: stmmac: fix build failure due to missing
 COMMON_CLK dependency

This patch fix the build failure on m68k;
drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.o: In function `ipq806x_gmac_probe':
dwmac-ipq806x.c:(.text+0xda): undefined reference to `clk_set_rate'
drivers/net/ethernet/stmicro/stmmac/dwmac-rk.o: In function `rk_gmac_probe':
dwmac-rk.c:(.text+0x1e58): undefined reference to `clk_set_rate'
drivers/net/ethernet/stmicro/stmmac/dwmac-sti.o: In function `stid127_fix_retime_src':
dwmac-sti.c:(.text+0xd8): undefined reference to `clk_set_rate'
dwmac-sti.c:(.text+0x114): undefined reference to `clk_set_rate'
drivers/net/ethernet/stmicro/stmmac/dwmac-sti.o:dwmac-sti.c:(.text+0x12c): more undefined references to `clk_set_rate' follow
Lots of stmmac platform drivers need COMMON_CLK in their Kconfig depends.

Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/Kconfig | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig
index e28c0d2c58e9..cb5b0f58c395 100644
--- a/drivers/net/ethernet/stmicro/stmmac/Kconfig
+++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig
@@ -33,7 +33,7 @@ config DWMAC_DWC_QOS_ETH
 	select PHYLIB
 	select CRC32
 	select MII
-	depends on OF && HAS_DMA
+	depends on OF && COMMON_CLK && HAS_DMA
 	help
 	  Support for chips using the snps,dwc-qos-ethernet.txt DT binding.
 
@@ -57,7 +57,7 @@ config DWMAC_ANARION
 config DWMAC_IPQ806X
 	tristate "QCA IPQ806x DWMAC support"
 	default ARCH_QCOM
-	depends on OF && (ARCH_QCOM || COMPILE_TEST)
+	depends on OF && COMMON_CLK && (ARCH_QCOM || COMPILE_TEST)
 	select MFD_SYSCON
 	help
 	  Support for QCA IPQ806X DWMAC Ethernet.
@@ -100,7 +100,7 @@ config DWMAC_OXNAS
 config DWMAC_ROCKCHIP
 	tristate "Rockchip dwmac support"
 	default ARCH_ROCKCHIP
-	depends on OF && (ARCH_ROCKCHIP || COMPILE_TEST)
+	depends on OF && COMMON_CLK && (ARCH_ROCKCHIP || COMPILE_TEST)
 	select MFD_SYSCON
 	help
 	  Support for Ethernet controller on Rockchip RK3288 SoC.
@@ -123,7 +123,7 @@ config DWMAC_SOCFPGA
 config DWMAC_STI
 	tristate "STi GMAC support"
 	default ARCH_STI
-	depends on OF && (ARCH_STI || COMPILE_TEST)
+	depends on OF && COMMON_CLK && (ARCH_STI || COMPILE_TEST)
 	select MFD_SYSCON
 	---help---
 	  Support for ethernet controller on STi SOCs.
@@ -147,7 +147,7 @@ config DWMAC_STM32
 config DWMAC_SUNXI
 	tristate "Allwinner GMAC support"
 	default ARCH_SUNXI
-	depends on OF && (ARCH_SUNXI || COMPILE_TEST)
+	depends on OF && COMMON_CLK && (ARCH_SUNXI || COMPILE_TEST)
 	---help---
 	  Support for Allwinner A20/A31 GMAC ethernet controllers.
 

From 58d813afbe89658a5972747460a5fe19dec4dbcb Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 7 Jun 2018 17:54:37 -0400
Subject: [PATCH 15/23] net: aquantia: fix unsigned numvecs comparison with
 less than zero

From: Colin Ian King <colin.king@canonical.com>

This was originally mistakenly submitted to net-next. Resubmitting to net.

The comparison of numvecs < 0 is always false because numvecs is a u32
and hence the error return from a failed call to pci_alloc_irq_vectores
is never detected.  Fix this by using the signed int ret to handle the
error return and assign numvecs to err.

Detected by CoverityScan, CID#1468650 ("Unsigned compared against 0")

Fixes: a09bd81b5413 ("net: aquantia: Limit number of vectors to actually allocated irqs")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
index a50e08bb4748..750007513f9d 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
@@ -267,14 +267,13 @@ static int aq_pci_probe(struct pci_dev *pdev,
 	numvecs = min(numvecs, num_online_cpus());
 	/*enable interrupts */
 #if !AQ_CFG_FORCE_LEGACY_INT
-	numvecs = pci_alloc_irq_vectors(self->pdev, 1, numvecs,
-					PCI_IRQ_MSIX | PCI_IRQ_MSI |
-					PCI_IRQ_LEGACY);
+	err = pci_alloc_irq_vectors(self->pdev, 1, numvecs,
+				    PCI_IRQ_MSIX | PCI_IRQ_MSI |
+				    PCI_IRQ_LEGACY);
 
-	if (numvecs < 0) {
-		err = numvecs;
+	if (err < 0)
 		goto err_hwinit;
-	}
+	numvecs = err;
 #endif
 	self->irqvecs = numvecs;
 

From 6310a882fbe0b87e0950222f2ac197ed92e11792 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 8 Jun 2018 10:58:25 +0800
Subject: [PATCH 16/23] net: fddi: fix a possible null-ptr-deref

bp->SharedMemAddr is set to NULL while bp->SharedMemSize lesser-or-equal 0,
then memset will trigger null-ptr-deref.

fix it by replacing pci_alloc_consistent with dma_zalloc_coherent.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/fddi/skfp/skfddi.c | 55 +++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/drivers/net/fddi/skfp/skfddi.c b/drivers/net/fddi/skfp/skfddi.c
index 2414f1dc8ddd..72433f3efc74 100644
--- a/drivers/net/fddi/skfp/skfddi.c
+++ b/drivers/net/fddi/skfp/skfddi.c
@@ -297,11 +297,11 @@ static int skfp_init_one(struct pci_dev *pdev,
 	return 0;
 err_out5:
 	if (smc->os.SharedMemAddr) 
-		pci_free_consistent(pdev, smc->os.SharedMemSize,
-				    smc->os.SharedMemAddr, 
-				    smc->os.SharedMemDMA);
-	pci_free_consistent(pdev, MAX_FRAME_SIZE,
-			    smc->os.LocalRxBuffer, smc->os.LocalRxBufferDMA);
+		dma_free_coherent(&pdev->dev, smc->os.SharedMemSize,
+				  smc->os.SharedMemAddr,
+				  smc->os.SharedMemDMA);
+	dma_free_coherent(&pdev->dev, MAX_FRAME_SIZE,
+			  smc->os.LocalRxBuffer, smc->os.LocalRxBufferDMA);
 err_out4:
 	free_netdev(dev);
 err_out3:
@@ -328,17 +328,17 @@ static void skfp_remove_one(struct pci_dev *pdev)
 	unregister_netdev(p);
 
 	if (lp->os.SharedMemAddr) {
-		pci_free_consistent(&lp->os.pdev,
-				    lp->os.SharedMemSize,
-				    lp->os.SharedMemAddr,
-				    lp->os.SharedMemDMA);
+		dma_free_coherent(&pdev->dev,
+				  lp->os.SharedMemSize,
+				  lp->os.SharedMemAddr,
+				  lp->os.SharedMemDMA);
 		lp->os.SharedMemAddr = NULL;
 	}
 	if (lp->os.LocalRxBuffer) {
-		pci_free_consistent(&lp->os.pdev,
-				    MAX_FRAME_SIZE,
-				    lp->os.LocalRxBuffer,
-				    lp->os.LocalRxBufferDMA);
+		dma_free_coherent(&pdev->dev,
+				  MAX_FRAME_SIZE,
+				  lp->os.LocalRxBuffer,
+				  lp->os.LocalRxBufferDMA);
 		lp->os.LocalRxBuffer = NULL;
 	}
 #ifdef MEM_MAPPED_IO
@@ -394,7 +394,9 @@ static  int skfp_driver_init(struct net_device *dev)
 	spin_lock_init(&bp->DriverLock);
 	
 	// Allocate invalid frame
-	bp->LocalRxBuffer = pci_alloc_consistent(&bp->pdev, MAX_FRAME_SIZE, &bp->LocalRxBufferDMA);
+	bp->LocalRxBuffer = dma_alloc_coherent(&bp->pdev.dev, MAX_FRAME_SIZE,
+					       &bp->LocalRxBufferDMA,
+					       GFP_ATOMIC);
 	if (!bp->LocalRxBuffer) {
 		printk("could not allocate mem for ");
 		printk("LocalRxBuffer: %d byte\n", MAX_FRAME_SIZE);
@@ -407,23 +409,22 @@ static  int skfp_driver_init(struct net_device *dev)
 	if (bp->SharedMemSize > 0) {
 		bp->SharedMemSize += 16;	// for descriptor alignment
 
-		bp->SharedMemAddr = pci_alloc_consistent(&bp->pdev,
-							 bp->SharedMemSize,
-							 &bp->SharedMemDMA);
+		bp->SharedMemAddr = dma_zalloc_coherent(&bp->pdev.dev,
+							bp->SharedMemSize,
+							&bp->SharedMemDMA,
+							GFP_ATOMIC);
 		if (!bp->SharedMemAddr) {
 			printk("could not allocate mem for ");
 			printk("hardware module: %ld byte\n",
 			       bp->SharedMemSize);
 			goto fail;
 		}
-		bp->SharedMemHeap = 0;	// Nothing used yet.
 
 	} else {
 		bp->SharedMemAddr = NULL;
-		bp->SharedMemHeap = 0;
-	}			// SharedMemSize > 0
+	}
 
-	memset(bp->SharedMemAddr, 0, bp->SharedMemSize);
+	bp->SharedMemHeap = 0;
 
 	card_stop(smc);		// Reset adapter.
 
@@ -442,15 +443,15 @@ static  int skfp_driver_init(struct net_device *dev)
 
 fail:
 	if (bp->SharedMemAddr) {
-		pci_free_consistent(&bp->pdev,
-				    bp->SharedMemSize,
-				    bp->SharedMemAddr,
-				    bp->SharedMemDMA);
+		dma_free_coherent(&bp->pdev.dev,
+				  bp->SharedMemSize,
+				  bp->SharedMemAddr,
+				  bp->SharedMemDMA);
 		bp->SharedMemAddr = NULL;
 	}
 	if (bp->LocalRxBuffer) {
-		pci_free_consistent(&bp->pdev, MAX_FRAME_SIZE,
-				    bp->LocalRxBuffer, bp->LocalRxBufferDMA);
+		dma_free_coherent(&bp->pdev.dev, MAX_FRAME_SIZE,
+				  bp->LocalRxBuffer, bp->LocalRxBufferDMA);
 		bp->LocalRxBuffer = NULL;
 	}
 	return err;

From 8d499533e0bc02d44283dbdab03142b599b8ba16 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 8 Jun 2018 05:02:31 +0200
Subject: [PATCH 17/23] net/sched: act_simple: fix parsing of TCA_DEF_DATA

use nla_strlcpy() to avoid copying data beyond the length of TCA_DEF_DATA
netlink attribute, in case it is less than SIMP_MAX_DATA and it does not
end with '\0' character.

v2: fix errors in the commit message, thanks Hangbin Liu

Fixes: fa1b1cff3d06 ("net_cls_act: Make act_simple use of netlink policy.")
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_simple.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 9618b4a83cee..98c4afe7c15b 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -53,22 +53,22 @@ static void tcf_simp_release(struct tc_action *a)
 	kfree(d->tcfd_defdata);
 }
 
-static int alloc_defdata(struct tcf_defact *d, char *defdata)
+static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata)
 {
 	d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
 	if (unlikely(!d->tcfd_defdata))
 		return -ENOMEM;
-	strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+	nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
 	return 0;
 }
 
-static void reset_policy(struct tcf_defact *d, char *defdata,
+static void reset_policy(struct tcf_defact *d, const struct nlattr *defdata,
 			 struct tc_defact *p)
 {
 	spin_lock_bh(&d->tcf_lock);
 	d->tcf_action = p->action;
 	memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
-	strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
+	nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
 	spin_unlock_bh(&d->tcf_lock);
 }
 
@@ -87,7 +87,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	struct tcf_defact *d;
 	bool exists = false;
 	int ret = 0, err;
-	char *defdata;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -110,8 +109,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 	}
 
-	defdata = nla_data(tb[TCA_DEF_DATA]);
-
 	if (!exists) {
 		ret = tcf_idr_create(tn, parm->index, est, a,
 				     &act_simp_ops, bind, false);
@@ -119,7 +116,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 			return ret;
 
 		d = to_defact(*a);
-		ret = alloc_defdata(d, defdata);
+		ret = alloc_defdata(d, tb[TCA_DEF_DATA]);
 		if (ret < 0) {
 			tcf_idr_release(*a, bind);
 			return ret;
@@ -133,7 +130,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 		if (!ovr)
 			return -EEXIST;
 
-		reset_policy(d, defdata, parm);
+		reset_policy(d, tb[TCA_DEF_DATA], parm);
 	}
 
 	if (ret == ACT_P_CREATED)

From 49c2c3f246e2fc3009039e31a826333dcd0283cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Fri, 8 Jun 2018 09:15:24 +0200
Subject: [PATCH 18/23] cdc_ncm: avoid padding beyond end of skb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 4a0e3e989d66 ("cdc_ncm: Add support for moving NDP to end
of NCM frame") added logic to reserve space for the NDP at the
end of the NTB/skb.  This reservation did not take the final
alignment of the NDP into account, causing us to reserve too
little space. Additionally the padding prior to NDP addition did
not ensure there was enough space for the NDP.

The NTB/skb with the NDP appended would then exceed the configured
max size. This caused the final padding of the NTB to use a
negative count, padding to almost INT_MAX, and resulting in:

[60103.825970] BUG: unable to handle kernel paging request at ffff9641f2004000
[60103.825998] IP: __memset+0x24/0x30
[60103.826001] PGD a6a06067 P4D a6a06067 PUD 4f65a063 PMD 72003063 PTE 0
[60103.826013] Oops: 0002 [#1] SMP NOPTI
[60103.826018] Modules linked in: (removed(
[60103.826158] CPU: 0 PID: 5990 Comm: Chrome_DevTools Tainted: G           O 4.14.0-3-amd64 #1 Debian 4.14.17-1
[60103.826162] Hardware name: LENOVO 20081 BIOS 41CN28WW(V2.04) 05/03/2012
[60103.826166] task: ffff964193484fc0 task.stack: ffffb2890137c000
[60103.826171] RIP: 0010:__memset+0x24/0x30
[60103.826174] RSP: 0000:ffff964316c03b68 EFLAGS: 00010216
[60103.826178] RAX: 0000000000000000 RBX: 00000000fffffffd RCX: 000000001ffa5000
[60103.826181] RDX: 0000000000000005 RSI: 0000000000000000 RDI: ffff9641f2003ffc
[60103.826184] RBP: ffff964192f6c800 R08: 00000000304d434e R09: ffff9641f1d2c004
[60103.826187] R10: 0000000000000002 R11: 00000000000005ae R12: ffff9642e6957a80
[60103.826190] R13: ffff964282ff2ee8 R14: 000000000000000d R15: ffff9642e4843900
[60103.826194] FS:  00007f395aaf6700(0000) GS:ffff964316c00000(0000) knlGS:0000000000000000
[60103.826197] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[60103.826200] CR2: ffff9641f2004000 CR3: 0000000013b0c000 CR4: 00000000000006f0
[60103.826204] Call Trace:
[60103.826212]  <IRQ>
[60103.826225]  cdc_ncm_fill_tx_frame+0x5e3/0x740 [cdc_ncm]
[60103.826236]  cdc_ncm_tx_fixup+0x57/0x70 [cdc_ncm]
[60103.826246]  usbnet_start_xmit+0x5d/0x710 [usbnet]
[60103.826254]  ? netif_skb_features+0x119/0x250
[60103.826259]  dev_hard_start_xmit+0xa1/0x200
[60103.826267]  sch_direct_xmit+0xf2/0x1b0
[60103.826273]  __dev_queue_xmit+0x5e3/0x7c0
[60103.826280]  ? ip_finish_output2+0x263/0x3c0
[60103.826284]  ip_finish_output2+0x263/0x3c0
[60103.826289]  ? ip_output+0x6c/0xe0
[60103.826293]  ip_output+0x6c/0xe0
[60103.826298]  ? ip_forward_options+0x1a0/0x1a0
[60103.826303]  tcp_transmit_skb+0x516/0x9b0
[60103.826309]  tcp_write_xmit+0x1aa/0xee0
[60103.826313]  ? sch_direct_xmit+0x71/0x1b0
[60103.826318]  tcp_tasklet_func+0x177/0x180
[60103.826325]  tasklet_action+0x5f/0x110
[60103.826332]  __do_softirq+0xde/0x2b3
[60103.826337]  irq_exit+0xae/0xb0
[60103.826342]  do_IRQ+0x81/0xd0
[60103.826347]  common_interrupt+0x98/0x98
[60103.826351]  </IRQ>
[60103.826355] RIP: 0033:0x7f397bdf2282
[60103.826358] RSP: 002b:00007f395aaf57d8 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff6e
[60103.826362] RAX: 0000000000000000 RBX: 00002f07bc6d0900 RCX: 00007f39752d7fe7
[60103.826365] RDX: 0000000000000022 RSI: 0000000000000147 RDI: 00002f07baea02c0
[60103.826368] RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000
[60103.826371] R10: 00000000ffffffff R11: 0000000000000000 R12: 00002f07baea02c0
[60103.826373] R13: 00002f07bba227a0 R14: 00002f07bc6d090c R15: 0000000000000000
[60103.826377] Code: 90 90 90 90 90 90 90 0f 1f 44 00 00 49 89 f9 48 89 d1 83
e2 07 48 c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 <f3> 48
ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1
[60103.826442] RIP: __memset+0x24/0x30 RSP: ffff964316c03b68
[60103.826444] CR2: ffff9641f2004000

Commit e1069bbfcf3b ("net: cdc_ncm: Reduce memory use when kernel
memory low") made this bug much more likely to trigger by reducing
the NTB size under memory pressure.

Link: https://bugs.debian.org/893393
Reported-by: Горбешко Богдан <bodqhrohro@gmail.com>
Reported-and-tested-by: Dennis Wassenberg <dennis.wassenberg@secunet.com>
Cc: Enrico Mioso <mrkiko.rs@gmail.com>
Fixes: 4a0e3e989d66 ("cdc_ncm: Add support for moving NDP to end of NCM frame")
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ncm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 90d07ed224d5..b0e8b9613054 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -1124,7 +1124,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	 * accordingly. Otherwise, we should check here.
 	 */
 	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)
-		delayed_ndp_size = ctx->max_ndp_size;
+		delayed_ndp_size = ALIGN(ctx->max_ndp_size, ctx->tx_ndp_modulus);
 	else
 		delayed_ndp_size = 0;
 
@@ -1285,7 +1285,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	/* If requested, put NDP at end of frame. */
 	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
 		nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data;
-		cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size);
+		cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size - ctx->max_ndp_size);
 		nth16->wNdpIndex = cpu_to_le16(skb_out->len);
 		skb_put_data(skb_out, ctx->delayed_ndp16, ctx->max_ndp_size);
 

From 6c206b20092a3623184cff9470dba75d21507874 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 8 Jun 2018 11:35:40 +0200
Subject: [PATCH 19/23] udp: fix rx queue len reported by diag and proc
 interface

After commit 6b229cf77d68 ("udp: add batching to udp_rmem_release()")
the sk_rmem_alloc field does not measure exactly anymore the
receive queue length, because we batch the rmem release. The issue
is really apparent only after commit 0d4a6608f68c ("udp: do rmem bulk
free even if the rx sk queue is empty"): the user space can easily
check for an empty socket with not-0 queue length reported by the 'ss'
tool or the procfs interface.

We need to use a custom UDP helper to report the correct queue length,
taking into account the forward allocation deficit.

Reported-by: trevor.francis@46labs.com
Fixes: 6b229cf77d68 ("UDP: add batching to udp_rmem_release()")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/transp_v6.h | 11 +++++++++--
 include/net/udp.h       |  5 +++++
 net/ipv4/udp.c          |  2 +-
 net/ipv4/udp_diag.c     |  2 +-
 net/ipv6/datagram.c     |  6 +++---
 net/ipv6/udp.c          |  3 ++-
 6 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index c4f5caaf3778..f6a3543e5247 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -45,8 +45,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg,
 			  struct flowi6 *fl6, struct ipcm6_cookie *ipc6,
 			  struct sockcm_cookie *sockc);
 
-void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
-			     __u16 srcp, __u16 destp, int bucket);
+void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
+			       __u16 srcp, __u16 destp, int rqueue, int bucket);
+static inline void
+ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp,
+			__u16 destp, int bucket)
+{
+	__ip6_dgram_sock_seq_show(seq, sp, srcp, destp, sk_rmem_alloc_get(sp),
+				  bucket);
+}
 
 #define LOOPBACK4_IPV6 cpu_to_be32(0x7f000006)
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 7ba0ed252c52..b1ea8b0f5e6a 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -247,6 +247,11 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 	return htons((((u64) hash * (max - min)) >> 32) + min);
 }
 
+static inline int udp_rqueue_get(struct sock *sk)
+{
+	return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit);
+}
+
 /* net/ipv4/udp.c */
 void udp_destruct_sock(struct sock *sk);
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3365362cac88..9bb27df4dac5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2772,7 +2772,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
 		bucket, src, srcp, dest, destp, sp->sk_state,
 		sk_wmem_alloc_get(sp),
-		sk_rmem_alloc_get(sp),
+		udp_rqueue_get(sp),
 		0, 0L, 0,
 		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
 		0, sock_i_ino(sp),
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index d0390d844ac8..d9ad986c7b2c 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -163,7 +163,7 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
 static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 		void *info)
 {
-	r->idiag_rqueue = sk_rmem_alloc_get(sk);
+	r->idiag_rqueue = udp_rqueue_get(sk);
 	r->idiag_wqueue = sk_wmem_alloc_get(sk);
 }
 
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index a02ad100f0d7..2ee08b6a86a4 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -1019,8 +1019,8 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl);
 
-void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
-			     __u16 srcp, __u16 destp, int bucket)
+void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
+			       __u16 srcp, __u16 destp, int rqueue, int bucket)
 {
 	const struct in6_addr *dest, *src;
 
@@ -1036,7 +1036,7 @@ void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
 		   dest->s6_addr32[2], dest->s6_addr32[3], destp,
 		   sp->sk_state,
 		   sk_wmem_alloc_get(sp),
-		   sk_rmem_alloc_get(sp),
+		   rqueue,
 		   0, 0L, 0,
 		   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
 		   0,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 164afd31aebf..e6645cae403e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1523,7 +1523,8 @@ int udp6_seq_show(struct seq_file *seq, void *v)
 		struct inet_sock *inet = inet_sk(v);
 		__u16 srcp = ntohs(inet->inet_sport);
 		__u16 destp = ntohs(inet->inet_dport);
-		ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket);
+		__ip6_dgram_sock_seq_show(seq, v, srcp, destp,
+					  udp_rqueue_get(v), bucket);
 	}
 	return 0;
 }

From 873aca2ee86e533665a73292c4e308ded1e9bafe Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 8 Jun 2018 15:11:47 +0200
Subject: [PATCH 20/23] net: bridge: Fix locking in br_fdb_find_port()

Callers of br_fdb_find() need to hold the hash lock, which
br_fdb_find_port() doesn't do. However, since br_fdb_find_port() is not
doing any actual FDB manipulation, the hash lock is not really needed at
all. So convert to br_fdb_find_rcu(), surrounded by rcu_read_lock() /
_unlock() pair.

The device pointer copied from inside the FDB entry is then kept alive
by the RTNL lock, which br_fdb_find_port() asserts.

Fixes: 4d4fd36126d6 ("net: bridge: Publish bridge accessor functions")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_fdb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index b19e3104afd6..502f66349530 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -135,9 +135,11 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev,
 		return NULL;
 
 	br = netdev_priv(br_dev);
-	f = br_fdb_find(br, addr, vid);
+	rcu_read_lock();
+	f = br_fdb_find_rcu(br, addr, vid);
 	if (f && f->dst)
 		dev = f->dst->dev;
+	rcu_read_unlock();
 
 	return dev;
 }

From 6d8c50dcb029872b298eea68cc6209c866fd3e14 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 7 Jun 2018 13:39:49 -0700
Subject: [PATCH 21/23] socket: close race condition between sock_close() and
 sockfs_setattr()

fchownat() doesn't even hold refcnt of fd until it figures out
fd is really needed (otherwise is ignored) and releases it after
it resolves the path. This means sock_close() could race with
sockfs_setattr(), which leads to a NULL pointer dereference
since typically we set sock->sk to NULL in ->release().

As pointed out by Al, this is unique to sockfs. So we can fix this
in socket layer by acquiring inode_lock in sock_close() and
checking against NULL in sockfs_setattr().

sock_release() is called in many places, only the sock_close()
path matters here. And fortunately, this should not affect normal
sock_close() as it is only called when the last fd refcnt is gone.
It only affects sock_close() with a parallel sockfs_setattr() in
progress, which is not common.

Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.")
Reported-by: shankarapailoor <shankarapailoor@gmail.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Lorenzo Colitti <lorenzo@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index af57d85bcb48..8a109012608a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -541,7 +541,10 @@ static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (!err && (iattr->ia_valid & ATTR_UID)) {
 		struct socket *sock = SOCKET_I(d_inode(dentry));
 
-		sock->sk->sk_uid = iattr->ia_uid;
+		if (sock->sk)
+			sock->sk->sk_uid = iattr->ia_uid;
+		else
+			err = -ENOENT;
 	}
 
 	return err;
@@ -590,12 +593,16 @@ EXPORT_SYMBOL(sock_alloc);
  *	an inode not a file.
  */
 
-void sock_release(struct socket *sock)
+static void __sock_release(struct socket *sock, struct inode *inode)
 {
 	if (sock->ops) {
 		struct module *owner = sock->ops->owner;
 
+		if (inode)
+			inode_lock(inode);
 		sock->ops->release(sock);
+		if (inode)
+			inode_unlock(inode);
 		sock->ops = NULL;
 		module_put(owner);
 	}
@@ -609,6 +616,11 @@ void sock_release(struct socket *sock)
 	}
 	sock->file = NULL;
 }
+
+void sock_release(struct socket *sock)
+{
+	__sock_release(sock, NULL);
+}
 EXPORT_SYMBOL(sock_release);
 
 void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
@@ -1171,7 +1183,7 @@ static int sock_mmap(struct file *file, struct vm_area_struct *vma)
 
 static int sock_close(struct inode *inode, struct file *filp)
 {
-	sock_release(SOCKET_I(inode));
+	__sock_release(SOCKET_I(inode), inode);
 	return 0;
 }
 

From b718e8c8f4f5920aaddc2e52d5e32f494c91129c Mon Sep 17 00:00:00 2001
From: Alvaro Gamez Machado <alvaro.gamez@hazent.com>
Date: Fri, 8 Jun 2018 12:23:39 +0200
Subject: [PATCH 22/23] net: phy: dp83822: use BMCR_ANENABLE instead of
 BMSR_ANEGCAPABLE for DP83620

DP83620 register set is compatible with the DP83848, but it also supports
100base-FX. When the hardware is configured such as that fiber mode is
enabled, autonegotiation is not possible.

The chip, however, doesn't expose this information via BMSR_ANEGCAPABLE.
Instead, this bit is always set high, even if the particular hardware
configuration makes it so that auto negotiation is not possible [1]. Under
these circumstances, the phy subsystem keeps trying for autonegotiation to
happen, without success.

Hereby, we inspect BMCR_ANENABLE bit after genphy_config_init, which on
reset is set to 0 when auto negotiation is disabled, and so we use this
value instead of BMSR_ANEGCAPABLE.

[1] https://e2e.ti.com/support/interface/ethernet/f/903/p/697165/2571170

Signed-off-by: Alvaro Gamez Machado <alvaro.gamez@hazent.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/dp83848.c | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c
index cd09c3af2117..6e8e42361fd5 100644
--- a/drivers/net/phy/dp83848.c
+++ b/drivers/net/phy/dp83848.c
@@ -74,6 +74,25 @@ static int dp83848_config_intr(struct phy_device *phydev)
 	return phy_write(phydev, DP83848_MICR, control);
 }
 
+static int dp83848_config_init(struct phy_device *phydev)
+{
+	int err;
+	int val;
+
+	err = genphy_config_init(phydev);
+	if (err < 0)
+		return err;
+
+	/* DP83620 always reports Auto Negotiation Ability on BMSR. Instead,
+	 * we check initial value of BMCR Auto negotiation enable bit
+	 */
+	val = phy_read(phydev, MII_BMCR);
+	if (!(val & BMCR_ANENABLE))
+		phydev->autoneg = AUTONEG_DISABLE;
+
+	return 0;
+}
+
 static struct mdio_device_id __maybe_unused dp83848_tbl[] = {
 	{ TI_DP83848C_PHY_ID, 0xfffffff0 },
 	{ NS_DP83848C_PHY_ID, 0xfffffff0 },
@@ -83,7 +102,7 @@ static struct mdio_device_id __maybe_unused dp83848_tbl[] = {
 };
 MODULE_DEVICE_TABLE(mdio, dp83848_tbl);
 
-#define DP83848_PHY_DRIVER(_id, _name)				\
+#define DP83848_PHY_DRIVER(_id, _name, _config_init)		\
 	{							\
 		.phy_id		= _id,				\
 		.phy_id_mask	= 0xfffffff0,			\
@@ -92,7 +111,7 @@ MODULE_DEVICE_TABLE(mdio, dp83848_tbl);
 		.flags		= PHY_HAS_INTERRUPT,		\
 								\
 		.soft_reset	= genphy_soft_reset,		\
-		.config_init	= genphy_config_init,		\
+		.config_init	= _config_init,			\
 		.suspend	= genphy_suspend,		\
 		.resume		= genphy_resume,		\
 								\
@@ -102,10 +121,14 @@ MODULE_DEVICE_TABLE(mdio, dp83848_tbl);
 	}
 
 static struct phy_driver dp83848_driver[] = {
-	DP83848_PHY_DRIVER(TI_DP83848C_PHY_ID, "TI DP83848C 10/100 Mbps PHY"),
-	DP83848_PHY_DRIVER(NS_DP83848C_PHY_ID, "NS DP83848C 10/100 Mbps PHY"),
-	DP83848_PHY_DRIVER(TI_DP83620_PHY_ID, "TI DP83620 10/100 Mbps PHY"),
-	DP83848_PHY_DRIVER(TLK10X_PHY_ID, "TI TLK10X 10/100 Mbps PHY"),
+	DP83848_PHY_DRIVER(TI_DP83848C_PHY_ID, "TI DP83848C 10/100 Mbps PHY",
+			   genphy_config_init),
+	DP83848_PHY_DRIVER(NS_DP83848C_PHY_ID, "NS DP83848C 10/100 Mbps PHY",
+			   genphy_config_init),
+	DP83848_PHY_DRIVER(TI_DP83620_PHY_ID, "TI DP83620 10/100 Mbps PHY",
+			   dp83848_config_init),
+	DP83848_PHY_DRIVER(TLK10X_PHY_ID, "TI TLK10X 10/100 Mbps PHY",
+			   genphy_config_init),
 };
 module_phy_driver(dp83848_driver);
 

From 867f816badc01e6da655028810d468c9f935b37c Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Fri, 8 Jun 2018 22:47:10 -0400
Subject: [PATCH 23/23] tcp: limit sk_rcvlowat by the maximum receive buffer

The user-provided value to setsockopt(SO_RCVLOWAT) can be
larger than the maximum possible receive buffer. Such values
mute POLLIN signals on the socket which can stall progress
on the socket.

Limit the user-provided value to half of the maximum receive
buffer, i.e., half of sk_rcvbuf when the receive buffer size
is set by the user, or otherwise half of sysctl_tcp_rmem[2].

Fixes: d1361840f8c5 ("tcp: fix SO_RCVLOWAT and RCVBUF autotuning")
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2741953adaba..141acd92e58a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1694,6 +1694,13 @@ EXPORT_SYMBOL(tcp_peek_len);
 /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
 int tcp_set_rcvlowat(struct sock *sk, int val)
 {
+	int cap;
+
+	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+		cap = sk->sk_rcvbuf >> 1;
+	else
+		cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
+	val = min(val, cap);
 	sk->sk_rcvlowat = val ? : 1;
 
 	/* Check if we need to signal EPOLLIN right now */
@@ -1702,12 +1709,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
 	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
 		return 0;
 
-	/* val comes from user space and might be close to INT_MAX */
 	val <<= 1;
-	if (val < 0)
-		val = INT_MAX;
-
-	val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
 	if (val > sk->sk_rcvbuf) {
 		sk->sk_rcvbuf = val;
 		tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);