From 58990d1ff3f7896ee341030e9a7c2e4002570683 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 7 Jun 2018 17:40:03 +0200 Subject: [PATCH 01/23] bpf: reject passing modified ctx to helper functions As commit 28e33f9d78ee ("bpf: disallow arithmetic operations on context pointer") already describes, f1174f77b50c ("bpf/verifier: rework value tracking") removed the specific white-listed cases we had previously where we would allow for pointer arithmetic in order to further generalize it, and allow e.g. context access via modified registers. While the dereferencing of modified context pointers had been forbidden through 28e33f9d78ee, syzkaller did recently manage to trigger several KASAN splats for slab out of bounds access and use after frees by simply passing a modified context pointer to a helper function which would then do the bad access since verifier allowed it in adjust_ptr_min_max_vals(). Rejecting arithmetic on ctx pointer in adjust_ptr_min_max_vals() generally could break existing programs as there's a valid use case in tracing in combination with passing the ctx to helpers as bpf_probe_read(), where the register then becomes unknown at verification time due to adding a non-constant offset to it. An access sequence may look like the following: offset = args->filename; /* field __data_loc filename */ bpf_probe_read(&dst, len, (char *)args + offset); // args is ctx There are two options: i) we could special case the ctx and as soon as we add a constant or bounded offset to it (hence ctx type wouldn't change) we could turn the ctx into an unknown scalar, or ii) we generalize the sanity test for ctx member access into a small helper and assert it on the ctx register that was passed as a function argument. Fwiw, latter is more obvious and less complex at the same time, and one case that may potentially be legitimate in future for ctx member access at least would be for ctx to carry a const offset. Therefore, fix follows approach from ii) and adds test cases to BPF kselftests. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Reported-by: syzbot+3d0b2441dbb71751615e@syzkaller.appspotmail.com Reported-by: syzbot+c8504affd4fdd0c1b626@syzkaller.appspotmail.com Reported-by: syzbot+e5190cb881d8660fb1a3@syzkaller.appspotmail.com Reported-by: syzbot+efae31b384d5badbd620@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Edward Cree Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 48 +++++++++++------ tools/testing/selftests/bpf/test_verifier.c | 58 ++++++++++++++++++++- 2 files changed, 88 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6403b5166f4..cced0c1e63e2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1617,6 +1617,30 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif +static int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + /* Access to ctx or passing it to a helper is only allowed in + * its original, unmodified form. + */ + + if (reg->off) { + verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", + regno, reg->off); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + return -EACCES; + } + + return 0; +} + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -1686,24 +1710,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } - /* ctx accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - */ - if (reg->off) { - verbose(env, - "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", - regno, reg->off, off - reg->off); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, - "variable ctx access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a @@ -1984,6 +1995,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 7cb1d74057ce..2ecd27b670d7 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8647,7 +8647,7 @@ static struct bpf_test tests[] = { offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .errstr = "dereference of modified ctx ptr R1 off=68+8, ctx+const is allowed, ctx+const+const is not", + .errstr = "dereference of modified ctx ptr", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, @@ -12258,6 +12258,62 @@ static struct bpf_test tests[] = { .result = ACCEPT, .retval = 5, }, + { + "pass unmodified ctx pointer to helper", + .insns = { + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_csum_update), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + }, + { + "pass modified ctx pointer to helper, 1", + .insns = { + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_csum_update), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "dereference of modified ctx ptr", + }, + { + "pass modified ctx pointer to helper, 2", + .insns = { + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_socket_cookie), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result_unpriv = REJECT, + .result = REJECT, + .errstr_unpriv = "dereference of modified ctx ptr", + .errstr = "dereference of modified ctx ptr", + }, + { + "pass modified ctx pointer to helper, 3", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_3, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_csum_update), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "variable ctx access var_off=(0x0; 0x4)", + }, }; static int probe_filter_length(const struct bpf_insn *fp) From 7eced5ab5a7366ee7ca5360b3eca9d220c2b2887 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 3 Jun 2018 12:06:57 +0200 Subject: [PATCH 02/23] netfilter: nf_tables: add NFT_LOGLEVEL_* enumeration and use it This is internal, not exposed through uapi, and although it maps with userspace LOG_*, with the introduction of LOGLEVEL_AUDIT we are incurring in namespace pollution. This patch adds the NFT_LOGLEVEL_ enumeration and use it from nft_log. Fixes: 1a893b44de45 ("netfilter: nf_tables: Add audit support to log statement") Signed-off-by: Pablo Neira Ayuso Acked-by: Phil Sutter Signed-off-by: David S. Miller --- include/uapi/linux/netfilter/nf_tables.h | 26 ++++++++++++++++++++++-- net/netfilter/nft_log.c | 10 ++++----- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index ae00a3c49b8a..c9bf74b94f37 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1099,9 +1099,31 @@ enum nft_log_attributes { #define NFTA_LOG_MAX (__NFTA_LOG_MAX - 1) /** - * LOGLEVEL_AUDIT - a pseudo log level enabling audit logging + * enum nft_log_level - nf_tables log levels + * + * @NFT_LOGLEVEL_EMERG: system is unusable + * @NFT_LOGLEVEL_ALERT: action must be taken immediately + * @NFT_LOGLEVEL_CRIT: critical conditions + * @NFT_LOGLEVEL_ERR: error conditions + * @NFT_LOGLEVEL_WARNING: warning conditions + * @NFT_LOGLEVEL_NOTICE: normal but significant condition + * @NFT_LOGLEVEL_INFO: informational + * @NFT_LOGLEVEL_DEBUG: debug-level messages + * @NFT_LOGLEVEL_AUDIT: enabling audit logging */ -#define LOGLEVEL_AUDIT 8 +enum nft_log_level { + NFT_LOGLEVEL_EMERG, + NFT_LOGLEVEL_ALERT, + NFT_LOGLEVEL_CRIT, + NFT_LOGLEVEL_ERR, + NFT_LOGLEVEL_WARNING, + NFT_LOGLEVEL_NOTICE, + NFT_LOGLEVEL_INFO, + NFT_LOGLEVEL_DEBUG, + NFT_LOGLEVEL_AUDIT, + __NFT_LOGLEVEL_MAX +}; +#define NFT_LOGLEVEL_MAX (__NFT_LOGLEVEL_MAX + 1) /** * enum nft_queue_attributes - nf_tables queue expression netlink attributes diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 7eef1cffbf1b..655187bed5d8 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -111,7 +111,7 @@ static void nft_log_eval(const struct nft_expr *expr, const struct nft_log *priv = nft_expr_priv(expr); if (priv->loginfo.type == NF_LOG_TYPE_LOG && - priv->loginfo.u.log.level == LOGLEVEL_AUDIT) { + priv->loginfo.u.log.level == NFT_LOGLEVEL_AUDIT) { nft_log_eval_audit(pkt); return; } @@ -166,9 +166,9 @@ static int nft_log_init(const struct nft_ctx *ctx, li->u.log.level = ntohl(nla_get_be32(tb[NFTA_LOG_LEVEL])); } else { - li->u.log.level = LOGLEVEL_WARNING; + li->u.log.level = NFT_LOGLEVEL_WARNING; } - if (li->u.log.level > LOGLEVEL_AUDIT) { + if (li->u.log.level > NFT_LOGLEVEL_AUDIT) { err = -EINVAL; goto err1; } @@ -196,7 +196,7 @@ static int nft_log_init(const struct nft_ctx *ctx, break; } - if (li->u.log.level == LOGLEVEL_AUDIT) + if (li->u.log.level == NFT_LOGLEVEL_AUDIT) return 0; err = nf_logger_find_get(ctx->family, li->type); @@ -220,7 +220,7 @@ static void nft_log_destroy(const struct nft_ctx *ctx, if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); - if (li->u.log.level == LOGLEVEL_AUDIT) + if (li->u.log.level == NFT_LOGLEVEL_AUDIT) return; nf_logger_put(ctx->family, li->type); From fd3a88625844907151737fc3b4201676effa6d27 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 6 Jun 2018 11:23:01 -0400 Subject: [PATCH 03/23] net: in virtio_net_hdr only add VLAN_HLEN to csum_start if payload holds vlan Tun, tap, virtio, packet and uml vector all use struct virtio_net_hdr to communicate packet metadata to userspace. For skbuffs with vlan, the first two return the packet as it may have existed on the wire, inserting the VLAN tag in the user buffer. Then virtio_net_hdr.csum_start needs to be adjusted by VLAN_HLEN bytes. Commit f09e2249c4f5 ("macvtap: restore vlan header on user read") added this feature to macvtap. Commit 3ce9b20f1971 ("macvtap: Fix csum_start when VLAN tags are present") then fixed up csum_start. Virtio, packet and uml do not insert the vlan header in the user buffer. When introducing virtio_net_hdr_from_skb to deduplicate filling in the virtio_net_hdr, the variant from macvtap which adds VLAN_HLEN was applied uniformly, breaking csum offset for packets with vlan on virtio and packet. Make insertion of VLAN_HLEN optional. Convert the callers to pass it when needed. Fixes: e858fae2b0b8f4 ("virtio_net: use common code for virtio_net_hdr and skb GSO conversion") Fixes: 1276f24eeef2 ("packet: use common code for virtio_net_hdr and skb GSO conversion") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- arch/um/drivers/vector_transports.c | 3 ++- drivers/net/tap.c | 5 ++++- drivers/net/tun.c | 3 ++- drivers/net/virtio_net.c | 3 ++- include/linux/virtio_net.h | 11 ++++------- net/packet/af_packet.c | 4 ++-- 6 files changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/um/drivers/vector_transports.c b/arch/um/drivers/vector_transports.c index 9065047f844b..77e4ebc206ae 100644 --- a/arch/um/drivers/vector_transports.c +++ b/arch/um/drivers/vector_transports.c @@ -120,7 +120,8 @@ static int raw_form_header(uint8_t *header, skb, vheader, virtio_legacy_is_little_endian(), - false + false, + 0 ); return 0; diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 9b6cb780affe..f0f7cd977667 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -774,13 +774,16 @@ static ssize_t tap_put_user(struct tap_queue *q, int total; if (q->flags & IFF_VNET_HDR) { + int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; struct virtio_net_hdr vnet_hdr; + vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); if (iov_iter_count(iter) < vnet_hdr_len) return -EINVAL; if (virtio_net_hdr_from_skb(skb, &vnet_hdr, - tap_is_little_endian(q), true)) + tap_is_little_endian(q), true, + vlan_hlen)) BUG(); if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) != diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 85e14adf5207..a192a017cc68 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2089,7 +2089,8 @@ static ssize_t tun_put_user(struct tun_struct *tun, return -EINVAL; if (virtio_net_hdr_from_skb(skb, &gso, - tun_is_little_endian(tun), true)) { + tun_is_little_endian(tun), true, + vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); pr_err("unexpected GSO type: " "0x%x, gso_size %d, hdr_len %d\n", diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 2aaa18ec7d46..1619ee3070b6 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1411,7 +1411,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) hdr = skb_vnet_hdr(skb); if (virtio_net_hdr_from_skb(skb, &hdr->hdr, - virtio_is_little_endian(vi->vdev), false)) + virtio_is_little_endian(vi->vdev), false, + 0)) BUG(); if (vi->mergeable_rx_bufs) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index f144216febc6..9397628a1967 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -58,7 +58,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, - bool has_data_valid) + bool has_data_valid, + int vlan_hlen) { memset(hdr, 0, sizeof(*hdr)); /* no info leak */ @@ -83,12 +84,8 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, if (skb->ip_summed == CHECKSUM_PARTIAL) { hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - if (skb_vlan_tag_present(skb)) - hdr->csum_start = __cpu_to_virtio16(little_endian, - skb_checksum_start_offset(skb) + VLAN_HLEN); - else - hdr->csum_start = __cpu_to_virtio16(little_endian, - skb_checksum_start_offset(skb)); + hdr->csum_start = __cpu_to_virtio16(little_endian, + skb_checksum_start_offset(skb) + vlan_hlen); hdr->csum_offset = __cpu_to_virtio16(little_endian, skb->csum_offset); } else if (has_data_valid && diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 54ce66f68482..ee018564b2b4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2005,7 +2005,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, return -EINVAL; *len -= sizeof(vnet_hdr); - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true)) + if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); @@ -2272,7 +2272,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (do_vnet) { if (virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), - vio_le(), true)) { + vio_le(), true, 0)) { spin_lock(&sk->sk_receive_queue.lock); goto drop_n_account; } From 52acf73b6e9a6962045feb2ba5a8921da2201915 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Wed, 6 Jun 2018 21:32:51 +0000 Subject: [PATCH 04/23] hv_netvsc: Fix a network regression after ifdown/ifup Recently people reported the NIC stops working after "ifdown eth0; ifup eth0". It turns out in this case the TX queues are not enabled, after the refactoring of the common detach logic: when the NIC has sub-channels, usually we enable all the TX queues after all sub-channels are set up: see rndis_set_subchannel() -> netif_device_attach(), but in the case of "ifdown eth0; ifup eth0" where the number of channels doesn't change, we also must make sure the TX queues are enabled. The patch fixes the regression. Fixes: 7b2ee50c0cd5 ("hv_netvsc: common detach logic") Signed-off-by: Dexuan Cui Cc: Stephen Hemminger Cc: K. Y. Srinivasan Cc: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index bef4d55a108c..8eec156418ea 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -127,8 +127,10 @@ static int netvsc_open(struct net_device *net) } rdev = nvdev->extension; - if (!rdev->link_state) + if (!rdev->link_state) { netif_carrier_on(net); + netif_tx_wake_all_queues(net); + } if (vf_netdev) { /* Setting synthetic device up transparently sets From 000ade8016400d93b4d7c89970d96b8c14773d45 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 6 Jun 2018 15:56:54 -0700 Subject: [PATCH 05/23] ip_tunnel: Fix name string concatenate in __ip_tunnel_create() By passing a limit of 2 bytes to strncat, strncat is limited to writing fewer bytes than what it's supposed to append to the name here. Since the bounds are checked on the line above this, just remove the string bounds checks entirely since they're unneeded. Signed-off-by: Sultan Alsawaf Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 38d906baf1df..c4f5602308ed 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -261,8 +261,8 @@ static struct net_device *__ip_tunnel_create(struct net *net, } else { if (strlen(ops->kind) > (IFNAMSIZ - 3)) goto failed; - strlcpy(name, ops->kind, IFNAMSIZ); - strncat(name, "%d", 2); + strcpy(name, ops->kind); + strcat(name, "%d"); } ASSERT_RTNL(); From eb55bbf865d9979098c6a7a17cbdb41237ece951 Mon Sep 17 00:00:00 2001 From: Xiangning Yu Date: Thu, 7 Jun 2018 13:39:59 +0800 Subject: [PATCH 06/23] bonding: re-evaluate force_primary when the primary slave name changes There is a timing issue under active-standy mode, when bond_enslave() is called, bond->params.primary might not be initialized yet. Any time the primary slave string changes, bond->force_primary should be set to true to make sure the primary becomes the active slave. Signed-off-by: Xiangning Yu Signed-off-by: David S. Miller --- drivers/net/bonding/bond_options.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 8a945c9341d6..98663c50ded0 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1142,6 +1142,7 @@ static int bond_option_primary_set(struct bonding *bond, slave->dev->name); rcu_assign_pointer(bond->primary_slave, slave); strcpy(bond->params.primary, slave->dev->name); + bond->force_primary = true; bond_select_active_slave(bond); goto out; } From 2ac0e1524d9585f5261cebecca262da6cedc1d85 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 7 Jun 2018 15:10:30 +0200 Subject: [PATCH 07/23] net: mscc: ocelot: Fix uninitialized error in ocelot_netdevice_event() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc-4.1.2: drivers/net/ethernet/mscc/ocelot.c: In function ‘ocelot_netdevice_event’: drivers/net/ethernet/mscc/ocelot.c:1129: warning: ‘ret’ may be used uninitialized in this function If the list iterated over by netdev_for_each_lower_dev() is empty, ret is never initialized, and converted into a notifier return value. Fix this by preinitializing ret to zero. Fixes: a556c76adc052c97 ("net: mscc: Add initial Ocelot switch support") Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index c8c74aa548d9..fb2c8f8071e6 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1126,7 +1126,7 @@ static int ocelot_netdevice_event(struct notifier_block *unused, { struct netdev_notifier_changeupper_info *info = ptr; struct net_device *dev = netdev_notifier_info_to_dev(ptr); - int ret; + int ret = 0; if (netif_is_lag_master(dev)) { struct net_device *slave; From bf956be520fb534510e31564231163aa05f7f091 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Jun 2018 10:23:10 -0700 Subject: [PATCH 08/23] umh: fix race condition kasan reported use-after-free: BUG: KASAN: use-after-free in call_usermodehelper_exec_work+0x2d3/0x310 kernel/umh.c:195 Write of size 4 at addr ffff8801d9202370 by task kworker/u4:2/50 Workqueue: events_unbound call_usermodehelper_exec_work Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412 __asan_report_store4_noabort+0x17/0x20 mm/kasan/report.c:437 call_usermodehelper_exec_work+0x2d3/0x310 kernel/umh.c:195 process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145 worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279 kthread+0x345/0x410 kernel/kthread.c:240 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412 The reason is that 'sub_info' cannot be accessed out of parent task context, since it will be freed by the child. Instead remember the pid in the child task. Fixes: 449325b52b7a ("umh: introduce fork_usermode_blob() helper") Reported-by: syzbot+2c73319c406f1987d156@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/umh.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/umh.c b/kernel/umh.c index 30db93fd7e39..c449858946af 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -99,6 +99,7 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); + sub_info->pid = task_pid_nr(current); if (sub_info->file) retval = do_execve_file(sub_info->file, sub_info->argv, sub_info->envp); @@ -191,8 +192,6 @@ static void call_usermodehelper_exec_work(struct work_struct *work) if (pid < 0) { sub_info->retval = pid; umh_complete(sub_info); - } else { - sub_info->pid = pid; } } } From 8d97ca6b6755bf7ef57d323642ca9ee80d689782 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Jun 2018 10:29:29 -0700 Subject: [PATCH 09/23] bpfilter: fix OUTPUT_FORMAT CONFIG_OUTPUT_FORMAT is x86 only macro. Used objdump to extract elf file format. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Reported-by: David S. Miller Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/bpfilter/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index aafa72001fcd..e0bbe7583e58 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -21,7 +21,7 @@ endif # which bpfilter_kern.c passes further into umh blob loader at run-time quiet_cmd_copy_umh = GEN $@ cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \ - $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \ + $(OBJCOPY) -I binary -O `$(OBJDUMP) -f $<|grep format|cut -d' ' -f8` \ -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \ --rename-section .data=.init.rodata $< $@ From 23316a366e1654e4ad05817c6075bc1019efb30a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 6 Jun 2018 09:12:44 -0700 Subject: [PATCH 10/23] tools/bpf: fix selftest get_cgroup_id_user Commit f269099a7e7a ("tools/bpf: add a selftest for bpf_get_current_cgroup_id() helper") added a test for bpf_get_current_cgroup_id() helper. The bpf program is attached to tracepoint syscalls/sys_enter_nanosleep and will record the cgroup id if the tracepoint is hit. The test program creates a cgroup and attachs itself to this cgroup and expects that the test program process cgroup id is the same as the cgroup_id retrieved by the bpf program. In a light system where no other processes called nanosleep syscall, the test case can pass. In a busy system where many different processes can hit syscalls/sys_enter_nanosleep tracepoint, the cgroup id recorded by bpf program may not match the test program process cgroup_id. This patch fixed an issue by communicating the test program pid to bpf program. The bpf program only records cgroup id if the current task pid is the same as passed-in pid. This ensures that the recorded cgroup_id is for the cgroup within which the test program resides. Fixes: f269099a7e7a ("tools/bpf: add a selftest for bpf_get_current_cgroup_id() helper") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/get_cgroup_id_kern.c | 14 +++++++++++++- tools/testing/selftests/bpf/get_cgroup_id_user.c | 12 ++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/get_cgroup_id_kern.c index 2cf8cb23f209..014dba10b8a5 100644 --- a/tools/testing/selftests/bpf/get_cgroup_id_kern.c +++ b/tools/testing/selftests/bpf/get_cgroup_id_kern.c @@ -11,12 +11,24 @@ struct bpf_map_def SEC("maps") cg_ids = { .max_entries = 1, }; +struct bpf_map_def SEC("maps") pidmap = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 1, +}; + SEC("tracepoint/syscalls/sys_enter_nanosleep") int trace(void *ctx) { - __u32 key = 0; + __u32 pid = bpf_get_current_pid_tgid(); + __u32 key = 0, *expected_pid; __u64 *val; + expected_pid = bpf_map_lookup_elem(&pidmap, &key); + if (!expected_pid || *expected_pid != pid) + return 0; + val = bpf_map_lookup_elem(&cg_ids, &key); if (val) *val = bpf_get_current_cgroup_id(); diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c index ea19a42e5894..e8da7b39158d 100644 --- a/tools/testing/selftests/bpf/get_cgroup_id_user.c +++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c @@ -50,13 +50,13 @@ int main(int argc, char **argv) const char *probe_name = "syscalls/sys_enter_nanosleep"; const char *file = "get_cgroup_id_kern.o"; int err, bytes, efd, prog_fd, pmu_fd; + int cgroup_fd, cgidmap_fd, pidmap_fd; struct perf_event_attr attr = {}; - int cgroup_fd, cgidmap_fd; struct bpf_object *obj; __u64 kcgid = 0, ucgid; + __u32 key = 0, pid; int exit_code = 1; char buf[256]; - __u32 key = 0; err = setup_cgroup_environment(); if (CHECK(err, "setup_cgroup_environment", "err %d errno %d\n", err, @@ -81,6 +81,14 @@ int main(int argc, char **argv) cgidmap_fd, errno)) goto close_prog; + pidmap_fd = bpf_find_map(__func__, obj, "pidmap"); + if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n", + pidmap_fd, errno)) + goto close_prog; + + pid = getpid(); + bpf_map_update_elem(pidmap_fd, &key, &pid, 0); + snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/id", probe_name); efd = open(buf, O_RDONLY, 0); From a5a16e43529b5040760ebf9bd9b056dd34861f93 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 7 Jun 2018 15:37:34 +0200 Subject: [PATCH 11/23] xsk: Fix umem fill/completion queue mmap on 32-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc-4.1.2 on 32-bit: net/xdp/xsk.c:663: warning: integer constant is too large for ‘long’ type net/xdp/xsk.c:665: warning: integer constant is too large for ‘long’ type Add the missing "ULL" suffixes to the large XDP_UMEM_PGOFF_*_RING values to fix this. net/xdp/xsk.c:663: warning: comparison is always false due to limited range of data type net/xdp/xsk.c:665: warning: comparison is always false due to limited range of data type "unsigned long" is 32-bit on 32-bit systems, hence the offset is truncated, and can never be equal to any of the XDP_UMEM_PGOFF_*_RING values. Use loff_t (and the required cast) to fix this. Fixes: 423f38329d267969 ("xsk: add umem fill queue support and mmap") Fixes: fe2308328cd2f26e ("xsk: add umem completion queue support and mmap") Signed-off-by: Geert Uytterhoeven Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 4 ++-- net/xdp/xsk.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 1fa0e977ea8d..caed8b1614ff 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -63,8 +63,8 @@ struct xdp_statistics { /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 -#define XDP_UMEM_PGOFF_FILL_RING 0x100000000 -#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000 +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL /* Rx/Tx descriptor */ struct xdp_desc { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c6ed2454f7ce..36919a254ba3 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -643,7 +643,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); struct xsk_queue *q = NULL; From c09290c5637692a9bfe7740e4c5e693efff12810 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 8 Jun 2018 00:06:01 +0200 Subject: [PATCH 12/23] bpf, xdp: fix crash in xdp_umem_unaccount_pages syzkaller was able to trigger the following panic for AF_XDP: BUG: KASAN: null-ptr-deref in atomic64_sub include/asm-generic/atomic-instrumented.h:144 [inline] BUG: KASAN: null-ptr-deref in atomic_long_sub include/asm-generic/atomic-long.h:199 [inline] BUG: KASAN: null-ptr-deref in xdp_umem_unaccount_pages.isra.4+0x3d/0x80 net/xdp/xdp_umem.c:135 Write of size 8 at addr 0000000000000060 by task syz-executor246/4527 CPU: 1 PID: 4527 Comm: syz-executor246 Not tainted 4.17.0+ #89 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 kasan_report_error mm/kasan/report.c:352 [inline] kasan_report.cold.7+0x6d/0x2fe mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 kasan_check_write+0x14/0x20 mm/kasan/kasan.c:278 atomic64_sub include/asm-generic/atomic-instrumented.h:144 [inline] atomic_long_sub include/asm-generic/atomic-long.h:199 [inline] xdp_umem_unaccount_pages.isra.4+0x3d/0x80 net/xdp/xdp_umem.c:135 xdp_umem_reg net/xdp/xdp_umem.c:334 [inline] xdp_umem_create+0xd6c/0x10f0 net/xdp/xdp_umem.c:349 xsk_setsockopt+0x443/0x550 net/xdp/xsk.c:531 __sys_setsockopt+0x1bd/0x390 net/socket.c:1935 __do_sys_setsockopt net/socket.c:1946 [inline] __se_sys_setsockopt net/socket.c:1943 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1943 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe In xdp_umem_reg() the call to xdp_umem_account_pages() passed with CAP_IPC_LOCK where we didn't need to end up charging rlimit on memlock for the current user and therefore umem->user continues to be NULL. Later on through fault injection syzkaller triggered a failure in either umem->pgs or umem->pages allocation such that we bail out and undo accounting in xdp_umem_unaccount_pages() where we eventually hit the panic since it tries to deref the umem->user. The code is pretty close to mm_account_pinned_pages() and mm_unaccount_pinned_pages() pair and potentially could reuse it even in a later cleanup, and it appears that the initial commit c0c77d8fb787 ("xsk: add user memory registration support sockopt") got this right while later follow-up introduced the bug via a49049ea2576 ("xsk: simplified umem setup"). Fixes: a49049ea2576 ("xsk: simplified umem setup") Reported-by: syzbot+979217770b09ebf5c407@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- net/xdp/xdp_umem.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 7eb4948a38d2..b9ef487c4618 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -132,8 +132,10 @@ static void xdp_umem_unpin_pages(struct xdp_umem *umem) static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { - atomic_long_sub(umem->npgs, &umem->user->locked_vm); - free_uid(umem->user); + if (umem->user) { + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); + } } static void xdp_umem_release(struct xdp_umem *umem) From 66e58e0ef80a56a1d7857b6ce121141563cdd93e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 7 Jun 2018 15:31:14 -0700 Subject: [PATCH 13/23] bpfilter: fix race in pipe access syzbot reported the following crash [ 338.293946] bpfilter: read fail -512 [ 338.304515] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 338.311863] general protection fault: 0000 [#1] SMP KASAN [ 338.344360] RIP: 0010:__vfs_write+0x4a6/0x960 [ 338.426363] Call Trace: [ 338.456967] __kernel_write+0x10c/0x380 [ 338.460928] __bpfilter_process_sockopt+0x1d8/0x35b [ 338.487103] bpfilter_mbox_request+0x4d/0xb0 [ 338.491492] bpfilter_ip_get_sockopt+0x6b/0x90 This can happen when multiple cpus trying to talk to user mode process via bpfilter_mbox_request(). One cpu grabs the mutex while another goes to sleep on the same mutex. Then former cpu sees that umh pipe is down and shuts down the pipes. Later cpu finally acquires the mutex and crashes on freed pipe. Fix the race by using info.pid as an indicator that umh and pipes are healthy and check it after acquiring the mutex. Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Reported-by: syzbot+7ade6c94abb2774c0fee@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index b13d058f8c34..09522573f611 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -24,17 +24,19 @@ static void shutdown_umh(struct umh_info *info) { struct task_struct *tsk; + if (!info->pid) + return; tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID); if (tsk) force_sig(SIGKILL, tsk); fput(info->pipe_to_umh); fput(info->pipe_from_umh); + info->pid = 0; } static void __stop_umh(void) { - if (IS_ENABLED(CONFIG_INET) && - bpfilter_process_sockopt) { + if (IS_ENABLED(CONFIG_INET)) { bpfilter_process_sockopt = NULL; shutdown_umh(&info); } @@ -55,7 +57,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, struct mbox_reply reply; loff_t pos; ssize_t n; - int ret; + int ret = -EFAULT; req.is_set = is_set; req.pid = current->pid; @@ -63,6 +65,8 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.addr = (long)optval; req.len = optlen; mutex_lock(&bpfilter_lock); + if (!info.pid) + goto out; n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos); if (n != sizeof(req)) { pr_err("write fail %zd\n", n); From bde4975310eb1982bd0bbff673989052d92fd481 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Wed, 6 Jun 2018 18:45:22 +0000 Subject: [PATCH 14/23] net: stmmac: fix build failure due to missing COMMON_CLK dependency This patch fix the build failure on m68k; drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.o: In function `ipq806x_gmac_probe': dwmac-ipq806x.c:(.text+0xda): undefined reference to `clk_set_rate' drivers/net/ethernet/stmicro/stmmac/dwmac-rk.o: In function `rk_gmac_probe': dwmac-rk.c:(.text+0x1e58): undefined reference to `clk_set_rate' drivers/net/ethernet/stmicro/stmmac/dwmac-sti.o: In function `stid127_fix_retime_src': dwmac-sti.c:(.text+0xd8): undefined reference to `clk_set_rate' dwmac-sti.c:(.text+0x114): undefined reference to `clk_set_rate' drivers/net/ethernet/stmicro/stmmac/dwmac-sti.o:dwmac-sti.c:(.text+0x12c): more undefined references to `clk_set_rate' follow Lots of stmmac platform drivers need COMMON_CLK in their Kconfig depends. Signed-off-by: Corentin Labbe Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index e28c0d2c58e9..cb5b0f58c395 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -33,7 +33,7 @@ config DWMAC_DWC_QOS_ETH select PHYLIB select CRC32 select MII - depends on OF && HAS_DMA + depends on OF && COMMON_CLK && HAS_DMA help Support for chips using the snps,dwc-qos-ethernet.txt DT binding. @@ -57,7 +57,7 @@ config DWMAC_ANARION config DWMAC_IPQ806X tristate "QCA IPQ806x DWMAC support" default ARCH_QCOM - depends on OF && (ARCH_QCOM || COMPILE_TEST) + depends on OF && COMMON_CLK && (ARCH_QCOM || COMPILE_TEST) select MFD_SYSCON help Support for QCA IPQ806X DWMAC Ethernet. @@ -100,7 +100,7 @@ config DWMAC_OXNAS config DWMAC_ROCKCHIP tristate "Rockchip dwmac support" default ARCH_ROCKCHIP - depends on OF && (ARCH_ROCKCHIP || COMPILE_TEST) + depends on OF && COMMON_CLK && (ARCH_ROCKCHIP || COMPILE_TEST) select MFD_SYSCON help Support for Ethernet controller on Rockchip RK3288 SoC. @@ -123,7 +123,7 @@ config DWMAC_SOCFPGA config DWMAC_STI tristate "STi GMAC support" default ARCH_STI - depends on OF && (ARCH_STI || COMPILE_TEST) + depends on OF && COMMON_CLK && (ARCH_STI || COMPILE_TEST) select MFD_SYSCON ---help--- Support for ethernet controller on STi SOCs. @@ -147,7 +147,7 @@ config DWMAC_STM32 config DWMAC_SUNXI tristate "Allwinner GMAC support" default ARCH_SUNXI - depends on OF && (ARCH_SUNXI || COMPILE_TEST) + depends on OF && COMMON_CLK && (ARCH_SUNXI || COMPILE_TEST) ---help--- Support for Allwinner A20/A31 GMAC ethernet controllers. From 58d813afbe89658a5972747460a5fe19dec4dbcb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 7 Jun 2018 17:54:37 -0400 Subject: [PATCH 15/23] net: aquantia: fix unsigned numvecs comparison with less than zero From: Colin Ian King This was originally mistakenly submitted to net-next. Resubmitting to net. The comparison of numvecs < 0 is always false because numvecs is a u32 and hence the error return from a failed call to pci_alloc_irq_vectores is never detected. Fix this by using the signed int ret to handle the error return and assign numvecs to err. Detected by CoverityScan, CID#1468650 ("Unsigned compared against 0") Fixes: a09bd81b5413 ("net: aquantia: Limit number of vectors to actually allocated irqs") Signed-off-by: Colin Ian King Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index a50e08bb4748..750007513f9d 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -267,14 +267,13 @@ static int aq_pci_probe(struct pci_dev *pdev, numvecs = min(numvecs, num_online_cpus()); /*enable interrupts */ #if !AQ_CFG_FORCE_LEGACY_INT - numvecs = pci_alloc_irq_vectors(self->pdev, 1, numvecs, - PCI_IRQ_MSIX | PCI_IRQ_MSI | - PCI_IRQ_LEGACY); + err = pci_alloc_irq_vectors(self->pdev, 1, numvecs, + PCI_IRQ_MSIX | PCI_IRQ_MSI | + PCI_IRQ_LEGACY); - if (numvecs < 0) { - err = numvecs; + if (err < 0) goto err_hwinit; - } + numvecs = err; #endif self->irqvecs = numvecs; From 6310a882fbe0b87e0950222f2ac197ed92e11792 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 8 Jun 2018 10:58:25 +0800 Subject: [PATCH 16/23] net: fddi: fix a possible null-ptr-deref bp->SharedMemAddr is set to NULL while bp->SharedMemSize lesser-or-equal 0, then memset will trigger null-ptr-deref. fix it by replacing pci_alloc_consistent with dma_zalloc_coherent. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/fddi/skfp/skfddi.c | 55 +++++++++++++++++----------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/drivers/net/fddi/skfp/skfddi.c b/drivers/net/fddi/skfp/skfddi.c index 2414f1dc8ddd..72433f3efc74 100644 --- a/drivers/net/fddi/skfp/skfddi.c +++ b/drivers/net/fddi/skfp/skfddi.c @@ -297,11 +297,11 @@ static int skfp_init_one(struct pci_dev *pdev, return 0; err_out5: if (smc->os.SharedMemAddr) - pci_free_consistent(pdev, smc->os.SharedMemSize, - smc->os.SharedMemAddr, - smc->os.SharedMemDMA); - pci_free_consistent(pdev, MAX_FRAME_SIZE, - smc->os.LocalRxBuffer, smc->os.LocalRxBufferDMA); + dma_free_coherent(&pdev->dev, smc->os.SharedMemSize, + smc->os.SharedMemAddr, + smc->os.SharedMemDMA); + dma_free_coherent(&pdev->dev, MAX_FRAME_SIZE, + smc->os.LocalRxBuffer, smc->os.LocalRxBufferDMA); err_out4: free_netdev(dev); err_out3: @@ -328,17 +328,17 @@ static void skfp_remove_one(struct pci_dev *pdev) unregister_netdev(p); if (lp->os.SharedMemAddr) { - pci_free_consistent(&lp->os.pdev, - lp->os.SharedMemSize, - lp->os.SharedMemAddr, - lp->os.SharedMemDMA); + dma_free_coherent(&pdev->dev, + lp->os.SharedMemSize, + lp->os.SharedMemAddr, + lp->os.SharedMemDMA); lp->os.SharedMemAddr = NULL; } if (lp->os.LocalRxBuffer) { - pci_free_consistent(&lp->os.pdev, - MAX_FRAME_SIZE, - lp->os.LocalRxBuffer, - lp->os.LocalRxBufferDMA); + dma_free_coherent(&pdev->dev, + MAX_FRAME_SIZE, + lp->os.LocalRxBuffer, + lp->os.LocalRxBufferDMA); lp->os.LocalRxBuffer = NULL; } #ifdef MEM_MAPPED_IO @@ -394,7 +394,9 @@ static int skfp_driver_init(struct net_device *dev) spin_lock_init(&bp->DriverLock); // Allocate invalid frame - bp->LocalRxBuffer = pci_alloc_consistent(&bp->pdev, MAX_FRAME_SIZE, &bp->LocalRxBufferDMA); + bp->LocalRxBuffer = dma_alloc_coherent(&bp->pdev.dev, MAX_FRAME_SIZE, + &bp->LocalRxBufferDMA, + GFP_ATOMIC); if (!bp->LocalRxBuffer) { printk("could not allocate mem for "); printk("LocalRxBuffer: %d byte\n", MAX_FRAME_SIZE); @@ -407,23 +409,22 @@ static int skfp_driver_init(struct net_device *dev) if (bp->SharedMemSize > 0) { bp->SharedMemSize += 16; // for descriptor alignment - bp->SharedMemAddr = pci_alloc_consistent(&bp->pdev, - bp->SharedMemSize, - &bp->SharedMemDMA); + bp->SharedMemAddr = dma_zalloc_coherent(&bp->pdev.dev, + bp->SharedMemSize, + &bp->SharedMemDMA, + GFP_ATOMIC); if (!bp->SharedMemAddr) { printk("could not allocate mem for "); printk("hardware module: %ld byte\n", bp->SharedMemSize); goto fail; } - bp->SharedMemHeap = 0; // Nothing used yet. } else { bp->SharedMemAddr = NULL; - bp->SharedMemHeap = 0; - } // SharedMemSize > 0 + } - memset(bp->SharedMemAddr, 0, bp->SharedMemSize); + bp->SharedMemHeap = 0; card_stop(smc); // Reset adapter. @@ -442,15 +443,15 @@ static int skfp_driver_init(struct net_device *dev) fail: if (bp->SharedMemAddr) { - pci_free_consistent(&bp->pdev, - bp->SharedMemSize, - bp->SharedMemAddr, - bp->SharedMemDMA); + dma_free_coherent(&bp->pdev.dev, + bp->SharedMemSize, + bp->SharedMemAddr, + bp->SharedMemDMA); bp->SharedMemAddr = NULL; } if (bp->LocalRxBuffer) { - pci_free_consistent(&bp->pdev, MAX_FRAME_SIZE, - bp->LocalRxBuffer, bp->LocalRxBufferDMA); + dma_free_coherent(&bp->pdev.dev, MAX_FRAME_SIZE, + bp->LocalRxBuffer, bp->LocalRxBufferDMA); bp->LocalRxBuffer = NULL; } return err; From 8d499533e0bc02d44283dbdab03142b599b8ba16 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 8 Jun 2018 05:02:31 +0200 Subject: [PATCH 17/23] net/sched: act_simple: fix parsing of TCA_DEF_DATA use nla_strlcpy() to avoid copying data beyond the length of TCA_DEF_DATA netlink attribute, in case it is less than SIMP_MAX_DATA and it does not end with '\0' character. v2: fix errors in the commit message, thanks Hangbin Liu Fixes: fa1b1cff3d06 ("net_cls_act: Make act_simple use of netlink policy.") Signed-off-by: Davide Caratti Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/act_simple.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 9618b4a83cee..98c4afe7c15b 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -53,22 +53,22 @@ static void tcf_simp_release(struct tc_action *a) kfree(d->tcfd_defdata); } -static int alloc_defdata(struct tcf_defact *d, char *defdata) +static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata) { d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); if (unlikely(!d->tcfd_defdata)) return -ENOMEM; - strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); return 0; } -static void reset_policy(struct tcf_defact *d, char *defdata, +static void reset_policy(struct tcf_defact *d, const struct nlattr *defdata, struct tc_defact *p) { spin_lock_bh(&d->tcf_lock); d->tcf_action = p->action; memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); - strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); } @@ -87,7 +87,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct tcf_defact *d; bool exists = false; int ret = 0, err; - char *defdata; if (nla == NULL) return -EINVAL; @@ -110,8 +109,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, return -EINVAL; } - defdata = nla_data(tb[TCA_DEF_DATA]); - if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_simp_ops, bind, false); @@ -119,7 +116,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, return ret; d = to_defact(*a); - ret = alloc_defdata(d, defdata); + ret = alloc_defdata(d, tb[TCA_DEF_DATA]); if (ret < 0) { tcf_idr_release(*a, bind); return ret; @@ -133,7 +130,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (!ovr) return -EEXIST; - reset_policy(d, defdata, parm); + reset_policy(d, tb[TCA_DEF_DATA], parm); } if (ret == ACT_P_CREATED) From 49c2c3f246e2fc3009039e31a826333dcd0283cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Fri, 8 Jun 2018 09:15:24 +0200 Subject: [PATCH 18/23] cdc_ncm: avoid padding beyond end of skb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4a0e3e989d66 ("cdc_ncm: Add support for moving NDP to end of NCM frame") added logic to reserve space for the NDP at the end of the NTB/skb. This reservation did not take the final alignment of the NDP into account, causing us to reserve too little space. Additionally the padding prior to NDP addition did not ensure there was enough space for the NDP. The NTB/skb with the NDP appended would then exceed the configured max size. This caused the final padding of the NTB to use a negative count, padding to almost INT_MAX, and resulting in: [60103.825970] BUG: unable to handle kernel paging request at ffff9641f2004000 [60103.825998] IP: __memset+0x24/0x30 [60103.826001] PGD a6a06067 P4D a6a06067 PUD 4f65a063 PMD 72003063 PTE 0 [60103.826013] Oops: 0002 [#1] SMP NOPTI [60103.826018] Modules linked in: (removed( [60103.826158] CPU: 0 PID: 5990 Comm: Chrome_DevTools Tainted: G O 4.14.0-3-amd64 #1 Debian 4.14.17-1 [60103.826162] Hardware name: LENOVO 20081 BIOS 41CN28WW(V2.04) 05/03/2012 [60103.826166] task: ffff964193484fc0 task.stack: ffffb2890137c000 [60103.826171] RIP: 0010:__memset+0x24/0x30 [60103.826174] RSP: 0000:ffff964316c03b68 EFLAGS: 00010216 [60103.826178] RAX: 0000000000000000 RBX: 00000000fffffffd RCX: 000000001ffa5000 [60103.826181] RDX: 0000000000000005 RSI: 0000000000000000 RDI: ffff9641f2003ffc [60103.826184] RBP: ffff964192f6c800 R08: 00000000304d434e R09: ffff9641f1d2c004 [60103.826187] R10: 0000000000000002 R11: 00000000000005ae R12: ffff9642e6957a80 [60103.826190] R13: ffff964282ff2ee8 R14: 000000000000000d R15: ffff9642e4843900 [60103.826194] FS: 00007f395aaf6700(0000) GS:ffff964316c00000(0000) knlGS:0000000000000000 [60103.826197] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [60103.826200] CR2: ffff9641f2004000 CR3: 0000000013b0c000 CR4: 00000000000006f0 [60103.826204] Call Trace: [60103.826212] [60103.826225] cdc_ncm_fill_tx_frame+0x5e3/0x740 [cdc_ncm] [60103.826236] cdc_ncm_tx_fixup+0x57/0x70 [cdc_ncm] [60103.826246] usbnet_start_xmit+0x5d/0x710 [usbnet] [60103.826254] ? netif_skb_features+0x119/0x250 [60103.826259] dev_hard_start_xmit+0xa1/0x200 [60103.826267] sch_direct_xmit+0xf2/0x1b0 [60103.826273] __dev_queue_xmit+0x5e3/0x7c0 [60103.826280] ? ip_finish_output2+0x263/0x3c0 [60103.826284] ip_finish_output2+0x263/0x3c0 [60103.826289] ? ip_output+0x6c/0xe0 [60103.826293] ip_output+0x6c/0xe0 [60103.826298] ? ip_forward_options+0x1a0/0x1a0 [60103.826303] tcp_transmit_skb+0x516/0x9b0 [60103.826309] tcp_write_xmit+0x1aa/0xee0 [60103.826313] ? sch_direct_xmit+0x71/0x1b0 [60103.826318] tcp_tasklet_func+0x177/0x180 [60103.826325] tasklet_action+0x5f/0x110 [60103.826332] __do_softirq+0xde/0x2b3 [60103.826337] irq_exit+0xae/0xb0 [60103.826342] do_IRQ+0x81/0xd0 [60103.826347] common_interrupt+0x98/0x98 [60103.826351] [60103.826355] RIP: 0033:0x7f397bdf2282 [60103.826358] RSP: 002b:00007f395aaf57d8 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff6e [60103.826362] RAX: 0000000000000000 RBX: 00002f07bc6d0900 RCX: 00007f39752d7fe7 [60103.826365] RDX: 0000000000000022 RSI: 0000000000000147 RDI: 00002f07baea02c0 [60103.826368] RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 [60103.826371] R10: 00000000ffffffff R11: 0000000000000000 R12: 00002f07baea02c0 [60103.826373] R13: 00002f07bba227a0 R14: 00002f07bc6d090c R15: 0000000000000000 [60103.826377] Code: 90 90 90 90 90 90 90 0f 1f 44 00 00 49 89 f9 48 89 d1 83 e2 07 48 c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 [60103.826442] RIP: __memset+0x24/0x30 RSP: ffff964316c03b68 [60103.826444] CR2: ffff9641f2004000 Commit e1069bbfcf3b ("net: cdc_ncm: Reduce memory use when kernel memory low") made this bug much more likely to trigger by reducing the NTB size under memory pressure. Link: https://bugs.debian.org/893393 Reported-by: Горбешко Богдан Reported-and-tested-by: Dennis Wassenberg Cc: Enrico Mioso Fixes: 4a0e3e989d66 ("cdc_ncm: Add support for moving NDP to end of NCM frame") Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 90d07ed224d5..b0e8b9613054 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1124,7 +1124,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign) * accordingly. Otherwise, we should check here. */ if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) - delayed_ndp_size = ctx->max_ndp_size; + delayed_ndp_size = ALIGN(ctx->max_ndp_size, ctx->tx_ndp_modulus); else delayed_ndp_size = 0; @@ -1285,7 +1285,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign) /* If requested, put NDP at end of frame. */ if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) { nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data; - cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size); + cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size - ctx->max_ndp_size); nth16->wNdpIndex = cpu_to_le16(skb_out->len); skb_put_data(skb_out, ctx->delayed_ndp16, ctx->max_ndp_size); From 6c206b20092a3623184cff9470dba75d21507874 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 8 Jun 2018 11:35:40 +0200 Subject: [PATCH 19/23] udp: fix rx queue len reported by diag and proc interface After commit 6b229cf77d68 ("udp: add batching to udp_rmem_release()") the sk_rmem_alloc field does not measure exactly anymore the receive queue length, because we batch the rmem release. The issue is really apparent only after commit 0d4a6608f68c ("udp: do rmem bulk free even if the rx sk queue is empty"): the user space can easily check for an empty socket with not-0 queue length reported by the 'ss' tool or the procfs interface. We need to use a custom UDP helper to report the correct queue length, taking into account the forward allocation deficit. Reported-by: trevor.francis@46labs.com Fixes: 6b229cf77d68 ("UDP: add batching to udp_rmem_release()") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/transp_v6.h | 11 +++++++++-- include/net/udp.h | 5 +++++ net/ipv4/udp.c | 2 +- net/ipv4/udp_diag.c | 2 +- net/ipv6/datagram.c | 6 +++--- net/ipv6/udp.c | 3 ++- 6 files changed, 21 insertions(+), 8 deletions(-) diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index c4f5caaf3778..f6a3543e5247 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -45,8 +45,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc); -void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, - __u16 srcp, __u16 destp, int bucket); +void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, + __u16 srcp, __u16 destp, int rqueue, int bucket); +static inline void +ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, + __u16 destp, int bucket) +{ + __ip6_dgram_sock_seq_show(seq, sp, srcp, destp, sk_rmem_alloc_get(sp), + bucket); +} #define LOOPBACK4_IPV6 cpu_to_be32(0x7f000006) diff --git a/include/net/udp.h b/include/net/udp.h index 7ba0ed252c52..b1ea8b0f5e6a 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -247,6 +247,11 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, return htons((((u64) hash * (max - min)) >> 32) + min); } +static inline int udp_rqueue_get(struct sock *sk) +{ + return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); +} + /* net/ipv4/udp.c */ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3365362cac88..9bb27df4dac5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2772,7 +2772,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), - sk_rmem_alloc_get(sp), + udp_rqueue_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index d0390d844ac8..d9ad986c7b2c 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -163,7 +163,7 @@ static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { - r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_rqueue = udp_rqueue_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index a02ad100f0d7..2ee08b6a86a4 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -1019,8 +1019,8 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl); -void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, - __u16 srcp, __u16 destp, int bucket) +void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, + __u16 srcp, __u16 destp, int rqueue, int bucket) { const struct in6_addr *dest, *src; @@ -1036,7 +1036,7 @@ void ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, dest->s6_addr32[2], dest->s6_addr32[3], destp, sp->sk_state, sk_wmem_alloc_get(sp), - sk_rmem_alloc_get(sp), + rqueue, 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), 0, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 164afd31aebf..e6645cae403e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1523,7 +1523,8 @@ int udp6_seq_show(struct seq_file *seq, void *v) struct inet_sock *inet = inet_sk(v); __u16 srcp = ntohs(inet->inet_sport); __u16 destp = ntohs(inet->inet_dport); - ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); + __ip6_dgram_sock_seq_show(seq, v, srcp, destp, + udp_rqueue_get(v), bucket); } return 0; } From 873aca2ee86e533665a73292c4e308ded1e9bafe Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 8 Jun 2018 15:11:47 +0200 Subject: [PATCH 20/23] net: bridge: Fix locking in br_fdb_find_port() Callers of br_fdb_find() need to hold the hash lock, which br_fdb_find_port() doesn't do. However, since br_fdb_find_port() is not doing any actual FDB manipulation, the hash lock is not really needed at all. So convert to br_fdb_find_rcu(), surrounded by rcu_read_lock() / _unlock() pair. The device pointer copied from inside the FDB entry is then kept alive by the RTNL lock, which br_fdb_find_port() asserts. Fixes: 4d4fd36126d6 ("net: bridge: Publish bridge accessor functions") Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_fdb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b19e3104afd6..502f66349530 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -135,9 +135,11 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev, return NULL; br = netdev_priv(br_dev); - f = br_fdb_find(br, addr, vid); + rcu_read_lock(); + f = br_fdb_find_rcu(br, addr, vid); if (f && f->dst) dev = f->dst->dev; + rcu_read_unlock(); return dev; } From 6d8c50dcb029872b298eea68cc6209c866fd3e14 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 7 Jun 2018 13:39:49 -0700 Subject: [PATCH 21/23] socket: close race condition between sock_close() and sockfs_setattr() fchownat() doesn't even hold refcnt of fd until it figures out fd is really needed (otherwise is ignored) and releases it after it resolves the path. This means sock_close() could race with sockfs_setattr(), which leads to a NULL pointer dereference since typically we set sock->sk to NULL in ->release(). As pointed out by Al, this is unique to sockfs. So we can fix this in socket layer by acquiring inode_lock in sock_close() and checking against NULL in sockfs_setattr(). sock_release() is called in many places, only the sock_close() path matters here. And fortunately, this should not affect normal sock_close() as it is only called when the last fd refcnt is gone. It only affects sock_close() with a parallel sockfs_setattr() in progress, which is not common. Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Reported-by: shankarapailoor Cc: Tetsuo Handa Cc: Lorenzo Colitti Cc: Al Viro Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/socket.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/net/socket.c b/net/socket.c index af57d85bcb48..8a109012608a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -541,7 +541,10 @@ static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr) if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); - sock->sk->sk_uid = iattr->ia_uid; + if (sock->sk) + sock->sk->sk_uid = iattr->ia_uid; + else + err = -ENOENT; } return err; @@ -590,12 +593,16 @@ EXPORT_SYMBOL(sock_alloc); * an inode not a file. */ -void sock_release(struct socket *sock) +static void __sock_release(struct socket *sock, struct inode *inode) { if (sock->ops) { struct module *owner = sock->ops->owner; + if (inode) + inode_lock(inode); sock->ops->release(sock); + if (inode) + inode_unlock(inode); sock->ops = NULL; module_put(owner); } @@ -609,6 +616,11 @@ void sock_release(struct socket *sock) } sock->file = NULL; } + +void sock_release(struct socket *sock) +{ + __sock_release(sock, NULL); +} EXPORT_SYMBOL(sock_release); void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) @@ -1171,7 +1183,7 @@ static int sock_mmap(struct file *file, struct vm_area_struct *vma) static int sock_close(struct inode *inode, struct file *filp) { - sock_release(SOCKET_I(inode)); + __sock_release(SOCKET_I(inode), inode); return 0; } From b718e8c8f4f5920aaddc2e52d5e32f494c91129c Mon Sep 17 00:00:00 2001 From: Alvaro Gamez Machado Date: Fri, 8 Jun 2018 12:23:39 +0200 Subject: [PATCH 22/23] net: phy: dp83822: use BMCR_ANENABLE instead of BMSR_ANEGCAPABLE for DP83620 DP83620 register set is compatible with the DP83848, but it also supports 100base-FX. When the hardware is configured such as that fiber mode is enabled, autonegotiation is not possible. The chip, however, doesn't expose this information via BMSR_ANEGCAPABLE. Instead, this bit is always set high, even if the particular hardware configuration makes it so that auto negotiation is not possible [1]. Under these circumstances, the phy subsystem keeps trying for autonegotiation to happen, without success. Hereby, we inspect BMCR_ANENABLE bit after genphy_config_init, which on reset is set to 0 when auto negotiation is disabled, and so we use this value instead of BMSR_ANEGCAPABLE. [1] https://e2e.ti.com/support/interface/ethernet/f/903/p/697165/2571170 Signed-off-by: Alvaro Gamez Machado Signed-off-by: David S. Miller --- drivers/net/phy/dp83848.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c index cd09c3af2117..6e8e42361fd5 100644 --- a/drivers/net/phy/dp83848.c +++ b/drivers/net/phy/dp83848.c @@ -74,6 +74,25 @@ static int dp83848_config_intr(struct phy_device *phydev) return phy_write(phydev, DP83848_MICR, control); } +static int dp83848_config_init(struct phy_device *phydev) +{ + int err; + int val; + + err = genphy_config_init(phydev); + if (err < 0) + return err; + + /* DP83620 always reports Auto Negotiation Ability on BMSR. Instead, + * we check initial value of BMCR Auto negotiation enable bit + */ + val = phy_read(phydev, MII_BMCR); + if (!(val & BMCR_ANENABLE)) + phydev->autoneg = AUTONEG_DISABLE; + + return 0; +} + static struct mdio_device_id __maybe_unused dp83848_tbl[] = { { TI_DP83848C_PHY_ID, 0xfffffff0 }, { NS_DP83848C_PHY_ID, 0xfffffff0 }, @@ -83,7 +102,7 @@ static struct mdio_device_id __maybe_unused dp83848_tbl[] = { }; MODULE_DEVICE_TABLE(mdio, dp83848_tbl); -#define DP83848_PHY_DRIVER(_id, _name) \ +#define DP83848_PHY_DRIVER(_id, _name, _config_init) \ { \ .phy_id = _id, \ .phy_id_mask = 0xfffffff0, \ @@ -92,7 +111,7 @@ MODULE_DEVICE_TABLE(mdio, dp83848_tbl); .flags = PHY_HAS_INTERRUPT, \ \ .soft_reset = genphy_soft_reset, \ - .config_init = genphy_config_init, \ + .config_init = _config_init, \ .suspend = genphy_suspend, \ .resume = genphy_resume, \ \ @@ -102,10 +121,14 @@ MODULE_DEVICE_TABLE(mdio, dp83848_tbl); } static struct phy_driver dp83848_driver[] = { - DP83848_PHY_DRIVER(TI_DP83848C_PHY_ID, "TI DP83848C 10/100 Mbps PHY"), - DP83848_PHY_DRIVER(NS_DP83848C_PHY_ID, "NS DP83848C 10/100 Mbps PHY"), - DP83848_PHY_DRIVER(TI_DP83620_PHY_ID, "TI DP83620 10/100 Mbps PHY"), - DP83848_PHY_DRIVER(TLK10X_PHY_ID, "TI TLK10X 10/100 Mbps PHY"), + DP83848_PHY_DRIVER(TI_DP83848C_PHY_ID, "TI DP83848C 10/100 Mbps PHY", + genphy_config_init), + DP83848_PHY_DRIVER(NS_DP83848C_PHY_ID, "NS DP83848C 10/100 Mbps PHY", + genphy_config_init), + DP83848_PHY_DRIVER(TI_DP83620_PHY_ID, "TI DP83620 10/100 Mbps PHY", + dp83848_config_init), + DP83848_PHY_DRIVER(TLK10X_PHY_ID, "TI TLK10X 10/100 Mbps PHY", + genphy_config_init), }; module_phy_driver(dp83848_driver); From 867f816badc01e6da655028810d468c9f935b37c Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Fri, 8 Jun 2018 22:47:10 -0400 Subject: [PATCH 23/23] tcp: limit sk_rcvlowat by the maximum receive buffer The user-provided value to setsockopt(SO_RCVLOWAT) can be larger than the maximum possible receive buffer. Such values mute POLLIN signals on the socket which can stall progress on the socket. Limit the user-provided value to half of the maximum receive buffer, i.e., half of sk_rcvbuf when the receive buffer size is set by the user, or otherwise half of sysctl_tcp_rmem[2]. Fixes: d1361840f8c5 ("tcp: fix SO_RCVLOWAT and RCVBUF autotuning") Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2741953adaba..141acd92e58a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1694,6 +1694,13 @@ EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { + int cap; + + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + cap = sk->sk_rcvbuf >> 1; + else + cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; + val = min(val, cap); sk->sk_rcvlowat = val ? : 1; /* Check if we need to signal EPOLLIN right now */ @@ -1702,12 +1709,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; - /* val comes from user space and might be close to INT_MAX */ val <<= 1; - if (val < 0) - val = INT_MAX; - - val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); if (val > sk->sk_rcvbuf) { sk->sk_rcvbuf = val; tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);