From dc4501ff287547dea7ca10f1c580c741291a8760 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 23 Dec 2018 21:45:56 -0800 Subject: [PATCH 01/57] tipc: fix a double free in tipc_enable_bearer() bearer_disable() already calls kfree_rcu() to free struct tipc_bearer, we don't need to call kfree() again. Fixes: cb30a63384bc ("tipc: refactor function tipc_enable_bearer()") Reported-by: syzbot+b981acf1fb240c0c128b@syzkaller.appspotmail.com Cc: Ying Xue Cc: Jon Maloy Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/tipc/bearer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index fb2c0d8f359f..d27f30a9a01d 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -319,7 +319,6 @@ static int tipc_enable_bearer(struct net *net, const char *name, res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { bearer_disable(net, b); - kfree(b); errstr = "failed to create discoverer"; goto rejected; } From f0fb9b288d0a7e9cc324ae362e2dfd2cc2217ded Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Mon, 24 Dec 2018 10:30:17 -0600 Subject: [PATCH 02/57] ipv6/route: Add a missing check on proc_dointvec While flushing the cache via ipv6_sysctl_rtcache_flush(), the call to proc_dointvec() may fail. The fix adds a check that returns the error, on failure. Signed-off-by: Aditya Pakki Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 194bc162866d..a94e0b02a8ac 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5054,12 +5054,16 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, { struct net *net; int delay; + int ret; if (!write) return -EINVAL; net = (struct net *)ctl->extra1; delay = net->ipv6.sysctl.flush_delay; - proc_dointvec(ctl, write, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + if (ret) + return ret; + fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } From ca19fcb6285bfce1601c073bf4b9d2942e2df8d9 Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Mon, 24 Dec 2018 15:21:21 -0600 Subject: [PATCH 03/57] net: chelsio: Add a missing check on cudg_get_buffer cudbg_collect_hw_sched() could fail when the function cudg_get_buffer() returns an error. The fix adds a check to the latter function returning error on failure Signed-off-by: Aditya Pakki Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c index 7c49681407ad..127b1f624413 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c @@ -1229,6 +1229,10 @@ int cudbg_collect_hw_sched(struct cudbg_init *pdbg_init, rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(struct cudbg_hw_sched), &temp_buff); + + if (rc) + return rc; + hw_sched_buff = (struct cudbg_hw_sched *)temp_buff.data; hw_sched_buff->map = t4_read_reg(padap, TP_TX_MOD_QUEUE_REQ_MAP_A); hw_sched_buff->mode = TIMERMODE_G(t4_read_reg(padap, TP_MOD_CONFIG_A)); From 26fd962bde0b15e54234fe762d86bc0349df1de4 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 25 Dec 2018 01:56:14 -0600 Subject: [PATCH 04/57] niu: fix missing checks of niu_pci_eeprom_read niu_pci_eeprom_read() may fail, so we should check its return value before using the read data. Signed-off-by: Kangjie Lu Acked-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/niu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c index 9319d84bf49f..d84501441edd 100644 --- a/drivers/net/ethernet/sun/niu.c +++ b/drivers/net/ethernet/sun/niu.c @@ -8100,6 +8100,8 @@ static int niu_pci_vpd_scan_props(struct niu *np, u32 start, u32 end) start += 3; prop_len = niu_pci_eeprom_read(np, start + 4); + if (prop_len < 0) + return prop_len; err = niu_pci_vpd_get_propname(np, start + 5, namebuf, 64); if (err < 0) return err; @@ -8144,8 +8146,12 @@ static int niu_pci_vpd_scan_props(struct niu *np, u32 start, u32 end) netif_printk(np, probe, KERN_DEBUG, np->dev, "VPD_SCAN: Reading in property [%s] len[%d]\n", namebuf, prop_len); - for (i = 0; i < prop_len; i++) - *prop_buf++ = niu_pci_eeprom_read(np, off + i); + for (i = 0; i < prop_len; i++) { + err = niu_pci_eeprom_read(np, off + i); + if (err >= 0) + *prop_buf = err; + ++prop_buf; + } } start += len; From 2d822f2dbab7f4c820f72eb8570aacf3f35855bd Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 25 Dec 2018 20:55:37 -0600 Subject: [PATCH 05/57] net: (cpts) fix a missing check of clk_prepare clk_prepare() could fail, so let's check its status, and if it fails, return its error code upstream. Signed-off-by: Kangjie Lu Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 054f78295d1d..2a9ba4acd7fa 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -590,7 +590,9 @@ struct cpts *cpts_create(struct device *dev, void __iomem *regs, return ERR_CAST(cpts->refclk); } - clk_prepare(cpts->refclk); + ret = clk_prepare(cpts->refclk); + if (ret) + return ERR_PTR(ret); cpts->cc.read = cpts_systim_read; cpts->cc.mask = CLOCKSOURCE_MASK(32); From f86a3b83833e7cfe558ca4d70b64ebc48903efec Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 25 Dec 2018 20:57:14 -0600 Subject: [PATCH 06/57] net: stmicro: fix a missing check of clk_prepare clk_prepare() could fail, so let's check its status, and if it fails, return its error code upstream. Signed-off-by: Kangjie Lu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c index d07520fb969e..62ccbd47c1db 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c @@ -59,7 +59,9 @@ static int sun7i_gmac_init(struct platform_device *pdev, void *priv) gmac->clk_enabled = 1; } else { clk_set_rate(gmac->tx_clk, SUN7I_GMAC_MII_RATE); - clk_prepare(gmac->tx_clk); + ret = clk_prepare(gmac->tx_clk); + if (ret) + return ret; } return 0; From e49505f7255be8ced695919c08a29bf2c3d79616 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 25 Dec 2018 22:08:18 -0600 Subject: [PATCH 07/57] net: dsa: bcm_sf2: Propagate error value from mdio_write Both bcm_sf2_sw_indir_rw and mdiobus_write_nested could fail, so let's return their error codes upstream. Signed-off-by: Kangjie Lu Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index aa4a1f5206f1..361fbde76654 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -303,11 +303,10 @@ static int bcm_sf2_sw_mdio_write(struct mii_bus *bus, int addr, int regnum, * send them to our master MDIO bus controller */ if (addr == BRCM_PSEUDO_PHY_ADDR && priv->indir_phy_mask & BIT(addr)) - bcm_sf2_sw_indir_rw(priv, 0, addr, regnum, val); + return bcm_sf2_sw_indir_rw(priv, 0, addr, regnum, val); else - mdiobus_write_nested(priv->master_mii_bus, addr, regnum, val); - - return 0; + return mdiobus_write_nested(priv->master_mii_bus, addr, + regnum, val); } static irqreturn_t bcm_sf2_switch_0_isr(int irq, void *dev_id) From ff07d48d7bc0974d4f96a85a4df14564fb09f1ef Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Tue, 25 Dec 2018 22:23:19 -0600 Subject: [PATCH 08/57] atl1e: checking the status of atl1e_write_phy_reg atl1e_write_phy_reg() could fail. The fix issues an error message when it fails. Signed-off-by: Kangjie Lu Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/atl1e/atl1e_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c index 9dc6da039a6d..3164aad29bcf 100644 --- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c @@ -473,7 +473,9 @@ static void atl1e_mdio_write(struct net_device *netdev, int phy_id, { struct atl1e_adapter *adapter = netdev_priv(netdev); - atl1e_write_phy_reg(&adapter->hw, reg_num & MDIO_REG_ADDR_MASK, val); + if (atl1e_write_phy_reg(&adapter->hw, + reg_num & MDIO_REG_ADDR_MASK, val)) + netdev_err(netdev, "write phy register failed\n"); } static int atl1e_mii_ioctl(struct net_device *netdev, From 46273cf7e009231d2b6bc10a926e82b8928a9fb2 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Wed, 26 Dec 2018 00:09:04 -0600 Subject: [PATCH 09/57] tipc: fix a missing check of genlmsg_put genlmsg_put could fail. The fix inserts a check of its return value, and if it fails, returns -EMSGSIZE. Signed-off-by: Kangjie Lu Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 21f6ccc89401..40f5cae623a7 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -904,6 +904,8 @@ static int tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, u32 sock) hdr = genlmsg_put(args, 0, 0, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_PUBL_GET); + if (!hdr) + return -EMSGSIZE; nest = nla_nest_start(args, TIPC_NLA_SOCK); if (!nest) { From 92ee77d148bf06d8c52664be4d1b862583fd5c0e Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Wed, 26 Dec 2018 00:31:08 -0600 Subject: [PATCH 10/57] net: marvell: fix a missing check of acpi_match_device When acpi_match_device fails, its return value is NULL. Directly using the return value without a check may result in a NULL-pointer dereference. The fix checks if acpi_match_device fails, and if so, returns -EINVAL. Signed-off-by: Kangjie Lu Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index f1dab0b55769..c9444346475b 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -5255,6 +5255,8 @@ static int mvpp2_probe(struct platform_device *pdev) if (has_acpi_companion(&pdev->dev)) { acpi_id = acpi_match_device(pdev->dev.driver->acpi_match_table, &pdev->dev); + if (!acpi_id) + return -EINVAL; priv->hw_version = (unsigned long)acpi_id->driver_data; } else { priv->hw_version = From 40752b3eae29f8ca2378e978a02bd6dbeeb06d16 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Wed, 26 Dec 2018 16:28:30 +0800 Subject: [PATCH 11/57] net/wan/fsl_ucc_hdlc: Avoid double free in ucc_hdlc_probe() This patch fixes potential double frees if register_hdlc_device() fails. Signed-off-by: Wen Yang Reviewed-by: Peng Hao CC: Zhao Qiang CC: "David S. Miller" CC: netdev@vger.kernel.org CC: linuxppc-dev@lists.ozlabs.org CC: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/wan/fsl_ucc_hdlc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index 7a42336c8af8..839fa7715709 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -1180,7 +1180,6 @@ static int ucc_hdlc_probe(struct platform_device *pdev) if (register_hdlc_device(dev)) { ret = -ENOBUFS; pr_err("ucc_hdlc: unable to register hdlc device\n"); - free_netdev(dev); goto free_dev; } From d8de01b763e0d8b3b418d3606d26f203983b6637 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Wed, 26 Dec 2018 06:35:23 -0600 Subject: [PATCH 12/57] phy.h: fix obvious errors in doc and kerneldoc content 1) note that gianfar_phy.c was removed years ago 2) fix obvious copy and paste error in regular doc 3) change regular doc into kerneldoc for phy_modes() Signed-off-by: Robert P. J. Day Signed-off-by: David S. Miller --- include/linux/phy.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/phy.h b/include/linux/phy.h index da039f211c22..3b051f761450 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1,6 +1,6 @@ /* * Framework and drivers for configuring and reading different PHYs - * Based on code in sungem_phy.c and gianfar_phy.c + * Based on code in sungem_phy.c and (long-removed) gianfar_phy.c * * Author: Andy Fleming * @@ -110,9 +110,9 @@ typedef enum { * @speeds: buffer to store supported speeds in. * @size: size of speeds buffer. * - * Description: Returns the number of supported speeds, and - * fills the speeds * buffer with the supported speeds. If speeds buffer is - * too small to contain * all currently supported speeds, will return as + * Description: Returns the number of supported speeds, and fills + * the speeds buffer with the supported speeds. If speeds buffer is + * too small to contain all currently supported speeds, will return as * many speeds as can fit. */ unsigned int phy_supported_speeds(struct phy_device *phy, @@ -120,7 +120,10 @@ unsigned int phy_supported_speeds(struct phy_device *phy, unsigned int size); /** - * It maps 'enum phy_interface_t' found in include/linux/phy.h + * phy_modes - map phy_interface_t enum to device tree binding of phy-mode + * @interface: enum phy_interface_t value + * + * Description: maps 'enum phy_interface_t' defined in this file * into the device tree binding of 'phy-mode', so that Ethernet * device driver can get phy interface from device tree. */ From a3c9311f62b4943228ae90f769775dd3bcbfa7c0 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Thu, 27 Dec 2018 16:10:59 -0500 Subject: [PATCH 13/57] include/linux/phy/phy.h: fix minor kerneldoc errors Correct two minor kerneldoc errors: 1) missing reference to @mode in struct phy_ops 2) obsolete reference to @init_data in struct_phy_attrs, removed in dbc98635e0d42f0e62ea92813df1e0e4c90f8375 Signed-off-by: Robert P. J. Day Signed-off-by: David S. Miller --- include/linux/phy/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index 03b319f89a34..66d1560f1a26 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -69,6 +69,7 @@ struct phy_ops { /** * struct phy_attrs - represents phy attributes * @bus_width: Data path width implemented by PHY + * @mode: PHY mode */ struct phy_attrs { u32 bus_width; @@ -80,7 +81,6 @@ struct phy_attrs { * @dev: phy device * @id: id of the phy device * @ops: function pointers for performing phy operations - * @init_data: list of PHY consumers (non-dt only) * @mutex: mutex to protect phy_ops * @init_count: used to protect when the PHY is used by multiple consumers * @power_count: used to protect when the PHY is used by multiple consumers From eb8950861c1bfd3eecc8f6faad213e3bca0dc395 Mon Sep 17 00:00:00 2001 From: Kangjie Lu Date: Fri, 21 Dec 2018 00:46:23 -0600 Subject: [PATCH 14/57] netfilter: nf_tables: fix a missing check of nla_put_failure If nla_nest_start() may fail. The fix checks its return value and goes to nla_put_failure if it fails. Signed-off-by: Kangjie Lu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fec814dace5a..2b0a93300dd7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5727,6 +5727,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK); + if (!nest) + goto nla_put_failure; if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) || nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority))) goto nla_put_failure; From c78e7818f16f687389174c4569243abbec8dc68f Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Fri, 28 Dec 2018 01:24:42 +0100 Subject: [PATCH 15/57] netfilter: nf_conncount: replace CONNCOUNT_LOCK_SLOTS with CONNCOUNT_SLOTS Most of the time these were the same value anyway, but when CONFIG_LOCKDEP was enabled we would use a smaller number of locks to reduce overhead. Unfortunately having two values is confusing and not worth the complexity. This fixes a bug where tree_gc_worker() would only GC up to CONNCOUNT_LOCK_SLOTS trees which meant when CONFIG_LOCKDEP was enabled not all trees would be GCed by tree_gc_worker(). Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") Signed-off-by: Florian Westphal Signed-off-by: Shawn Bohrer Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 9cd180bda092..3271a4e00500 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -33,12 +33,6 @@ #define CONNCOUNT_SLOTS 256U -#ifdef CONFIG_LOCKDEP -#define CONNCOUNT_LOCK_SLOTS 8U -#else -#define CONNCOUNT_LOCK_SLOTS 256U -#endif - #define CONNCOUNT_GC_MAX_NODES 8 #define MAX_KEYLEN 5 @@ -60,7 +54,7 @@ struct nf_conncount_rb { struct rcu_head rcu_head; }; -static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; +static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp; struct nf_conncount_data { unsigned int keylen; @@ -353,7 +347,7 @@ insert_tree(struct net *net, unsigned int count = 0, gc_count = 0; bool node_found = false; - spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + spin_lock_bh(&nf_conncount_locks[hash]); parent = NULL; rbnode = &(root->rb_node); @@ -430,7 +424,7 @@ insert_tree(struct net *net, rb_link_node_rcu(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); out_unlock: - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); + spin_unlock_bh(&nf_conncount_locks[hash]); return count; } @@ -499,7 +493,7 @@ static void tree_gc_worker(struct work_struct *work) struct rb_node *node; unsigned int tree, next_tree, gc_count = 0; - tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS; + tree = data->gc_tree % CONNCOUNT_SLOTS; root = &data->root[tree]; rcu_read_lock(); @@ -621,10 +615,7 @@ static int __init nf_conncount_modinit(void) { int i; - BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS); - BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0); - - for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i) + for (i = 0; i < CONNCOUNT_SLOTS; ++i) spin_lock_init(&nf_conncount_locks[i]); conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", From 4cd273bb91b3001f623f516ec726c49754571b1a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 Dec 2018 01:24:43 +0100 Subject: [PATCH 16/57] netfilter: nf_conncount: don't skip eviction when age is negative age is signed integer, so result can be negative when the timestamps have a large delta. In this case we want to discard the entry. Instead of using age >= 2 || age < 0, just make it unsigned. Fixes: b36e4523d4d56 ("netfilter: nf_conncount: fix garbage collection confirm race") Reviewed-by: Shawn Bohrer Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 3271a4e00500..8bb4ed85c262 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -155,7 +155,7 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple_hash *found; unsigned long a, b; int cpu = raw_smp_processor_id(); - __s32 age; + u32 age; found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); if (found) From f7fcc98dfc2d136722007fec0debbed761679b94 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 Dec 2018 01:24:44 +0100 Subject: [PATCH 17/57] netfilter: nf_conncount: split gc in two phases The lockless workqueue garbage collector can race with packet path garbage collector to delete list nodes, as it calls tree_nodes_free() with the addresses of nodes that might have been free'd already from another cpu. To fix this, split gc into two phases. One phase to perform gc on the connections: From a locking perspective, this is the same as count_tree(): we hold rcu lock, but we do not change the tree, we only change the nodes' contents. The second phase acquires the tree lock and reaps empty nodes. This avoids a race condition of the garbage collection vs. packet path: If a node has been free'd already, the second phase won't find it anymore. This second phase is, from locking perspective, same as insert_tree(). The former only modifies nodes (list content, count), latter modifies the tree itself (rb_erase or rb_insert). Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") Reviewed-by: Shawn Bohrer Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 8bb4ed85c262..753132e4afa8 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -500,16 +500,32 @@ static void tree_gc_worker(struct work_struct *work) for (node = rb_first(root); node != NULL; node = rb_next(node)) { rbconn = rb_entry(node, struct nf_conncount_rb, node); if (nf_conncount_gc_list(data->net, &rbconn->list)) - gc_nodes[gc_count++] = rbconn; + gc_count++; } rcu_read_unlock(); spin_lock_bh(&nf_conncount_locks[tree]); + if (gc_count < ARRAY_SIZE(gc_nodes)) + goto next; /* do not bother */ - if (gc_count) { - tree_nodes_free(root, gc_nodes, gc_count); + gc_count = 0; + node = rb_first(root); + while (node != NULL) { + rbconn = rb_entry(node, struct nf_conncount_rb, node); + node = rb_next(node); + + if (rbconn->list.count > 0) + continue; + + gc_nodes[gc_count++] = rbconn; + if (gc_count >= ARRAY_SIZE(gc_nodes)) { + tree_nodes_free(root, gc_nodes, gc_count); + gc_count = 0; + } } + tree_nodes_free(root, gc_nodes, gc_count); +next: clear_bit(tree, data->pending_trees); next_tree = (tree + 1) % CONNCOUNT_SLOTS; From e8cfb372b38a1b8979aa7f7631fb5e7b11c3793c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 Dec 2018 01:24:45 +0100 Subject: [PATCH 18/57] netfilter: nf_conncount: restart search when nodes have been erased Shawn Bohrer reported a following crash: |RIP: 0010:rb_erase+0xae/0x360 [..] Call Trace: nf_conncount_destroy+0x59/0xc0 [nf_conncount] cleanup_match+0x45/0x70 [ip_tables] ... Shawn tracked this down to bogus 'parent' pointer: Problem is that when we insert a new node, then there is a chance that the 'parent' that we found was also passed to tree_nodes_free() (because that node was empty) for erase+free. Instead of trying to be clever and detect when this happens, restart the search if we have evicted one or more nodes. To prevent frequent restarts, do not perform gc on the second round. Also, unconditionally schedule the gc worker. The condition gc_count > ARRAY_SIZE(gc_nodes)) cannot be true unless tree grows very large, as the height of the tree will be low even with hundreds of nodes present. Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") Reported-by: Shawn Bohrer Reviewed-by: Shawn Bohrer Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 753132e4afa8..0a83c694a8f1 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -346,9 +346,10 @@ insert_tree(struct net *net, struct nf_conncount_tuple *conn; unsigned int count = 0, gc_count = 0; bool node_found = false; + bool do_gc = true; spin_lock_bh(&nf_conncount_locks[hash]); - +restart: parent = NULL; rbnode = &(root->rb_node); while (*rbnode) { @@ -381,21 +382,16 @@ insert_tree(struct net *net, if (gc_count >= ARRAY_SIZE(gc_nodes)) continue; - if (nf_conncount_gc_list(net, &rbconn->list)) + if (do_gc && nf_conncount_gc_list(net, &rbconn->list)) gc_nodes[gc_count++] = rbconn; } if (gc_count) { tree_nodes_free(root, gc_nodes, gc_count); - /* tree_node_free before new allocation permits - * allocator to re-use newly free'd object. - * - * This is a rare event; in most cases we will find - * existing node to re-use. (or gc_count is 0). - */ - - if (gc_count >= ARRAY_SIZE(gc_nodes)) - schedule_gc_worker(data, hash); + schedule_gc_worker(data, hash); + gc_count = 0; + do_gc = false; + goto restart; } if (node_found) From df4a902509766897f7371fdfa4c3bf8bc321b55d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 Dec 2018 01:24:46 +0100 Subject: [PATCH 19/57] netfilter: nf_conncount: merge lookup and add functions 'lookup' is always followed by 'add'. Merge both and make the list-walk part of nf_conncount_add(). This also avoids one unneeded unlock/re-lock pair. Extra care needs to be taken in count_tree, as we only hold rcu read lock, i.e. we can only insert to an existing tree node after acquiring its lock and making sure it has a nonzero count. As a zero count should be rare, just fall back to insert_tree() (which acquires tree lock). This issue and its solution were pointed out by Shawn Bohrer during patch review. Reviewed-by: Shawn Bohrer Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 18 +-- net/netfilter/nf_conncount.c | 146 ++++++++++----------- net/netfilter/nft_connlimit.c | 14 +- 3 files changed, 72 insertions(+), 106 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 4b2b2baf8ab4..aa66775c15f4 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -5,12 +5,6 @@ struct nf_conncount_data; -enum nf_conncount_list_add { - NF_CONNCOUNT_ADDED, /* list add was ok */ - NF_CONNCOUNT_ERR, /* -ENOMEM, must drop skb */ - NF_CONNCOUNT_SKIP, /* list is already reclaimed by gc */ -}; - struct nf_conncount_list { spinlock_t list_lock; struct list_head head; /* connections with the same filtering key */ @@ -29,18 +23,12 @@ unsigned int nf_conncount_count(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit); +int nf_conncount_add(struct net *net, struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone); void nf_conncount_list_init(struct nf_conncount_list *list); -enum nf_conncount_list_add -nf_conncount_add(struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); - bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 0a83c694a8f1..ce7f7d1212a6 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -83,38 +83,6 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -enum nf_conncount_list_add -nf_conncount_add(struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) -{ - struct nf_conncount_tuple *conn; - - if (WARN_ON_ONCE(list->count > INT_MAX)) - return NF_CONNCOUNT_ERR; - - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) - return NF_CONNCOUNT_ERR; - - conn->tuple = *tuple; - conn->zone = *zone; - conn->cpu = raw_smp_processor_id(); - conn->jiffies32 = (u32)jiffies; - conn->dead = false; - spin_lock_bh(&list->list_lock); - if (list->dead == true) { - kmem_cache_free(conncount_conn_cachep, conn); - spin_unlock_bh(&list->list_lock); - return NF_CONNCOUNT_SKIP; - } - list_add_tail(&conn->node, &list->head); - list->count++; - spin_unlock_bh(&list->list_lock); - return NF_CONNCOUNT_ADDED; -} -EXPORT_SYMBOL_GPL(nf_conncount_add); - static void __conn_free(struct rcu_head *h) { struct nf_conncount_tuple *conn; @@ -177,11 +145,10 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, return ERR_PTR(-EAGAIN); } -void nf_conncount_lookup(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit) +static int __nf_conncount_add(struct net *net, + struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; @@ -189,9 +156,6 @@ void nf_conncount_lookup(struct net *net, unsigned int collect = 0; bool free_entry = false; - /* best effort only */ - *addit = tuple ? true : false; - /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { if (collect > CONNCOUNT_GC_MAX_NODES) @@ -201,21 +165,19 @@ void nf_conncount_lookup(struct net *net, if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - if (!tuple) - continue; - if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) - *addit = false; - } else if (PTR_ERR(found) == -ENOENT) + return 0; /* already exists */ + } else { collect++; + } continue; } found_ct = nf_ct_tuplehash_to_ctrack(found); - if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks @@ -223,7 +185,8 @@ void nf_conncount_lookup(struct net *net, * * Attempt to avoid a re-add in this case. */ - *addit = false; + nf_ct_put(found_ct); + return 0; } else if (already_closed(found_ct)) { /* * we do not care about connections which are @@ -237,8 +200,38 @@ void nf_conncount_lookup(struct net *net, nf_ct_put(found_ct); } + + if (WARN_ON_ONCE(list->count > INT_MAX)) + return -EOVERFLOW; + + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) + return -ENOMEM; + + conn->tuple = *tuple; + conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + list_add_tail(&conn->node, &list->head); + list->count++; + return 0; } -EXPORT_SYMBOL_GPL(nf_conncount_lookup); + +int nf_conncount_add(struct net *net, + struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) +{ + int ret; + + /* check the saved connections */ + spin_lock_bh(&list->list_lock); + ret = __nf_conncount_add(net, list, tuple, zone); + spin_unlock_bh(&list->list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_conncount_add); void nf_conncount_list_init(struct nf_conncount_list *list) { @@ -339,13 +332,11 @@ insert_tree(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { - enum nf_conncount_list_add ret; struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; unsigned int count = 0, gc_count = 0; - bool node_found = false; bool do_gc = true; spin_lock_bh(&nf_conncount_locks[hash]); @@ -363,20 +354,15 @@ insert_tree(struct net *net, } else if (diff > 0) { rbnode = &((*rbnode)->rb_right); } else { - /* unlikely: other cpu added node already */ - node_found = true; - ret = nf_conncount_add(&rbconn->list, tuple, zone); - if (ret == NF_CONNCOUNT_ERR) { + int ret; + + ret = nf_conncount_add(net, &rbconn->list, tuple, zone); + if (ret) count = 0; /* hotdrop */ - } else if (ret == NF_CONNCOUNT_ADDED) { + else count = rbconn->list.count; - } else { - /* NF_CONNCOUNT_SKIP, rbconn is already - * reclaimed by gc, insert a new tree node - */ - node_found = false; - } - break; + tree_nodes_free(root, gc_nodes, gc_count); + goto out_unlock; } if (gc_count >= ARRAY_SIZE(gc_nodes)) @@ -394,9 +380,6 @@ insert_tree(struct net *net, goto restart; } - if (node_found) - goto out_unlock; - /* expected case: match, insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) @@ -431,7 +414,6 @@ count_tree(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { - enum nf_conncount_list_add ret; struct rb_root *root; struct rb_node *parent; struct nf_conncount_rb *rbconn; @@ -444,7 +426,6 @@ count_tree(struct net *net, parent = rcu_dereference_raw(root->rb_node); while (parent) { int diff; - bool addit; rbconn = rb_entry(parent, struct nf_conncount_rb, node); @@ -454,24 +435,29 @@ count_tree(struct net *net, } else if (diff > 0) { parent = rcu_dereference_raw(parent->rb_right); } else { - /* same source network -> be counted! */ - nf_conncount_lookup(net, &rbconn->list, tuple, zone, - &addit); + int ret; - if (!addit) + if (!tuple) { + nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; + } - ret = nf_conncount_add(&rbconn->list, tuple, zone); - if (ret == NF_CONNCOUNT_ERR) { - return 0; /* hotdrop */ - } else if (ret == NF_CONNCOUNT_ADDED) { - return rbconn->list.count; - } else { - /* NF_CONNCOUNT_SKIP, rbconn is already - * reclaimed by gc, insert a new tree node - */ + spin_lock_bh(&rbconn->list.list_lock); + /* Node might be about to be free'd. + * We need to defer to insert_tree() in this case. + */ + if (rbconn->list.count == 0) { + spin_unlock_bh(&rbconn->list.list_lock); break; } + + /* same source network -> be counted! */ + ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); + spin_unlock_bh(&rbconn->list.list_lock); + if (ret) + return 0; /* hotdrop */ + else + return rbconn->list.count; } } diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index b90d96ba4a12..af1497ab9464 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -30,7 +30,6 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int count; - bool addit; tuple_ptr = &tuple; @@ -44,19 +43,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, return; } - nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, - &addit); - count = priv->list.count; - - if (!addit) - goto out; - - if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) { + if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) { regs->verdict.code = NF_DROP; return; } - count++; -out: + + count = priv->list.count; if ((count > priv->limit) ^ priv->invert) { regs->verdict.code = NFT_BREAK; From 2f971a8f425545da52ca0e6bee81f5b1ea0ccc5f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 28 Dec 2018 01:24:47 +0100 Subject: [PATCH 20/57] netfilter: nf_conncount: move all list iterations under spinlock Two CPUs may race to remove a connection from the list, the existing conn->dead will result in a use-after-free. Use the per-list spinlock to protect list iterations. As all accesses to the list now happen while holding the per-list lock, we no longer need to delay free operations with rcu. Joint work with Florian. Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") Reviewed-by: Shawn Bohrer Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 46 ++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index ce7f7d1212a6..d0fd195b19a8 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -43,8 +43,6 @@ struct nf_conncount_tuple { struct nf_conntrack_zone zone; int cpu; u32 jiffies32; - bool dead; - struct rcu_head rcu_head; }; struct nf_conncount_rb { @@ -83,36 +81,21 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -static void __conn_free(struct rcu_head *h) -{ - struct nf_conncount_tuple *conn; - - conn = container_of(h, struct nf_conncount_tuple, rcu_head); - kmem_cache_free(conncount_conn_cachep, conn); -} - static bool conn_free(struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { bool free_entry = false; - spin_lock_bh(&list->list_lock); - - if (conn->dead) { - spin_unlock_bh(&list->list_lock); - return free_entry; - } + lockdep_assert_held(&list->list_lock); list->count--; - conn->dead = true; - list_del_rcu(&conn->node); + list_del(&conn->node); if (list->count == 0) { list->dead = true; free_entry = true; } - spin_unlock_bh(&list->list_lock); - call_rcu(&conn->rcu_head, __conn_free); + kmem_cache_free(conncount_conn_cachep, conn); return free_entry; } @@ -242,7 +225,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list) } EXPORT_SYMBOL_GPL(nf_conncount_list_init); -/* Return true if the list is empty */ +/* Return true if the list is empty. Must be called with BH disabled. */ bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list) { @@ -253,12 +236,18 @@ bool nf_conncount_gc_list(struct net *net, bool free_entry = false; bool ret = false; + /* don't bother if other cpu is already doing GC */ + if (!spin_trylock(&list->list_lock)) + return false; + list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn, &free_entry); if (IS_ERR(found)) { if (PTR_ERR(found) == -ENOENT) { - if (free_entry) + if (free_entry) { + spin_unlock(&list->list_lock); return true; + } collected++; } continue; @@ -271,23 +260,24 @@ bool nf_conncount_gc_list(struct net *net, * closed already -> ditch it */ nf_ct_put(found_ct); - if (conn_free(list, conn)) + if (conn_free(list, conn)) { + spin_unlock(&list->list_lock); return true; + } collected++; continue; } nf_ct_put(found_ct); if (collected > CONNCOUNT_GC_MAX_NODES) - return false; + break; } - spin_lock_bh(&list->list_lock); if (!list->count) { list->dead = true; ret = true; } - spin_unlock_bh(&list->list_lock); + spin_unlock(&list->list_lock); return ret; } @@ -478,6 +468,7 @@ static void tree_gc_worker(struct work_struct *work) tree = data->gc_tree % CONNCOUNT_SLOTS; root = &data->root[tree]; + local_bh_disable(); rcu_read_lock(); for (node = rb_first(root); node != NULL; node = rb_next(node)) { rbconn = rb_entry(node, struct nf_conncount_rb, node); @@ -485,6 +476,9 @@ static void tree_gc_worker(struct work_struct *work) gc_count++; } rcu_read_unlock(); + local_bh_enable(); + + cond_resched(); spin_lock_bh(&nf_conncount_locks[tree]); if (gc_count < ARRAY_SIZE(gc_nodes)) From c80f10bc973af2ace6b1414724eeff61eaa71837 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 28 Dec 2018 01:24:48 +0100 Subject: [PATCH 21/57] netfilter: nf_conncount: speculative garbage collection on empty lists Instead of removing a empty list node that might be reintroduced soon thereafter, tentatively place the empty list node on the list passed to tree_nodes_free(), then re-check if the list is empty again before erasing it from the tree. [ Florian: rebase on top of pending nf_conncount fixes ] Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") Reviewed-by: Shawn Bohrer Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 1 - net/netfilter/nf_conncount.c | 47 +++++++--------------- 2 files changed, 15 insertions(+), 33 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index aa66775c15f4..f32fc8289473 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -9,7 +9,6 @@ struct nf_conncount_list { spinlock_t list_lock; struct list_head head; /* connections with the same filtering key */ unsigned int count; /* length of list */ - bool dead; }; struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index d0fd195b19a8..f0b05dfebc6e 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -81,27 +81,20 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -static bool conn_free(struct nf_conncount_list *list, +static void conn_free(struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { - bool free_entry = false; - lockdep_assert_held(&list->list_lock); list->count--; list_del(&conn->node); - if (list->count == 0) { - list->dead = true; - free_entry = true; - } kmem_cache_free(conncount_conn_cachep, conn); - return free_entry; } static const struct nf_conntrack_tuple_hash * find_or_evict(struct net *net, struct nf_conncount_list *list, - struct nf_conncount_tuple *conn, bool *free_entry) + struct nf_conncount_tuple *conn) { const struct nf_conntrack_tuple_hash *found; unsigned long a, b; @@ -121,7 +114,7 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, */ age = a - b; if (conn->cpu == cpu || age >= 2) { - *free_entry = conn_free(list, conn); + conn_free(list, conn); return ERR_PTR(-ENOENT); } @@ -137,14 +130,13 @@ static int __nf_conncount_add(struct net *net, struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collect = 0; - bool free_entry = false; /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { if (collect > CONNCOUNT_GC_MAX_NODES) break; - found = find_or_evict(net, list, conn, &free_entry); + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { @@ -221,7 +213,6 @@ void nf_conncount_list_init(struct nf_conncount_list *list) spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 0; - list->dead = false; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); @@ -233,7 +224,6 @@ bool nf_conncount_gc_list(struct net *net, struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collected = 0; - bool free_entry = false; bool ret = false; /* don't bother if other cpu is already doing GC */ @@ -241,15 +231,10 @@ bool nf_conncount_gc_list(struct net *net, return false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { - found = find_or_evict(net, list, conn, &free_entry); + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { - if (PTR_ERR(found) == -ENOENT) { - if (free_entry) { - spin_unlock(&list->list_lock); - return true; - } + if (PTR_ERR(found) == -ENOENT) collected++; - } continue; } @@ -260,10 +245,7 @@ bool nf_conncount_gc_list(struct net *net, * closed already -> ditch it */ nf_ct_put(found_ct); - if (conn_free(list, conn)) { - spin_unlock(&list->list_lock); - return true; - } + conn_free(list, conn); collected++; continue; } @@ -273,10 +255,8 @@ bool nf_conncount_gc_list(struct net *net, break; } - if (!list->count) { - list->dead = true; + if (!list->count) ret = true; - } spin_unlock(&list->list_lock); return ret; @@ -291,6 +271,7 @@ static void __tree_nodes_free(struct rcu_head *h) kmem_cache_free(conncount_rb_cachep, rbconn); } +/* caller must hold tree nf_conncount_locks[] lock */ static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) @@ -300,8 +281,10 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; spin_lock(&rbconn->list.list_lock); - rb_erase(&rbconn->node, root); - call_rcu(&rbconn->rcu_head, __tree_nodes_free); + if (!rbconn->list.count) { + rb_erase(&rbconn->node, root); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); + } spin_unlock(&rbconn->list.list_lock); } } @@ -318,7 +301,6 @@ insert_tree(struct net *net, struct rb_root *root, unsigned int hash, const u32 *key, - u8 keylen, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { @@ -327,6 +309,7 @@ insert_tree(struct net *net, struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; unsigned int count = 0, gc_count = 0; + u8 keylen = data->keylen; bool do_gc = true; spin_lock_bh(&nf_conncount_locks[hash]); @@ -454,7 +437,7 @@ count_tree(struct net *net, if (!tuple) return 0; - return insert_tree(net, data, root, hash, key, keylen, tuple, zone); + return insert_tree(net, data, root, hash, key, tuple, zone); } static void tree_gc_worker(struct work_struct *work) From a007232066f6839d6f256bab21e825d968f1a163 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 Dec 2018 01:24:49 +0100 Subject: [PATCH 22/57] netfilter: nf_conncount: fix argument order to find_next_bit Size and 'next bit' were swapped, this bug could cause worker to reschedule itself even if system was idle. Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") Reviewed-by: Shawn Bohrer Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index f0b05dfebc6e..7554c56b2e63 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -488,7 +488,7 @@ static void tree_gc_worker(struct work_struct *work) clear_bit(tree, data->pending_trees); next_tree = (tree + 1) % CONNCOUNT_SLOTS; - next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS); + next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree); if (next_tree < CONNCOUNT_SLOTS) { data->gc_tree = next_tree; From f9fc54d313fab2834f44f516459cdc8ac91d797f Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 26 Dec 2018 19:51:46 +0800 Subject: [PATCH 23/57] ethtool: check the return value of get_regs_len The return type for get_regs_len in struct ethtool_ops is int, the hns3 driver may return error when failing to get the regs len by sending cmd to firmware. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- net/core/ethtool.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d05402868575..158264f7cfaf 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -793,8 +793,13 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, if (rc >= 0) info.n_priv_flags = rc; } - if (ops->get_regs_len) - info.regdump_len = ops->get_regs_len(dev); + if (ops->get_regs_len) { + int ret = ops->get_regs_len(dev); + + if (ret > 0) + info.regdump_len = ret; + } + if (ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); @@ -1337,6 +1342,9 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) return -EFAULT; reglen = ops->get_regs_len(dev); + if (reglen <= 0) + return reglen; + if (regs.len > reglen) regs.len = reglen; From 7418e6520f22a2e35815122fa5a53d5bbfa2c10f Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 26 Dec 2018 22:09:34 +0800 Subject: [PATCH 24/57] isdn: hisax: hfc_pci: Fix a possible concurrency use-after-free bug in HFCPCI_l1hw() In drivers/isdn/hisax/hfc_pci.c, the functions hfcpci_interrupt() and HFCPCI_l1hw() may be concurrently executed. HFCPCI_l1hw() line 1173: if (!cs->tx_skb) hfcpci_interrupt() line 942: spin_lock_irqsave(); line 1066: dev_kfree_skb_irq(cs->tx_skb); Thus, a possible concurrency use-after-free bug may occur in HFCPCI_l1hw(). To fix these bugs, the calls to spin_lock_irqsave() and spin_unlock_irqrestore() are added in HFCPCI_l1hw(), to protect the access to cs->tx_skb. Signed-off-by: Jia-Ju Bai Signed-off-by: David S. Miller --- drivers/isdn/hisax/hfc_pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/isdn/hisax/hfc_pci.c b/drivers/isdn/hisax/hfc_pci.c index 5b719b561860..81dd465afcf4 100644 --- a/drivers/isdn/hisax/hfc_pci.c +++ b/drivers/isdn/hisax/hfc_pci.c @@ -1169,11 +1169,13 @@ HFCPCI_l1hw(struct PStack *st, int pr, void *arg) if (cs->debug & L1_DEB_LAPD) debugl1(cs, "-> PH_REQUEST_PULL"); #endif + spin_lock_irqsave(&cs->lock, flags); if (!cs->tx_skb) { test_and_clear_bit(FLG_L1_PULL_REQ, &st->l1.Flags); st->l1.l1l2(st, PH_PULL | CONFIRM, NULL); } else test_and_set_bit(FLG_L1_PULL_REQ, &st->l1.Flags); + spin_unlock_irqrestore(&cs->lock, flags); break; case (HW_RESET | REQUEST): spin_lock_irqsave(&cs->lock, flags); From 0d9c9a238faf925823bde866182c663b6d734f2e Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 27 Dec 2018 18:29:09 -0600 Subject: [PATCH 25/57] fsl/fman: Use GFP_ATOMIC in {memac,tgec}_add_hash_mac_address() These functions are called from atomic context: [ 9.150239] BUG: sleeping function called from invalid context at /home/scott/git/linux/mm/slab.h:421 [ 9.158159] in_atomic(): 1, irqs_disabled(): 0, pid: 4432, name: ip [ 9.163128] CPU: 8 PID: 4432 Comm: ip Not tainted 4.20.0-rc2-00169-g63d86876f324 #29 [ 9.163130] Call Trace: [ 9.170701] [c0000002e899a980] [c0000000009c1068] .dump_stack+0xa8/0xec (unreliable) [ 9.177140] [c0000002e899aa10] [c00000000007a7b4] .___might_sleep+0x138/0x164 [ 9.184440] [c0000002e899aa80] [c0000000001d5bac] .kmem_cache_alloc_trace+0x238/0x30c [ 9.191216] [c0000002e899ab40] [c00000000065ea1c] .memac_add_hash_mac_address+0x104/0x198 [ 9.199464] [c0000002e899abd0] [c00000000065a788] .set_multi+0x1c8/0x218 [ 9.206242] [c0000002e899ac80] [c0000000006615ec] .dpaa_set_rx_mode+0xdc/0x17c [ 9.213544] [c0000002e899ad00] [c00000000083d2b0] .__dev_set_rx_mode+0x80/0xd4 [ 9.219535] [c0000002e899ad90] [c00000000083d334] .dev_set_rx_mode+0x30/0x54 [ 9.225271] [c0000002e899ae10] [c00000000083d4a0] .__dev_open+0x148/0x1c8 [ 9.230751] [c0000002e899aeb0] [c00000000083d934] .__dev_change_flags+0x19c/0x1e0 [ 9.230755] [c0000002e899af60] [c00000000083d9a4] .dev_change_flags+0x2c/0x80 [ 9.242752] [c0000002e899aff0] [c0000000008554ec] .do_setlink+0x350/0xf08 [ 9.248228] [c0000002e899b170] [c000000000857ad0] .rtnl_newlink+0x588/0x7e0 [ 9.253965] [c0000002e899b740] [c000000000852424] .rtnetlink_rcv_msg+0x3e0/0x498 [ 9.261440] [c0000002e899b820] [c000000000884790] .netlink_rcv_skb+0x134/0x14c [ 9.267607] [c0000002e899b8e0] [c000000000851840] .rtnetlink_rcv+0x18/0x2c [ 9.274558] [c0000002e899b950] [c000000000883c8c] .netlink_unicast+0x214/0x318 [ 9.281163] [c0000002e899ba00] [c000000000884220] .netlink_sendmsg+0x348/0x444 [ 9.287076] [c0000002e899bae0] [c00000000080d13c] .sock_sendmsg+0x2c/0x54 [ 9.287080] [c0000002e899bb50] [c0000000008106c0] .___sys_sendmsg+0x2d0/0x2d8 [ 9.298375] [c0000002e899bd30] [c000000000811a80] .__sys_sendmsg+0x5c/0xb0 [ 9.303939] [c0000002e899be20] [c0000000000006b0] system_call+0x60/0x6c Signed-off-by: Scott Wood Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fman/fman_memac.c | 2 +- drivers/net/ethernet/freescale/fman/fman_tgec.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fman/fman_memac.c b/drivers/net/ethernet/freescale/fman/fman_memac.c index bc6eb30aa20f..41c6fa200e74 100644 --- a/drivers/net/ethernet/freescale/fman/fman_memac.c +++ b/drivers/net/ethernet/freescale/fman/fman_memac.c @@ -928,7 +928,7 @@ int memac_add_hash_mac_address(struct fman_mac *memac, enet_addr_t *eth_addr) hash = get_mac_addr_hash_code(addr) & HASH_CTRL_ADDR_MASK; /* Create element to be added to the driver hash table */ - hash_entry = kmalloc(sizeof(*hash_entry), GFP_KERNEL); + hash_entry = kmalloc(sizeof(*hash_entry), GFP_ATOMIC); if (!hash_entry) return -ENOMEM; hash_entry->addr = addr; diff --git a/drivers/net/ethernet/freescale/fman/fman_tgec.c b/drivers/net/ethernet/freescale/fman/fman_tgec.c index 40705938eecc..f75b9c11b2d2 100644 --- a/drivers/net/ethernet/freescale/fman/fman_tgec.c +++ b/drivers/net/ethernet/freescale/fman/fman_tgec.c @@ -553,7 +553,7 @@ int tgec_add_hash_mac_address(struct fman_mac *tgec, enet_addr_t *eth_addr) hash = (crc >> TGEC_HASH_MCAST_SHIFT) & TGEC_HASH_ADR_MSK; /* Create element to be added to the driver hash table */ - hash_entry = kmalloc(sizeof(*hash_entry), GFP_KERNEL); + hash_entry = kmalloc(sizeof(*hash_entry), GFP_ATOMIC); if (!hash_entry) return -ENOMEM; hash_entry->addr = addr; From f989d03ef25df3fc26d3ea0fe7c19c9830577166 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 30 Dec 2018 14:33:20 +0200 Subject: [PATCH 26/57] net: rtnetlink: address is mandatory for rtnl_fdb_get We must have an address to lookup otherwise we'll derefence a null pointer in the ndo_fdb_get callbacks. CC: Roopa Prabhu CC: David Ahern Reported-by: syzbot+017b1f61c82a1c3e7efd@syzkaller.appspotmail.com Fixes: 5b2f94b27622 ("net: rtnetlink: support for fdb get") Signed-off-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 48f61885fd6f..5ea1bed08ede 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4104,6 +4104,11 @@ static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (!addr) { + NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); + return -EINVAL; + } + if (brport_idx) { dev = __dev_get_by_index(net, brport_idx); if (!dev) { From 178fe94405bffbd1acd83b6ff3b40211185ae9c9 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 28 Dec 2018 23:28:21 +0100 Subject: [PATCH 27/57] net/ipv6: Fix a test against 'ipv6_find_idev()' return value 'ipv6_find_idev()' returns NULL on error, not an error pointer. Update the test accordingly and return -ENOBUFS, as already done in 'addrconf_add_dev()', if NULL is returned. Fixes: ("ipv6: allow userspace to add IFA_F_OPTIMISTIC addresses") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 521e471f1cf9..8eeec6eb2bd3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4736,8 +4736,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; idev = ipv6_find_idev(dev); - if (IS_ERR(idev)) - return PTR_ERR(idev); + if (!idev) + return -ENOBUFS; if (!ipv6_allow_optimistic_dad(net, idev)) cfg.ifa_flags &= ~IFA_F_OPTIMISTIC; From 58075ff523af85002bfeace07304d57c59251605 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 29 Dec 2018 14:45:23 +0800 Subject: [PATCH 28/57] ipv4: fib_rules: Fix possible infinite loop in fib_empty_table gcc warn this: net/ipv4/fib_rules.c:203 fib_empty_table() warn: always true condition '(id <= 4294967295) => (0-u32max <= u32max)' 'id' is u32, which always not greater than RT_TABLE_MAX (0xFFFFFFFF), So add a check to break while wrap around. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/ipv4/fib_rules.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f8eb78d042a4..cfec3af54c8d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -198,11 +198,15 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) static struct fib_table *fib_empty_table(struct net *net) { - u32 id; + u32 id = 1; - for (id = 1; id <= RT_TABLE_MAX; id++) + while (1) { if (!fib_get_table(net, id)) return fib_new_table(net, id); + + if (id++ == RT_TABLE_MAX) + break; + } return NULL; } From 7f334a7e1ae113212e39aafba51352ea9ab8c9f9 Mon Sep 17 00:00:00 2001 From: Su Yanjun Date: Sat, 29 Dec 2018 14:07:55 -0500 Subject: [PATCH 29/57] ipv6: fix typo in net/ipv6/reassembly.c Signed-off-by: Su Yanjun Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index a5bb59ee50ac..36a3d8dc61f5 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -210,7 +210,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, if (next && next->ip_defrag_offset < end) goto discard_fq; - /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + /* Note : skb->ip_defrag_offset and skb->sk share the same location */ dev = skb->dev; if (dev) fq->iif = dev->ifindex; From c433570458e49bccea5c551df628d058b3526289 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 29 Dec 2018 13:56:36 -0800 Subject: [PATCH 30/57] ax25: fix a use-after-free in ax25_fillin_cb() There are multiple issues here: 1. After freeing dev->ax25_ptr, we need to set it to NULL otherwise we may use a dangling pointer. 2. There is a race between ax25_setsockopt() and device notifier as reported by syzbot. Close it by holding RTNL lock. 3. We need to test if dev->ax25_ptr is NULL before using it. Reported-and-tested-by: syzbot+ae6bb869cbed29b29040@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 11 +++++++++-- net/ax25/ax25_dev.c | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c603d33d5410..5d01edf8d819 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -653,15 +653,22 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } - dev = dev_get_by_name(&init_net, devname); + rtnl_lock(); + dev = __dev_get_by_name(&init_net, devname); if (!dev) { + rtnl_unlock(); res = -ENODEV; break; } ax25->ax25_dev = ax25_dev_ax25dev(dev); + if (!ax25->ax25_dev) { + rtnl_unlock(); + res = -ENODEV; + break; + } ax25_fillin_cb(ax25, ax25->ax25_dev); - dev_put(dev); + rtnl_unlock(); break; default: diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 9a3a301e1e2f..d92195cd7834 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -116,6 +116,7 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; spin_unlock_bh(&ax25_dev_lock); + dev->ax25_ptr = NULL; dev_put(dev); kfree(ax25_dev); return; @@ -125,6 +126,7 @@ void ax25_dev_device_down(struct net_device *dev) if (s->next == ax25_dev) { s->next = ax25_dev->next; spin_unlock_bh(&ax25_dev_lock); + dev->ax25_ptr = NULL; dev_put(dev); kfree(ax25_dev); return; From d5c7c745f254c6cb98b3b3f15fe789b8bd770c72 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 29 Dec 2018 13:56:37 -0800 Subject: [PATCH 31/57] net/wan: fix a double free in x25_asy_open_tty() When x25_asy_open() fails, it already cleans up by itself, so its caller doesn't need to free the memory again. It seems we still have to call x25_asy_free() to clear the SLF_INUSE bit, so just set these pointers to NULL after kfree(). Reported-and-tested-by: syzbot+5e5e969e525129229052@syzkaller.appspotmail.com Fixes: 3b780bed3138 ("x25_asy: Free x25_asy on x25_asy_open() failure.") Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/wan/x25_asy.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c index 1098263ab862..46c3d983b7b7 100644 --- a/drivers/net/wan/x25_asy.c +++ b/drivers/net/wan/x25_asy.c @@ -485,8 +485,10 @@ static int x25_asy_open(struct net_device *dev) /* Cleanup */ kfree(sl->xbuff); + sl->xbuff = NULL; noxbuff: kfree(sl->rbuff); + sl->rbuff = NULL; norbuff: return -ENOMEM; } From 7314f5480f3e37e570104dc5e0f28823ef849e72 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 29 Dec 2018 13:56:38 -0800 Subject: [PATCH 32/57] netrom: fix locking in nr_find_socket() nr_find_socket(), nr_find_peer() and nr_find_listener() lock the sock after finding it in the global list. However, the call path requires BH disabled for the sock lock consistently. Actually the locking is unnecessary at this point, we can just hold the sock refcnt to make sure it is not gone after we unlock the global list, and lock it later only when needed. Reported-and-tested-by: syzbot+f621cda8b7e598908efa@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/netrom/af_netrom.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 03f37c4e64fe..1d3144d19903 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -153,7 +153,7 @@ static struct sock *nr_find_listener(ax25_address *addr) sk_for_each(s, &nr_list) if (!ax25cmp(&nr_sk(s)->source_addr, addr) && s->sk_state == TCP_LISTEN) { - bh_lock_sock(s); + sock_hold(s); goto found; } s = NULL; @@ -174,7 +174,7 @@ static struct sock *nr_find_socket(unsigned char index, unsigned char id) struct nr_sock *nr = nr_sk(s); if (nr->my_index == index && nr->my_id == id) { - bh_lock_sock(s); + sock_hold(s); goto found; } } @@ -198,7 +198,7 @@ static struct sock *nr_find_peer(unsigned char index, unsigned char id, if (nr->your_index == index && nr->your_id == id && !ax25cmp(&nr->dest_addr, dest)) { - bh_lock_sock(s); + sock_hold(s); goto found; } } @@ -224,7 +224,7 @@ static unsigned short nr_find_next_circuit(void) if (i != 0 && j != 0) { if ((sk=nr_find_socket(i, j)) == NULL) break; - bh_unlock_sock(sk); + sock_put(sk); } id++; @@ -920,6 +920,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) } if (sk != NULL) { + bh_lock_sock(sk); skb_reset_transport_header(skb); if (frametype == NR_CONNACK && skb->len == 22) @@ -929,6 +930,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) ret = nr_process_rx_frame(sk, skb); bh_unlock_sock(sk); + sock_put(sk); return ret; } @@ -960,10 +962,12 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) (make = nr_make_new(sk)) == NULL) { nr_transmit_refusal(skb, 0); if (sk) - bh_unlock_sock(sk); + sock_put(sk); return 0; } + bh_lock_sock(sk); + window = skb->data[20]; skb->sk = make; @@ -1016,6 +1020,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) sk->sk_data_ready(sk); bh_unlock_sock(sk); + sock_put(sk); nr_insert_socket(make); From 3bd8264511035dc97c902f03fa9f1d07f95f8f62 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 30 Dec 2018 13:16:12 +0100 Subject: [PATCH 33/57] r8169: fix WoL device wakeup enable In rtl8169_runtime_resume() we configure WoL but don't set the device to wakeup-enabled. This prevents PME generation once the cable is re-plugged. Fix this by moving the call to device_set_wakeup_enable() to __rtl8169_set_wol(). Fixes: 433f9d0ddcc6 ("r8169: improve saved_wolopts handling") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 99bc3de906e2..298930d39b79 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -1477,6 +1477,8 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) } RTL_W8(tp, Cfg9346, Cfg9346_Lock); + + device_set_wakeup_enable(tp_to_dev(tp), wolopts); } static int rtl8169_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) @@ -1498,8 +1500,6 @@ static int rtl8169_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) rtl_unlock_work(tp); - device_set_wakeup_enable(d, tp->saved_wolopts); - pm_runtime_put_noidle(d); return 0; From 756af9c642329d54f048bac2a62f829b391f6944 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Mon, 31 Dec 2018 15:43:01 -0600 Subject: [PATCH 34/57] ibmveth: fix DMA unmap error in ibmveth_xmit_start error path Commit 33a48ab105a7 ("ibmveth: Fix DMA unmap error") fixed an issue in the normal code path of ibmveth_xmit_start() that was originally introduced by Commit 6e8ab30ec677 ("ibmveth: Add scatter-gather support"). This original fix missed the error path where dma_unmap_page is wrongly called on the header portion in descs[0] which was mapped with dma_map_single. As a result a failure to DMA map any of the frags results in a dmesg warning when CONFIG_DMA_API_DEBUG is enabled. ------------[ cut here ]------------ DMA-API: ibmveth 30000002: device driver frees DMA memory with wrong function [device address=0x000000000a430000] [size=172 bytes] [mapped as page] [unmapped as single] WARNING: CPU: 1 PID: 8426 at kernel/dma/debug.c:1085 check_unmap+0x4fc/0xe10 ... ... DMA-API: Mapped at: ibmveth_start_xmit+0x30c/0xb60 dev_hard_start_xmit+0x100/0x450 sch_direct_xmit+0x224/0x490 __qdisc_run+0x20c/0x980 __dev_queue_xmit+0x1bc/0xf20 This fixes the API misuse by unampping descs[0] with dma_unmap_single. Fixes: 6e8ab30ec677 ("ibmveth: Add scatter-gather support") Signed-off-by: Tyrel Datwyler Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmveth.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index a4681780a55d..098d8764c0ea 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1171,11 +1171,15 @@ static netdev_tx_t ibmveth_start_xmit(struct sk_buff *skb, map_failed_frags: last = i+1; - for (i = 0; i < last; i++) + for (i = 1; i < last; i++) dma_unmap_page(&adapter->vdev->dev, descs[i].fields.address, descs[i].fields.flags_len & IBMVETH_BUF_LEN_MASK, DMA_TO_DEVICE); + dma_unmap_single(&adapter->vdev->dev, + descs[0].fields.address, + descs[0].fields.flags_len & IBMVETH_BUF_LEN_MASK, + DMA_TO_DEVICE); map_failed: if (!firmware_has_feature(FW_FEATURE_CMO)) netdev_err(netdev, "tx: unable to map xmit buffer\n"); From 8b6b25cf93b73e8e399adc55b0ffb9db32b881e0 Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Fri, 21 Dec 2018 17:35:11 +0800 Subject: [PATCH 35/57] selftests/bpf: fix error printing in test_devmap() As a simple fix, just print the correct map type. Signed-off-by: Xiaozhou Liu Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_maps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 9c79ee017df3..e2b9eee37187 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -510,7 +510,7 @@ static void test_devmap(int task, void *data) fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value), 2, 0); if (fd < 0) { - printf("Failed to create arraymap '%s'!\n", strerror(errno)); + printf("Failed to create devmap '%s'!\n", strerror(errno)); exit(1); } From 3a0ed3e9619738067214871e9cb826fa23b2ddb9 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 27 Dec 2018 18:55:09 -0800 Subject: [PATCH 36/57] sock: Make sock->sk_stamp thread-safe Al Viro mentioned (Message-ID <20170626041334.GZ10672@ZenIV.linux.org.uk>) that there is probably a race condition lurking in accesses of sk_stamp on 32-bit machines. sock->sk_stamp is of type ktime_t which is always an s64. On a 32 bit architecture, we might run into situations of unsafe access as the access to the field becomes non atomic. Use seqlocks for synchronization. This allows us to avoid using spinlocks for readers as readers do not need mutual exclusion. Another approach to solve this is to require sk_lock for all modifications of the timestamps. The current approach allows for timestamps to have their own lock: sk_stamp_lock. This allows for the patch to not compete with already existing critical sections, and side effects are limited to the paths in the patch. The addition of the new field maintains the data locality optimizations from commit 9115e8cd2a0c ("net: reorganize struct sock for better data locality") Note that all the instances of the sk_stamp accesses are either through the ioctl or the syscall recvmsg. Signed-off-by: Deepa Dinamani Signed-off-by: David S. Miller --- include/net/sock.h | 38 +++++++++++++++++++++++++++++++++++--- net/compat.c | 15 +++++++++------ net/core/sock.c | 15 ++++++++++----- net/sunrpc/svcsock.c | 2 +- 4 files changed, 55 insertions(+), 15 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index a6235c286ef9..2b229f7be8eb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -298,6 +298,7 @@ struct sock_common { * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received + * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING socket options * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications @@ -474,6 +475,9 @@ struct sock { const struct cred *sk_peer_cred; long sk_rcvtimeo; ktime_t sk_stamp; +#if BITS_PER_LONG==32 + seqlock_t sk_stamp_seq; +#endif u16 sk_tsflags; u8 sk_shutdown; u32 sk_tskey; @@ -2297,6 +2301,34 @@ static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) atomic_add(segs, &sk->sk_drops); } +static inline ktime_t sock_read_timestamp(struct sock *sk) +{ +#if BITS_PER_LONG==32 + unsigned int seq; + ktime_t kt; + + do { + seq = read_seqbegin(&sk->sk_stamp_seq); + kt = sk->sk_stamp; + } while (read_seqretry(&sk->sk_stamp_seq, seq)); + + return kt; +#else + return sk->sk_stamp; +#endif +} + +static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) +{ +#if BITS_PER_LONG==32 + write_seqlock(&sk->sk_stamp_seq); + sk->sk_stamp = kt; + write_sequnlock(&sk->sk_stamp_seq); +#else + sk->sk_stamp = kt; +#endif +} + void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, @@ -2321,7 +2353,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else - sk->sk_stamp = kt; + sock_write_timestamp(sk, kt); if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid) __sock_recv_wifi_status(msg, sk, skb); @@ -2342,9 +2374,9 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY) __sock_recv_ts_and_drops(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) - sk->sk_stamp = skb->tstamp; + sock_write_timestamp(sk, skb->tstamp); else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP)) - sk->sk_stamp = 0; + sock_write_timestamp(sk, 0); } void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); diff --git a/net/compat.c b/net/compat.c index 47a614b370cd..d1f3a8a0b3ef 100644 --- a/net/compat.c +++ b/net/compat.c @@ -467,12 +467,14 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) ctv = (struct compat_timeval __user *) userstamp; err = -ENOENT; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sk->sk_stamp); + tv = ktime_to_timeval(sock_read_timestamp(sk)); + if (tv.tv_sec == -1) return err; if (tv.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - tv = ktime_to_timeval(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + tv = ktime_to_timeval(kt); } err = 0; if (put_user(tv.tv_sec, &ctv->tv_sec) || @@ -494,12 +496,13 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta ctv = (struct compat_timespec __user *) userstamp; err = -ENOENT; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sk->sk_stamp); + ts = ktime_to_timespec(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return err; if (ts.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - ts = ktime_to_timespec(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + ts = ktime_to_timespec(kt); } err = 0; if (put_user(ts.tv_sec, &ctv->tv_sec) || diff --git a/net/core/sock.c b/net/core/sock.c index f00902c532cc..6aa2e7e0b4fb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2751,6 +2751,9 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; +#if BITS_PER_LONG==32 + seqlock_init(&sk->sk_stamp_seq); +#endif atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL @@ -2850,12 +2853,13 @@ int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) struct timeval tv; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - tv = ktime_to_timeval(sk->sk_stamp); + tv = ktime_to_timeval(sock_read_timestamp(sk)); if (tv.tv_sec == -1) return -ENOENT; if (tv.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); - tv = ktime_to_timeval(sk->sk_stamp); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); + tv = ktime_to_timeval(kt); } return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; } @@ -2866,11 +2870,12 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) struct timespec ts; sock_enable_timestamp(sk, SOCK_TIMESTAMP); - ts = ktime_to_timespec(sk->sk_stamp); + ts = ktime_to_timespec(sock_read_timestamp(sk)); if (ts.tv_sec == -1) return -ENOENT; if (ts.tv_sec == 0) { - sk->sk_stamp = ktime_get_real(); + ktime_t kt = ktime_get_real(); + sock_write_timestamp(sk, kt); ts = ktime_to_timespec(sk->sk_stamp); } return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 986f3ed7d1a2..b7e67310ec37 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -549,7 +549,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) /* Don't enable netstamp, sunrpc doesn't need that much accuracy */ } - svsk->sk_sk->sk_stamp = skb->tstamp; + sock_write_timestamp(svsk->sk_sk, skb->tstamp); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ len = skb->len; From 2b96547223e3a036efb51733dbf37a1e4be92068 Mon Sep 17 00:00:00 2001 From: yupeng Date: Sat, 29 Dec 2018 21:46:38 -0800 Subject: [PATCH 37/57] add document for TCP OFO, PAWS and skip ACK counters add document and examples for below counters: TcpExtTCPOFOQueue TcpExtTCPOFODrop TcpExtTCPOFOMerge TcpExtPAWSActive TcpExtPAWSEstab TcpExtTCPACKSkippedSynRecv TcpExtTCPACKSkippedPAWS TcpExtTCPACKSkippedSeq TcpExtTCPACKSkippedFinWait2 TcpExtTCPACKSkippedTimeWait TcpExtTCPACKSkippedChallenge Signed-off-by: yupeng Signed-off-by: David S. Miller --- Documentation/networking/snmp_counter.rst | 240 +++++++++++++++++++++- 1 file changed, 239 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst index f8eb77ddbd44..b0dfdaaca512 100644 --- a/Documentation/networking/snmp_counter.rst +++ b/Documentation/networking/snmp_counter.rst @@ -571,7 +571,97 @@ duplicate packet is received. * TcpExtTCPDSACKOfoRecv The TCP stack receives a DSACK, which indicate an out of order -duplciate packet is received. +duplicate packet is received. + +TCP out of order +=============== +* TcpExtTCPOFOQueue +The TCP layer receives an out of order packet and has enough memory +to queue it. + +* TcpExtTCPOFODrop +The TCP layer receives an out of order packet but doesn't have enough +memory, so drops it. Such packets won't be counted into +TcpExtTCPOFOQueue. + +* TcpExtTCPOFOMerge +The received out of order packet has an overlay with the previous +packet. the overlay part will be dropped. All of TcpExtTCPOFOMerge +packets will also be counted into TcpExtTCPOFOQueue. + +TCP PAWS +======= +PAWS (Protection Against Wrapped Sequence numbers) is an algorithm +which is used to drop old packets. It depends on the TCP +timestamps. For detail information, please refer the `timestamp wiki`_ +and the `RFC of PAWS`_. + +.. _RFC of PAWS: https://tools.ietf.org/html/rfc1323#page-17 +.. _timestamp wiki: https://en.wikipedia.org/wiki/Transmission_Control_Protocol#TCP_timestamps + +* TcpExtPAWSActive +Packets are dropped by PAWS in Syn-Sent status. + +* TcpExtPAWSEstab +Packets are dropped by PAWS in any status other than Syn-Sent. + +TCP ACK skip +=========== +In some scenarios, kernel would avoid sending duplicate ACKs too +frequently. Please find more details in the tcp_invalid_ratelimit +section of the `sysctl document`_. When kernel decides to skip an ACK +due to tcp_invalid_ratelimit, kernel would update one of below +counters to indicate the ACK is skipped in which scenario. The ACK +would only be skipped if the received packet is either a SYN packet or +it has no data. + +.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt + +* TcpExtTCPACKSkippedSynRecv +The ACK is skipped in Syn-Recv status. The Syn-Recv status means the +TCP stack receives a SYN and replies SYN+ACK. Now the TCP stack is +waiting for an ACK. Generally, the TCP stack doesn't need to send ACK +in the Syn-Recv status. But in several scenarios, the TCP stack need +to send an ACK. E.g., the TCP stack receives the same SYN packet +repeately, the received packet does not pass the PAWS check, or the +received packet sequence number is out of window. In these scenarios, +the TCP stack needs to send ACK. If the ACk sending frequency is higher than +tcp_invalid_ratelimit allows, the TCP stack will skip sending ACK and +increase TcpExtTCPACKSkippedSynRecv. + + +* TcpExtTCPACKSkippedPAWS +The ACK is skipped due to PAWS (Protect Against Wrapped Sequence +numbers) check fails. If the PAWS check fails in Syn-Recv, Fin-Wait-2 +or Time-Wait statuses, the skipped ACK would be counted to +TcpExtTCPACKSkippedSynRecv, TcpExtTCPACKSkippedFinWait2 or +TcpExtTCPACKSkippedTimeWait. In all other statuses, the skipped ACK +would be counted to TcpExtTCPACKSkippedPAWS. + +* TcpExtTCPACKSkippedSeq +The sequence number is out of window and the timestamp passes the PAWS +check and the TCP status is not Syn-Recv, Fin-Wait-2, and Time-Wait. + +* TcpExtTCPACKSkippedFinWait2 +The ACK is skipped in Fin-Wait-2 status, the reason would be either +PAWS check fails or the received sequence number is out of window. + +* TcpExtTCPACKSkippedTimeWait +Tha ACK is skipped in Time-Wait status, the reason would be either +PAWS check failed or the received sequence number is out of window. + +* TcpExtTCPACKSkippedChallenge +The ACK is skipped if the ACK is a challenge ACK. The RFC 5961 defines +3 kind of challenge ACK, please refer `RFC 5961 section 3.2`_, +`RFC 5961 section 4.2`_ and `RFC 5961 section 5.2`_. Besides these +three scenarios, In some TCP status, the linux TCP stack would also +send challenge ACKs if the ACK number is before the first +unacknowledged number (more strict than `RFC 5961 section 5.2`_). + +.. _RFC 5961 section 3.2: https://tools.ietf.org/html/rfc5961#page-7 +.. _RFC 5961 section 4.2: https://tools.ietf.org/html/rfc5961#page-9 +.. _RFC 5961 section 5.2: https://tools.ietf.org/html/rfc5961#page-11 + examples ======= @@ -1188,3 +1278,151 @@ Run nstat on server B:: We have deleted the default route on server B. Server B couldn't find a route for the 8.8.8.8 IP address, so server B increased IpOutNoRoutes. + +TcpExtTCPACKSkippedSynRecv +------------------------ +In this test, we send 3 same SYN packets from client to server. The +first SYN will let server create a socket, set it to Syn-Recv status, +and reply a SYN/ACK. The second SYN will let server reply the SYN/ACK +again, and record the reply time (the duplicate ACK reply time). The +third SYN will let server check the previous duplicate ACK reply time, +and decide to skip the duplicate ACK, then increase the +TcpExtTCPACKSkippedSynRecv counter. + +Run tcpdump to capture a SYN packet:: + + nstatuser@nstat-a:~$ sudo tcpdump -c 1 -w /tmp/syn.pcap port 9000 + tcpdump: listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes + +Open another terminal, run nc command:: + + nstatuser@nstat-a:~$ nc nstat-b 9000 + +As the nstat-b didn't listen on port 9000, it should reply a RST, and +the nc command exited immediately. It was enough for the tcpdump +command to capture a SYN packet. A linux server might use hardware +offload for the TCP checksum, so the checksum in the /tmp/syn.pcap +might be not correct. We call tcprewrite to fix it:: + + nstatuser@nstat-a:~$ tcprewrite --infile=/tmp/syn.pcap --outfile=/tmp/syn_fixcsum.pcap --fixcsum + +On nstat-b, we run nc to listen on port 9000:: + + nstatuser@nstat-b:~$ nc -lkv 9000 + Listening on [0.0.0.0] (family 0, port 9000) + +On nstat-a, we blocked the packet from port 9000, or nstat-a would send +RST to nstat-b:: + + nstatuser@nstat-a:~$ sudo iptables -A INPUT -p tcp --sport 9000 -j DROP + +Send 3 SYN repeatly to nstat-b:: + + nstatuser@nstat-a:~$ for i in {1..3}; do sudo tcpreplay -i ens3 /tmp/syn_fixcsum.pcap; done + +Check snmp cunter on nstat-b:: + + nstatuser@nstat-b:~$ nstat | grep -i skip + TcpExtTCPACKSkippedSynRecv 1 0.0 + +As we expected, TcpExtTCPACKSkippedSynRecv is 1. + +TcpExtTCPACKSkippedPAWS +---------------------- +To trigger PAWS, we could send an old SYN. + +On nstat-b, let nc listen on port 9000:: + + nstatuser@nstat-b:~$ nc -lkv 9000 + Listening on [0.0.0.0] (family 0, port 9000) + +On nstat-a, run tcpdump to capture a SYN:: + + nstatuser@nstat-a:~$ sudo tcpdump -w /tmp/paws_pre.pcap -c 1 port 9000 + tcpdump: listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes + +On nstat-a, run nc as a client to connect nstat-b:: + + nstatuser@nstat-a:~$ nc -v nstat-b 9000 + Connection to nstat-b 9000 port [tcp/*] succeeded! + +Now the tcpdump has captured the SYN and exit. We should fix the +checksum:: + + nstatuser@nstat-a:~$ tcprewrite --infile /tmp/paws_pre.pcap --outfile /tmp/paws.pcap --fixcsum + +Send the SYN packet twice:: + + nstatuser@nstat-a:~$ for i in {1..2}; do sudo tcpreplay -i ens3 /tmp/paws.pcap; done + +On nstat-b, check the snmp counter:: + + nstatuser@nstat-b:~$ nstat | grep -i skip + TcpExtTCPACKSkippedPAWS 1 0.0 + +We sent two SYN via tcpreplay, both of them would let PAWS check +failed, the nstat-b replied an ACK for the first SYN, skipped the ACK +for the second SYN, and updated TcpExtTCPACKSkippedPAWS. + +TcpExtTCPACKSkippedSeq +-------------------- +To trigger TcpExtTCPACKSkippedSeq, we send packets which have valid +timestamp (to pass PAWS check) but the sequence number is out of +window. The linux TCP stack would avoid to skip if the packet has +data, so we need a pure ACK packet. To generate such a packet, we +could create two sockets: one on port 9000, another on port 9001. Then +we capture an ACK on port 9001, change the source/destination port +numbers to match the port 9000 socket. Then we could trigger +TcpExtTCPACKSkippedSeq via this packet. + +On nstat-b, open two terminals, run two nc commands to listen on both +port 9000 and port 9001:: + + nstatuser@nstat-b:~$ nc -lkv 9000 + Listening on [0.0.0.0] (family 0, port 9000) + + nstatuser@nstat-b:~$ nc -lkv 9001 + Listening on [0.0.0.0] (family 0, port 9001) + +On nstat-a, run two nc clients:: + + nstatuser@nstat-a:~$ nc -v nstat-b 9000 + Connection to nstat-b 9000 port [tcp/*] succeeded! + + nstatuser@nstat-a:~$ nc -v nstat-b 9001 + Connection to nstat-b 9001 port [tcp/*] succeeded! + +On nstat-a, run tcpdump to capture an ACK:: + + nstatuser@nstat-a:~$ sudo tcpdump -w /tmp/seq_pre.pcap -c 1 dst port 9001 + tcpdump: listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes + +On nstat-b, send a packet via the port 9001 socket. E.g. we sent a +string 'foo' in our example:: + + nstatuser@nstat-b:~$ nc -lkv 9001 + Listening on [0.0.0.0] (family 0, port 9001) + Connection from nstat-a 42132 received! + foo + +On nstat-a, the tcpdump should have caputred the ACK. We should check +the source port numbers of the two nc clients:: + + nstatuser@nstat-a:~$ ss -ta '( dport = :9000 || dport = :9001 )' | tee + State Recv-Q Send-Q Local Address:Port Peer Address:Port + ESTAB 0 0 192.168.122.250:50208 192.168.122.251:9000 + ESTAB 0 0 192.168.122.250:42132 192.168.122.251:9001 + +Run tcprewrite, change port 9001 to port 9000, chagne port 42132 to +port 50208:: + + nstatuser@nstat-a:~$ tcprewrite --infile /tmp/seq_pre.pcap --outfile /tmp/seq.pcap -r 9001:9000 -r 42132:50208 --fixcsum + +Now the /tmp/seq.pcap is the packet we need. Send it to nstat-b:: + + nstatuser@nstat-a:~$ for i in {1..2}; do sudo tcpreplay -i ens3 /tmp/seq.pcap; done + +Check TcpExtTCPACKSkippedSeq on nstat-b:: + + nstatuser@nstat-b:~$ nstat | grep -i skip + TcpExtTCPACKSkippedSeq 1 0.0 From 4087d2bc0d9469835f8d19d63a4a56739e5b8c5b Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sun, 30 Dec 2018 23:24:11 +0800 Subject: [PATCH 38/57] net: rds: remove unnecessary NULL check In kfree, the NULL check is done. Signed-off-by: Zhu Yanjun Signed-off-by: David S. Miller --- net/rds/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index b9bbcf3d6c63..c16f0a362c32 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -623,7 +623,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net) if (rtn->rds_tcp_sysctl) unregister_net_sysctl_table(rtn->rds_tcp_sysctl); - if (net != &init_net && rtn->ctl_table) + if (net != &init_net) kfree(rtn->ctl_table); } From aff6db454599d62191aabc208930e891748e4322 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 30 Dec 2018 12:43:42 -0800 Subject: [PATCH 39/57] ptr_ring: wrap back ->producer in __ptr_ring_swap_queue() __ptr_ring_swap_queue() tries to move pointers from the old ring to the new one, but it forgets to check if ->producer is beyond the new size at the end of the operation. This leads to an out-of-bound access in __ptr_ring_produce() as reported by syzbot. Reported-by: syzbot+8993c0fa96d57c399735@syzkaller.appspotmail.com Fixes: 5d49de532002 ("ptr_ring: resize support") Cc: "Michael S. Tsirkin" Cc: John Fastabend Cc: Jason Wang Signed-off-by: Cong Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 6894976b54e3..186cd8e970c7 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -573,6 +573,8 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, else if (destroy) destroy(ptr); + if (producer >= size) + producer = 0; __ptr_ring_set_size(r, size); r->producer = producer; r->consumer_head = 0; From 8c76e77f9069f10505c08e02646c3ee11ad79038 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 30 Dec 2018 17:21:05 -0500 Subject: [PATCH 40/57] tap: call skb_probe_transport_header after setting skb->dev The BPF flow dissector expects either skb->sk or skb->dev set on all skbs. Delay flow dissection until after skb->dev is set. This requires calling from within an rcu read-side critical section. That is fine, see also the call from tun_xdp_one. Fixes: d0e13a1488ad ("flow_dissector: lookup netns by skb->sk if skb->dev is NULL") Reported-by: Christian Borntraeger Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/tap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 443b2694130c..c0b52e48f0e6 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1177,8 +1177,6 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) goto err_kfree; } - skb_probe_transport_header(skb, ETH_HLEN); - /* Move network header to the right position for VLAN tagged packets */ if ((skb->protocol == htons(ETH_P_8021Q) || skb->protocol == htons(ETH_P_8021AD)) && @@ -1189,6 +1187,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) tap = rcu_dereference(q->tap); if (tap) { skb->dev = tap->dev; + skb_probe_transport_header(skb, ETH_HLEN); dev_queue_xmit(skb); } else { kfree_skb(skb); From cb9f1b783850b14cbd7f87d061d784a666dfba1f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 30 Dec 2018 17:24:36 -0500 Subject: [PATCH 41/57] ip: validate header length on virtual device xmit KMSAN detected read beyond end of buffer in vti and sit devices when passing truncated packets with PF_PACKET. The issue affects additional ip tunnel devices. Extend commit 76c0ddd8c3a6 ("ip6_tunnel: be careful when accessing the inner header") and commit ccfec9e5cb2d ("ip_tunnel: be careful when accessing the inner header"). Move the check to a separate helper and call at the start of each ndo_start_xmit function in net/ipv4 and net/ipv6. Minor changes: - convert dev_kfree_skb to kfree_skb on error path, as dev_kfree_skb calls consume_skb which is not for error paths. - use pskb_network_may_pull even though that is pedantic here, as the same as pskb_may_pull for devices without llheaders. - do not cache ipv6 hdrs if used only once (unsafe across pskb_may_pull, was more relevant to earlier patch) Reported-by: syzbot Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 20 ++++++++++++++++++++ net/ipv4/ip_gre.c | 9 +++++++++ net/ipv4/ip_tunnel.c | 9 --------- net/ipv4/ip_vti.c | 12 +++++++++--- net/ipv6/ip6_gre.c | 10 +++++++--- net/ipv6/ip6_tunnel.c | 10 +++------- net/ipv6/ip6_vti.c | 8 ++++---- net/ipv6/ip6mr.c | 17 +++++++++++------ net/ipv6/sit.c | 3 +++ 9 files changed, 66 insertions(+), 32 deletions(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index cbcf35ce1b14..34f019650941 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -308,6 +308,26 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); +static inline bool pskb_inet_may_pull(struct sk_buff *skb) +{ + int nhlen; + + switch (skb->protocol) { +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + nhlen = sizeof(struct ipv6hdr); + break; +#endif + case htons(ETH_P_IP): + nhlen = sizeof(struct iphdr); + break; + default: + nhlen = 0; + } + + return pskb_network_may_pull(skb, nhlen); +} + static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c7a7bd58a23c..d1d09f3e5f9e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -676,6 +676,9 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { gre_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; @@ -719,6 +722,9 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); bool truncate = false; + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { erspan_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; @@ -762,6 +768,9 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); + if (!pskb_inet_may_pull(skb)) + goto free_skb; + if (tunnel->collect_md) { gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 284a22154b4e..c4f5602308ed 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -627,7 +627,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); - unsigned int inner_nhdr_len = 0; const struct iphdr *inner_iph; struct flowi4 fl4; u8 tos, ttl; @@ -637,14 +636,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, __be32 dst; bool connected; - /* ensure we can access the inner net header, for several users below */ - if (skb->protocol == htons(ETH_P_IP)) - inner_nhdr_len = sizeof(struct iphdr); - else if (skb->protocol == htons(ETH_P_IPV6)) - inner_nhdr_len = sizeof(struct ipv6hdr); - if (unlikely(!pskb_may_pull(skb, inner_nhdr_len))) - goto tx_error; - inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index de31b302d69c..d7b43e700023 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -241,6 +241,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); struct flowi fl; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { @@ -253,15 +256,18 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); break; default: - dev->stats.tx_errors++; - dev_kfree_skb(skb); - return NETDEV_TX_OK; + goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); return vti_xmit(skb, dev, &fl); + +tx_err: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; } static int vti4_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 229e55c99021..09d0826742f8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -881,6 +881,9 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, struct net_device_stats *stats = &t->dev->stats; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; @@ -923,6 +926,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, int nhoff; int thoff; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; @@ -995,8 +1001,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, goto tx_err; } } else { - struct ipv6hdr *ipv6h = ipv6_hdr(skb); - switch (skb->protocol) { case htons(ETH_P_IP): memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1004,7 +1008,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, &dsfield, &encap_limit); break; case htons(ETH_P_IPV6): - if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) + if (ipv6_addr_equal(&t->parms.raddr, &ipv6_hdr(skb)->saddr)) goto tx_err; if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 99179b9c8384..0c6403cf8b52 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1243,10 +1243,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; - /* ensure we can access the full inner ip header */ - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return -1; - iph = ip_hdr(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1321,9 +1317,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; - if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) - return -1; - ipv6h = ipv6_hdr(skb); tproto = READ_ONCE(t->parms.proto); if ((tproto != IPPROTO_IPV6 && tproto != 0) || @@ -1405,6 +1398,9 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) struct net_device_stats *stats = &t->dev->stats; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + switch (skb->protocol) { case htons(ETH_P_IP): ret = ip4ip6_tnl_xmit(skb, dev); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 706fe42e4928..8b6eefff2f7e 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -522,18 +522,18 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; - struct ipv6hdr *ipv6h; struct flowi fl; int ret; + if (!pskb_inet_may_pull(skb)) + goto tx_err; + memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): - ipv6h = ipv6_hdr(skb); - if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || - vti6_addr_conflict(t, ipv6h)) + vti6_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; xfrm_decode_session(skb, &fl, AF_INET6); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 8276f1224f16..30337b38274b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -599,13 +600,12 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; - int err; - err = ip6mr_fib_lookup(net, &fl6, &mrt); - if (err < 0) { - kfree_skb(skb); - return err; - } + if (!pskb_inet_may_pull(skb)) + goto tx_err; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + goto tx_err; read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; @@ -614,6 +614,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, read_unlock(&mrt_lock); kfree_skb(skb); return NETDEV_TX_OK; + +tx_err: + dev->stats.tx_errors++; + kfree_skb(skb); + return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 51c9f75f34b9..1e03305c0549 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1021,6 +1021,9 @@ static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb, static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { + if (!pskb_inet_may_pull(skb)) + goto tx_err; + switch (skb->protocol) { case htons(ETH_P_IP): sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP); From e888402789b9db5de4fcda361331d66dbf0cd9fd Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Mon, 31 Dec 2018 10:58:29 +0800 Subject: [PATCH 42/57] net: hns3: call hns3_nic_net_open() while doing HNAE3_UP_CLIENT For HNAE3_DOWN_CLIENT calling hns3_nic_net_stop(), HNAE3_UP_CLIENT should call hns3_nic_net_open(), since if the number of queue or the map of TC has is changed before HHAE3_UP_CLIENT is called, it will cause problem. Also the HNS3_NIC_STATE_RESETTING flag needs to be cleared before hns3_nic_net_open() called, and set it back while hns3_nic_net_open() failed. Fixes: bb6b94a896d4 ("net: hns3: Add reset interface implementation in client") Signed-off-by: Huazhong Tan Signed-off-by: Yunsheng Lin Signed-off-by: Peng Li Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index d3b9aaf96c1c..07cd58798083 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -3995,17 +3995,18 @@ static int hns3_reset_notify_up_enet(struct hnae3_handle *handle) struct hns3_nic_priv *priv = netdev_priv(kinfo->netdev); int ret = 0; + clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state); + if (netif_running(kinfo->netdev)) { - ret = hns3_nic_net_up(kinfo->netdev); + ret = hns3_nic_net_open(kinfo->netdev); if (ret) { + set_bit(HNS3_NIC_STATE_RESETTING, &priv->state); netdev_err(kinfo->netdev, "hns net up fail, ret=%d!\n", ret); return ret; } } - clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state); - return ret; } From 53fe3ed19df0bca6ce752fae8e483910b6f112f6 Mon Sep 17 00:00:00 2001 From: Xue Chaojing Date: Tue, 1 Jan 2019 19:39:33 +0000 Subject: [PATCH 43/57] net-next/hinic:add shutdown callback If there is no shutdown callback, our board will report pcie UNF errors after restarting. This patch add shutdown callback for hinic. Signed-off-by: Xue Chaojing Signed-off-by: David S. Miller --- drivers/net/ethernet/huawei/hinic/hinic_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index 6d48dc62a44b..da323b9e1f62 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -1106,6 +1106,11 @@ static void hinic_remove(struct pci_dev *pdev) dev_info(&pdev->dev, "HiNIC driver - removed\n"); } +static void hinic_shutdown(struct pci_dev *pdev) +{ + pci_disable_device(pdev); +} + static const struct pci_device_id hinic_pci_table[] = { { PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_QUAD_PORT_25GE), 0}, { PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_DUAL_PORT_25GE), 0}, @@ -1119,6 +1124,7 @@ static struct pci_driver hinic_driver = { .id_table = hinic_pci_table, .probe = hinic_probe, .remove = hinic_remove, + .shutdown = hinic_shutdown, }; module_pci_driver(hinic_driver); From 202700e30740c6568b5a6943662f3829566dd533 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Jan 2019 04:24:20 -0800 Subject: [PATCH 44/57] net/hamradio/6pack: use mod_timer() to rearm timers Using del_timer() + add_timer() is generally unsafe on SMP, as noticed by syzbot. Use mod_timer() instead. kernel BUG at kernel/time/timer.c:1136! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 1026 Comm: kworker/u4:4 Not tainted 4.20.0+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events_unbound flush_to_ldisc RIP: 0010:add_timer kernel/time/timer.c:1136 [inline] RIP: 0010:add_timer+0xa81/0x1470 kernel/time/timer.c:1134 Code: 4d 89 7d 40 48 c7 85 70 fe ff ff 00 00 00 00 c7 85 7c fe ff ff ff ff ff ff 48 89 85 90 fe ff ff e9 e6 f7 ff ff e8 cf 42 12 00 <0f> 0b e8 c8 42 12 00 0f 0b e8 c1 42 12 00 4c 89 bd 60 fe ff ff e9 RSP: 0018:ffff8880a7fdf5a8 EFLAGS: 00010293 RAX: ffff8880a7846340 RBX: dffffc0000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff816f3ee1 RDI: ffff88808a514ff8 RBP: ffff8880a7fdf760 R08: 0000000000000007 R09: ffff8880a7846c58 R10: ffff8880a7846340 R11: 0000000000000000 R12: ffff88808a514ff8 R13: ffff88808a514ff8 R14: ffff88808a514dc0 R15: 0000000000000030 FS: 0000000000000000(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000061c500 CR3: 00000000994d9000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: decode_prio_command drivers/net/hamradio/6pack.c:903 [inline] sixpack_decode drivers/net/hamradio/6pack.c:971 [inline] sixpack_receive_buf drivers/net/hamradio/6pack.c:457 [inline] sixpack_receive_buf+0xf9c/0x1470 drivers/net/hamradio/6pack.c:434 tty_ldisc_receive_buf+0x164/0x1c0 drivers/tty/tty_buffer.c:465 tty_port_default_receive_buf+0x114/0x190 drivers/tty/tty_port.c:38 receive_buf drivers/tty/tty_buffer.c:481 [inline] flush_to_ldisc+0x3b2/0x590 drivers/tty/tty_buffer.c:533 process_one_work+0xd0c/0x1ce0 kernel/workqueue.c:2153 worker_thread+0x143/0x14a0 kernel/workqueue.c:2296 kthread+0x357/0x430 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Andreas Koensgen Signed-off-by: David S. Miller --- drivers/net/hamradio/6pack.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 28c749980359..a19868cba48c 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -523,10 +523,7 @@ static void resync_tnc(struct timer_list *t) /* Start resync timer again -- the TNC might be still absent */ - - del_timer(&sp->resync_t); - sp->resync_t.expires = jiffies + SIXP_RESYNC_TIMEOUT; - add_timer(&sp->resync_t); + mod_timer(&sp->resync_t, jiffies + SIXP_RESYNC_TIMEOUT); } static inline int tnc_init(struct sixpack *sp) @@ -537,9 +534,7 @@ static inline int tnc_init(struct sixpack *sp) sp->tty->ops->write(sp->tty, &inbyte, 1); - del_timer(&sp->resync_t); - sp->resync_t.expires = jiffies + SIXP_RESYNC_TIMEOUT; - add_timer(&sp->resync_t); + mod_timer(&sp->resync_t, jiffies + SIXP_RESYNC_TIMEOUT); return 0; } @@ -897,11 +892,8 @@ static void decode_prio_command(struct sixpack *sp, unsigned char cmd) /* if the state byte has been received, the TNC is present, so the resync timer can be reset. */ - if (sp->tnc_state == TNC_IN_SYNC) { - del_timer(&sp->resync_t); - sp->resync_t.expires = jiffies + SIXP_INIT_RESYNC_TIMEOUT; - add_timer(&sp->resync_t); - } + if (sp->tnc_state == TNC_IN_SYNC) + mod_timer(&sp->resync_t, jiffies + SIXP_INIT_RESYNC_TIMEOUT); sp->status1 = cmd & SIXP_PRIO_DATA_MASK; } From 7adf3246092f5e87ed0fa610e8088fae416c581f Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 2 Jan 2019 13:29:27 +0100 Subject: [PATCH 45/57] ipv6: route: Fix return value of ip6_neigh_lookup() on neigh_create() error In ip6_neigh_lookup(), we must not return errors coming from neigh_create(): if creation of a neighbour entry fails, the lookup should return NULL, in the same way as it's done in __neigh_lookup(). Otherwise, callers legitimately checking for a non-NULL return value of the lookup function might dereference an invalid pointer. For instance, on neighbour table overflow, ndisc_router_discovery() crashes ndisc_update() by passing ERR_PTR(-ENOBUFS) as 'neigh' argument. Reported-by: Jianlin Shi Fixes: f8a1b43b709d ("net/ipv6: Create a neigh_lookup for FIB entries") Signed-off-by: Stefano Brivio Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a94e0b02a8ac..40b225f87d5e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -210,7 +210,9 @@ struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, n = __ipv6_neigh_lookup(dev, daddr); if (n) return n; - return neigh_create(&nd_tbl, daddr, dev); + + n = neigh_create(&nd_tbl, daddr, dev); + return IS_ERR(n) ? NULL : n; } static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, From d63967e475ae10f286dbd35e189cb241e0b1f284 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Jan 2019 09:20:27 -0800 Subject: [PATCH 46/57] isdn: fix kernel-infoleak in capi_unlocked_ioctl Since capi_ioctl() copies 64 bytes after calling capi20_get_manufacturer() we need to ensure to not leak information to user. BUG: KMSAN: kernel-infoleak in _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32 CPU: 0 PID: 11245 Comm: syz-executor633 Not tainted 4.20.0-rc7+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x173/0x1d0 lib/dump_stack.c:113 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613 kmsan_internal_check_memory+0x9d4/0xb00 mm/kmsan/kmsan.c:704 kmsan_copy_to_user+0xab/0xc0 mm/kmsan/kmsan_hooks.c:601 _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32 capi_ioctl include/linux/uaccess.h:177 [inline] capi_unlocked_ioctl+0x1a0b/0x1bf0 drivers/isdn/capi/capi.c:939 do_vfs_ioctl+0xebd/0x2bf0 fs/ioctl.c:46 ksys_ioctl fs/ioctl.c:713 [inline] __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl+0x1da/0x270 fs/ioctl.c:718 __x64_sys_ioctl+0x4a/0x70 fs/ioctl.c:718 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291 entry_SYSCALL_64_after_hwframe+0x63/0xe7 RIP: 0033:0x440019 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffdd4659fb8 EFLAGS: 00000213 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440019 RDX: 0000000020000080 RSI: 00000000c0044306 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8 R10: 0000000000000000 R11: 0000000000000213 R12: 00000000004018a0 R13: 0000000000401930 R14: 0000000000000000 R15: 0000000000000000 Local variable description: ----data.i@capi_unlocked_ioctl Variable was created at: capi_ioctl drivers/isdn/capi/capi.c:747 [inline] capi_unlocked_ioctl+0x82/0x1bf0 drivers/isdn/capi/capi.c:939 do_vfs_ioctl+0xebd/0x2bf0 fs/ioctl.c:46 Bytes 12-63 of 64 are uninitialized Memory access of size 64 starts at ffff88807ac5fce8 Data copied to user address 0000000020000080 Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Karsten Keil Signed-off-by: David S. Miller --- drivers/isdn/capi/kcapi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index 0ff517d3c98f..a4ceb61c5b60 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -852,7 +852,7 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf) u16 ret; if (contr == 0) { - strlcpy(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN); + strncpy(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN); return CAPI_NOERROR; } @@ -860,7 +860,7 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf) ctr = get_capi_ctr_by_nr(contr); if (ctr && ctr->state == CAPI_CTR_RUNNING) { - strlcpy(buf, ctr->manu, CAPI_MANUFACTURER_LEN); + strncpy(buf, ctr->manu, CAPI_MANUFACTURER_LEN); ret = CAPI_NOERROR; } else ret = CAPI_REGNOTINSTALLED; From c08435ec7f2bc8f4109401f696fd55159b4b40cb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:27 +0100 Subject: [PATCH 47/57] bpf: move {prev_,}insn_idx into verifier env Move prev_insn_idx and insn_idx from the do_check() function into the verifier environment, so they can be read inside the various helper functions for handling the instructions. It's easier to put this into the environment rather than changing all call-sites only to pass it along. insn_idx is useful in particular since this later on allows to hold state in env->insn_aux_data[env->insn_idx]. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 + kernel/bpf/verifier.c | 76 ++++++++++++++++++------------------ 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c233efc106c6..3f84f3e87704 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -212,6 +212,8 @@ struct bpf_subprog_info { * one verifier_env per bpf_check() call */ struct bpf_verifier_env { + u32 insn_idx; + u32 prev_insn_idx; struct bpf_prog *prog; /* eBPF program being verified */ const struct bpf_verifier_ops *ops; struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 71d86e3024ae..afa8515bbb34 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5650,7 +5650,6 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; int insn_cnt = env->prog->len, i; - int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; @@ -5670,19 +5669,19 @@ static int do_check(struct bpf_verifier_env *env) BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, 0 /* subprogno, zero == main subprog */); - insn_idx = 0; + for (;;) { struct bpf_insn *insn; u8 class; int err; - if (insn_idx >= insn_cnt) { + if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", - insn_idx, insn_cnt); + env->insn_idx, insn_cnt); return -EFAULT; } - insn = &insns[insn_idx]; + insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { @@ -5692,7 +5691,7 @@ static int do_check(struct bpf_verifier_env *env) return -E2BIG; } - err = is_state_visited(env, insn_idx); + err = is_state_visited(env, env->insn_idx); if (err < 0) return err; if (err == 1) { @@ -5700,9 +5699,9 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level) { if (do_print_state) verbose(env, "\nfrom %d to %d: safe\n", - prev_insn_idx, insn_idx); + env->prev_insn_idx, env->insn_idx); else - verbose(env, "%d: safe\n", insn_idx); + verbose(env, "%d: safe\n", env->insn_idx); } goto process_bpf_exit; } @@ -5715,10 +5714,10 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level > 1 || (env->log.level && do_print_state)) { if (env->log.level > 1) - verbose(env, "%d:", insn_idx); + verbose(env, "%d:", env->insn_idx); else verbose(env, "\nfrom %d to %d:", - prev_insn_idx, insn_idx); + env->prev_insn_idx, env->insn_idx); print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -5729,20 +5728,20 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; - verbose_linfo(env, insn_idx, "; "); - verbose(env, "%d: ", insn_idx); + verbose_linfo(env, env->insn_idx, "; "); + verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (bpf_prog_is_dev_bound(env->prog->aux)) { - err = bpf_prog_offload_verify_insn(env, insn_idx, - prev_insn_idx); + err = bpf_prog_offload_verify_insn(env, env->insn_idx, + env->prev_insn_idx); if (err) return err; } regs = cur_regs(env); - env->insn_aux_data[insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); @@ -5768,13 +5767,13 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, - insn->dst_reg, false); + err = check_mem_access(env, env->insn_idx, insn->src_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, insn->dst_reg, false); if (err) return err; - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { /* saw a valid insn @@ -5799,10 +5798,10 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn_idx, insn); + err = check_xadd(env, env->insn_idx, insn); if (err) return err; - insn_idx++; + env->insn_idx++; continue; } @@ -5818,13 +5817,13 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - insn->src_reg, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, insn->src_reg, false); if (err) return err; - prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; @@ -5852,9 +5851,9 @@ static int do_check(struct bpf_verifier_env *env) } /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - -1, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false); if (err) return err; @@ -5872,9 +5871,9 @@ static int do_check(struct bpf_verifier_env *env) } if (insn->src_reg == BPF_PSEUDO_CALL) - err = check_func_call(env, insn, &insn_idx); + err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, insn_idx); + err = check_helper_call(env, insn->imm, env->insn_idx); if (err) return err; @@ -5887,7 +5886,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - insn_idx += insn->off + 1; + env->insn_idx += insn->off + 1; continue; } else if (opcode == BPF_EXIT) { @@ -5901,8 +5900,8 @@ static int do_check(struct bpf_verifier_env *env) if (state->curframe) { /* exit from nested function */ - prev_insn_idx = insn_idx; - err = prepare_func_exit(env, &insn_idx); + env->prev_insn_idx = env->insn_idx; + err = prepare_func_exit(env, &env->insn_idx); if (err) return err; do_print_state = true; @@ -5932,7 +5931,8 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: - err = pop_stack(env, &prev_insn_idx, &insn_idx); + err = pop_stack(env, &env->prev_insn_idx, + &env->insn_idx); if (err < 0) { if (err != -ENOENT) return err; @@ -5942,7 +5942,7 @@ static int do_check(struct bpf_verifier_env *env) continue; } } else { - err = check_cond_jmp_op(env, insn, &insn_idx); + err = check_cond_jmp_op(env, insn, &env->insn_idx); if (err) return err; } @@ -5959,8 +5959,8 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - insn_idx++; - env->insn_aux_data[insn_idx].seen = true; + env->insn_idx++; + env->insn_aux_data[env->insn_idx].seen = true; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -5970,7 +5970,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - insn_idx++; + env->insn_idx++; } verbose(env, "processed %d insns (limit %d), stack depth ", From 144cd91c4c2bced6eb8a7e25e590f6618a11e854 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:28 +0100 Subject: [PATCH 48/57] bpf: move tmp variable into ax register in interpreter This change moves the on-stack 64 bit tmp variable in ___bpf_prog_run() into the hidden ax register. The latter is currently only used in JITs for constant blinding as a temporary scratch register, meaning the BPF interpreter will never see the use of ax. Therefore it is safe to use it for the cases where tmp has been used earlier. This is needed to later on allow restricted hidden use of ax in both interpreter and JITs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 ++- kernel/bpf/core.c | 34 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 8c8544b375eb..84a6a98f8328 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -60,7 +60,8 @@ struct sock_reuseport; * constants. See JIT pre-step in bpf_jit_blind_constants(). */ #define BPF_REG_AX MAX_BPF_REG -#define MAX_BPF_JIT_REG (MAX_BPF_REG + 1) +#define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) +#define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 38de580abcc2..a34312a5eea2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -54,6 +54,7 @@ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] +#define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define IMM insn->imm @@ -1188,7 +1189,6 @@ bool bpf_opcode_in_insntable(u8 code) */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { - u64 tmp; #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { @@ -1268,36 +1268,36 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - div64_u64_rem(DST, SRC, &tmp); - DST = tmp; + div64_u64_rem(DST, SRC, &AX); + DST = AX; CONT; ALU_MOD_X: - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &tmp); - DST = tmp; + div64_u64_rem(DST, IMM, &AX); + DST = AX; CONT; ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); CONT; ALU64_DIV_X: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; CONT; ALU64_DIV_K: DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; CONT; ALU_END_TO_BE: switch (IMM) { @@ -1553,7 +1553,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ @@ -1566,7 +1566,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ From 9b73bfdd08e73231d6a90ae6db4b46b3fbf56c30 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:29 +0100 Subject: [PATCH 49/57] bpf: enable access to ax register also from verifier rewrite Right now we are using BPF ax register in JIT for constant blinding as well as in interpreter as temporary variable. Verifier will not be able to use it simply because its use will get overridden from the former in bpf_jit_blind_insn(). However, it can be made to work in that blinding will be skipped if there is prior use in either source or destination register on the instruction. Taking constraints of ax into account, the verifier is then open to use it in rewrites under some constraints. Note, ax register already has mappings in every eBPF JIT. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 7 +------ kernel/bpf/core.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 84a6a98f8328..ad106d845b22 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -53,12 +53,7 @@ struct sock_reuseport; #define BPF_REG_D BPF_REG_8 /* data, callee-saved */ #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ -/* Kernel hidden auxiliary/helper register for hardening step. - * Only used by eBPF JITs. It's nothing more than a temporary - * register that JITs use internally, only that here it's part - * of eBPF instructions that have been rewritten for blinding - * constants. See JIT pre-step in bpf_jit_blind_constants(). - */ +/* Kernel hidden auxiliary/helper register. */ #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a34312a5eea2..f908b9356025 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -858,6 +858,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + /* Constraints on AX register: + * + * AX register is inaccessible from user space. It is mapped in + * all JITs, and used here for constant blinding rewrites. It is + * typically "stateless" meaning its contents are only valid within + * the executed instruction, but not across several instructions. + * There are a few exceptions however which are further detailed + * below. + * + * Constant blinding is only used by JITs, not in the interpreter. + * The interpreter uses AX in some occasions as a local temporary + * register e.g. in DIV or MOD instructions. + * + * In restricted circumstances, the verifier can also use the AX + * register for rewrites as long as they do not interfere with + * the above cases! + */ + if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) + goto out; + if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { From 0d6303db7970e6f56ae700fa07e11eb510cda125 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:30 +0100 Subject: [PATCH 50/57] bpf: restrict map value pointer arithmetic for unprivileged Restrict map value pointer arithmetic for unprivileged users in that arithmetic itself must not go out of bounds as opposed to the actual access later on. Therefore after each adjust_ptr_min_max_vals() with a map value pointer as a destination it will simulate a check_map_access() of 1 byte on the destination and once that fails the program is rejected for unprivileged program loads. We use this later on for masking any pointer arithmetic with the remainder of the map value space. The likelihood of breaking any existing real-world unprivileged eBPF program is very small for this corner case. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index afa8515bbb34..4da8c73e168f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3249,6 +3249,17 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); + + /* For unprivileged we require that resulting offset must be in bounds + * in order to be able to sanitize access later on. + */ + if (!env->allow_ptr_leaks && dst_reg->type == PTR_TO_MAP_VALUE && + check_map_access(env, dst, dst_reg->off, 1, false)) { + verbose(env, "R%d pointer arithmetic of map value goes out of range, prohibited for !root\n", + dst); + return -EACCES; + } + return 0; } From e4298d25830a866cc0f427d4bccb858e76715859 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:31 +0100 Subject: [PATCH 51/57] bpf: restrict stack pointer arithmetic for unprivileged Restrict stack pointer arithmetic for unprivileged users in that arithmetic itself must not go out of bounds as opposed to the actual access later on. Therefore after each adjust_ptr_min_max_vals() with a stack pointer as a destination we simulate a check_stack_access() of 1 byte on the destination and once that fails the program is rejected for unprivileged program loads. This is analog to map value pointer arithmetic and needed for masking later on. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 63 ++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4da8c73e168f..9ac205d1b8b7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1387,6 +1387,31 @@ static int check_stack_read(struct bpf_verifier_env *env, } } +static int check_stack_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int off, int size) +{ + /* Stack accesses must be at a fixed offset, so that we + * can determine what type of data were returned. See + * check_stack_read(). + */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable stack access var_off=%s off=%d size=%d", + tn_buf, off, size); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose(env, "invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -1954,24 +1979,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (reg->type == PTR_TO_STACK) { - /* stack accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - * See check_stack_read(). - */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } off += reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, - size); - return -EACCES; - } + err = check_stack_access(env, reg, off, size); + if (err) + return err; state = func(env, reg); err = update_stack_depth(env, state, off); @@ -3253,11 +3264,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks && dst_reg->type == PTR_TO_MAP_VALUE && - check_map_access(env, dst, dst_reg->off, 1, false)) { - verbose(env, "R%d pointer arithmetic of map value goes out of range, prohibited for !root\n", - dst); - return -EACCES; + if (!env->allow_ptr_leaks) { + if (dst_reg->type == PTR_TO_MAP_VALUE && + check_map_access(env, dst, dst_reg->off, 1, false)) { + verbose(env, "R%d pointer arithmetic of map value goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } else if (dst_reg->type == PTR_TO_STACK && + check_stack_access(env, dst_reg, dst_reg->off + + dst_reg->var_off.value, 1)) { + verbose(env, "R%d stack pointer arithmetic goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } } return 0; From 9d7eceede769f90b66cfa06ad5b357140d5141ed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:32 +0100 Subject: [PATCH 52/57] bpf: restrict unknown scalars of mixed signed bounds for unprivileged For unknown scalars of mixed signed bounds, meaning their smin_value is negative and their smax_value is positive, we need to reject arithmetic with pointer to map value. For unprivileged the goal is to mask every map pointer arithmetic and this cannot reliably be done when it is unknown at verification time whether the scalar value is negative or positive. Given this is a corner case, the likelihood of breaking should be very small. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9ac205d1b8b7..eebbc03e5af2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3081,8 +3081,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); - u32 dst = insn->dst_reg; dst_reg = ®s[dst]; @@ -3115,6 +3115,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_VALUE: + if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { + verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", + off_reg == dst_reg ? dst : src); + return -EACCES; + } + /* fall-through */ default: break; } From b7137c4eab85c1cf3d46acdde90ce1163b28c873 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:33 +0100 Subject: [PATCH 53/57] bpf: fix check_map_access smin_value test when pointer contains offset In check_map_access() we probe actual bounds through __check_map_access() with offset of reg->smin_value + off for lower bound and offset of reg->umax_value + off for the upper bound. However, even though the reg->smin_value could have a negative value, the final result of the sum with off could be positive when pointer arithmetic with known and unknown scalars is combined. In this case we reject the program with an error such as "R min value is negative, either use unsigned index or do a if (index >=0) check." even though the access itself would be fine. Therefore extend the check to probe whether the actual resulting reg->smin_value + off is less than zero. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eebbc03e5af2..8e5da1ce5da4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1443,13 +1443,17 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, */ if (env->log.level) print_verifier_state(env, state); + /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->smin_value < 0) { + if (reg->smin_value < 0 && + (reg->smin_value == S64_MIN || + (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || + reg->smin_value + off < 0)) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; From 979d63d50c0c0f7bc537bf821e056cc9fe5abd38 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:34 +0100 Subject: [PATCH 54/57] bpf: prevent out of bounds speculation on pointer arithmetic Jann reported that the original commit back in b2157399cc98 ("bpf: prevent out-of-bounds speculation") was not sufficient to stop CPU from speculating out of bounds memory access: While b2157399cc98 only focussed on masking array map access for unprivileged users for tail calls and data access such that the user provided index gets sanitized from BPF program and syscall side, there is still a more generic form affected from BPF programs that applies to most maps that hold user data in relation to dynamic map access when dealing with unknown scalars or "slow" known scalars as access offset, for example: - Load a map value pointer into R6 - Load an index into R7 - Do a slow computation (e.g. with a memory dependency) that loads a limit into R8 (e.g. load the limit from a map for high latency, then mask it to make the verifier happy) - Exit if R7 >= R8 (mispredicted branch) - Load R0 = R6[R7] - Load R0 = R6[R0] For unknown scalars there are two options in the BPF verifier where we could derive knowledge from in order to guarantee safe access to the memory: i) While /<=/>= variants won't allow to derive any lower or upper bounds from the unknown scalar where it would be safe to add it to the map value pointer, it is possible through ==/!= test however. ii) another option is to transform the unknown scalar into a known scalar, for example, through ALU ops combination such as R &= followed by R |= or any similar combination where the original information from the unknown scalar would be destroyed entirely leaving R with a constant. The initial slow load still precedes the latter ALU ops on that register, so the CPU executes speculatively from that point. Once we have the known scalar, any compare operation would work then. A third option only involving registers with known scalars could be crafted as described in [0] where a CPU port (e.g. Slow Int unit) would be filled with many dependent computations such that the subsequent condition depending on its outcome has to wait for evaluation on its execution port and thereby executing speculatively if the speculated code can be scheduled on a different execution port, or any other form of mistraining as described in [1], for example. Given this is not limited to only unknown scalars, not only map but also stack access is affected since both is accessible for unprivileged users and could potentially be used for out of bounds access under speculation. In order to prevent any of these cases, the verifier is now sanitizing pointer arithmetic on the offset such that any out of bounds speculation would be masked in a way where the pointer arithmetic result in the destination register will stay unchanged, meaning offset masked into zero similar as in array_index_nospec() case. With regards to implementation, there are three options that were considered: i) new insn for sanitation, ii) push/pop insn and sanitation as inlined BPF, iii) reuse of ax register and sanitation as inlined BPF. Option i) has the downside that we end up using from reserved bits in the opcode space, but also that we would require each JIT to emit masking as native arch opcodes meaning mitigation would have slow adoption till everyone implements it eventually which is counter-productive. Option ii) and iii) have both in common that a temporary register is needed in order to implement the sanitation as inlined BPF since we are not allowed to modify the source register. While a push / pop insn in ii) would be useful to have in any case, it requires once again that every JIT needs to implement it first. While possible, amount of changes needed would also be unsuitable for a -stable patch. Therefore, the path which has fewer changes, less BPF instructions for the mitigation and does not require anything to be changed in the JITs is option iii) which this work is pursuing. The ax register is already mapped to a register in all JITs (modulo arm32 where it's mapped to stack as various other BPF registers there) and used in constant blinding for JITs-only so far. It can be reused for verifier rewrites under certain constraints. The interpreter's tmp "register" has therefore been remapped into extending the register set with hidden ax register and reusing that for a number of instructions that needed the prior temporary variable internally (e.g. div, mod). This allows for zero increase in stack space usage in the interpreter, and enables (restricted) generic use in rewrites otherwise as long as such a patchlet does not make use of these instructions. The sanitation mask is dynamic and relative to the offset the map value or stack pointer currently holds. There are various cases that need to be taken under consideration for the masking, e.g. such operation could look as follows: ptr += val or val += ptr or ptr -= val. Thus, the value to be sanitized could reside either in source or in destination register, and the limit is different depending on whether the ALU op is addition or subtraction and depending on the current known and bounded offset. The limit is derived as follows: limit := max_value_size - (smin_value + off). For subtraction: limit := umax_value + off. This holds because we do not allow any pointer arithmetic that would temporarily go out of bounds or would have an unknown value with mixed signed bounds where it is unclear at verification time whether the actual runtime value would be either negative or positive. For example, we have a derived map pointer value with constant offset and bounded one, so limit based on smin_value works because the verifier requires that statically analyzed arithmetic on the pointer must be in bounds, and thus it checks if resulting smin_value + off and umax_value + off is still within map value bounds at time of arithmetic in addition to time of access. Similarly, for the case of stack access we derive the limit as follows: MAX_BPF_STACK + off for subtraction and -off for the case of addition where off := ptr_reg->off + ptr_reg->var_off.value. Subtraction is a special case for the masking which can be in form of ptr += -val, ptr -= -val, or ptr -= val. In the first two cases where we know that the value is negative, we need to temporarily negate the value in order to do the sanitation on a positive value where we later swap the ALU op, and restore original source register if the value was in source. The sanitation of pointer arithmetic alone is still not fully sufficient as is, since a scenario like the following could happen ... PTR += 0x1000 (e.g. K-based imm) PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON PTR += 0x1000 PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON [...] ... which under speculation could end up as ... PTR += 0x1000 PTR -= 0 [ truncated by mitigation ] PTR += 0x1000 PTR -= 0 [ truncated by mitigation ] [...] ... and therefore still access out of bounds. To prevent such case, the verifier is also analyzing safety for potential out of bounds access under speculative execution. Meaning, it is also simulating pointer access under truncation. We therefore "branch off" and push the current verification state after the ALU operation with known 0 to the verification stack for later analysis. Given the current path analysis succeeded it is likely that the one under speculation can be pruned. In any case, it is also subject to existing complexity limits and therefore anything beyond this point will be rejected. In terms of pruning, it needs to be ensured that the verification state from speculative execution simulation must never prune a non-speculative execution path, therefore, we mark verifier state accordingly at the time of push_stack(). If verifier detects out of bounds access under speculative execution from one of the possible paths that includes a truncation, it will reject such program. Given we mask every reg-based pointer arithmetic for unprivileged programs, we've been looking into how it could affect real-world programs in terms of size increase. As the majority of programs are targeted for privileged-only use case, we've unconditionally enabled masking (with its alu restrictions on top of it) for privileged programs for the sake of testing in order to check i) whether they get rejected in its current form, and ii) by how much the number of instructions and size will increase. We've tested this by using Katran, Cilium and test_l4lb from the kernel selftests. For Katran we've evaluated balancer_kern.o, Cilium bpf_lxc.o and an older test object bpf_lxc_opt_-DUNKNOWN.o and l4lb we've used test_l4lb.o as well as test_l4lb_noinline.o. We found that none of the programs got rejected by the verifier with this change, and that impact is rather minimal to none. balancer_kern.o had 13,904 bytes (1,738 insns) xlated and 7,797 bytes JITed before and after the change. Most complex program in bpf_lxc.o had 30,544 bytes (3,817 insns) xlated and 18,538 bytes JITed before and after and none of the other tail call programs in bpf_lxc.o had any changes either. For the older bpf_lxc_opt_-DUNKNOWN.o object we found a small increase from 20,616 bytes (2,576 insns) and 12,536 bytes JITed before to 20,664 bytes (2,582 insns) and 12,558 bytes JITed after the change. Other programs from that object file had similar small increase. Both test_l4lb.o had no change and remained at 6,544 bytes (817 insns) xlated and 3,401 bytes JITed and for test_l4lb_noinline.o constant at 5,080 bytes (634 insns) xlated and 3,313 bytes JITed. This can be explained in that LLVM typically optimizes stack based pointer arithmetic by using K-based operations and that use of dynamic map access is not overly frequent. However, in future we may decide to optimize the algorithm further under known guarantees from branch and value speculation. Latter seems also unclear in terms of prediction heuristics that today's CPUs apply as well as whether there could be collisions in e.g. the predictor's Value History/Pattern Table for triggering out of bounds access, thus masking is performed unconditionally at this point but could be subject to relaxation later on. We were generally also brainstorming various other approaches for mitigation, but the blocker was always lack of available registers at runtime and/or overhead for runtime tracking of limits belonging to a specific pointer. Thus, we found this to be minimally intrusive under given constraints. With that in place, a simple example with sanitized access on unprivileged load at post-verification time looks as follows: # bpftool prog dump xlated id 282 [...] 28: (79) r1 = *(u64 *)(r7 +0) 29: (79) r2 = *(u64 *)(r7 +8) 30: (57) r1 &= 15 31: (79) r3 = *(u64 *)(r0 +4608) 32: (57) r3 &= 1 33: (47) r3 |= 1 34: (2d) if r2 > r3 goto pc+19 35: (b4) (u32) r11 = (u32) 20479 | 36: (1f) r11 -= r2 | Dynamic sanitation for pointer 37: (4f) r11 |= r2 | arithmetic with registers 38: (87) r11 = -r11 | containing bounded or known 39: (c7) r11 s>>= 63 | scalars in order to prevent 40: (5f) r11 &= r2 | out of bounds speculation. 41: (0f) r4 += r11 | 42: (71) r4 = *(u8 *)(r4 +0) 43: (6f) r4 <<= r1 [...] For the case where the scalar sits in the destination register as opposed to the source register, the following code is emitted for the above example: [...] 16: (b4) (u32) r11 = (u32) 20479 17: (1f) r11 -= r2 18: (4f) r11 |= r2 19: (87) r11 = -r11 20: (c7) r11 s>>= 63 21: (5f) r2 &= r11 22: (0f) r2 += r0 23: (61) r0 = *(u32 *)(r2 +0) [...] JIT blinding example with non-conflicting use of r10: [...] d5: je 0x0000000000000106 _ d7: mov 0x0(%rax),%edi | da: mov $0xf153246,%r10d | Index load from map value and e0: xor $0xf153259,%r10 | (const blinded) mask with 0x1f. e7: and %r10,%rdi |_ ea: mov $0x2f,%r10d | f0: sub %rdi,%r10 | Sanitized addition. Both use r10 f3: or %rdi,%r10 | but do not interfere with each f6: neg %r10 | other. (Neither do these instructions f9: sar $0x3f,%r10 | interfere with the use of ax as temp fd: and %r10,%rdi | in interpreter.) 100: add %rax,%rdi |_ 103: mov 0x0(%rdi),%eax [...] Tested that it fixes Jann's reproducer, and also checked that test_verifier and test_progs suite with interpreter, JIT and JIT with hardening enabled on x86-64 and arm64 runs successfully. [0] Speculose: Analyzing the Security Implications of Speculative Execution in CPUs, Giorgi Maisuradze and Christian Rossow, https://arxiv.org/pdf/1801.04084.pdf [1] A Systematic Evaluation of Transient Execution Attacks and Defenses, Claudio Canella, Jo Van Bulck, Michael Schwarz, Moritz Lipp, Benjamin von Berg, Philipp Ortner, Frank Piessens, Dmitry Evtyushkin, Daniel Gruss, https://arxiv.org/pdf/1811.05441.pdf Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation") Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 10 ++ kernel/bpf/verifier.c | 185 +++++++++++++++++++++++++++++++++-- 2 files changed, 189 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3f84f3e87704..27b74947cd2b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -148,6 +148,7 @@ struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; u32 curframe; + bool speculative; }; #define bpf_get_spilled_reg(slot, frame) \ @@ -167,15 +168,24 @@ struct bpf_verifier_state_list { struct bpf_verifier_state_list *next; }; +/* Possible states for alu_state member. */ +#define BPF_ALU_SANITIZE_SRC 1U +#define BPF_ALU_SANITIZE_DST 2U +#define BPF_ALU_NEG_VALUE (1U << 2) +#define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ + BPF_ALU_SANITIZE_DST) + struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ unsigned long map_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ + u32 alu_limit; /* limit for add/sub register with pointer */ }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ bool seen; /* this insn was processed by the verifier */ + u8 alu_state; /* used in combination with alu_limit */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e5da1ce5da4..f6bc62a9ee8e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -710,6 +710,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, free_func_state(dst_state->frame[i]); dst_state->frame[i] = NULL; } + dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; @@ -754,7 +755,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) + int insn_idx, int prev_insn_idx, + bool speculative) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; @@ -772,6 +774,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, err = copy_verifier_state(&elem->st, cur); if (err) goto err; + elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose(env, "BPF program is too complex\n"); goto err; @@ -3067,6 +3070,102 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, + u32 *ptr_limit, u8 opcode, bool off_is_neg) +{ + bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + u32 off; + + switch (ptr_reg->type) { + case PTR_TO_STACK: + off = ptr_reg->off + ptr_reg->var_off.value; + if (mask_to_left) + *ptr_limit = MAX_BPF_STACK + off; + else + *ptr_limit = -off; + return 0; + case PTR_TO_MAP_VALUE: + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->value_size - off; + } + return 0; + default: + return -EINVAL; + } +} + +static int sanitize_ptr_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn, + const struct bpf_reg_state *ptr_reg, + struct bpf_reg_state *dst_reg, + bool off_is_neg) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_insn_aux_data *aux = cur_aux(env); + bool ptr_is_dst_reg = ptr_reg == dst_reg; + u8 opcode = BPF_OP(insn->code); + u32 alu_state, alu_limit; + struct bpf_reg_state tmp; + bool ret; + + if (env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K) + return 0; + + /* We already marked aux for masking from non-speculative + * paths, thus we got here in the first place. We only care + * to explore bad access from here. + */ + if (vstate->speculative) + goto do_sim; + + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) + return 0; + + /* If we arrived here from different branches with different + * limits to sanitize, then this won't work. + */ + if (aux->alu_state && + (aux->alu_state != alu_state || + aux->alu_limit != alu_limit)) + return -EACCES; + + /* Corresponding fixup done in fixup_bpf_calls(). */ + aux->alu_state = alu_state; + aux->alu_limit = alu_limit; + +do_sim: + /* Simulate and find potential out-of-bounds access under + * speculative execution from truncation as a result of + * masking when off was not within expected range. If off + * sits in dst, then we temporarily need to move ptr there + * to simulate dst (== 0) +/-= ptr. Needed, for example, + * for cases where we use K-based arithmetic in one direction + * and truncated reg-based in the other in order to explore + * bad access. + */ + if (!ptr_is_dst_reg) { + tmp = *dst_reg; + *dst_reg = *ptr_reg; + } + ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + if (!ptr_is_dst_reg) + *dst_reg = tmp; + return !ret ? -EFAULT : 0; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -3087,6 +3186,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); + int ret; dst_reg = ®s[dst]; @@ -3142,6 +3242,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to add from different maps or paths\n", dst); + return ret; + } /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -3192,6 +3297,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to sub from different maps or paths\n", dst); + return ret; + } if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ verbose(env, "R%d tried to subtract pointer from scalar\n", @@ -4389,7 +4499,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, + false); if (!other_branch) return -EFAULT; other_branch_regs = other_branch->frame[other_branch->curframe]->regs; @@ -5499,6 +5610,12 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -5700,6 +5817,7 @@ static int do_check(struct bpf_verifier_env *env) if (!state) return -ENOMEM; state->curframe = 0; + state->speculative = false; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -5739,8 +5857,10 @@ static int do_check(struct bpf_verifier_env *env) /* found equivalent state, can prune the search */ if (env->log.level) { if (do_print_state) - verbose(env, "\nfrom %d to %d: safe\n", - env->prev_insn_idx, env->insn_idx); + verbose(env, "\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); else verbose(env, "%d: safe\n", env->insn_idx); } @@ -5757,8 +5877,10 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level > 1) verbose(env, "%d:", env->insn_idx); else - verbose(env, "\nfrom %d to %d:", - env->prev_insn_idx, env->insn_idx); + verbose(env, "\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -6750,6 +6872,57 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { + const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; + const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; + struct bpf_insn insn_buf[16]; + struct bpf_insn *patch = &insn_buf[0]; + bool issrc, isneg; + u32 off_reg; + + aux = &env->insn_aux_data[i + delta]; + if (!aux->alu_state) + continue; + + isneg = aux->alu_state & BPF_ALU_NEG_VALUE; + issrc = (aux->alu_state & BPF_ALU_SANITIZE) == + BPF_ALU_SANITIZE_SRC; + + off_reg = issrc ? insn->src_reg : insn->dst_reg; + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + if (issrc) { + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, + off_reg); + insn->src_reg = BPF_REG_AX; + } else { + *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, + BPF_REG_AX); + } + if (isneg) + insn->code = insn->code == code_add ? + code_sub : code_add; + *patch++ = *insn; + if (issrc && isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) From 80c9b2fae87bb5c5698940da1a981f14f89518d1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Jan 2019 00:58:35 +0100 Subject: [PATCH 55/57] bpf: add various test cases to selftests Add various map value pointer related test cases to test_verifier kselftest to reflect recent changes and improve test coverage. The tests include basic masking functionality, unprivileged behavior on pointer arithmetic which goes oob, mixed bounds tests, negative unknown scalar but resulting positive offset for access and helper range, handling of arithmetic from multiple maps, various masking scenarios with subsequent map value access and others including two test cases from Jann Horn for prior fixes. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_verifier.c | 1108 ++++++++++++++++++- 1 file changed, 1105 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 33f7d38849b8..10d44446e801 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -2577,6 +2578,7 @@ static struct bpf_test tests[] = { }, .result = REJECT, .errstr = "invalid stack off=-79992 size=8", + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", }, { "PTR_TO_STACK store/load - out of bounds high", @@ -3104,6 +3106,8 @@ static struct bpf_test tests[] = { BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -3205,6 +3209,243 @@ static struct bpf_test tests[] = { /* Verifier rewrite for unpriv skips tail call here. */ .retval_unpriv = 2, }, + { + "PTR_TO_STACK check high 1", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -1), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 42, + }, + { + "PTR_TO_STACK check high 2", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ST_MEM(BPF_B, BPF_REG_1, -1, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, -1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 42, + }, + { + "PTR_TO_STACK check high 3", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0), + BPF_ST_MEM(BPF_B, BPF_REG_1, -1, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, -1), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .result_unpriv = REJECT, + .result = ACCEPT, + .retval = 42, + }, + { + "PTR_TO_STACK check high 4", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .errstr = "invalid stack off=0 size=1", + .result = REJECT, + }, + { + "PTR_TO_STACK check high 5", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack off", + }, + { + "PTR_TO_STACK check high 6", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1), + BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MAX, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MAX), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack off", + }, + { + "PTR_TO_STACK check high 7", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1), + BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MAX, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MAX), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .errstr = "fp pointer offset", + }, + { + "PTR_TO_STACK check low 1", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -512), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 42, + }, + { + "PTR_TO_STACK check low 2", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -513), + BPF_ST_MEM(BPF_B, BPF_REG_1, 1, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 1), + BPF_EXIT_INSN(), + }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .result = ACCEPT, + .retval = 42, + }, + { + "PTR_TO_STACK check low 3", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -513), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .errstr = "invalid stack off=-513 size=1", + .result = REJECT, + }, + { + "PTR_TO_STACK check low 4", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, INT_MIN), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "math between fp pointer", + }, + { + "PTR_TO_STACK check low 5", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack off", + }, + { + "PTR_TO_STACK check low 6", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)), + BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MIN, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MIN), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack off", + }, + { + "PTR_TO_STACK check low 7", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)), + BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MIN, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MIN), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", + .errstr = "fp pointer offset", + }, + { + "PTR_TO_STACK mixed reg/k, 1", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -3), + BPF_MOV64_IMM(BPF_REG_2, -3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 42, + }, + { + "PTR_TO_STACK mixed reg/k, 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -3), + BPF_MOV64_IMM(BPF_REG_2, -3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_MOV64_REG(BPF_REG_5, BPF_REG_10), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_5, -6), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 42, + }, + { + "PTR_TO_STACK mixed reg/k, 3", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -3), + BPF_MOV64_IMM(BPF_REG_2, -3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = -3, + }, + { + "PTR_TO_STACK reg", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_MOV64_IMM(BPF_REG_2, -3), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2), + BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .result_unpriv = REJECT, + .errstr_unpriv = "invalid stack off=0 size=1", + .result = ACCEPT, + .retval = 42, + }, { "stack pointer arithmetic", .insns = { @@ -6609,6 +6850,232 @@ static struct bpf_test tests[] = { .errstr = "R1 min value is negative", .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, + { + "map access: known scalar += value_ptr from different maps", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 5 }, + .fixup_map_array_48b = { 8 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps", + .retval = 1, + }, + { + "map access: value_ptr -= known scalar from different maps", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_16b = { 5 }, + .fixup_map_array_48b = { 8 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 min value is outside of the array range", + .retval = 1, + }, + { + "map access: known scalar += value_ptr from different maps, but same value properties", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_48b = { 5 }, + .fixup_map_array_48b = { 8 }, + .result = ACCEPT, + .retval = 1, + }, + { + "map access: value_ptr += known scalar, upper oob arith, test 1", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_IMM(BPF_REG_1, 48), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", + .retval = 1, + }, + { + "map access: value_ptr += known scalar, upper oob arith, test 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_IMM(BPF_REG_1, 49), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", + .retval = 1, + }, + { + "map access: value_ptr += known scalar, upper oob arith, test 3", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_IMM(BPF_REG_1, 47), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", + .retval = 1, + }, + { + "map access: value_ptr -= known scalar, lower oob arith, test 1", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_IMM(BPF_REG_1, 47), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, 48), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = REJECT, + .errstr = "R0 min value is outside of the array range", + .result_unpriv = REJECT, + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", + }, + { + "map access: value_ptr -= known scalar, lower oob arith, test 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_MOV64_IMM(BPF_REG_1, 47), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, 48), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", + .retval = 1, + }, + { + "map access: value_ptr -= known scalar, lower oob arith, test 3", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_IMM(BPF_REG_1, 47), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, 47), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", + .retval = 1, + }, { "map access: known scalar += value_ptr", .insns = { @@ -6630,7 +7097,7 @@ static struct bpf_test tests[] = { .retval = 1, }, { - "map access: value_ptr += known scalar", + "map access: value_ptr += known scalar, 1", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@ -6650,7 +7117,113 @@ static struct bpf_test tests[] = { .retval = 1, }, { - "map access: unknown scalar += value_ptr", + "map access: value_ptr += known scalar, 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, 49), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = REJECT, + .errstr = "invalid access to map value", + }, + { + "map access: value_ptr += known scalar, 3", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = REJECT, + .errstr = "invalid access to map value", + }, + { + "map access: value_ptr += known scalar, 4", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7), + BPF_MOV64_IMM(BPF_REG_1, 5), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, -2), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", + .retval = 1, + }, + { + "map access: value_ptr += known scalar, 5", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_1, (6 + 1) * sizeof(int)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 0xabcdef12, + }, + { + "map access: value_ptr += known scalar, 6", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_IMM(BPF_REG_1, (3 + 1) * sizeof(int)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, 3 * sizeof(int)), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 0xabcdef12, + }, + { + "map access: unknown scalar += value_ptr, 1", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@ -6671,7 +7244,76 @@ static struct bpf_test tests[] = { .retval = 1, }, { - "map access: value_ptr += unknown scalar", + "map access: unknown scalar += value_ptr, 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 0xabcdef12, + }, + { + "map access: unknown scalar += value_ptr, 3", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", + .retval = 0xabcdef12, + }, + { + "map access: unknown scalar += value_ptr, 4", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV64_IMM(BPF_REG_1, 19), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31), + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = REJECT, + .errstr = "R1 max value is outside of the array range", + .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range", + }, + { + "map access: value_ptr += unknown scalar, 1", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@ -6691,6 +7333,54 @@ static struct bpf_test tests[] = { .result = ACCEPT, .retval = 1, }, + { + "map access: value_ptr += unknown scalar, 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 0xabcdef12, + }, + { + "map access: value_ptr += unknown scalar, 3", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 8), + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 16), + BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf), + BPF_ALU64_IMM(BPF_AND, BPF_REG_3, 1), + BPF_ALU64_IMM(BPF_OR, BPF_REG_3, 1), + BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_3, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_3), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_JMP_IMM(BPF_JA, 0, 0, -3), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 1, + }, { "map access: value_ptr += value_ptr", .insns = { @@ -6770,6 +7460,8 @@ static struct bpf_test tests[] = { }, .fixup_map_array_48b = { 3 }, .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", .retval = 1, }, { @@ -6837,6 +7529,8 @@ static struct bpf_test tests[] = { }, .fixup_map_array_48b = { 3 }, .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", .retval = 1, }, { @@ -8376,6 +9070,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8400,6 +9095,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8426,6 +9122,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R8 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8451,6 +9148,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R8 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8499,6 +9197,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8570,6 +9269,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8621,6 +9321,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8648,6 +9349,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8674,6 +9376,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8703,6 +9406,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R7 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8733,6 +9437,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 4 }, .errstr = "unbounded min value", + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, { @@ -8761,6 +9466,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "unbounded min value", + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, .result_unpriv = REJECT, }, @@ -8813,8 +9519,38 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.", + .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds", .result = REJECT, }, + { + "check subtraction on pointers for unpriv", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LD_MAP_FD(BPF_REG_ARG1, 0), + BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_ARG2, 0, 9), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_9, BPF_REG_FP), + BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_0), + BPF_LD_MAP_FD(BPF_REG_ARG1, 0), + BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_ARG2, 0, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 1, 9 }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R9 pointer -= pointer prohibited", + }, { "bounds check based on zero-extended MOV", .insns = { @@ -9145,6 +9881,36 @@ static struct bpf_test tests[] = { .errstr = "R0 unbounded memory access", .result = REJECT }, + { + "bounds check after 32-bit right shift with 64-bit input", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + /* r1 = 2 */ + BPF_MOV64_IMM(BPF_REG_1, 2), + /* r1 = 1<<32 */ + BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 31), + /* r1 = 0 (NOT 2!) */ + BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 31), + /* r1 = 0xffff'fffe (NOT 0!) */ + BPF_ALU32_IMM(BPF_SUB, BPF_REG_1, 2), + /* computes OOB pointer */ + BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1), + /* OOB access */ + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0), + /* exit */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 3 }, + .errstr = "R0 invalid mem access", + .result = REJECT, + }, { "bounds check map access with off+size signed 32bit overflow. test1", .insns = { @@ -9185,6 +9951,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "pointer offset 1073741822", + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", .result = REJECT }, { @@ -9206,6 +9973,7 @@ static struct bpf_test tests[] = { }, .fixup_map_hash_8b = { 3 }, .errstr = "pointer offset -1073741822", + .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", .result = REJECT }, { @@ -9377,6 +10145,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN() }, .errstr = "fp pointer offset 1073741822", + .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", .result = REJECT }, { @@ -13718,6 +14487,328 @@ static struct bpf_test tests[] = { .result_unpriv = ACCEPT, .insn_processed = 15, }, + { + "masking, test out of bounds 1", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 5), + BPF_MOV32_IMM(BPF_REG_2, 5 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 1), + BPF_MOV32_IMM(BPF_REG_2, 1 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 3", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0xffffffff), + BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 4", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0xffffffff), + BPF_MOV32_IMM(BPF_REG_2, 1 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 5", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, -1), + BPF_MOV32_IMM(BPF_REG_2, 1 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 6", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, -1), + BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 7", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 5), + BPF_MOV32_IMM(BPF_REG_2, 5 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 8", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_MOV32_IMM(BPF_REG_2, 1 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 9", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0xffffffff), + BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 10", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0xffffffff), + BPF_MOV32_IMM(BPF_REG_2, 1 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 11", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_MOV32_IMM(BPF_REG_2, 1 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test out of bounds 12", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, -1), + BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test in bounds 1", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 4), + BPF_MOV32_IMM(BPF_REG_2, 5 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 4, + }, + { + "masking, test in bounds 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test in bounds 3", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0xfffffffe), + BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0xfffffffe, + }, + { + "masking, test in bounds 4", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0xabcde), + BPF_MOV32_IMM(BPF_REG_2, 0xabcdef - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0xabcde, + }, + { + "masking, test in bounds 5", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 0), + BPF_MOV32_IMM(BPF_REG_2, 1 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, + { + "masking, test in bounds 6", + .insns = { + BPF_MOV32_IMM(BPF_REG_1, 46), + BPF_MOV32_IMM(BPF_REG_2, 47 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 46, + }, + { + "masking, test in bounds 7", + .insns = { + BPF_MOV64_IMM(BPF_REG_3, -46), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, -1), + BPF_MOV32_IMM(BPF_REG_2, 47 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_3), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_3, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_3), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 46, + }, + { + "masking, test in bounds 8", + .insns = { + BPF_MOV64_IMM(BPF_REG_3, -47), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, -1), + BPF_MOV32_IMM(BPF_REG_2, 47 - 1), + BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_3), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3), + BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0), + BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63), + BPF_ALU64_REG(BPF_AND, BPF_REG_3, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_3), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, { "reference tracking in call: free reference in subprog and outside", .insns = { @@ -14413,6 +15504,16 @@ static int create_map(uint32_t type, uint32_t size_key, return fd; } +static void update_map(int fd, int index) +{ + struct test_val value = { + .index = (6 + 1) * sizeof(int), + .foo[6] = 0xabcdef12, + }; + + assert(!bpf_map_update_elem(fd, &index, &value, 0)); +} + static int create_prog_dummy1(enum bpf_prog_type prog_type) { struct bpf_insn prog[] = { @@ -14564,6 +15665,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, if (*fixup_map_array_48b) { map_fds[3] = create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(struct test_val), 1); + update_map(map_fds[3], 0); do { prog[*fixup_map_array_48b].imm = map_fds[3]; fixup_map_array_48b++; From 73155879b3c1ac3ace35208a54a3a160ec520bef Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 2 Jan 2019 18:26:13 -0800 Subject: [PATCH 56/57] ipv6: Fix dump of specific table with strict checking Dump of a specific table with strict checking enabled is looping. The problem is that the end of the table dump is not marked in the cb. When dumping a specific table, cb args 0 and 1 are not used (they are the hash index and entry with an hash table index when dumping all tables). Re-use args[0] to hold a 'done' flag for the specific table dump. Fixes: 13e38901d46ca ("net/ipv6: Plumb support for filtering route dumps") Reported-by: Jakub Kicinski Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ae3786132c23..6613d8dbb0e5 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -627,7 +627,11 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) return -ENOENT; } - res = fib6_dump_table(tb, skb, cb); + if (!cb->args[0]) { + res = fib6_dump_table(tb, skb, cb); + if (!res) + cb->args[0] = 1; + } goto out; } From c5ee066333ebc322a24a00a743ed941a0c68617e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 2 Jan 2019 18:57:09 -0800 Subject: [PATCH 57/57] ipv6: Consider sk_bound_dev_if when binding a socket to an address IPv6 does not consider if the socket is bound to a device when binding to an address. The result is that a socket can be bound to eth0 and then bound to the address of eth1. If the device is a VRF, the result is that a socket can only be bound to an address in the default VRF. Resolve by considering the device if sk_bound_dev_if is set. This problem exists from the beginning of git history. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f0cd291034f0..0bfb6cc0a30a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -350,6 +350,9 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EINVAL; goto out_unlock; } + } + + if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { err = -ENODEV;