From dc4501ff287547dea7ca10f1c580c741291a8760 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sun, 23 Dec 2018 21:45:56 -0800
Subject: [PATCH 01/57] tipc: fix a double free in tipc_enable_bearer()

bearer_disable() already calls kfree_rcu() to free struct tipc_bearer,
we don't need to call kfree() again.

Fixes: cb30a63384bc ("tipc: refactor function tipc_enable_bearer()")
Reported-by: syzbot+b981acf1fb240c0c128b@syzkaller.appspotmail.com
Cc: Ying Xue <ying.xue@windriver.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bearer.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index fb2c0d8f359f..d27f30a9a01d 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -319,7 +319,6 @@ static int tipc_enable_bearer(struct net *net, const char *name,
 	res = tipc_disc_create(net, b, &b->bcast_addr, &skb);
 	if (res) {
 		bearer_disable(net, b);
-		kfree(b);
 		errstr = "failed to create discoverer";
 		goto rejected;
 	}

From f0fb9b288d0a7e9cc324ae362e2dfd2cc2217ded Mon Sep 17 00:00:00 2001
From: Aditya Pakki <pakki001@umn.edu>
Date: Mon, 24 Dec 2018 10:30:17 -0600
Subject: [PATCH 02/57] ipv6/route: Add a missing check on proc_dointvec

While flushing the cache via  ipv6_sysctl_rtcache_flush(), the call
to proc_dointvec() may fail. The fix adds a check that returns the
error, on failure.

Signed-off-by: Aditya Pakki <pakki001@umn.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 194bc162866d..a94e0b02a8ac 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5054,12 +5054,16 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 {
 	struct net *net;
 	int delay;
+	int ret;
 	if (!write)
 		return -EINVAL;
 
 	net = (struct net *)ctl->extra1;
 	delay = net->ipv6.sysctl.flush_delay;
-	proc_dointvec(ctl, write, buffer, lenp, ppos);
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+	if (ret)
+		return ret;
+
 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
 	return 0;
 }

From ca19fcb6285bfce1601c073bf4b9d2942e2df8d9 Mon Sep 17 00:00:00 2001
From: Aditya Pakki <pakki001@umn.edu>
Date: Mon, 24 Dec 2018 15:21:21 -0600
Subject: [PATCH 03/57] net: chelsio: Add a missing check on cudg_get_buffer

cudbg_collect_hw_sched() could fail when the function cudg_get_buffer()
returns an error. The fix adds a check to the latter function returning
error on failure

Signed-off-by: Aditya Pakki <pakki001@umn.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
index 7c49681407ad..127b1f624413 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
@@ -1229,6 +1229,10 @@ int cudbg_collect_hw_sched(struct cudbg_init *pdbg_init,
 
 	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(struct cudbg_hw_sched),
 			    &temp_buff);
+
+	if (rc)
+		return rc;
+
 	hw_sched_buff = (struct cudbg_hw_sched *)temp_buff.data;
 	hw_sched_buff->map = t4_read_reg(padap, TP_TX_MOD_QUEUE_REQ_MAP_A);
 	hw_sched_buff->mode = TIMERMODE_G(t4_read_reg(padap, TP_MOD_CONFIG_A));

From 26fd962bde0b15e54234fe762d86bc0349df1de4 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Tue, 25 Dec 2018 01:56:14 -0600
Subject: [PATCH 04/57] niu: fix missing checks of niu_pci_eeprom_read

niu_pci_eeprom_read() may fail, so we should check its return value
before using the read data.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Acked-by: Shannon Nelson <shannon.lee.nelson@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sun/niu.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 9319d84bf49f..d84501441edd 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -8100,6 +8100,8 @@ static int niu_pci_vpd_scan_props(struct niu *np, u32 start, u32 end)
 		start += 3;
 
 		prop_len = niu_pci_eeprom_read(np, start + 4);
+		if (prop_len < 0)
+			return prop_len;
 		err = niu_pci_vpd_get_propname(np, start + 5, namebuf, 64);
 		if (err < 0)
 			return err;
@@ -8144,8 +8146,12 @@ static int niu_pci_vpd_scan_props(struct niu *np, u32 start, u32 end)
 			netif_printk(np, probe, KERN_DEBUG, np->dev,
 				     "VPD_SCAN: Reading in property [%s] len[%d]\n",
 				     namebuf, prop_len);
-			for (i = 0; i < prop_len; i++)
-				*prop_buf++ = niu_pci_eeprom_read(np, off + i);
+			for (i = 0; i < prop_len; i++) {
+				err = niu_pci_eeprom_read(np, off + i);
+				if (err >= 0)
+					*prop_buf = err;
+				++prop_buf;
+			}
 		}
 
 		start += len;

From 2d822f2dbab7f4c820f72eb8570aacf3f35855bd Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Tue, 25 Dec 2018 20:55:37 -0600
Subject: [PATCH 05/57] net: (cpts) fix a missing check of clk_prepare

clk_prepare() could fail, so let's check its status, and if it fails,
return its error code upstream.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpts.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index 054f78295d1d..2a9ba4acd7fa 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -590,7 +590,9 @@ struct cpts *cpts_create(struct device *dev, void __iomem *regs,
 		return ERR_CAST(cpts->refclk);
 	}
 
-	clk_prepare(cpts->refclk);
+	ret = clk_prepare(cpts->refclk);
+	if (ret)
+		return ERR_PTR(ret);
 
 	cpts->cc.read = cpts_systim_read;
 	cpts->cc.mask = CLOCKSOURCE_MASK(32);

From f86a3b83833e7cfe558ca4d70b64ebc48903efec Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Tue, 25 Dec 2018 20:57:14 -0600
Subject: [PATCH 06/57] net: stmicro: fix a missing check of clk_prepare

clk_prepare() could fail, so let's check its status, and if it fails,
return its error code upstream.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
index d07520fb969e..62ccbd47c1db 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
@@ -59,7 +59,9 @@ static int sun7i_gmac_init(struct platform_device *pdev, void *priv)
 		gmac->clk_enabled = 1;
 	} else {
 		clk_set_rate(gmac->tx_clk, SUN7I_GMAC_MII_RATE);
-		clk_prepare(gmac->tx_clk);
+		ret = clk_prepare(gmac->tx_clk);
+		if (ret)
+			return ret;
 	}
 
 	return 0;

From e49505f7255be8ced695919c08a29bf2c3d79616 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Tue, 25 Dec 2018 22:08:18 -0600
Subject: [PATCH 07/57] net: dsa: bcm_sf2: Propagate error value from
 mdio_write

Both bcm_sf2_sw_indir_rw and mdiobus_write_nested could fail, so let's
return their error codes upstream.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index aa4a1f5206f1..361fbde76654 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -303,11 +303,10 @@ static int bcm_sf2_sw_mdio_write(struct mii_bus *bus, int addr, int regnum,
 	 * send them to our master MDIO bus controller
 	 */
 	if (addr == BRCM_PSEUDO_PHY_ADDR && priv->indir_phy_mask & BIT(addr))
-		bcm_sf2_sw_indir_rw(priv, 0, addr, regnum, val);
+		return bcm_sf2_sw_indir_rw(priv, 0, addr, regnum, val);
 	else
-		mdiobus_write_nested(priv->master_mii_bus, addr, regnum, val);
-
-	return 0;
+		return mdiobus_write_nested(priv->master_mii_bus, addr,
+				regnum, val);
 }
 
 static irqreturn_t bcm_sf2_switch_0_isr(int irq, void *dev_id)

From ff07d48d7bc0974d4f96a85a4df14564fb09f1ef Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Tue, 25 Dec 2018 22:23:19 -0600
Subject: [PATCH 08/57] atl1e: checking the status of atl1e_write_phy_reg

atl1e_write_phy_reg() could fail. The fix issues an error message when
it fails.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 9dc6da039a6d..3164aad29bcf 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -473,7 +473,9 @@ static void atl1e_mdio_write(struct net_device *netdev, int phy_id,
 {
 	struct atl1e_adapter *adapter = netdev_priv(netdev);
 
-	atl1e_write_phy_reg(&adapter->hw, reg_num & MDIO_REG_ADDR_MASK, val);
+	if (atl1e_write_phy_reg(&adapter->hw,
+				reg_num & MDIO_REG_ADDR_MASK, val))
+		netdev_err(netdev, "write phy register failed\n");
 }
 
 static int atl1e_mii_ioctl(struct net_device *netdev,

From 46273cf7e009231d2b6bc10a926e82b8928a9fb2 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Wed, 26 Dec 2018 00:09:04 -0600
Subject: [PATCH 09/57] tipc: fix a missing check of genlmsg_put

genlmsg_put could fail. The fix inserts a check of its return value, and
if it fails, returns -EMSGSIZE.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/netlink_compat.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 21f6ccc89401..40f5cae623a7 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -904,6 +904,8 @@ static int tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, u32 sock)
 
 	hdr = genlmsg_put(args, 0, 0, &tipc_genl_family, NLM_F_MULTI,
 			  TIPC_NL_PUBL_GET);
+	if (!hdr)
+		return -EMSGSIZE;
 
 	nest = nla_nest_start(args, TIPC_NLA_SOCK);
 	if (!nest) {

From 92ee77d148bf06d8c52664be4d1b862583fd5c0e Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Wed, 26 Dec 2018 00:31:08 -0600
Subject: [PATCH 10/57] net: marvell: fix a missing check of acpi_match_device

When acpi_match_device fails, its return value is NULL. Directly using
the return value without a check may result in a NULL-pointer
dereference. The fix checks if acpi_match_device fails, and if so,
returns -EINVAL.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index f1dab0b55769..c9444346475b 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -5255,6 +5255,8 @@ static int mvpp2_probe(struct platform_device *pdev)
 	if (has_acpi_companion(&pdev->dev)) {
 		acpi_id = acpi_match_device(pdev->dev.driver->acpi_match_table,
 					    &pdev->dev);
+		if (!acpi_id)
+			return -EINVAL;
 		priv->hw_version = (unsigned long)acpi_id->driver_data;
 	} else {
 		priv->hw_version =

From 40752b3eae29f8ca2378e978a02bd6dbeeb06d16 Mon Sep 17 00:00:00 2001
From: Wen Yang <wen.yang99@zte.com.cn>
Date: Wed, 26 Dec 2018 16:28:30 +0800
Subject: [PATCH 11/57] net/wan/fsl_ucc_hdlc: Avoid double free in
 ucc_hdlc_probe()

This patch fixes potential double frees if register_hdlc_device() fails.

Signed-off-by: Wen Yang <wen.yang99@zte.com.cn>
Reviewed-by: Peng Hao <peng.hao2@zte.com.cn>
CC: Zhao Qiang <qiang.zhao@nxp.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: netdev@vger.kernel.org
CC: linuxppc-dev@lists.ozlabs.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/fsl_ucc_hdlc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index 7a42336c8af8..839fa7715709 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -1180,7 +1180,6 @@ static int ucc_hdlc_probe(struct platform_device *pdev)
 	if (register_hdlc_device(dev)) {
 		ret = -ENOBUFS;
 		pr_err("ucc_hdlc: unable to register hdlc device\n");
-		free_netdev(dev);
 		goto free_dev;
 	}
 

From d8de01b763e0d8b3b418d3606d26f203983b6637 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Wed, 26 Dec 2018 06:35:23 -0600
Subject: [PATCH 12/57] phy.h: fix obvious errors in doc and kerneldoc content

1) note that gianfar_phy.c was removed years ago
 2) fix obvious copy and paste error in regular doc
 3) change regular doc into kerneldoc for phy_modes()

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/linux/phy.h b/include/linux/phy.h
index da039f211c22..3b051f761450 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1,6 +1,6 @@
 /*
  * Framework and drivers for configuring and reading different PHYs
- * Based on code in sungem_phy.c and gianfar_phy.c
+ * Based on code in sungem_phy.c and (long-removed) gianfar_phy.c
  *
  * Author: Andy Fleming
  *
@@ -110,9 +110,9 @@ typedef enum {
  * @speeds: buffer to store supported speeds in.
  * @size: size of speeds buffer.
  *
- * Description: Returns the number of supported speeds, and
- * fills the speeds * buffer with the supported speeds. If speeds buffer is
- * too small to contain * all currently supported speeds, will return as
+ * Description: Returns the number of supported speeds, and fills
+ * the speeds buffer with the supported speeds. If speeds buffer is
+ * too small to contain all currently supported speeds, will return as
  * many speeds as can fit.
  */
 unsigned int phy_supported_speeds(struct phy_device *phy,
@@ -120,7 +120,10 @@ unsigned int phy_supported_speeds(struct phy_device *phy,
 				      unsigned int size);
 
 /**
- * It maps 'enum phy_interface_t' found in include/linux/phy.h
+ * phy_modes - map phy_interface_t enum to device tree binding of phy-mode
+ * @interface: enum phy_interface_t value
+ *
+ * Description: maps 'enum phy_interface_t' defined in this file
  * into the device tree binding of 'phy-mode', so that Ethernet
  * device driver can get phy interface from device tree.
  */

From a3c9311f62b4943228ae90f769775dd3bcbfa7c0 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Thu, 27 Dec 2018 16:10:59 -0500
Subject: [PATCH 13/57] include/linux/phy/phy.h: fix minor kerneldoc errors

Correct two minor kerneldoc errors:

 1) missing reference to @mode in struct phy_ops
 2) obsolete reference to @init_data in struct_phy_attrs,
    removed in dbc98635e0d42f0e62ea92813df1e0e4c90f8375

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy/phy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 03b319f89a34..66d1560f1a26 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -69,6 +69,7 @@ struct phy_ops {
 /**
  * struct phy_attrs - represents phy attributes
  * @bus_width: Data path width implemented by PHY
+ * @mode: PHY mode
  */
 struct phy_attrs {
 	u32			bus_width;
@@ -80,7 +81,6 @@ struct phy_attrs {
  * @dev: phy device
  * @id: id of the phy device
  * @ops: function pointers for performing phy operations
- * @init_data: list of PHY consumers (non-dt only)
  * @mutex: mutex to protect phy_ops
  * @init_count: used to protect when the PHY is used by multiple consumers
  * @power_count: used to protect when the PHY is used by multiple consumers

From eb8950861c1bfd3eecc8f6faad213e3bca0dc395 Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Fri, 21 Dec 2018 00:46:23 -0600
Subject: [PATCH 14/57] netfilter: nf_tables: fix a missing check of
 nla_put_failure

If nla_nest_start() may fail. The fix checks its return value and goes
to nla_put_failure if it fails.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fec814dace5a..2b0a93300dd7 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5727,6 +5727,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 		goto nla_put_failure;
 
 	nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
+	if (!nest)
+		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) ||
 	    nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority)))
 		goto nla_put_failure;

From c78e7818f16f687389174c4569243abbec8dc68f Mon Sep 17 00:00:00 2001
From: Shawn Bohrer <sbohrer@cloudflare.com>
Date: Fri, 28 Dec 2018 01:24:42 +0100
Subject: [PATCH 15/57] netfilter: nf_conncount: replace CONNCOUNT_LOCK_SLOTS
 with CONNCOUNT_SLOTS

Most of the time these were the same value anyway, but when
CONFIG_LOCKDEP was enabled we would use a smaller number of locks to
reduce overhead.  Unfortunately having two values is confusing and not
worth the complexity.

This fixes a bug where tree_gc_worker() would only GC up to
CONNCOUNT_LOCK_SLOTS trees which meant when CONFIG_LOCKDEP was enabled
not all trees would be GCed by tree_gc_worker().

Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Shawn Bohrer <sbohrer@cloudflare.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 9cd180bda092..3271a4e00500 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -33,12 +33,6 @@
 
 #define CONNCOUNT_SLOTS		256U
 
-#ifdef CONFIG_LOCKDEP
-#define CONNCOUNT_LOCK_SLOTS	8U
-#else
-#define CONNCOUNT_LOCK_SLOTS	256U
-#endif
-
 #define CONNCOUNT_GC_MAX_NODES	8
 #define MAX_KEYLEN		5
 
@@ -60,7 +54,7 @@ struct nf_conncount_rb {
 	struct rcu_head rcu_head;
 };
 
-static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp;
+static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp;
 
 struct nf_conncount_data {
 	unsigned int keylen;
@@ -353,7 +347,7 @@ insert_tree(struct net *net,
 	unsigned int count = 0, gc_count = 0;
 	bool node_found = false;
 
-	spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+	spin_lock_bh(&nf_conncount_locks[hash]);
 
 	parent = NULL;
 	rbnode = &(root->rb_node);
@@ -430,7 +424,7 @@ insert_tree(struct net *net,
 	rb_link_node_rcu(&rbconn->node, parent, rbnode);
 	rb_insert_color(&rbconn->node, root);
 out_unlock:
-	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+	spin_unlock_bh(&nf_conncount_locks[hash]);
 	return count;
 }
 
@@ -499,7 +493,7 @@ static void tree_gc_worker(struct work_struct *work)
 	struct rb_node *node;
 	unsigned int tree, next_tree, gc_count = 0;
 
-	tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS;
+	tree = data->gc_tree % CONNCOUNT_SLOTS;
 	root = &data->root[tree];
 
 	rcu_read_lock();
@@ -621,10 +615,7 @@ static int __init nf_conncount_modinit(void)
 {
 	int i;
 
-	BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS);
-	BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0);
-
-	for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i)
+	for (i = 0; i < CONNCOUNT_SLOTS; ++i)
 		spin_lock_init(&nf_conncount_locks[i]);
 
 	conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",

From 4cd273bb91b3001f623f516ec726c49754571b1a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 28 Dec 2018 01:24:43 +0100
Subject: [PATCH 16/57] netfilter: nf_conncount: don't skip eviction when age
 is negative

age is signed integer, so result can be negative when the timestamps
have a large delta.  In this case we want to discard the entry.

Instead of using age >= 2 || age < 0, just make it unsigned.

Fixes: b36e4523d4d56 ("netfilter: nf_conncount: fix garbage collection confirm race")
Reviewed-by: Shawn Bohrer <sbohrer@cloudflare.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 3271a4e00500..8bb4ed85c262 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -155,7 +155,7 @@ find_or_evict(struct net *net, struct nf_conncount_list *list,
 	const struct nf_conntrack_tuple_hash *found;
 	unsigned long a, b;
 	int cpu = raw_smp_processor_id();
-	__s32 age;
+	u32 age;
 
 	found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
 	if (found)

From f7fcc98dfc2d136722007fec0debbed761679b94 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 28 Dec 2018 01:24:44 +0100
Subject: [PATCH 17/57] netfilter: nf_conncount: split gc in two phases

The lockless workqueue garbage collector can race with packet path
garbage collector to delete list nodes, as it calls tree_nodes_free()
with the addresses of nodes that might have been free'd already from
another cpu.

To fix this, split gc into two phases.

One phase to perform gc on the connections: From a locking perspective,
this is the same as count_tree(): we hold rcu lock, but we do not
change the tree, we only change the nodes' contents.

The second phase acquires the tree lock and reaps empty nodes.
This avoids a race condition of the garbage collection vs.  packet path:
If a node has been free'd already, the second phase won't find it anymore.

This second phase is, from locking perspective, same as insert_tree().

The former only modifies nodes (list content, count), latter modifies
the tree itself (rb_erase or rb_insert).

Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search")
Reviewed-by: Shawn Bohrer <sbohrer@cloudflare.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 8bb4ed85c262..753132e4afa8 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -500,16 +500,32 @@ static void tree_gc_worker(struct work_struct *work)
 	for (node = rb_first(root); node != NULL; node = rb_next(node)) {
 		rbconn = rb_entry(node, struct nf_conncount_rb, node);
 		if (nf_conncount_gc_list(data->net, &rbconn->list))
-			gc_nodes[gc_count++] = rbconn;
+			gc_count++;
 	}
 	rcu_read_unlock();
 
 	spin_lock_bh(&nf_conncount_locks[tree]);
+	if (gc_count < ARRAY_SIZE(gc_nodes))
+		goto next; /* do not bother */
 
-	if (gc_count) {
-		tree_nodes_free(root, gc_nodes, gc_count);
+	gc_count = 0;
+	node = rb_first(root);
+	while (node != NULL) {
+		rbconn = rb_entry(node, struct nf_conncount_rb, node);
+		node = rb_next(node);
+
+		if (rbconn->list.count > 0)
+			continue;
+
+		gc_nodes[gc_count++] = rbconn;
+		if (gc_count >= ARRAY_SIZE(gc_nodes)) {
+			tree_nodes_free(root, gc_nodes, gc_count);
+			gc_count = 0;
+		}
 	}
 
+	tree_nodes_free(root, gc_nodes, gc_count);
+next:
 	clear_bit(tree, data->pending_trees);
 
 	next_tree = (tree + 1) % CONNCOUNT_SLOTS;

From e8cfb372b38a1b8979aa7f7631fb5e7b11c3793c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 28 Dec 2018 01:24:45 +0100
Subject: [PATCH 18/57] netfilter: nf_conncount: restart search when nodes have
 been erased

Shawn Bohrer reported a following crash:
 |RIP: 0010:rb_erase+0xae/0x360
 [..]
 Call Trace:
  nf_conncount_destroy+0x59/0xc0 [nf_conncount]
  cleanup_match+0x45/0x70 [ip_tables]
  ...

Shawn tracked this down to bogus 'parent' pointer:
Problem is that when we insert a new node, then there is a chance that
the 'parent' that we found was also passed to tree_nodes_free() (because
that node was empty) for erase+free.

Instead of trying to be clever and detect when this happens, restart
the search if we have evicted one or more nodes.  To prevent frequent
restarts, do not perform gc on the second round.

Also, unconditionally schedule the gc worker.
The condition

  gc_count > ARRAY_SIZE(gc_nodes))

cannot be true unless tree grows very large, as the height of the tree
will be low even with hundreds of nodes present.

Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search")
Reported-by: Shawn Bohrer <sbohrer@cloudflare.com>
Reviewed-by: Shawn Bohrer <sbohrer@cloudflare.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 753132e4afa8..0a83c694a8f1 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -346,9 +346,10 @@ insert_tree(struct net *net,
 	struct nf_conncount_tuple *conn;
 	unsigned int count = 0, gc_count = 0;
 	bool node_found = false;
+	bool do_gc = true;
 
 	spin_lock_bh(&nf_conncount_locks[hash]);
-
+restart:
 	parent = NULL;
 	rbnode = &(root->rb_node);
 	while (*rbnode) {
@@ -381,21 +382,16 @@ insert_tree(struct net *net,
 		if (gc_count >= ARRAY_SIZE(gc_nodes))
 			continue;
 
-		if (nf_conncount_gc_list(net, &rbconn->list))
+		if (do_gc && nf_conncount_gc_list(net, &rbconn->list))
 			gc_nodes[gc_count++] = rbconn;
 	}
 
 	if (gc_count) {
 		tree_nodes_free(root, gc_nodes, gc_count);
-		/* tree_node_free before new allocation permits
-		 * allocator to re-use newly free'd object.
-		 *
-		 * This is a rare event; in most cases we will find
-		 * existing node to re-use. (or gc_count is 0).
-		 */
-
-		if (gc_count >= ARRAY_SIZE(gc_nodes))
-			schedule_gc_worker(data, hash);
+		schedule_gc_worker(data, hash);
+		gc_count = 0;
+		do_gc = false;
+		goto restart;
 	}
 
 	if (node_found)

From df4a902509766897f7371fdfa4c3bf8bc321b55d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 28 Dec 2018 01:24:46 +0100
Subject: [PATCH 19/57] netfilter: nf_conncount: merge lookup and add functions

'lookup' is always followed by 'add'.
Merge both and make the list-walk part of nf_conncount_add().

This also avoids one unneeded unlock/re-lock pair.

Extra care needs to be taken in count_tree, as we only hold rcu
read lock, i.e. we can only insert to an existing tree node after
acquiring its lock and making sure it has a nonzero count.

As a zero count should be rare, just fall back to insert_tree()
(which acquires tree lock).

This issue and its solution were pointed out by Shawn Bohrer
during patch review.

Reviewed-by: Shawn Bohrer <sbohrer@cloudflare.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h |  18 +--
 net/netfilter/nf_conncount.c               | 146 ++++++++++-----------
 net/netfilter/nft_connlimit.c              |  14 +-
 3 files changed, 72 insertions(+), 106 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index 4b2b2baf8ab4..aa66775c15f4 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -5,12 +5,6 @@
 
 struct nf_conncount_data;
 
-enum nf_conncount_list_add {
-	NF_CONNCOUNT_ADDED, 	/* list add was ok */
-	NF_CONNCOUNT_ERR,	/* -ENOMEM, must drop skb */
-	NF_CONNCOUNT_SKIP,	/* list is already reclaimed by gc */
-};
-
 struct nf_conncount_list {
 	spinlock_t list_lock;
 	struct list_head head;	/* connections with the same filtering key */
@@ -29,18 +23,12 @@ unsigned int nf_conncount_count(struct net *net,
 				const struct nf_conntrack_tuple *tuple,
 				const struct nf_conntrack_zone *zone);
 
-void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list,
-			 const struct nf_conntrack_tuple *tuple,
-			 const struct nf_conntrack_zone *zone,
-			 bool *addit);
+int nf_conncount_add(struct net *net, struct nf_conncount_list *list,
+		     const struct nf_conntrack_tuple *tuple,
+		     const struct nf_conntrack_zone *zone);
 
 void nf_conncount_list_init(struct nf_conncount_list *list);
 
-enum nf_conncount_list_add
-nf_conncount_add(struct nf_conncount_list *list,
-		 const struct nf_conntrack_tuple *tuple,
-		 const struct nf_conntrack_zone *zone);
-
 bool nf_conncount_gc_list(struct net *net,
 			  struct nf_conncount_list *list);
 
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 0a83c694a8f1..ce7f7d1212a6 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -83,38 +83,6 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
 	return memcmp(a, b, klen * sizeof(u32));
 }
 
-enum nf_conncount_list_add
-nf_conncount_add(struct nf_conncount_list *list,
-		 const struct nf_conntrack_tuple *tuple,
-		 const struct nf_conntrack_zone *zone)
-{
-	struct nf_conncount_tuple *conn;
-
-	if (WARN_ON_ONCE(list->count > INT_MAX))
-		return NF_CONNCOUNT_ERR;
-
-	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
-	if (conn == NULL)
-		return NF_CONNCOUNT_ERR;
-
-	conn->tuple = *tuple;
-	conn->zone = *zone;
-	conn->cpu = raw_smp_processor_id();
-	conn->jiffies32 = (u32)jiffies;
-	conn->dead = false;
-	spin_lock_bh(&list->list_lock);
-	if (list->dead == true) {
-		kmem_cache_free(conncount_conn_cachep, conn);
-		spin_unlock_bh(&list->list_lock);
-		return NF_CONNCOUNT_SKIP;
-	}
-	list_add_tail(&conn->node, &list->head);
-	list->count++;
-	spin_unlock_bh(&list->list_lock);
-	return NF_CONNCOUNT_ADDED;
-}
-EXPORT_SYMBOL_GPL(nf_conncount_add);
-
 static void __conn_free(struct rcu_head *h)
 {
 	struct nf_conncount_tuple *conn;
@@ -177,11 +145,10 @@ find_or_evict(struct net *net, struct nf_conncount_list *list,
 	return ERR_PTR(-EAGAIN);
 }
 
-void nf_conncount_lookup(struct net *net,
-			 struct nf_conncount_list *list,
-			 const struct nf_conntrack_tuple *tuple,
-			 const struct nf_conntrack_zone *zone,
-			 bool *addit)
+static int __nf_conncount_add(struct net *net,
+			      struct nf_conncount_list *list,
+			      const struct nf_conntrack_tuple *tuple,
+			      const struct nf_conntrack_zone *zone)
 {
 	const struct nf_conntrack_tuple_hash *found;
 	struct nf_conncount_tuple *conn, *conn_n;
@@ -189,9 +156,6 @@ void nf_conncount_lookup(struct net *net,
 	unsigned int collect = 0;
 	bool free_entry = false;
 
-	/* best effort only */
-	*addit = tuple ? true : false;
-
 	/* check the saved connections */
 	list_for_each_entry_safe(conn, conn_n, &list->head, node) {
 		if (collect > CONNCOUNT_GC_MAX_NODES)
@@ -201,21 +165,19 @@ void nf_conncount_lookup(struct net *net,
 		if (IS_ERR(found)) {
 			/* Not found, but might be about to be confirmed */
 			if (PTR_ERR(found) == -EAGAIN) {
-				if (!tuple)
-					continue;
-
 				if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
 				    nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
 				    nf_ct_zone_id(zone, zone->dir))
-					*addit = false;
-			} else if (PTR_ERR(found) == -ENOENT)
+					return 0; /* already exists */
+			} else {
 				collect++;
+			}
 			continue;
 		}
 
 		found_ct = nf_ct_tuplehash_to_ctrack(found);
 
-		if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) &&
+		if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
 		    nf_ct_zone_equal(found_ct, zone, zone->dir)) {
 			/*
 			 * We should not see tuples twice unless someone hooks
@@ -223,7 +185,8 @@ void nf_conncount_lookup(struct net *net,
 			 *
 			 * Attempt to avoid a re-add in this case.
 			 */
-			*addit = false;
+			nf_ct_put(found_ct);
+			return 0;
 		} else if (already_closed(found_ct)) {
 			/*
 			 * we do not care about connections which are
@@ -237,8 +200,38 @@ void nf_conncount_lookup(struct net *net,
 
 		nf_ct_put(found_ct);
 	}
+
+	if (WARN_ON_ONCE(list->count > INT_MAX))
+		return -EOVERFLOW;
+
+	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
+	if (conn == NULL)
+		return -ENOMEM;
+
+	conn->tuple = *tuple;
+	conn->zone = *zone;
+	conn->cpu = raw_smp_processor_id();
+	conn->jiffies32 = (u32)jiffies;
+	list_add_tail(&conn->node, &list->head);
+	list->count++;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(nf_conncount_lookup);
+
+int nf_conncount_add(struct net *net,
+		     struct nf_conncount_list *list,
+		     const struct nf_conntrack_tuple *tuple,
+		     const struct nf_conntrack_zone *zone)
+{
+	int ret;
+
+	/* check the saved connections */
+	spin_lock_bh(&list->list_lock);
+	ret = __nf_conncount_add(net, list, tuple, zone);
+	spin_unlock_bh(&list->list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_conncount_add);
 
 void nf_conncount_list_init(struct nf_conncount_list *list)
 {
@@ -339,13 +332,11 @@ insert_tree(struct net *net,
 	    const struct nf_conntrack_tuple *tuple,
 	    const struct nf_conntrack_zone *zone)
 {
-	enum nf_conncount_list_add ret;
 	struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
 	struct rb_node **rbnode, *parent;
 	struct nf_conncount_rb *rbconn;
 	struct nf_conncount_tuple *conn;
 	unsigned int count = 0, gc_count = 0;
-	bool node_found = false;
 	bool do_gc = true;
 
 	spin_lock_bh(&nf_conncount_locks[hash]);
@@ -363,20 +354,15 @@ insert_tree(struct net *net,
 		} else if (diff > 0) {
 			rbnode = &((*rbnode)->rb_right);
 		} else {
-			/* unlikely: other cpu added node already */
-			node_found = true;
-			ret = nf_conncount_add(&rbconn->list, tuple, zone);
-			if (ret == NF_CONNCOUNT_ERR) {
+			int ret;
+
+			ret = nf_conncount_add(net, &rbconn->list, tuple, zone);
+			if (ret)
 				count = 0; /* hotdrop */
-			} else if (ret == NF_CONNCOUNT_ADDED) {
+			else
 				count = rbconn->list.count;
-			} else {
-				/* NF_CONNCOUNT_SKIP, rbconn is already
-				 * reclaimed by gc, insert a new tree node
-				 */
-				node_found = false;
-			}
-			break;
+			tree_nodes_free(root, gc_nodes, gc_count);
+			goto out_unlock;
 		}
 
 		if (gc_count >= ARRAY_SIZE(gc_nodes))
@@ -394,9 +380,6 @@ insert_tree(struct net *net,
 		goto restart;
 	}
 
-	if (node_found)
-		goto out_unlock;
-
 	/* expected case: match, insert new node */
 	rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
 	if (rbconn == NULL)
@@ -431,7 +414,6 @@ count_tree(struct net *net,
 	   const struct nf_conntrack_tuple *tuple,
 	   const struct nf_conntrack_zone *zone)
 {
-	enum nf_conncount_list_add ret;
 	struct rb_root *root;
 	struct rb_node *parent;
 	struct nf_conncount_rb *rbconn;
@@ -444,7 +426,6 @@ count_tree(struct net *net,
 	parent = rcu_dereference_raw(root->rb_node);
 	while (parent) {
 		int diff;
-		bool addit;
 
 		rbconn = rb_entry(parent, struct nf_conncount_rb, node);
 
@@ -454,24 +435,29 @@ count_tree(struct net *net,
 		} else if (diff > 0) {
 			parent = rcu_dereference_raw(parent->rb_right);
 		} else {
-			/* same source network -> be counted! */
-			nf_conncount_lookup(net, &rbconn->list, tuple, zone,
-					    &addit);
+			int ret;
 
-			if (!addit)
+			if (!tuple) {
+				nf_conncount_gc_list(net, &rbconn->list);
 				return rbconn->list.count;
+			}
 
-			ret = nf_conncount_add(&rbconn->list, tuple, zone);
-			if (ret == NF_CONNCOUNT_ERR) {
-				return 0; /* hotdrop */
-			} else if (ret == NF_CONNCOUNT_ADDED) {
-				return rbconn->list.count;
-			} else {
-				/* NF_CONNCOUNT_SKIP, rbconn is already
-				 * reclaimed by gc, insert a new tree node
-				 */
+			spin_lock_bh(&rbconn->list.list_lock);
+			/* Node might be about to be free'd.
+			 * We need to defer to insert_tree() in this case.
+			 */
+			if (rbconn->list.count == 0) {
+				spin_unlock_bh(&rbconn->list.list_lock);
 				break;
 			}
+
+			/* same source network -> be counted! */
+			ret = __nf_conncount_add(net, &rbconn->list, tuple, zone);
+			spin_unlock_bh(&rbconn->list.list_lock);
+			if (ret)
+				return 0; /* hotdrop */
+			else
+				return rbconn->list.count;
 		}
 	}
 
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index b90d96ba4a12..af1497ab9464 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -30,7 +30,6 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
 	enum ip_conntrack_info ctinfo;
 	const struct nf_conn *ct;
 	unsigned int count;
-	bool addit;
 
 	tuple_ptr = &tuple;
 
@@ -44,19 +43,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
 		return;
 	}
 
-	nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone,
-			    &addit);
-	count = priv->list.count;
-
-	if (!addit)
-		goto out;
-
-	if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) {
+	if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) {
 		regs->verdict.code = NF_DROP;
 		return;
 	}
-	count++;
-out:
+
+	count = priv->list.count;
 
 	if ((count > priv->limit) ^ priv->invert) {
 		regs->verdict.code = NFT_BREAK;

From 2f971a8f425545da52ca0e6bee81f5b1ea0ccc5f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 28 Dec 2018 01:24:47 +0100
Subject: [PATCH 20/57] netfilter: nf_conncount: move all list iterations under
 spinlock

Two CPUs may race to remove a connection from the list, the existing
conn->dead will result in a use-after-free. Use the per-list spinlock to
protect list iterations.

As all accesses to the list now happen while holding the per-list lock,
we no longer need to delay free operations with rcu.

Joint work with Florian.

Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search")
Reviewed-by: Shawn Bohrer <sbohrer@cloudflare.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 46 ++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index ce7f7d1212a6..d0fd195b19a8 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -43,8 +43,6 @@ struct nf_conncount_tuple {
 	struct nf_conntrack_zone	zone;
 	int				cpu;
 	u32				jiffies32;
-	bool				dead;
-	struct rcu_head			rcu_head;
 };
 
 struct nf_conncount_rb {
@@ -83,36 +81,21 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
 	return memcmp(a, b, klen * sizeof(u32));
 }
 
-static void __conn_free(struct rcu_head *h)
-{
-	struct nf_conncount_tuple *conn;
-
-	conn = container_of(h, struct nf_conncount_tuple, rcu_head);
-	kmem_cache_free(conncount_conn_cachep, conn);
-}
-
 static bool conn_free(struct nf_conncount_list *list,
 		      struct nf_conncount_tuple *conn)
 {
 	bool free_entry = false;
 
-	spin_lock_bh(&list->list_lock);
-
-	if (conn->dead) {
-		spin_unlock_bh(&list->list_lock);
-		return free_entry;
-	}
+	lockdep_assert_held(&list->list_lock);
 
 	list->count--;
-	conn->dead = true;
-	list_del_rcu(&conn->node);
+	list_del(&conn->node);
 	if (list->count == 0) {
 		list->dead = true;
 		free_entry = true;
 	}
 
-	spin_unlock_bh(&list->list_lock);
-	call_rcu(&conn->rcu_head, __conn_free);
+	kmem_cache_free(conncount_conn_cachep, conn);
 	return free_entry;
 }
 
@@ -242,7 +225,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list)
 }
 EXPORT_SYMBOL_GPL(nf_conncount_list_init);
 
-/* Return true if the list is empty */
+/* Return true if the list is empty. Must be called with BH disabled. */
 bool nf_conncount_gc_list(struct net *net,
 			  struct nf_conncount_list *list)
 {
@@ -253,12 +236,18 @@ bool nf_conncount_gc_list(struct net *net,
 	bool free_entry = false;
 	bool ret = false;
 
+	/* don't bother if other cpu is already doing GC */
+	if (!spin_trylock(&list->list_lock))
+		return false;
+
 	list_for_each_entry_safe(conn, conn_n, &list->head, node) {
 		found = find_or_evict(net, list, conn, &free_entry);
 		if (IS_ERR(found)) {
 			if (PTR_ERR(found) == -ENOENT)  {
-				if (free_entry)
+				if (free_entry) {
+					spin_unlock(&list->list_lock);
 					return true;
+				}
 				collected++;
 			}
 			continue;
@@ -271,23 +260,24 @@ bool nf_conncount_gc_list(struct net *net,
 			 * closed already -> ditch it
 			 */
 			nf_ct_put(found_ct);
-			if (conn_free(list, conn))
+			if (conn_free(list, conn)) {
+				spin_unlock(&list->list_lock);
 				return true;
+			}
 			collected++;
 			continue;
 		}
 
 		nf_ct_put(found_ct);
 		if (collected > CONNCOUNT_GC_MAX_NODES)
-			return false;
+			break;
 	}
 
-	spin_lock_bh(&list->list_lock);
 	if (!list->count) {
 		list->dead = true;
 		ret = true;
 	}
-	spin_unlock_bh(&list->list_lock);
+	spin_unlock(&list->list_lock);
 
 	return ret;
 }
@@ -478,6 +468,7 @@ static void tree_gc_worker(struct work_struct *work)
 	tree = data->gc_tree % CONNCOUNT_SLOTS;
 	root = &data->root[tree];
 
+	local_bh_disable();
 	rcu_read_lock();
 	for (node = rb_first(root); node != NULL; node = rb_next(node)) {
 		rbconn = rb_entry(node, struct nf_conncount_rb, node);
@@ -485,6 +476,9 @@ static void tree_gc_worker(struct work_struct *work)
 			gc_count++;
 	}
 	rcu_read_unlock();
+	local_bh_enable();
+
+	cond_resched();
 
 	spin_lock_bh(&nf_conncount_locks[tree]);
 	if (gc_count < ARRAY_SIZE(gc_nodes))

From c80f10bc973af2ace6b1414724eeff61eaa71837 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 28 Dec 2018 01:24:48 +0100
Subject: [PATCH 21/57] netfilter: nf_conncount: speculative garbage collection
 on empty lists

Instead of removing a empty list node that might be reintroduced soon
thereafter, tentatively place the empty list node on the list passed to
tree_nodes_free(), then re-check if the list is empty again before erasing
it from the tree.

[ Florian: rebase on top of pending nf_conncount fixes ]

Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search")
Reviewed-by: Shawn Bohrer <sbohrer@cloudflare.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h |  1 -
 net/netfilter/nf_conncount.c               | 47 +++++++---------------
 2 files changed, 15 insertions(+), 33 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index aa66775c15f4..f32fc8289473 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -9,7 +9,6 @@ struct nf_conncount_list {
 	spinlock_t list_lock;
 	struct list_head head;	/* connections with the same filtering key */
 	unsigned int count;	/* length of list */
-	bool dead;
 };
 
 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index d0fd195b19a8..f0b05dfebc6e 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -81,27 +81,20 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
 	return memcmp(a, b, klen * sizeof(u32));
 }
 
-static bool conn_free(struct nf_conncount_list *list,
+static void conn_free(struct nf_conncount_list *list,
 		      struct nf_conncount_tuple *conn)
 {
-	bool free_entry = false;
-
 	lockdep_assert_held(&list->list_lock);
 
 	list->count--;
 	list_del(&conn->node);
-	if (list->count == 0) {
-		list->dead = true;
-		free_entry = true;
-	}
 
 	kmem_cache_free(conncount_conn_cachep, conn);
-	return free_entry;
 }
 
 static const struct nf_conntrack_tuple_hash *
 find_or_evict(struct net *net, struct nf_conncount_list *list,
-	      struct nf_conncount_tuple *conn, bool *free_entry)
+	      struct nf_conncount_tuple *conn)
 {
 	const struct nf_conntrack_tuple_hash *found;
 	unsigned long a, b;
@@ -121,7 +114,7 @@ find_or_evict(struct net *net, struct nf_conncount_list *list,
 	 */
 	age = a - b;
 	if (conn->cpu == cpu || age >= 2) {
-		*free_entry = conn_free(list, conn);
+		conn_free(list, conn);
 		return ERR_PTR(-ENOENT);
 	}
 
@@ -137,14 +130,13 @@ static int __nf_conncount_add(struct net *net,
 	struct nf_conncount_tuple *conn, *conn_n;
 	struct nf_conn *found_ct;
 	unsigned int collect = 0;
-	bool free_entry = false;
 
 	/* check the saved connections */
 	list_for_each_entry_safe(conn, conn_n, &list->head, node) {
 		if (collect > CONNCOUNT_GC_MAX_NODES)
 			break;
 
-		found = find_or_evict(net, list, conn, &free_entry);
+		found = find_or_evict(net, list, conn);
 		if (IS_ERR(found)) {
 			/* Not found, but might be about to be confirmed */
 			if (PTR_ERR(found) == -EAGAIN) {
@@ -221,7 +213,6 @@ void nf_conncount_list_init(struct nf_conncount_list *list)
 	spin_lock_init(&list->list_lock);
 	INIT_LIST_HEAD(&list->head);
 	list->count = 0;
-	list->dead = false;
 }
 EXPORT_SYMBOL_GPL(nf_conncount_list_init);
 
@@ -233,7 +224,6 @@ bool nf_conncount_gc_list(struct net *net,
 	struct nf_conncount_tuple *conn, *conn_n;
 	struct nf_conn *found_ct;
 	unsigned int collected = 0;
-	bool free_entry = false;
 	bool ret = false;
 
 	/* don't bother if other cpu is already doing GC */
@@ -241,15 +231,10 @@ bool nf_conncount_gc_list(struct net *net,
 		return false;
 
 	list_for_each_entry_safe(conn, conn_n, &list->head, node) {
-		found = find_or_evict(net, list, conn, &free_entry);
+		found = find_or_evict(net, list, conn);
 		if (IS_ERR(found)) {
-			if (PTR_ERR(found) == -ENOENT)  {
-				if (free_entry) {
-					spin_unlock(&list->list_lock);
-					return true;
-				}
+			if (PTR_ERR(found) == -ENOENT)
 				collected++;
-			}
 			continue;
 		}
 
@@ -260,10 +245,7 @@ bool nf_conncount_gc_list(struct net *net,
 			 * closed already -> ditch it
 			 */
 			nf_ct_put(found_ct);
-			if (conn_free(list, conn)) {
-				spin_unlock(&list->list_lock);
-				return true;
-			}
+			conn_free(list, conn);
 			collected++;
 			continue;
 		}
@@ -273,10 +255,8 @@ bool nf_conncount_gc_list(struct net *net,
 			break;
 	}
 
-	if (!list->count) {
-		list->dead = true;
+	if (!list->count)
 		ret = true;
-	}
 	spin_unlock(&list->list_lock);
 
 	return ret;
@@ -291,6 +271,7 @@ static void __tree_nodes_free(struct rcu_head *h)
 	kmem_cache_free(conncount_rb_cachep, rbconn);
 }
 
+/* caller must hold tree nf_conncount_locks[] lock */
 static void tree_nodes_free(struct rb_root *root,
 			    struct nf_conncount_rb *gc_nodes[],
 			    unsigned int gc_count)
@@ -300,8 +281,10 @@ static void tree_nodes_free(struct rb_root *root,
 	while (gc_count) {
 		rbconn = gc_nodes[--gc_count];
 		spin_lock(&rbconn->list.list_lock);
-		rb_erase(&rbconn->node, root);
-		call_rcu(&rbconn->rcu_head, __tree_nodes_free);
+		if (!rbconn->list.count) {
+			rb_erase(&rbconn->node, root);
+			call_rcu(&rbconn->rcu_head, __tree_nodes_free);
+		}
 		spin_unlock(&rbconn->list.list_lock);
 	}
 }
@@ -318,7 +301,6 @@ insert_tree(struct net *net,
 	    struct rb_root *root,
 	    unsigned int hash,
 	    const u32 *key,
-	    u8 keylen,
 	    const struct nf_conntrack_tuple *tuple,
 	    const struct nf_conntrack_zone *zone)
 {
@@ -327,6 +309,7 @@ insert_tree(struct net *net,
 	struct nf_conncount_rb *rbconn;
 	struct nf_conncount_tuple *conn;
 	unsigned int count = 0, gc_count = 0;
+	u8 keylen = data->keylen;
 	bool do_gc = true;
 
 	spin_lock_bh(&nf_conncount_locks[hash]);
@@ -454,7 +437,7 @@ count_tree(struct net *net,
 	if (!tuple)
 		return 0;
 
-	return insert_tree(net, data, root, hash, key, keylen, tuple, zone);
+	return insert_tree(net, data, root, hash, key, tuple, zone);
 }
 
 static void tree_gc_worker(struct work_struct *work)

From a007232066f6839d6f256bab21e825d968f1a163 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 28 Dec 2018 01:24:49 +0100
Subject: [PATCH 22/57] netfilter: nf_conncount: fix argument order to
 find_next_bit

Size and 'next bit' were swapped, this bug could cause worker to
reschedule itself even if system was idle.

Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search")
Reviewed-by: Shawn Bohrer <sbohrer@cloudflare.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index f0b05dfebc6e..7554c56b2e63 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -488,7 +488,7 @@ static void tree_gc_worker(struct work_struct *work)
 	clear_bit(tree, data->pending_trees);
 
 	next_tree = (tree + 1) % CONNCOUNT_SLOTS;
-	next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS);
+	next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree);
 
 	if (next_tree < CONNCOUNT_SLOTS) {
 		data->gc_tree = next_tree;

From f9fc54d313fab2834f44f516459cdc8ac91d797f Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Wed, 26 Dec 2018 19:51:46 +0800
Subject: [PATCH 23/57] ethtool: check the return value of get_regs_len

The return type for get_regs_len in struct ethtool_ops is int,
the hns3 driver may return error when failing to get the regs
len by sending cmd to firmware.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index d05402868575..158264f7cfaf 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -793,8 +793,13 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
 		if (rc >= 0)
 			info.n_priv_flags = rc;
 	}
-	if (ops->get_regs_len)
-		info.regdump_len = ops->get_regs_len(dev);
+	if (ops->get_regs_len) {
+		int ret = ops->get_regs_len(dev);
+
+		if (ret > 0)
+			info.regdump_len = ret;
+	}
+
 	if (ops->get_eeprom_len)
 		info.eedump_len = ops->get_eeprom_len(dev);
 
@@ -1337,6 +1342,9 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
 		return -EFAULT;
 
 	reglen = ops->get_regs_len(dev);
+	if (reglen <= 0)
+		return reglen;
+
 	if (regs.len > reglen)
 		regs.len = reglen;
 

From 7418e6520f22a2e35815122fa5a53d5bbfa2c10f Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Wed, 26 Dec 2018 22:09:34 +0800
Subject: [PATCH 24/57] isdn: hisax: hfc_pci: Fix a possible concurrency
 use-after-free bug in HFCPCI_l1hw()

In drivers/isdn/hisax/hfc_pci.c, the functions hfcpci_interrupt() and
HFCPCI_l1hw() may be concurrently executed.

HFCPCI_l1hw()
  line 1173: if (!cs->tx_skb)

hfcpci_interrupt()
  line 942: spin_lock_irqsave();
  line 1066: dev_kfree_skb_irq(cs->tx_skb);

Thus, a possible concurrency use-after-free bug may occur
in HFCPCI_l1hw().

To fix these bugs, the calls to spin_lock_irqsave() and
spin_unlock_irqrestore() are added in HFCPCI_l1hw(), to protect the
access to cs->tx_skb.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/hisax/hfc_pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/isdn/hisax/hfc_pci.c b/drivers/isdn/hisax/hfc_pci.c
index 5b719b561860..81dd465afcf4 100644
--- a/drivers/isdn/hisax/hfc_pci.c
+++ b/drivers/isdn/hisax/hfc_pci.c
@@ -1169,11 +1169,13 @@ HFCPCI_l1hw(struct PStack *st, int pr, void *arg)
 		if (cs->debug & L1_DEB_LAPD)
 			debugl1(cs, "-> PH_REQUEST_PULL");
 #endif
+		spin_lock_irqsave(&cs->lock, flags);
 		if (!cs->tx_skb) {
 			test_and_clear_bit(FLG_L1_PULL_REQ, &st->l1.Flags);
 			st->l1.l1l2(st, PH_PULL | CONFIRM, NULL);
 		} else
 			test_and_set_bit(FLG_L1_PULL_REQ, &st->l1.Flags);
+		spin_unlock_irqrestore(&cs->lock, flags);
 		break;
 	case (HW_RESET | REQUEST):
 		spin_lock_irqsave(&cs->lock, flags);

From 0d9c9a238faf925823bde866182c663b6d734f2e Mon Sep 17 00:00:00 2001
From: Scott Wood <oss@buserror.net>
Date: Thu, 27 Dec 2018 18:29:09 -0600
Subject: [PATCH 25/57] fsl/fman: Use GFP_ATOMIC in
 {memac,tgec}_add_hash_mac_address()

These functions are called from atomic context:

[    9.150239] BUG: sleeping function called from invalid context at /home/scott/git/linux/mm/slab.h:421
[    9.158159] in_atomic(): 1, irqs_disabled(): 0, pid: 4432, name: ip
[    9.163128] CPU: 8 PID: 4432 Comm: ip Not tainted 4.20.0-rc2-00169-g63d86876f324 #29
[    9.163130] Call Trace:
[    9.170701] [c0000002e899a980] [c0000000009c1068] .dump_stack+0xa8/0xec (unreliable)
[    9.177140] [c0000002e899aa10] [c00000000007a7b4] .___might_sleep+0x138/0x164
[    9.184440] [c0000002e899aa80] [c0000000001d5bac] .kmem_cache_alloc_trace+0x238/0x30c
[    9.191216] [c0000002e899ab40] [c00000000065ea1c] .memac_add_hash_mac_address+0x104/0x198
[    9.199464] [c0000002e899abd0] [c00000000065a788] .set_multi+0x1c8/0x218
[    9.206242] [c0000002e899ac80] [c0000000006615ec] .dpaa_set_rx_mode+0xdc/0x17c
[    9.213544] [c0000002e899ad00] [c00000000083d2b0] .__dev_set_rx_mode+0x80/0xd4
[    9.219535] [c0000002e899ad90] [c00000000083d334] .dev_set_rx_mode+0x30/0x54
[    9.225271] [c0000002e899ae10] [c00000000083d4a0] .__dev_open+0x148/0x1c8
[    9.230751] [c0000002e899aeb0] [c00000000083d934] .__dev_change_flags+0x19c/0x1e0
[    9.230755] [c0000002e899af60] [c00000000083d9a4] .dev_change_flags+0x2c/0x80
[    9.242752] [c0000002e899aff0] [c0000000008554ec] .do_setlink+0x350/0xf08
[    9.248228] [c0000002e899b170] [c000000000857ad0] .rtnl_newlink+0x588/0x7e0
[    9.253965] [c0000002e899b740] [c000000000852424] .rtnetlink_rcv_msg+0x3e0/0x498
[    9.261440] [c0000002e899b820] [c000000000884790] .netlink_rcv_skb+0x134/0x14c
[    9.267607] [c0000002e899b8e0] [c000000000851840] .rtnetlink_rcv+0x18/0x2c
[    9.274558] [c0000002e899b950] [c000000000883c8c] .netlink_unicast+0x214/0x318
[    9.281163] [c0000002e899ba00] [c000000000884220] .netlink_sendmsg+0x348/0x444
[    9.287076] [c0000002e899bae0] [c00000000080d13c] .sock_sendmsg+0x2c/0x54
[    9.287080] [c0000002e899bb50] [c0000000008106c0] .___sys_sendmsg+0x2d0/0x2d8
[    9.298375] [c0000002e899bd30] [c000000000811a80] .__sys_sendmsg+0x5c/0xb0
[    9.303939] [c0000002e899be20] [c0000000000006b0] system_call+0x60/0x6c

Signed-off-by: Scott Wood <oss@buserror.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fman/fman_memac.c | 2 +-
 drivers/net/ethernet/freescale/fman/fman_tgec.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman_memac.c b/drivers/net/ethernet/freescale/fman/fman_memac.c
index bc6eb30aa20f..41c6fa200e74 100644
--- a/drivers/net/ethernet/freescale/fman/fman_memac.c
+++ b/drivers/net/ethernet/freescale/fman/fman_memac.c
@@ -928,7 +928,7 @@ int memac_add_hash_mac_address(struct fman_mac *memac, enet_addr_t *eth_addr)
 	hash = get_mac_addr_hash_code(addr) & HASH_CTRL_ADDR_MASK;
 
 	/* Create element to be added to the driver hash table */
-	hash_entry = kmalloc(sizeof(*hash_entry), GFP_KERNEL);
+	hash_entry = kmalloc(sizeof(*hash_entry), GFP_ATOMIC);
 	if (!hash_entry)
 		return -ENOMEM;
 	hash_entry->addr = addr;
diff --git a/drivers/net/ethernet/freescale/fman/fman_tgec.c b/drivers/net/ethernet/freescale/fman/fman_tgec.c
index 40705938eecc..f75b9c11b2d2 100644
--- a/drivers/net/ethernet/freescale/fman/fman_tgec.c
+++ b/drivers/net/ethernet/freescale/fman/fman_tgec.c
@@ -553,7 +553,7 @@ int tgec_add_hash_mac_address(struct fman_mac *tgec, enet_addr_t *eth_addr)
 	hash = (crc >> TGEC_HASH_MCAST_SHIFT) & TGEC_HASH_ADR_MSK;
 
 	/* Create element to be added to the driver hash table */
-	hash_entry = kmalloc(sizeof(*hash_entry), GFP_KERNEL);
+	hash_entry = kmalloc(sizeof(*hash_entry), GFP_ATOMIC);
 	if (!hash_entry)
 		return -ENOMEM;
 	hash_entry->addr = addr;

From f989d03ef25df3fc26d3ea0fe7c19c9830577166 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Sun, 30 Dec 2018 14:33:20 +0200
Subject: [PATCH 26/57] net: rtnetlink: address is mandatory for rtnl_fdb_get

We must have an address to lookup otherwise we'll derefence a null
pointer in the ndo_fdb_get callbacks.

CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: David Ahern <dsa@cumulusnetworks.com>
Reported-by: syzbot+017b1f61c82a1c3e7efd@syzkaller.appspotmail.com
Fixes: 5b2f94b27622 ("net: rtnetlink: support for fdb get")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 48f61885fd6f..5ea1bed08ede 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4104,6 +4104,11 @@ static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	if (!addr) {
+		NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request");
+		return -EINVAL;
+	}
+
 	if (brport_idx) {
 		dev = __dev_get_by_index(net, brport_idx);
 		if (!dev) {

From 178fe94405bffbd1acd83b6ff3b40211185ae9c9 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 28 Dec 2018 23:28:21 +0100
Subject: [PATCH 27/57] net/ipv6: Fix a test against 'ipv6_find_idev()' return
 value

'ipv6_find_idev()' returns NULL on error, not an error pointer.
Update the test accordingly and return -ENOBUFS, as already done in
'addrconf_add_dev()', if NULL is returned.

Fixes: ("ipv6: allow userspace to add IFA_F_OPTIMISTIC addresses")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 521e471f1cf9..8eeec6eb2bd3 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4736,8 +4736,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 			 IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
 
 	idev = ipv6_find_idev(dev);
-	if (IS_ERR(idev))
-		return PTR_ERR(idev);
+	if (!idev)
+		return -ENOBUFS;
 
 	if (!ipv6_allow_optimistic_dad(net, idev))
 		cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;

From 58075ff523af85002bfeace07304d57c59251605 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sat, 29 Dec 2018 14:45:23 +0800
Subject: [PATCH 28/57] ipv4: fib_rules: Fix possible infinite loop in
 fib_empty_table

gcc warn this:
net/ipv4/fib_rules.c:203 fib_empty_table() warn:
 always true condition '(id <= 4294967295) => (0-u32max <= u32max)'

'id' is u32, which always not greater than RT_TABLE_MAX
(0xFFFFFFFF), So add a check to break while wrap around.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_rules.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index f8eb78d042a4..cfec3af54c8d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -198,11 +198,15 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 
 static struct fib_table *fib_empty_table(struct net *net)
 {
-	u32 id;
+	u32 id = 1;
 
-	for (id = 1; id <= RT_TABLE_MAX; id++)
+	while (1) {
 		if (!fib_get_table(net, id))
 			return fib_new_table(net, id);
+
+		if (id++ == RT_TABLE_MAX)
+			break;
+	}
 	return NULL;
 }
 

From 7f334a7e1ae113212e39aafba51352ea9ab8c9f9 Mon Sep 17 00:00:00 2001
From: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Date: Sat, 29 Dec 2018 14:07:55 -0500
Subject: [PATCH 29/57] ipv6: fix typo in net/ipv6/reassembly.c

Signed-off-by: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index a5bb59ee50ac..36a3d8dc61f5 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -210,7 +210,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	if (next && next->ip_defrag_offset < end)
 		goto discard_fq;
 
-	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
+	/* Note : skb->ip_defrag_offset and skb->sk share the same location */
 	dev = skb->dev;
 	if (dev)
 		fq->iif = dev->ifindex;

From c433570458e49bccea5c551df628d058b3526289 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 29 Dec 2018 13:56:36 -0800
Subject: [PATCH 30/57] ax25: fix a use-after-free in ax25_fillin_cb()

There are multiple issues here:

1. After freeing dev->ax25_ptr, we need to set it to NULL otherwise
   we may use a dangling pointer.

2. There is a race between ax25_setsockopt() and device notifier as
   reported by syzbot. Close it by holding RTNL lock.

3. We need to test if dev->ax25_ptr is NULL before using it.

Reported-and-tested-by: syzbot+ae6bb869cbed29b29040@syzkaller.appspotmail.com
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ax25/af_ax25.c  | 11 +++++++++--
 net/ax25/ax25_dev.c |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index c603d33d5410..5d01edf8d819 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -653,15 +653,22 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
 			break;
 		}
 
-		dev = dev_get_by_name(&init_net, devname);
+		rtnl_lock();
+		dev = __dev_get_by_name(&init_net, devname);
 		if (!dev) {
+			rtnl_unlock();
 			res = -ENODEV;
 			break;
 		}
 
 		ax25->ax25_dev = ax25_dev_ax25dev(dev);
+		if (!ax25->ax25_dev) {
+			rtnl_unlock();
+			res = -ENODEV;
+			break;
+		}
 		ax25_fillin_cb(ax25, ax25->ax25_dev);
-		dev_put(dev);
+		rtnl_unlock();
 		break;
 
 	default:
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 9a3a301e1e2f..d92195cd7834 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -116,6 +116,7 @@ void ax25_dev_device_down(struct net_device *dev)
 	if ((s = ax25_dev_list) == ax25_dev) {
 		ax25_dev_list = s->next;
 		spin_unlock_bh(&ax25_dev_lock);
+		dev->ax25_ptr = NULL;
 		dev_put(dev);
 		kfree(ax25_dev);
 		return;
@@ -125,6 +126,7 @@ void ax25_dev_device_down(struct net_device *dev)
 		if (s->next == ax25_dev) {
 			s->next = ax25_dev->next;
 			spin_unlock_bh(&ax25_dev_lock);
+			dev->ax25_ptr = NULL;
 			dev_put(dev);
 			kfree(ax25_dev);
 			return;

From d5c7c745f254c6cb98b3b3f15fe789b8bd770c72 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 29 Dec 2018 13:56:37 -0800
Subject: [PATCH 31/57] net/wan: fix a double free in x25_asy_open_tty()

When x25_asy_open() fails, it already cleans up by itself,
so its caller doesn't need to free the memory again.

It seems we still have to call x25_asy_free() to clear the SLF_INUSE
bit, so just set these pointers to NULL after kfree().

Reported-and-tested-by: syzbot+5e5e969e525129229052@syzkaller.appspotmail.com
Fixes: 3b780bed3138 ("x25_asy: Free x25_asy on x25_asy_open() failure.")
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/x25_asy.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c
index 1098263ab862..46c3d983b7b7 100644
--- a/drivers/net/wan/x25_asy.c
+++ b/drivers/net/wan/x25_asy.c
@@ -485,8 +485,10 @@ static int x25_asy_open(struct net_device *dev)
 
 	/* Cleanup */
 	kfree(sl->xbuff);
+	sl->xbuff = NULL;
 noxbuff:
 	kfree(sl->rbuff);
+	sl->rbuff = NULL;
 norbuff:
 	return -ENOMEM;
 }

From 7314f5480f3e37e570104dc5e0f28823ef849e72 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 29 Dec 2018 13:56:38 -0800
Subject: [PATCH 32/57] netrom: fix locking in nr_find_socket()

nr_find_socket(), nr_find_peer() and nr_find_listener() lock the
sock after finding it in the global list. However, the call path
requires BH disabled for the sock lock consistently.

Actually the locking is unnecessary at this point, we can just hold
the sock refcnt to make sure it is not gone after we unlock the global
list, and lock it later only when needed.

Reported-and-tested-by: syzbot+f621cda8b7e598908efa@syzkaller.appspotmail.com
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netrom/af_netrom.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 03f37c4e64fe..1d3144d19903 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -153,7 +153,7 @@ static struct sock *nr_find_listener(ax25_address *addr)
 	sk_for_each(s, &nr_list)
 		if (!ax25cmp(&nr_sk(s)->source_addr, addr) &&
 		    s->sk_state == TCP_LISTEN) {
-			bh_lock_sock(s);
+			sock_hold(s);
 			goto found;
 		}
 	s = NULL;
@@ -174,7 +174,7 @@ static struct sock *nr_find_socket(unsigned char index, unsigned char id)
 		struct nr_sock *nr = nr_sk(s);
 
 		if (nr->my_index == index && nr->my_id == id) {
-			bh_lock_sock(s);
+			sock_hold(s);
 			goto found;
 		}
 	}
@@ -198,7 +198,7 @@ static struct sock *nr_find_peer(unsigned char index, unsigned char id,
 
 		if (nr->your_index == index && nr->your_id == id &&
 		    !ax25cmp(&nr->dest_addr, dest)) {
-			bh_lock_sock(s);
+			sock_hold(s);
 			goto found;
 		}
 	}
@@ -224,7 +224,7 @@ static unsigned short nr_find_next_circuit(void)
 		if (i != 0 && j != 0) {
 			if ((sk=nr_find_socket(i, j)) == NULL)
 				break;
-			bh_unlock_sock(sk);
+			sock_put(sk);
 		}
 
 		id++;
@@ -920,6 +920,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (sk != NULL) {
+		bh_lock_sock(sk);
 		skb_reset_transport_header(skb);
 
 		if (frametype == NR_CONNACK && skb->len == 22)
@@ -929,6 +930,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
 
 		ret = nr_process_rx_frame(sk, skb);
 		bh_unlock_sock(sk);
+		sock_put(sk);
 		return ret;
 	}
 
@@ -960,10 +962,12 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
 	    (make = nr_make_new(sk)) == NULL) {
 		nr_transmit_refusal(skb, 0);
 		if (sk)
-			bh_unlock_sock(sk);
+			sock_put(sk);
 		return 0;
 	}
 
+	bh_lock_sock(sk);
+
 	window = skb->data[20];
 
 	skb->sk             = make;
@@ -1016,6 +1020,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
 		sk->sk_data_ready(sk);
 
 	bh_unlock_sock(sk);
+	sock_put(sk);
 
 	nr_insert_socket(make);
 

From 3bd8264511035dc97c902f03fa9f1d07f95f8f62 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 30 Dec 2018 13:16:12 +0100
Subject: [PATCH 33/57] r8169: fix WoL device wakeup enable

In rtl8169_runtime_resume() we configure WoL but don't set the device
to wakeup-enabled. This prevents PME generation once the cable is
re-plugged. Fix this by moving the call to device_set_wakeup_enable()
to __rtl8169_set_wol().

Fixes: 433f9d0ddcc6 ("r8169: improve saved_wolopts handling")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 99bc3de906e2..298930d39b79 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1477,6 +1477,8 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts)
 	}
 
 	RTL_W8(tp, Cfg9346, Cfg9346_Lock);
+
+	device_set_wakeup_enable(tp_to_dev(tp), wolopts);
 }
 
 static int rtl8169_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
@@ -1498,8 +1500,6 @@ static int rtl8169_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 
 	rtl_unlock_work(tp);
 
-	device_set_wakeup_enable(d, tp->saved_wolopts);
-
 	pm_runtime_put_noidle(d);
 
 	return 0;

From 756af9c642329d54f048bac2a62f829b391f6944 Mon Sep 17 00:00:00 2001
From: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Date: Mon, 31 Dec 2018 15:43:01 -0600
Subject: [PATCH 34/57] ibmveth: fix DMA unmap error in ibmveth_xmit_start
 error path

Commit 33a48ab105a7 ("ibmveth: Fix DMA unmap error") fixed an issue in the
normal code path of ibmveth_xmit_start() that was originally introduced by
Commit 6e8ab30ec677 ("ibmveth: Add scatter-gather support"). This original
fix missed the error path where dma_unmap_page is wrongly called on the
header portion in descs[0] which was mapped with dma_map_single. As a
result a failure to DMA map any of the frags results in a dmesg warning
when CONFIG_DMA_API_DEBUG is enabled.

------------[ cut here ]------------
DMA-API: ibmveth 30000002: device driver frees DMA memory with wrong function
  [device address=0x000000000a430000] [size=172 bytes] [mapped as page] [unmapped as single]
WARNING: CPU: 1 PID: 8426 at kernel/dma/debug.c:1085 check_unmap+0x4fc/0xe10
...
<snip>
...
DMA-API: Mapped at:
ibmveth_start_xmit+0x30c/0xb60
dev_hard_start_xmit+0x100/0x450
sch_direct_xmit+0x224/0x490
__qdisc_run+0x20c/0x980
__dev_queue_xmit+0x1bc/0xf20

This fixes the API misuse by unampping descs[0] with dma_unmap_single.

Fixes: 6e8ab30ec677 ("ibmveth: Add scatter-gather support")
Signed-off-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmveth.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index a4681780a55d..098d8764c0ea 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1171,11 +1171,15 @@ static netdev_tx_t ibmveth_start_xmit(struct sk_buff *skb,
 
 map_failed_frags:
 	last = i+1;
-	for (i = 0; i < last; i++)
+	for (i = 1; i < last; i++)
 		dma_unmap_page(&adapter->vdev->dev, descs[i].fields.address,
 			       descs[i].fields.flags_len & IBMVETH_BUF_LEN_MASK,
 			       DMA_TO_DEVICE);
 
+	dma_unmap_single(&adapter->vdev->dev,
+			 descs[0].fields.address,
+			 descs[0].fields.flags_len & IBMVETH_BUF_LEN_MASK,
+			 DMA_TO_DEVICE);
 map_failed:
 	if (!firmware_has_feature(FW_FEATURE_CMO))
 		netdev_err(netdev, "tx: unable to map xmit buffer\n");

From 8b6b25cf93b73e8e399adc55b0ffb9db32b881e0 Mon Sep 17 00:00:00 2001
From: Xiaozhou Liu <liuxiaozhou@bytedance.com>
Date: Fri, 21 Dec 2018 17:35:11 +0800
Subject: [PATCH 35/57] selftests/bpf: fix error printing in test_devmap()

As a simple fix, just print the correct map type.

Signed-off-by: Xiaozhou Liu <liuxiaozhou@bytedance.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_maps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 9c79ee017df3..e2b9eee37187 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -510,7 +510,7 @@ static void test_devmap(int task, void *data)
 	fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value),
 			    2, 0);
 	if (fd < 0) {
-		printf("Failed to create arraymap '%s'!\n", strerror(errno));
+		printf("Failed to create devmap '%s'!\n", strerror(errno));
 		exit(1);
 	}
 

From 3a0ed3e9619738067214871e9cb826fa23b2ddb9 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 27 Dec 2018 18:55:09 -0800
Subject: [PATCH 36/57] sock: Make sock->sk_stamp thread-safe

Al Viro mentioned (Message-ID
<20170626041334.GZ10672@ZenIV.linux.org.uk>)
that there is probably a race condition
lurking in accesses of sk_stamp on 32-bit machines.

sock->sk_stamp is of type ktime_t which is always an s64.
On a 32 bit architecture, we might run into situations of
unsafe access as the access to the field becomes non atomic.

Use seqlocks for synchronization.
This allows us to avoid using spinlocks for readers as
readers do not need mutual exclusion.

Another approach to solve this is to require sk_lock for all
modifications of the timestamps. The current approach allows
for timestamps to have their own lock: sk_stamp_lock.
This allows for the patch to not compete with already
existing critical sections, and side effects are limited
to the paths in the patch.

The addition of the new field maintains the data locality
optimizations from
commit 9115e8cd2a0c ("net: reorganize struct sock for better data
locality")

Note that all the instances of the sk_stamp accesses
are either through the ioctl or the syscall recvmsg.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h   | 38 +++++++++++++++++++++++++++++++++++---
 net/compat.c         | 15 +++++++++------
 net/core/sock.c      | 15 ++++++++++-----
 net/sunrpc/svcsock.c |  2 +-
 4 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index a6235c286ef9..2b229f7be8eb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -298,6 +298,7 @@ struct sock_common {
   *	@sk_filter: socket filtering instructions
   *	@sk_timer: sock cleanup timer
   *	@sk_stamp: time stamp of last packet received
+  *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
   *	@sk_tsflags: SO_TIMESTAMPING socket options
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
   *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
@@ -474,6 +475,9 @@ struct sock {
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
 	ktime_t			sk_stamp;
+#if BITS_PER_LONG==32
+	seqlock_t		sk_stamp_seq;
+#endif
 	u16			sk_tsflags;
 	u8			sk_shutdown;
 	u32			sk_tskey;
@@ -2297,6 +2301,34 @@ static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
 	atomic_add(segs, &sk->sk_drops);
 }
 
+static inline ktime_t sock_read_timestamp(struct sock *sk)
+{
+#if BITS_PER_LONG==32
+	unsigned int seq;
+	ktime_t kt;
+
+	do {
+		seq = read_seqbegin(&sk->sk_stamp_seq);
+		kt = sk->sk_stamp;
+	} while (read_seqretry(&sk->sk_stamp_seq, seq));
+
+	return kt;
+#else
+	return sk->sk_stamp;
+#endif
+}
+
+static inline void sock_write_timestamp(struct sock *sk, ktime_t kt)
+{
+#if BITS_PER_LONG==32
+	write_seqlock(&sk->sk_stamp_seq);
+	sk->sk_stamp = kt;
+	write_sequnlock(&sk->sk_stamp_seq);
+#else
+	sk->sk_stamp = kt;
+#endif
+}
+
 void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 			   struct sk_buff *skb);
 void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
@@ -2321,7 +2353,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 	     (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
 		__sock_recv_timestamp(msg, sk, skb);
 	else
-		sk->sk_stamp = kt;
+		sock_write_timestamp(sk, kt);
 
 	if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid)
 		__sock_recv_wifi_status(msg, sk, skb);
@@ -2342,9 +2374,9 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
 	if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
 		__sock_recv_ts_and_drops(msg, sk, skb);
 	else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
-		sk->sk_stamp = skb->tstamp;
+		sock_write_timestamp(sk, skb->tstamp);
 	else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP))
-		sk->sk_stamp = 0;
+		sock_write_timestamp(sk, 0);
 }
 
 void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);
diff --git a/net/compat.c b/net/compat.c
index 47a614b370cd..d1f3a8a0b3ef 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -467,12 +467,14 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
 	ctv = (struct compat_timeval __user *) userstamp;
 	err = -ENOENT;
 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-	tv = ktime_to_timeval(sk->sk_stamp);
+	tv = ktime_to_timeval(sock_read_timestamp(sk));
+
 	if (tv.tv_sec == -1)
 		return err;
 	if (tv.tv_sec == 0) {
-		sk->sk_stamp = ktime_get_real();
-		tv = ktime_to_timeval(sk->sk_stamp);
+		ktime_t kt = ktime_get_real();
+		sock_write_timestamp(sk, kt);
+		tv = ktime_to_timeval(kt);
 	}
 	err = 0;
 	if (put_user(tv.tv_sec, &ctv->tv_sec) ||
@@ -494,12 +496,13 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta
 	ctv = (struct compat_timespec __user *) userstamp;
 	err = -ENOENT;
 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-	ts = ktime_to_timespec(sk->sk_stamp);
+	ts = ktime_to_timespec(sock_read_timestamp(sk));
 	if (ts.tv_sec == -1)
 		return err;
 	if (ts.tv_sec == 0) {
-		sk->sk_stamp = ktime_get_real();
-		ts = ktime_to_timespec(sk->sk_stamp);
+		ktime_t kt = ktime_get_real();
+		sock_write_timestamp(sk, kt);
+		ts = ktime_to_timespec(kt);
 	}
 	err = 0;
 	if (put_user(ts.tv_sec, &ctv->tv_sec) ||
diff --git a/net/core/sock.c b/net/core/sock.c
index f00902c532cc..6aa2e7e0b4fb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2751,6 +2751,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
 
 	sk->sk_stamp = SK_DEFAULT_STAMP;
+#if BITS_PER_LONG==32
+	seqlock_init(&sk->sk_stamp_seq);
+#endif
 	atomic_set(&sk->sk_zckey, 0);
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
@@ -2850,12 +2853,13 @@ int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
 	struct timeval tv;
 
 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-	tv = ktime_to_timeval(sk->sk_stamp);
+	tv = ktime_to_timeval(sock_read_timestamp(sk));
 	if (tv.tv_sec == -1)
 		return -ENOENT;
 	if (tv.tv_sec == 0) {
-		sk->sk_stamp = ktime_get_real();
-		tv = ktime_to_timeval(sk->sk_stamp);
+		ktime_t kt = ktime_get_real();
+		sock_write_timestamp(sk, kt);
+		tv = ktime_to_timeval(kt);
 	}
 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
 }
@@ -2866,11 +2870,12 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
 	struct timespec ts;
 
 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
-	ts = ktime_to_timespec(sk->sk_stamp);
+	ts = ktime_to_timespec(sock_read_timestamp(sk));
 	if (ts.tv_sec == -1)
 		return -ENOENT;
 	if (ts.tv_sec == 0) {
-		sk->sk_stamp = ktime_get_real();
+		ktime_t kt = ktime_get_real();
+		sock_write_timestamp(sk, kt);
 		ts = ktime_to_timespec(sk->sk_stamp);
 	}
 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 986f3ed7d1a2..b7e67310ec37 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -549,7 +549,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 		/* Don't enable netstamp, sunrpc doesn't
 		   need that much accuracy */
 	}
-	svsk->sk_sk->sk_stamp = skb->tstamp;
+	sock_write_timestamp(svsk->sk_sk, skb->tstamp);
 	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
 
 	len  = skb->len;

From 2b96547223e3a036efb51733dbf37a1e4be92068 Mon Sep 17 00:00:00 2001
From: yupeng <yupeng0921@gmail.com>
Date: Sat, 29 Dec 2018 21:46:38 -0800
Subject: [PATCH 37/57] add document for TCP OFO, PAWS and skip ACK counters

add document and examples for below counters:
TcpExtTCPOFOQueue
TcpExtTCPOFODrop
TcpExtTCPOFOMerge
TcpExtPAWSActive
TcpExtPAWSEstab
TcpExtTCPACKSkippedSynRecv
TcpExtTCPACKSkippedPAWS
TcpExtTCPACKSkippedSeq
TcpExtTCPACKSkippedFinWait2
TcpExtTCPACKSkippedTimeWait
TcpExtTCPACKSkippedChallenge

Signed-off-by: yupeng <yupeng0921@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/snmp_counter.rst | 240 +++++++++++++++++++++-
 1 file changed, 239 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst
index f8eb77ddbd44..b0dfdaaca512 100644
--- a/Documentation/networking/snmp_counter.rst
+++ b/Documentation/networking/snmp_counter.rst
@@ -571,7 +571,97 @@ duplicate packet is received.
 
 * TcpExtTCPDSACKOfoRecv
 The TCP stack receives a DSACK, which indicate an out of order
-duplciate packet is received.
+duplicate packet is received.
+
+TCP out of order
+===============
+* TcpExtTCPOFOQueue
+The TCP layer receives an out of order packet and has enough memory
+to queue it.
+
+* TcpExtTCPOFODrop
+The TCP layer receives an out of order packet but doesn't have enough
+memory, so drops it. Such packets won't be counted into
+TcpExtTCPOFOQueue.
+
+* TcpExtTCPOFOMerge
+The received out of order packet has an overlay with the previous
+packet. the overlay part will be dropped. All of TcpExtTCPOFOMerge
+packets will also be counted into TcpExtTCPOFOQueue.
+
+TCP PAWS
+=======
+PAWS (Protection Against Wrapped Sequence numbers) is an algorithm
+which is used to drop old packets. It depends on the TCP
+timestamps. For detail information, please refer the `timestamp wiki`_
+and the `RFC of PAWS`_.
+
+.. _RFC of PAWS: https://tools.ietf.org/html/rfc1323#page-17
+.. _timestamp wiki: https://en.wikipedia.org/wiki/Transmission_Control_Protocol#TCP_timestamps
+
+* TcpExtPAWSActive
+Packets are dropped by PAWS in Syn-Sent status.
+
+* TcpExtPAWSEstab
+Packets are dropped by PAWS in any status other than Syn-Sent.
+
+TCP ACK skip
+===========
+In some scenarios, kernel would avoid sending duplicate ACKs too
+frequently. Please find more details in the tcp_invalid_ratelimit
+section of the `sysctl document`_. When kernel decides to skip an ACK
+due to tcp_invalid_ratelimit, kernel would update one of below
+counters to indicate the ACK is skipped in which scenario. The ACK
+would only be skipped if the received packet is either a SYN packet or
+it has no data.
+
+.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt
+
+* TcpExtTCPACKSkippedSynRecv
+The ACK is skipped in Syn-Recv status. The Syn-Recv status means the
+TCP stack receives a SYN and replies SYN+ACK. Now the TCP stack is
+waiting for an ACK. Generally, the TCP stack doesn't need to send ACK
+in the Syn-Recv status. But in several scenarios, the TCP stack need
+to send an ACK. E.g., the TCP stack receives the same SYN packet
+repeately, the received packet does not pass the PAWS check, or the
+received packet sequence number is out of window. In these scenarios,
+the TCP stack needs to send ACK. If the ACk sending frequency is higher than
+tcp_invalid_ratelimit allows, the TCP stack will skip sending ACK and
+increase TcpExtTCPACKSkippedSynRecv.
+
+
+* TcpExtTCPACKSkippedPAWS
+The ACK is skipped due to PAWS (Protect Against Wrapped Sequence
+numbers) check fails. If the PAWS check fails in Syn-Recv, Fin-Wait-2
+or Time-Wait statuses, the skipped ACK would be counted to
+TcpExtTCPACKSkippedSynRecv, TcpExtTCPACKSkippedFinWait2 or
+TcpExtTCPACKSkippedTimeWait. In all other statuses, the skipped ACK
+would be counted to TcpExtTCPACKSkippedPAWS.
+
+* TcpExtTCPACKSkippedSeq
+The sequence number is out of window and the timestamp passes the PAWS
+check and the TCP status is not Syn-Recv, Fin-Wait-2, and Time-Wait.
+
+* TcpExtTCPACKSkippedFinWait2
+The ACK is skipped in Fin-Wait-2 status, the reason would be either
+PAWS check fails or the received sequence number is out of window.
+
+* TcpExtTCPACKSkippedTimeWait
+Tha ACK is skipped in Time-Wait status, the reason would be either
+PAWS check failed or the received sequence number is out of window.
+
+* TcpExtTCPACKSkippedChallenge
+The ACK is skipped if the ACK is a challenge ACK. The RFC 5961 defines
+3 kind of challenge ACK, please refer `RFC 5961 section 3.2`_,
+`RFC 5961 section 4.2`_ and `RFC 5961 section 5.2`_. Besides these
+three scenarios, In some TCP status, the linux TCP stack would also
+send challenge ACKs if the ACK number is before the first
+unacknowledged number (more strict than `RFC 5961 section 5.2`_).
+
+.. _RFC 5961 section 3.2: https://tools.ietf.org/html/rfc5961#page-7
+.. _RFC 5961 section 4.2: https://tools.ietf.org/html/rfc5961#page-9
+.. _RFC 5961 section 5.2: https://tools.ietf.org/html/rfc5961#page-11
+
 
 examples
 =======
@@ -1188,3 +1278,151 @@ Run nstat on server B::
 We have deleted the default route on server B. Server B couldn't find
 a route for the 8.8.8.8 IP address, so server B increased
 IpOutNoRoutes.
+
+TcpExtTCPACKSkippedSynRecv
+------------------------
+In this test, we send 3 same SYN packets from client to server. The
+first SYN will let server create a socket, set it to Syn-Recv status,
+and reply a SYN/ACK. The second SYN will let server reply the SYN/ACK
+again, and record the reply time (the duplicate ACK reply time). The
+third SYN will let server check the previous duplicate ACK reply time,
+and decide to skip the duplicate ACK, then increase the
+TcpExtTCPACKSkippedSynRecv counter.
+
+Run tcpdump to capture a SYN packet::
+
+  nstatuser@nstat-a:~$ sudo tcpdump -c 1 -w /tmp/syn.pcap port 9000
+  tcpdump: listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes
+
+Open another terminal, run nc command::
+
+  nstatuser@nstat-a:~$ nc nstat-b 9000
+
+As the nstat-b didn't listen on port 9000, it should reply a RST, and
+the nc command exited immediately. It was enough for the tcpdump
+command to capture a SYN packet. A linux server might use hardware
+offload for the TCP checksum, so the checksum in the /tmp/syn.pcap
+might be not correct. We call tcprewrite to fix it::
+
+  nstatuser@nstat-a:~$ tcprewrite --infile=/tmp/syn.pcap --outfile=/tmp/syn_fixcsum.pcap --fixcsum
+
+On nstat-b, we run nc to listen on port 9000::
+
+  nstatuser@nstat-b:~$ nc -lkv 9000
+  Listening on [0.0.0.0] (family 0, port 9000)
+
+On nstat-a, we blocked the packet from port 9000, or nstat-a would send
+RST to nstat-b::
+
+  nstatuser@nstat-a:~$ sudo iptables -A INPUT -p tcp --sport 9000 -j DROP
+
+Send 3 SYN repeatly to nstat-b::
+
+  nstatuser@nstat-a:~$ for i in {1..3}; do sudo tcpreplay -i ens3 /tmp/syn_fixcsum.pcap; done
+
+Check snmp cunter on nstat-b::
+
+  nstatuser@nstat-b:~$ nstat | grep -i skip
+  TcpExtTCPACKSkippedSynRecv      1                  0.0
+
+As we expected, TcpExtTCPACKSkippedSynRecv is 1.
+
+TcpExtTCPACKSkippedPAWS
+----------------------
+To trigger PAWS, we could send an old SYN.
+
+On nstat-b, let nc listen on port 9000::
+
+  nstatuser@nstat-b:~$ nc -lkv 9000
+  Listening on [0.0.0.0] (family 0, port 9000)
+
+On nstat-a, run tcpdump to capture a SYN::
+
+  nstatuser@nstat-a:~$ sudo tcpdump -w /tmp/paws_pre.pcap -c 1 port 9000
+  tcpdump: listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes
+
+On nstat-a, run nc as a client to connect nstat-b::
+
+  nstatuser@nstat-a:~$ nc -v nstat-b 9000
+  Connection to nstat-b 9000 port [tcp/*] succeeded!
+
+Now the tcpdump has captured the SYN and exit. We should fix the
+checksum::
+
+  nstatuser@nstat-a:~$ tcprewrite --infile /tmp/paws_pre.pcap --outfile /tmp/paws.pcap --fixcsum
+
+Send the SYN packet twice::
+
+  nstatuser@nstat-a:~$ for i in {1..2}; do sudo tcpreplay -i ens3 /tmp/paws.pcap; done
+
+On nstat-b, check the snmp counter::
+
+  nstatuser@nstat-b:~$ nstat | grep -i skip
+  TcpExtTCPACKSkippedPAWS         1                  0.0
+
+We sent two SYN via tcpreplay, both of them would let PAWS check
+failed, the nstat-b replied an ACK for the first SYN, skipped the ACK
+for the second SYN, and updated TcpExtTCPACKSkippedPAWS.
+
+TcpExtTCPACKSkippedSeq
+--------------------
+To trigger TcpExtTCPACKSkippedSeq, we send packets which have valid
+timestamp (to pass PAWS check) but the sequence number is out of
+window. The linux TCP stack would avoid to skip if the packet has
+data, so we need a pure ACK packet. To generate such a packet, we
+could create two sockets: one on port 9000, another on port 9001. Then
+we capture an ACK on port 9001, change the source/destination port
+numbers to match the port 9000 socket. Then we could trigger
+TcpExtTCPACKSkippedSeq via this packet.
+
+On nstat-b, open two terminals, run two nc commands to listen on both
+port 9000 and port 9001::
+
+  nstatuser@nstat-b:~$ nc -lkv 9000
+  Listening on [0.0.0.0] (family 0, port 9000)
+
+  nstatuser@nstat-b:~$ nc -lkv 9001
+  Listening on [0.0.0.0] (family 0, port 9001)
+
+On nstat-a, run two nc clients::
+
+  nstatuser@nstat-a:~$ nc -v nstat-b 9000
+  Connection to nstat-b 9000 port [tcp/*] succeeded!
+
+  nstatuser@nstat-a:~$ nc -v nstat-b 9001
+  Connection to nstat-b 9001 port [tcp/*] succeeded!
+
+On nstat-a, run tcpdump to capture an ACK::
+
+  nstatuser@nstat-a:~$ sudo tcpdump -w /tmp/seq_pre.pcap -c 1 dst port 9001
+  tcpdump: listening on ens3, link-type EN10MB (Ethernet), capture size 262144 bytes
+
+On nstat-b, send a packet via the port 9001 socket. E.g. we sent a
+string 'foo' in our example::
+
+  nstatuser@nstat-b:~$ nc -lkv 9001
+  Listening on [0.0.0.0] (family 0, port 9001)
+  Connection from nstat-a 42132 received!
+  foo
+
+On nstat-a, the tcpdump should have caputred the ACK. We should check
+the source port numbers of the two nc clients::
+
+  nstatuser@nstat-a:~$ ss -ta '( dport = :9000 || dport = :9001 )' | tee
+  State  Recv-Q   Send-Q         Local Address:Port           Peer Address:Port
+  ESTAB  0        0            192.168.122.250:50208       192.168.122.251:9000
+  ESTAB  0        0            192.168.122.250:42132       192.168.122.251:9001
+
+Run tcprewrite, change port 9001 to port 9000, chagne port 42132 to
+port 50208::
+
+  nstatuser@nstat-a:~$ tcprewrite --infile /tmp/seq_pre.pcap --outfile /tmp/seq.pcap -r 9001:9000 -r 42132:50208 --fixcsum
+
+Now the /tmp/seq.pcap is the packet we need. Send it to nstat-b::
+
+  nstatuser@nstat-a:~$ for i in {1..2}; do sudo tcpreplay -i ens3 /tmp/seq.pcap; done
+
+Check TcpExtTCPACKSkippedSeq on nstat-b::
+
+  nstatuser@nstat-b:~$ nstat | grep -i skip
+  TcpExtTCPACKSkippedSeq          1                  0.0

From 4087d2bc0d9469835f8d19d63a4a56739e5b8c5b Mon Sep 17 00:00:00 2001
From: Zhu Yanjun <yanjun.zhu@oracle.com>
Date: Sun, 30 Dec 2018 23:24:11 +0800
Subject: [PATCH 38/57] net: rds: remove unnecessary NULL check

In kfree, the NULL check is done.

Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index b9bbcf3d6c63..c16f0a362c32 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -623,7 +623,7 @@ static void __net_exit rds_tcp_exit_net(struct net *net)
 	if (rtn->rds_tcp_sysctl)
 		unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
 
-	if (net != &init_net && rtn->ctl_table)
+	if (net != &init_net)
 		kfree(rtn->ctl_table);
 }
 

From aff6db454599d62191aabc208930e891748e4322 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sun, 30 Dec 2018 12:43:42 -0800
Subject: [PATCH 39/57] ptr_ring: wrap back ->producer in
 __ptr_ring_swap_queue()

__ptr_ring_swap_queue() tries to move pointers from the old
ring to the new one, but it forgets to check if ->producer
is beyond the new size at the end of the operation. This leads
to an out-of-bound access in __ptr_ring_produce() as reported
by syzbot.

Reported-by: syzbot+8993c0fa96d57c399735@syzkaller.appspotmail.com
Fixes: 5d49de532002 ("ptr_ring: resize support")
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6894976b54e3..186cd8e970c7 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -573,6 +573,8 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
 		else if (destroy)
 			destroy(ptr);
 
+	if (producer >= size)
+		producer = 0;
 	__ptr_ring_set_size(r, size);
 	r->producer = producer;
 	r->consumer_head = 0;

From 8c76e77f9069f10505c08e02646c3ee11ad79038 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sun, 30 Dec 2018 17:21:05 -0500
Subject: [PATCH 40/57] tap: call skb_probe_transport_header after setting
 skb->dev

The BPF flow dissector expects either skb->sk or skb->dev set on
all skbs. Delay flow dissection until after skb->dev is set.

This requires calling from within an rcu read-side critical section.
That is fine, see also the call from tun_xdp_one.

Fixes: d0e13a1488ad ("flow_dissector: lookup netns by skb->sk if skb->dev is NULL")
Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 443b2694130c..c0b52e48f0e6 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1177,8 +1177,6 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
 			goto err_kfree;
 	}
 
-	skb_probe_transport_header(skb, ETH_HLEN);
-
 	/* Move network header to the right position for VLAN tagged packets */
 	if ((skb->protocol == htons(ETH_P_8021Q) ||
 	     skb->protocol == htons(ETH_P_8021AD)) &&
@@ -1189,6 +1187,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
 	tap = rcu_dereference(q->tap);
 	if (tap) {
 		skb->dev = tap->dev;
+		skb_probe_transport_header(skb, ETH_HLEN);
 		dev_queue_xmit(skb);
 	} else {
 		kfree_skb(skb);

From cb9f1b783850b14cbd7f87d061d784a666dfba1f Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sun, 30 Dec 2018 17:24:36 -0500
Subject: [PATCH 41/57] ip: validate header length on virtual device xmit

KMSAN detected read beyond end of buffer in vti and sit devices when
passing truncated packets with PF_PACKET. The issue affects additional
ip tunnel devices.

Extend commit 76c0ddd8c3a6 ("ip6_tunnel: be careful when accessing the
inner header") and commit ccfec9e5cb2d ("ip_tunnel: be careful when
accessing the inner header").

Move the check to a separate helper and call at the start of each
ndo_start_xmit function in net/ipv4 and net/ipv6.

Minor changes:
- convert dev_kfree_skb to kfree_skb on error path,
  as dev_kfree_skb calls consume_skb which is not for error paths.
- use pskb_network_may_pull even though that is pedantic here,
  as the same as pskb_may_pull for devices without llheaders.
- do not cache ipv6 hdrs if used only once
  (unsafe across pskb_may_pull, was more relevant to earlier patch)

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h | 20 ++++++++++++++++++++
 net/ipv4/ip_gre.c        |  9 +++++++++
 net/ipv4/ip_tunnel.c     |  9 ---------
 net/ipv4/ip_vti.c        | 12 +++++++++---
 net/ipv6/ip6_gre.c       | 10 +++++++---
 net/ipv6/ip6_tunnel.c    | 10 +++-------
 net/ipv6/ip6_vti.c       |  8 ++++----
 net/ipv6/ip6mr.c         | 17 +++++++++++------
 net/ipv6/sit.c           |  3 +++
 9 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index cbcf35ce1b14..34f019650941 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -308,6 +308,26 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
 int ip_tunnel_encap_setup(struct ip_tunnel *t,
 			  struct ip_tunnel_encap *ipencap);
 
+static inline bool pskb_inet_may_pull(struct sk_buff *skb)
+{
+	int nhlen;
+
+	switch (skb->protocol) {
+#if IS_ENABLED(CONFIG_IPV6)
+	case htons(ETH_P_IPV6):
+		nhlen = sizeof(struct ipv6hdr);
+		break;
+#endif
+	case htons(ETH_P_IP):
+		nhlen = sizeof(struct iphdr);
+		break;
+	default:
+		nhlen = 0;
+	}
+
+	return pskb_network_may_pull(skb, nhlen);
+}
+
 static inline int ip_encap_hlen(struct ip_tunnel_encap *e)
 {
 	const struct ip_tunnel_encap_ops *ops;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index c7a7bd58a23c..d1d09f3e5f9e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -676,6 +676,9 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr *tnl_params;
 
+	if (!pskb_inet_may_pull(skb))
+		goto free_skb;
+
 	if (tunnel->collect_md) {
 		gre_fb_xmit(skb, dev, skb->protocol);
 		return NETDEV_TX_OK;
@@ -719,6 +722,9 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	bool truncate = false;
 
+	if (!pskb_inet_may_pull(skb))
+		goto free_skb;
+
 	if (tunnel->collect_md) {
 		erspan_fb_xmit(skb, dev, skb->protocol);
 		return NETDEV_TX_OK;
@@ -762,6 +768,9 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
+	if (!pskb_inet_may_pull(skb))
+		goto free_skb;
+
 	if (tunnel->collect_md) {
 		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
 		return NETDEV_TX_OK;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 284a22154b4e..c4f5602308ed 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -627,7 +627,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, u8 protocol)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	unsigned int inner_nhdr_len = 0;
 	const struct iphdr *inner_iph;
 	struct flowi4 fl4;
 	u8     tos, ttl;
@@ -637,14 +636,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	__be32 dst;
 	bool connected;
 
-	/* ensure we can access the inner net header, for several users below */
-	if (skb->protocol == htons(ETH_P_IP))
-		inner_nhdr_len = sizeof(struct iphdr);
-	else if (skb->protocol == htons(ETH_P_IPV6))
-		inner_nhdr_len = sizeof(struct ipv6hdr);
-	if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
-		goto tx_error;
-
 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 	connected = (tunnel->parms.iph.daddr != 0);
 
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index de31b302d69c..d7b43e700023 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -241,6 +241,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	struct flowi fl;
 
+	if (!pskb_inet_may_pull(skb))
+		goto tx_err;
+
 	memset(&fl, 0, sizeof(fl));
 
 	switch (skb->protocol) {
@@ -253,15 +256,18 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
 		break;
 	default:
-		dev->stats.tx_errors++;
-		dev_kfree_skb(skb);
-		return NETDEV_TX_OK;
+		goto tx_err;
 	}
 
 	/* override mark with tunnel output key */
 	fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key);
 
 	return vti_xmit(skb, dev, &fl);
+
+tx_err:
+	dev->stats.tx_errors++;
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
 }
 
 static int vti4_err(struct sk_buff *skb, u32 info)
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 229e55c99021..09d0826742f8 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -881,6 +881,9 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
 	struct net_device_stats *stats = &t->dev->stats;
 	int ret;
 
+	if (!pskb_inet_may_pull(skb))
+		goto tx_err;
+
 	if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
 		goto tx_err;
 
@@ -923,6 +926,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	int nhoff;
 	int thoff;
 
+	if (!pskb_inet_may_pull(skb))
+		goto tx_err;
+
 	if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
 		goto tx_err;
 
@@ -995,8 +1001,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 			goto tx_err;
 		}
 	} else {
-		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
-
 		switch (skb->protocol) {
 		case htons(ETH_P_IP):
 			memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1004,7 +1008,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 						 &dsfield, &encap_limit);
 			break;
 		case htons(ETH_P_IPV6):
-			if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
+			if (ipv6_addr_equal(&t->parms.raddr, &ipv6_hdr(skb)->saddr))
 				goto tx_err;
 			if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6,
 						     &dsfield, &encap_limit))
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 99179b9c8384..0c6403cf8b52 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1243,10 +1243,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	u8 tproto;
 	int err;
 
-	/* ensure we can access the full inner ip header */
-	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-		return -1;
-
 	iph = ip_hdr(skb);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
@@ -1321,9 +1317,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	u8 tproto;
 	int err;
 
-	if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
-		return -1;
-
 	ipv6h = ipv6_hdr(skb);
 	tproto = READ_ONCE(t->parms.proto);
 	if ((tproto != IPPROTO_IPV6 && tproto != 0) ||
@@ -1405,6 +1398,9 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct net_device_stats *stats = &t->dev->stats;
 	int ret;
 
+	if (!pskb_inet_may_pull(skb))
+		goto tx_err;
+
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
 		ret = ip4ip6_tnl_xmit(skb, dev);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 706fe42e4928..8b6eefff2f7e 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -522,18 +522,18 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct net_device_stats *stats = &t->dev->stats;
-	struct ipv6hdr *ipv6h;
 	struct flowi fl;
 	int ret;
 
+	if (!pskb_inet_may_pull(skb))
+		goto tx_err;
+
 	memset(&fl, 0, sizeof(fl));
 
 	switch (skb->protocol) {
 	case htons(ETH_P_IPV6):
-		ipv6h = ipv6_hdr(skb);
-
 		if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) ||
-		    vti6_addr_conflict(t, ipv6h))
+		    vti6_addr_conflict(t, ipv6_hdr(skb)))
 			goto tx_err;
 
 		xfrm_decode_session(skb, &fl, AF_INET6);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 8276f1224f16..30337b38274b 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -51,6 +51,7 @@
 #include <linux/export.h>
 #include <net/ip6_checksum.h>
 #include <linux/netconf.h>
+#include <net/ip_tunnels.h>
 
 #include <linux/nospec.h>
 
@@ -599,13 +600,12 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
 		.flowi6_iif	= skb->skb_iif ? : LOOPBACK_IFINDEX,
 		.flowi6_mark	= skb->mark,
 	};
-	int err;
 
-	err = ip6mr_fib_lookup(net, &fl6, &mrt);
-	if (err < 0) {
-		kfree_skb(skb);
-		return err;
-	}
+	if (!pskb_inet_may_pull(skb))
+		goto tx_err;
+
+	if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
+		goto tx_err;
 
 	read_lock(&mrt_lock);
 	dev->stats.tx_bytes += skb->len;
@@ -614,6 +614,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
 	read_unlock(&mrt_lock);
 	kfree_skb(skb);
 	return NETDEV_TX_OK;
+
+tx_err:
+	dev->stats.tx_errors++;
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
 }
 
 static int reg_vif_get_iflink(const struct net_device *dev)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 51c9f75f34b9..1e03305c0549 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1021,6 +1021,9 @@ static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb,
 static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb,
 				   struct net_device *dev)
 {
+	if (!pskb_inet_may_pull(skb))
+		goto tx_err;
+
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
 		sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP);

From e888402789b9db5de4fcda361331d66dbf0cd9fd Mon Sep 17 00:00:00 2001
From: Huazhong Tan <tanhuazhong@huawei.com>
Date: Mon, 31 Dec 2018 10:58:29 +0800
Subject: [PATCH 42/57] net: hns3: call hns3_nic_net_open() while doing
 HNAE3_UP_CLIENT

For HNAE3_DOWN_CLIENT calling hns3_nic_net_stop(), HNAE3_UP_CLIENT
should call hns3_nic_net_open(), since if the number of queue or
the map of TC has is changed before HHAE3_UP_CLIENT is called,
it will cause problem.

Also the HNS3_NIC_STATE_RESETTING flag needs to be cleared before
hns3_nic_net_open() called, and set it back while hns3_nic_net_open()
failed.

Fixes: bb6b94a896d4 ("net: hns3: Add reset interface implementation in client")
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index d3b9aaf96c1c..07cd58798083 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3995,17 +3995,18 @@ static int hns3_reset_notify_up_enet(struct hnae3_handle *handle)
 	struct hns3_nic_priv *priv = netdev_priv(kinfo->netdev);
 	int ret = 0;
 
+	clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state);
+
 	if (netif_running(kinfo->netdev)) {
-		ret = hns3_nic_net_up(kinfo->netdev);
+		ret = hns3_nic_net_open(kinfo->netdev);
 		if (ret) {
+			set_bit(HNS3_NIC_STATE_RESETTING, &priv->state);
 			netdev_err(kinfo->netdev,
 				   "hns net up fail, ret=%d!\n", ret);
 			return ret;
 		}
 	}
 
-	clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state);
-
 	return ret;
 }
 

From 53fe3ed19df0bca6ce752fae8e483910b6f112f6 Mon Sep 17 00:00:00 2001
From: Xue Chaojing <xuechaojing@huawei.com>
Date: Tue, 1 Jan 2019 19:39:33 +0000
Subject: [PATCH 43/57] net-next/hinic:add shutdown callback

If there is no shutdown callback, our board will report pcie UNF errors
after restarting. This patch add shutdown callback for hinic.

Signed-off-by: Xue Chaojing <xuechaojing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/huawei/hinic/hinic_main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c
index 6d48dc62a44b..da323b9e1f62 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_main.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c
@@ -1106,6 +1106,11 @@ static void hinic_remove(struct pci_dev *pdev)
 	dev_info(&pdev->dev, "HiNIC driver - removed\n");
 }
 
+static void hinic_shutdown(struct pci_dev *pdev)
+{
+	pci_disable_device(pdev);
+}
+
 static const struct pci_device_id hinic_pci_table[] = {
 	{ PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_QUAD_PORT_25GE), 0},
 	{ PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_DUAL_PORT_25GE), 0},
@@ -1119,6 +1124,7 @@ static struct pci_driver hinic_driver = {
 	.id_table       = hinic_pci_table,
 	.probe          = hinic_probe,
 	.remove         = hinic_remove,
+	.shutdown       = hinic_shutdown,
 };
 
 module_pci_driver(hinic_driver);

From 202700e30740c6568b5a6943662f3829566dd533 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 Jan 2019 04:24:20 -0800
Subject: [PATCH 44/57] net/hamradio/6pack: use mod_timer() to rearm timers

Using del_timer() + add_timer() is generally unsafe on SMP,
as noticed by syzbot. Use mod_timer() instead.

kernel BUG at kernel/time/timer.c:1136!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 1026 Comm: kworker/u4:4 Not tainted 4.20.0+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events_unbound flush_to_ldisc
RIP: 0010:add_timer kernel/time/timer.c:1136 [inline]
RIP: 0010:add_timer+0xa81/0x1470 kernel/time/timer.c:1134
Code: 4d 89 7d 40 48 c7 85 70 fe ff ff 00 00 00 00 c7 85 7c fe ff ff ff ff ff ff 48 89 85 90 fe ff ff e9 e6 f7 ff ff e8 cf 42 12 00 <0f> 0b e8 c8 42 12 00 0f 0b e8 c1 42 12 00 4c 89 bd 60 fe ff ff e9
RSP: 0018:ffff8880a7fdf5a8 EFLAGS: 00010293
RAX: ffff8880a7846340 RBX: dffffc0000000000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff816f3ee1 RDI: ffff88808a514ff8
RBP: ffff8880a7fdf760 R08: 0000000000000007 R09: ffff8880a7846c58
R10: ffff8880a7846340 R11: 0000000000000000 R12: ffff88808a514ff8
R13: ffff88808a514ff8 R14: ffff88808a514dc0 R15: 0000000000000030
FS:  0000000000000000(0000) GS:ffff8880ae700000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000000061c500 CR3: 00000000994d9000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 decode_prio_command drivers/net/hamradio/6pack.c:903 [inline]
 sixpack_decode drivers/net/hamradio/6pack.c:971 [inline]
 sixpack_receive_buf drivers/net/hamradio/6pack.c:457 [inline]
 sixpack_receive_buf+0xf9c/0x1470 drivers/net/hamradio/6pack.c:434
 tty_ldisc_receive_buf+0x164/0x1c0 drivers/tty/tty_buffer.c:465
 tty_port_default_receive_buf+0x114/0x190 drivers/tty/tty_port.c:38
 receive_buf drivers/tty/tty_buffer.c:481 [inline]
 flush_to_ldisc+0x3b2/0x590 drivers/tty/tty_buffer.c:533
 process_one_work+0xd0c/0x1ce0 kernel/workqueue.c:2153
 worker_thread+0x143/0x14a0 kernel/workqueue.c:2296
 kthread+0x357/0x430 kernel/kthread.c:246
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Andreas Koensgen <ajk@comnets.uni-bremen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hamradio/6pack.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 28c749980359..a19868cba48c 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -523,10 +523,7 @@ static void resync_tnc(struct timer_list *t)
 
 
 	/* Start resync timer again -- the TNC might be still absent */
-
-	del_timer(&sp->resync_t);
-	sp->resync_t.expires	= jiffies + SIXP_RESYNC_TIMEOUT;
-	add_timer(&sp->resync_t);
+	mod_timer(&sp->resync_t, jiffies + SIXP_RESYNC_TIMEOUT);
 }
 
 static inline int tnc_init(struct sixpack *sp)
@@ -537,9 +534,7 @@ static inline int tnc_init(struct sixpack *sp)
 
 	sp->tty->ops->write(sp->tty, &inbyte, 1);
 
-	del_timer(&sp->resync_t);
-	sp->resync_t.expires = jiffies + SIXP_RESYNC_TIMEOUT;
-	add_timer(&sp->resync_t);
+	mod_timer(&sp->resync_t, jiffies + SIXP_RESYNC_TIMEOUT);
 
 	return 0;
 }
@@ -897,11 +892,8 @@ static void decode_prio_command(struct sixpack *sp, unsigned char cmd)
         /* if the state byte has been received, the TNC is present,
            so the resync timer can be reset. */
 
-	if (sp->tnc_state == TNC_IN_SYNC) {
-		del_timer(&sp->resync_t);
-		sp->resync_t.expires	= jiffies + SIXP_INIT_RESYNC_TIMEOUT;
-		add_timer(&sp->resync_t);
-	}
+	if (sp->tnc_state == TNC_IN_SYNC)
+		mod_timer(&sp->resync_t, jiffies + SIXP_INIT_RESYNC_TIMEOUT);
 
 	sp->status1 = cmd & SIXP_PRIO_DATA_MASK;
 }

From 7adf3246092f5e87ed0fa610e8088fae416c581f Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Wed, 2 Jan 2019 13:29:27 +0100
Subject: [PATCH 45/57] ipv6: route: Fix return value of ip6_neigh_lookup() on
 neigh_create() error

In ip6_neigh_lookup(), we must not return errors coming from
neigh_create(): if creation of a neighbour entry fails, the lookup should
return NULL, in the same way as it's done in __neigh_lookup().

Otherwise, callers legitimately checking for a non-NULL return value of
the lookup function might dereference an invalid pointer.

For instance, on neighbour table overflow, ndisc_router_discovery()
crashes ndisc_update() by passing ERR_PTR(-ENOBUFS) as 'neigh' argument.

Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: f8a1b43b709d ("net/ipv6: Create a neigh_lookup for FIB entries")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a94e0b02a8ac..40b225f87d5e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -210,7 +210,9 @@ struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 	n = __ipv6_neigh_lookup(dev, daddr);
 	if (n)
 		return n;
-	return neigh_create(&nd_tbl, daddr, dev);
+
+	n = neigh_create(&nd_tbl, daddr, dev);
+	return IS_ERR(n) ? NULL : n;
 }
 
 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,

From d63967e475ae10f286dbd35e189cb241e0b1f284 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 Jan 2019 09:20:27 -0800
Subject: [PATCH 46/57] isdn: fix kernel-infoleak in capi_unlocked_ioctl

Since capi_ioctl() copies 64 bytes after calling
capi20_get_manufacturer() we need to ensure to not leak
information to user.

BUG: KMSAN: kernel-infoleak in _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32
CPU: 0 PID: 11245 Comm: syz-executor633 Not tainted 4.20.0-rc7+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x173/0x1d0 lib/dump_stack.c:113
 kmsan_report+0x12e/0x2a0 mm/kmsan/kmsan.c:613
 kmsan_internal_check_memory+0x9d4/0xb00 mm/kmsan/kmsan.c:704
 kmsan_copy_to_user+0xab/0xc0 mm/kmsan/kmsan_hooks.c:601
 _copy_to_user+0x16b/0x1f0 lib/usercopy.c:32
 capi_ioctl include/linux/uaccess.h:177 [inline]
 capi_unlocked_ioctl+0x1a0b/0x1bf0 drivers/isdn/capi/capi.c:939
 do_vfs_ioctl+0xebd/0x2bf0 fs/ioctl.c:46
 ksys_ioctl fs/ioctl.c:713 [inline]
 __do_sys_ioctl fs/ioctl.c:720 [inline]
 __se_sys_ioctl+0x1da/0x270 fs/ioctl.c:718
 __x64_sys_ioctl+0x4a/0x70 fs/ioctl.c:718
 do_syscall_64+0xbc/0xf0 arch/x86/entry/common.c:291
 entry_SYSCALL_64_after_hwframe+0x63/0xe7
RIP: 0033:0x440019
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffdd4659fb8 EFLAGS: 00000213 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440019
RDX: 0000000020000080 RSI: 00000000c0044306 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8
R10: 0000000000000000 R11: 0000000000000213 R12: 00000000004018a0
R13: 0000000000401930 R14: 0000000000000000 R15: 0000000000000000

Local variable description: ----data.i@capi_unlocked_ioctl
Variable was created at:
 capi_ioctl drivers/isdn/capi/capi.c:747 [inline]
 capi_unlocked_ioctl+0x82/0x1bf0 drivers/isdn/capi/capi.c:939
 do_vfs_ioctl+0xebd/0x2bf0 fs/ioctl.c:46

Bytes 12-63 of 64 are uninitialized
Memory access of size 64 starts at ffff88807ac5fce8
Data copied to user address 0000000020000080

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Karsten Keil <isdn@linux-pingi.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/capi/kcapi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 0ff517d3c98f..a4ceb61c5b60 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -852,7 +852,7 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf)
 	u16 ret;
 
 	if (contr == 0) {
-		strlcpy(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN);
+		strncpy(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN);
 		return CAPI_NOERROR;
 	}
 
@@ -860,7 +860,7 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf)
 
 	ctr = get_capi_ctr_by_nr(contr);
 	if (ctr && ctr->state == CAPI_CTR_RUNNING) {
-		strlcpy(buf, ctr->manu, CAPI_MANUFACTURER_LEN);
+		strncpy(buf, ctr->manu, CAPI_MANUFACTURER_LEN);
 		ret = CAPI_NOERROR;
 	} else
 		ret = CAPI_REGNOTINSTALLED;

From c08435ec7f2bc8f4109401f696fd55159b4b40cb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:27 +0100
Subject: [PATCH 47/57] bpf: move {prev_,}insn_idx into verifier env

Move prev_insn_idx and insn_idx from the do_check() function into
the verifier environment, so they can be read inside the various
helper functions for handling the instructions. It's easier to put
this into the environment rather than changing all call-sites only
to pass it along. insn_idx is useful in particular since this later
on allows to hold state in env->insn_aux_data[env->insn_idx].

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  2 +
 kernel/bpf/verifier.c        | 76 ++++++++++++++++++------------------
 2 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c233efc106c6..3f84f3e87704 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -212,6 +212,8 @@ struct bpf_subprog_info {
  * one verifier_env per bpf_check() call
  */
 struct bpf_verifier_env {
+	u32 insn_idx;
+	u32 prev_insn_idx;
 	struct bpf_prog *prog;		/* eBPF program being verified */
 	const struct bpf_verifier_ops *ops;
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 71d86e3024ae..afa8515bbb34 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5650,7 +5650,6 @@ static int do_check(struct bpf_verifier_env *env)
 	struct bpf_insn *insns = env->prog->insnsi;
 	struct bpf_reg_state *regs;
 	int insn_cnt = env->prog->len, i;
-	int insn_idx, prev_insn_idx = 0;
 	int insn_processed = 0;
 	bool do_print_state = false;
 
@@ -5670,19 +5669,19 @@ static int do_check(struct bpf_verifier_env *env)
 			BPF_MAIN_FUNC /* callsite */,
 			0 /* frameno */,
 			0 /* subprogno, zero == main subprog */);
-	insn_idx = 0;
+
 	for (;;) {
 		struct bpf_insn *insn;
 		u8 class;
 		int err;
 
-		if (insn_idx >= insn_cnt) {
+		if (env->insn_idx >= insn_cnt) {
 			verbose(env, "invalid insn idx %d insn_cnt %d\n",
-				insn_idx, insn_cnt);
+				env->insn_idx, insn_cnt);
 			return -EFAULT;
 		}
 
-		insn = &insns[insn_idx];
+		insn = &insns[env->insn_idx];
 		class = BPF_CLASS(insn->code);
 
 		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
@@ -5692,7 +5691,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return -E2BIG;
 		}
 
-		err = is_state_visited(env, insn_idx);
+		err = is_state_visited(env, env->insn_idx);
 		if (err < 0)
 			return err;
 		if (err == 1) {
@@ -5700,9 +5699,9 @@ static int do_check(struct bpf_verifier_env *env)
 			if (env->log.level) {
 				if (do_print_state)
 					verbose(env, "\nfrom %d to %d: safe\n",
-						prev_insn_idx, insn_idx);
+						env->prev_insn_idx, env->insn_idx);
 				else
-					verbose(env, "%d: safe\n", insn_idx);
+					verbose(env, "%d: safe\n", env->insn_idx);
 			}
 			goto process_bpf_exit;
 		}
@@ -5715,10 +5714,10 @@ static int do_check(struct bpf_verifier_env *env)
 
 		if (env->log.level > 1 || (env->log.level && do_print_state)) {
 			if (env->log.level > 1)
-				verbose(env, "%d:", insn_idx);
+				verbose(env, "%d:", env->insn_idx);
 			else
 				verbose(env, "\nfrom %d to %d:",
-					prev_insn_idx, insn_idx);
+					env->prev_insn_idx, env->insn_idx);
 			print_verifier_state(env, state->frame[state->curframe]);
 			do_print_state = false;
 		}
@@ -5729,20 +5728,20 @@ static int do_check(struct bpf_verifier_env *env)
 				.private_data	= env,
 			};
 
-			verbose_linfo(env, insn_idx, "; ");
-			verbose(env, "%d: ", insn_idx);
+			verbose_linfo(env, env->insn_idx, "; ");
+			verbose(env, "%d: ", env->insn_idx);
 			print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
 		}
 
 		if (bpf_prog_is_dev_bound(env->prog->aux)) {
-			err = bpf_prog_offload_verify_insn(env, insn_idx,
-							   prev_insn_idx);
+			err = bpf_prog_offload_verify_insn(env, env->insn_idx,
+							   env->prev_insn_idx);
 			if (err)
 				return err;
 		}
 
 		regs = cur_regs(env);
-		env->insn_aux_data[insn_idx].seen = true;
+		env->insn_aux_data[env->insn_idx].seen = true;
 
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
@@ -5768,13 +5767,13 @@ static int do_check(struct bpf_verifier_env *env)
 			/* check that memory (src_reg + off) is readable,
 			 * the state of dst_reg will be updated by this func
 			 */
-			err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
-					       BPF_SIZE(insn->code), BPF_READ,
-					       insn->dst_reg, false);
+			err = check_mem_access(env, env->insn_idx, insn->src_reg,
+					       insn->off, BPF_SIZE(insn->code),
+					       BPF_READ, insn->dst_reg, false);
 			if (err)
 				return err;
 
-			prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
+			prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;
 
 			if (*prev_src_type == NOT_INIT) {
 				/* saw a valid insn
@@ -5799,10 +5798,10 @@ static int do_check(struct bpf_verifier_env *env)
 			enum bpf_reg_type *prev_dst_type, dst_reg_type;
 
 			if (BPF_MODE(insn->code) == BPF_XADD) {
-				err = check_xadd(env, insn_idx, insn);
+				err = check_xadd(env, env->insn_idx, insn);
 				if (err)
 					return err;
-				insn_idx++;
+				env->insn_idx++;
 				continue;
 			}
 
@@ -5818,13 +5817,13 @@ static int do_check(struct bpf_verifier_env *env)
 			dst_reg_type = regs[insn->dst_reg].type;
 
 			/* check that memory (dst_reg + off) is writeable */
-			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
-					       BPF_SIZE(insn->code), BPF_WRITE,
-					       insn->src_reg, false);
+			err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+					       insn->off, BPF_SIZE(insn->code),
+					       BPF_WRITE, insn->src_reg, false);
 			if (err)
 				return err;
 
-			prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
+			prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;
 
 			if (*prev_dst_type == NOT_INIT) {
 				*prev_dst_type = dst_reg_type;
@@ -5852,9 +5851,9 @@ static int do_check(struct bpf_verifier_env *env)
 			}
 
 			/* check that memory (dst_reg + off) is writeable */
-			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
-					       BPF_SIZE(insn->code), BPF_WRITE,
-					       -1, false);
+			err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+					       insn->off, BPF_SIZE(insn->code),
+					       BPF_WRITE, -1, false);
 			if (err)
 				return err;
 
@@ -5872,9 +5871,9 @@ static int do_check(struct bpf_verifier_env *env)
 				}
 
 				if (insn->src_reg == BPF_PSEUDO_CALL)
-					err = check_func_call(env, insn, &insn_idx);
+					err = check_func_call(env, insn, &env->insn_idx);
 				else
-					err = check_helper_call(env, insn->imm, insn_idx);
+					err = check_helper_call(env, insn->imm, env->insn_idx);
 				if (err)
 					return err;
 
@@ -5887,7 +5886,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
-				insn_idx += insn->off + 1;
+				env->insn_idx += insn->off + 1;
 				continue;
 
 			} else if (opcode == BPF_EXIT) {
@@ -5901,8 +5900,8 @@ static int do_check(struct bpf_verifier_env *env)
 
 				if (state->curframe) {
 					/* exit from nested function */
-					prev_insn_idx = insn_idx;
-					err = prepare_func_exit(env, &insn_idx);
+					env->prev_insn_idx = env->insn_idx;
+					err = prepare_func_exit(env, &env->insn_idx);
 					if (err)
 						return err;
 					do_print_state = true;
@@ -5932,7 +5931,8 @@ static int do_check(struct bpf_verifier_env *env)
 				if (err)
 					return err;
 process_bpf_exit:
-				err = pop_stack(env, &prev_insn_idx, &insn_idx);
+				err = pop_stack(env, &env->prev_insn_idx,
+						&env->insn_idx);
 				if (err < 0) {
 					if (err != -ENOENT)
 						return err;
@@ -5942,7 +5942,7 @@ static int do_check(struct bpf_verifier_env *env)
 					continue;
 				}
 			} else {
-				err = check_cond_jmp_op(env, insn, &insn_idx);
+				err = check_cond_jmp_op(env, insn, &env->insn_idx);
 				if (err)
 					return err;
 			}
@@ -5959,8 +5959,8 @@ static int do_check(struct bpf_verifier_env *env)
 				if (err)
 					return err;
 
-				insn_idx++;
-				env->insn_aux_data[insn_idx].seen = true;
+				env->insn_idx++;
+				env->insn_aux_data[env->insn_idx].seen = true;
 			} else {
 				verbose(env, "invalid BPF_LD mode\n");
 				return -EINVAL;
@@ -5970,7 +5970,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return -EINVAL;
 		}
 
-		insn_idx++;
+		env->insn_idx++;
 	}
 
 	verbose(env, "processed %d insns (limit %d), stack depth ",

From 144cd91c4c2bced6eb8a7e25e590f6618a11e854 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:28 +0100
Subject: [PATCH 48/57] bpf: move tmp variable into ax register in interpreter

This change moves the on-stack 64 bit tmp variable in ___bpf_prog_run()
into the hidden ax register. The latter is currently only used in JITs
for constant blinding as a temporary scratch register, meaning the BPF
interpreter will never see the use of ax. Therefore it is safe to use
it for the cases where tmp has been used earlier. This is needed to later
on allow restricted hidden use of ax in both interpreter and JITs.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  3 ++-
 kernel/bpf/core.c      | 34 +++++++++++++++++-----------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8c8544b375eb..84a6a98f8328 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -60,7 +60,8 @@ struct sock_reuseport;
  * constants. See JIT pre-step in bpf_jit_blind_constants().
  */
 #define BPF_REG_AX		MAX_BPF_REG
-#define MAX_BPF_JIT_REG		(MAX_BPF_REG + 1)
+#define MAX_BPF_EXT_REG		(MAX_BPF_REG + 1)
+#define MAX_BPF_JIT_REG		MAX_BPF_EXT_REG
 
 /* unused opcode to mark special call to bpf_tail_call() helper */
 #define BPF_TAIL_CALL	0xf0
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 38de580abcc2..a34312a5eea2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -54,6 +54,7 @@
 #define DST	regs[insn->dst_reg]
 #define SRC	regs[insn->src_reg]
 #define FP	regs[BPF_REG_FP]
+#define AX	regs[BPF_REG_AX]
 #define ARG1	regs[BPF_REG_ARG1]
 #define CTX	regs[BPF_REG_CTX]
 #define IMM	insn->imm
@@ -1188,7 +1189,6 @@ bool bpf_opcode_in_insntable(u8 code)
  */
 static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 {
-	u64 tmp;
 #define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
 #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
 	static const void *jumptable[256] = {
@@ -1268,36 +1268,36 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 		(*(s64 *) &DST) >>= IMM;
 		CONT;
 	ALU64_MOD_X:
-		div64_u64_rem(DST, SRC, &tmp);
-		DST = tmp;
+		div64_u64_rem(DST, SRC, &AX);
+		DST = AX;
 		CONT;
 	ALU_MOD_X:
-		tmp = (u32) DST;
-		DST = do_div(tmp, (u32) SRC);
+		AX = (u32) DST;
+		DST = do_div(AX, (u32) SRC);
 		CONT;
 	ALU64_MOD_K:
-		div64_u64_rem(DST, IMM, &tmp);
-		DST = tmp;
+		div64_u64_rem(DST, IMM, &AX);
+		DST = AX;
 		CONT;
 	ALU_MOD_K:
-		tmp = (u32) DST;
-		DST = do_div(tmp, (u32) IMM);
+		AX = (u32) DST;
+		DST = do_div(AX, (u32) IMM);
 		CONT;
 	ALU64_DIV_X:
 		DST = div64_u64(DST, SRC);
 		CONT;
 	ALU_DIV_X:
-		tmp = (u32) DST;
-		do_div(tmp, (u32) SRC);
-		DST = (u32) tmp;
+		AX = (u32) DST;
+		do_div(AX, (u32) SRC);
+		DST = (u32) AX;
 		CONT;
 	ALU64_DIV_K:
 		DST = div64_u64(DST, IMM);
 		CONT;
 	ALU_DIV_K:
-		tmp = (u32) DST;
-		do_div(tmp, (u32) IMM);
-		DST = (u32) tmp;
+		AX = (u32) DST;
+		do_div(AX, (u32) IMM);
+		DST = (u32) AX;
 		CONT;
 	ALU_END_TO_BE:
 		switch (IMM) {
@@ -1553,7 +1553,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
 static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
 { \
 	u64 stack[stack_size / sizeof(u64)]; \
-	u64 regs[MAX_BPF_REG]; \
+	u64 regs[MAX_BPF_EXT_REG]; \
 \
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
 	ARG1 = (u64) (unsigned long) ctx; \
@@ -1566,7 +1566,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
 				      const struct bpf_insn *insn) \
 { \
 	u64 stack[stack_size / sizeof(u64)]; \
-	u64 regs[MAX_BPF_REG]; \
+	u64 regs[MAX_BPF_EXT_REG]; \
 \
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
 	BPF_R1 = r1; \

From 9b73bfdd08e73231d6a90ae6db4b46b3fbf56c30 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:29 +0100
Subject: [PATCH 49/57] bpf: enable access to ax register also from verifier
 rewrite

Right now we are using BPF ax register in JIT for constant blinding as
well as in interpreter as temporary variable. Verifier will not be able
to use it simply because its use will get overridden from the former in
bpf_jit_blind_insn(). However, it can be made to work in that blinding
will be skipped if there is prior use in either source or destination
register on the instruction. Taking constraints of ax into account, the
verifier is then open to use it in rewrites under some constraints. Note,
ax register already has mappings in every eBPF JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  7 +------
 kernel/bpf/core.c      | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 84a6a98f8328..ad106d845b22 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -53,12 +53,7 @@ struct sock_reuseport;
 #define BPF_REG_D	BPF_REG_8	/* data, callee-saved */
 #define BPF_REG_H	BPF_REG_9	/* hlen, callee-saved */
 
-/* Kernel hidden auxiliary/helper register for hardening step.
- * Only used by eBPF JITs. It's nothing more than a temporary
- * register that JITs use internally, only that here it's part
- * of eBPF instructions that have been rewritten for blinding
- * constants. See JIT pre-step in bpf_jit_blind_constants().
- */
+/* Kernel hidden auxiliary/helper register. */
 #define BPF_REG_AX		MAX_BPF_REG
 #define MAX_BPF_EXT_REG		(MAX_BPF_REG + 1)
 #define MAX_BPF_JIT_REG		MAX_BPF_EXT_REG
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a34312a5eea2..f908b9356025 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -858,6 +858,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
 	BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
 
+	/* Constraints on AX register:
+	 *
+	 * AX register is inaccessible from user space. It is mapped in
+	 * all JITs, and used here for constant blinding rewrites. It is
+	 * typically "stateless" meaning its contents are only valid within
+	 * the executed instruction, but not across several instructions.
+	 * There are a few exceptions however which are further detailed
+	 * below.
+	 *
+	 * Constant blinding is only used by JITs, not in the interpreter.
+	 * The interpreter uses AX in some occasions as a local temporary
+	 * register e.g. in DIV or MOD instructions.
+	 *
+	 * In restricted circumstances, the verifier can also use the AX
+	 * register for rewrites as long as they do not interfere with
+	 * the above cases!
+	 */
+	if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
+		goto out;
+
 	if (from->imm == 0 &&
 	    (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
 	     from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {

From 0d6303db7970e6f56ae700fa07e11eb510cda125 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:30 +0100
Subject: [PATCH 50/57] bpf: restrict map value pointer arithmetic for
 unprivileged

Restrict map value pointer arithmetic for unprivileged users in that
arithmetic itself must not go out of bounds as opposed to the actual
access later on. Therefore after each adjust_ptr_min_max_vals() with a
map value pointer as a destination it will simulate a check_map_access()
of 1 byte on the destination and once that fails the program is rejected
for unprivileged program loads. We use this later on for masking any
pointer arithmetic with the remainder of the map value space. The
likelihood of breaking any existing real-world unprivileged eBPF
program is very small for this corner case.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index afa8515bbb34..4da8c73e168f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3249,6 +3249,17 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	__update_reg_bounds(dst_reg);
 	__reg_deduce_bounds(dst_reg);
 	__reg_bound_offset(dst_reg);
+
+	/* For unprivileged we require that resulting offset must be in bounds
+	 * in order to be able to sanitize access later on.
+	 */
+	if (!env->allow_ptr_leaks && dst_reg->type == PTR_TO_MAP_VALUE &&
+	    check_map_access(env, dst, dst_reg->off, 1, false)) {
+		verbose(env, "R%d pointer arithmetic of map value goes out of range, prohibited for !root\n",
+			dst);
+		return -EACCES;
+	}
+
 	return 0;
 }
 

From e4298d25830a866cc0f427d4bccb858e76715859 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:31 +0100
Subject: [PATCH 51/57] bpf: restrict stack pointer arithmetic for unprivileged

Restrict stack pointer arithmetic for unprivileged users in that
arithmetic itself must not go out of bounds as opposed to the actual
access later on. Therefore after each adjust_ptr_min_max_vals() with
a stack pointer as a destination we simulate a check_stack_access()
of 1 byte on the destination and once that fails the program is
rejected for unprivileged program loads. This is analog to map
value pointer arithmetic and needed for masking later on.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 63 ++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4da8c73e168f..9ac205d1b8b7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1387,6 +1387,31 @@ static int check_stack_read(struct bpf_verifier_env *env,
 	}
 }
 
+static int check_stack_access(struct bpf_verifier_env *env,
+			      const struct bpf_reg_state *reg,
+			      int off, int size)
+{
+	/* Stack accesses must be at a fixed offset, so that we
+	 * can determine what type of data were returned. See
+	 * check_stack_read().
+	 */
+	if (!tnum_is_const(reg->var_off)) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose(env, "variable stack access var_off=%s off=%d size=%d",
+			tn_buf, off, size);
+		return -EACCES;
+	}
+
+	if (off >= 0 || off < -MAX_BPF_STACK) {
+		verbose(env, "invalid stack off=%d size=%d\n", off, size);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
 /* check read/write into map element returned by bpf_map_lookup_elem() */
 static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 			      int size, bool zero_size_allowed)
@@ -1954,24 +1979,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		}
 
 	} else if (reg->type == PTR_TO_STACK) {
-		/* stack accesses must be at a fixed offset, so that we can
-		 * determine what type of data were returned.
-		 * See check_stack_read().
-		 */
-		if (!tnum_is_const(reg->var_off)) {
-			char tn_buf[48];
-
-			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose(env, "variable stack access var_off=%s off=%d size=%d",
-				tn_buf, off, size);
-			return -EACCES;
-		}
 		off += reg->var_off.value;
-		if (off >= 0 || off < -MAX_BPF_STACK) {
-			verbose(env, "invalid stack off=%d size=%d\n", off,
-				size);
-			return -EACCES;
-		}
+		err = check_stack_access(env, reg, off, size);
+		if (err)
+			return err;
 
 		state = func(env, reg);
 		err = update_stack_depth(env, state, off);
@@ -3253,11 +3264,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	/* For unprivileged we require that resulting offset must be in bounds
 	 * in order to be able to sanitize access later on.
 	 */
-	if (!env->allow_ptr_leaks && dst_reg->type == PTR_TO_MAP_VALUE &&
-	    check_map_access(env, dst, dst_reg->off, 1, false)) {
-		verbose(env, "R%d pointer arithmetic of map value goes out of range, prohibited for !root\n",
-			dst);
-		return -EACCES;
+	if (!env->allow_ptr_leaks) {
+		if (dst_reg->type == PTR_TO_MAP_VALUE &&
+		    check_map_access(env, dst, dst_reg->off, 1, false)) {
+			verbose(env, "R%d pointer arithmetic of map value goes out of range, "
+				"prohibited for !root\n", dst);
+			return -EACCES;
+		} else if (dst_reg->type == PTR_TO_STACK &&
+			   check_stack_access(env, dst_reg, dst_reg->off +
+					      dst_reg->var_off.value, 1)) {
+			verbose(env, "R%d stack pointer arithmetic goes out of range, "
+				"prohibited for !root\n", dst);
+			return -EACCES;
+		}
 	}
 
 	return 0;

From 9d7eceede769f90b66cfa06ad5b357140d5141ed Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:32 +0100
Subject: [PATCH 52/57] bpf: restrict unknown scalars of mixed signed bounds
 for unprivileged

For unknown scalars of mixed signed bounds, meaning their smin_value is
negative and their smax_value is positive, we need to reject arithmetic
with pointer to map value. For unprivileged the goal is to mask every
map pointer arithmetic and this cannot reliably be done when it is
unknown at verification time whether the scalar value is negative or
positive. Given this is a corner case, the likelihood of breaking should
be very small.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9ac205d1b8b7..eebbc03e5af2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3081,8 +3081,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
 	u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
 	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
+	u32 dst = insn->dst_reg, src = insn->src_reg;
 	u8 opcode = BPF_OP(insn->code);
-	u32 dst = insn->dst_reg;
 
 	dst_reg = &regs[dst];
 
@@ -3115,6 +3115,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
+	case PTR_TO_MAP_VALUE:
+		if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
+			verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
+				off_reg == dst_reg ? dst : src);
+			return -EACCES;
+		}
+		/* fall-through */
 	default:
 		break;
 	}

From b7137c4eab85c1cf3d46acdde90ce1163b28c873 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:33 +0100
Subject: [PATCH 53/57] bpf: fix check_map_access smin_value test when pointer
 contains offset

In check_map_access() we probe actual bounds through __check_map_access()
with offset of reg->smin_value + off for lower bound and offset of
reg->umax_value + off for the upper bound. However, even though the
reg->smin_value could have a negative value, the final result of the
sum with off could be positive when pointer arithmetic with known and
unknown scalars is combined. In this case we reject the program with
an error such as "R<x> min value is negative, either use unsigned index
or do a if (index >=0) check." even though the access itself would be
fine. Therefore extend the check to probe whether the actual resulting
reg->smin_value + off is less than zero.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eebbc03e5af2..8e5da1ce5da4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1443,13 +1443,17 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 */
 	if (env->log.level)
 		print_verifier_state(env, state);
+
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
 	 * value is 0.  If we are using signed variables for our
 	 * index'es we need to make sure that whatever we use
 	 * will have a set floor within our range.
 	 */
-	if (reg->smin_value < 0) {
+	if (reg->smin_value < 0 &&
+	    (reg->smin_value == S64_MIN ||
+	     (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
+	      reg->smin_value + off < 0)) {
 		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;

From 979d63d50c0c0f7bc537bf821e056cc9fe5abd38 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:34 +0100
Subject: [PATCH 54/57] bpf: prevent out of bounds speculation on pointer
 arithmetic

Jann reported that the original commit back in b2157399cc98
("bpf: prevent out-of-bounds speculation") was not sufficient
to stop CPU from speculating out of bounds memory access:
While b2157399cc98 only focussed on masking array map access
for unprivileged users for tail calls and data access such
that the user provided index gets sanitized from BPF program
and syscall side, there is still a more generic form affected
from BPF programs that applies to most maps that hold user
data in relation to dynamic map access when dealing with
unknown scalars or "slow" known scalars as access offset, for
example:

  - Load a map value pointer into R6
  - Load an index into R7
  - Do a slow computation (e.g. with a memory dependency) that
    loads a limit into R8 (e.g. load the limit from a map for
    high latency, then mask it to make the verifier happy)
  - Exit if R7 >= R8 (mispredicted branch)
  - Load R0 = R6[R7]
  - Load R0 = R6[R0]

For unknown scalars there are two options in the BPF verifier
where we could derive knowledge from in order to guarantee
safe access to the memory: i) While </>/<=/>= variants won't
allow to derive any lower or upper bounds from the unknown
scalar where it would be safe to add it to the map value
pointer, it is possible through ==/!= test however. ii) another
option is to transform the unknown scalar into a known scalar,
for example, through ALU ops combination such as R &= <imm>
followed by R |= <imm> or any similar combination where the
original information from the unknown scalar would be destroyed
entirely leaving R with a constant. The initial slow load still
precedes the latter ALU ops on that register, so the CPU
executes speculatively from that point. Once we have the known
scalar, any compare operation would work then. A third option
only involving registers with known scalars could be crafted
as described in [0] where a CPU port (e.g. Slow Int unit)
would be filled with many dependent computations such that
the subsequent condition depending on its outcome has to wait
for evaluation on its execution port and thereby executing
speculatively if the speculated code can be scheduled on a
different execution port, or any other form of mistraining
as described in [1], for example. Given this is not limited
to only unknown scalars, not only map but also stack access
is affected since both is accessible for unprivileged users
and could potentially be used for out of bounds access under
speculation.

In order to prevent any of these cases, the verifier is now
sanitizing pointer arithmetic on the offset such that any
out of bounds speculation would be masked in a way where the
pointer arithmetic result in the destination register will
stay unchanged, meaning offset masked into zero similar as
in array_index_nospec() case. With regards to implementation,
there are three options that were considered: i) new insn
for sanitation, ii) push/pop insn and sanitation as inlined
BPF, iii) reuse of ax register and sanitation as inlined BPF.

Option i) has the downside that we end up using from reserved
bits in the opcode space, but also that we would require
each JIT to emit masking as native arch opcodes meaning
mitigation would have slow adoption till everyone implements
it eventually which is counter-productive. Option ii) and iii)
have both in common that a temporary register is needed in
order to implement the sanitation as inlined BPF since we
are not allowed to modify the source register. While a push /
pop insn in ii) would be useful to have in any case, it
requires once again that every JIT needs to implement it
first. While possible, amount of changes needed would also
be unsuitable for a -stable patch. Therefore, the path which
has fewer changes, less BPF instructions for the mitigation
and does not require anything to be changed in the JITs is
option iii) which this work is pursuing. The ax register is
already mapped to a register in all JITs (modulo arm32 where
it's mapped to stack as various other BPF registers there)
and used in constant blinding for JITs-only so far. It can
be reused for verifier rewrites under certain constraints.
The interpreter's tmp "register" has therefore been remapped
into extending the register set with hidden ax register and
reusing that for a number of instructions that needed the
prior temporary variable internally (e.g. div, mod). This
allows for zero increase in stack space usage in the interpreter,
and enables (restricted) generic use in rewrites otherwise as
long as such a patchlet does not make use of these instructions.
The sanitation mask is dynamic and relative to the offset the
map value or stack pointer currently holds.

There are various cases that need to be taken under consideration
for the masking, e.g. such operation could look as follows:
ptr += val or val += ptr or ptr -= val. Thus, the value to be
sanitized could reside either in source or in destination
register, and the limit is different depending on whether
the ALU op is addition or subtraction and depending on the
current known and bounded offset. The limit is derived as
follows: limit := max_value_size - (smin_value + off). For
subtraction: limit := umax_value + off. This holds because
we do not allow any pointer arithmetic that would
temporarily go out of bounds or would have an unknown
value with mixed signed bounds where it is unclear at
verification time whether the actual runtime value would
be either negative or positive. For example, we have a
derived map pointer value with constant offset and bounded
one, so limit based on smin_value works because the verifier
requires that statically analyzed arithmetic on the pointer
must be in bounds, and thus it checks if resulting
smin_value + off and umax_value + off is still within map
value bounds at time of arithmetic in addition to time of
access. Similarly, for the case of stack access we derive
the limit as follows: MAX_BPF_STACK + off for subtraction
and -off for the case of addition where off := ptr_reg->off +
ptr_reg->var_off.value. Subtraction is a special case for
the masking which can be in form of ptr += -val, ptr -= -val,
or ptr -= val. In the first two cases where we know that
the value is negative, we need to temporarily negate the
value in order to do the sanitation on a positive value
where we later swap the ALU op, and restore original source
register if the value was in source.

The sanitation of pointer arithmetic alone is still not fully
sufficient as is, since a scenario like the following could
happen ...

  PTR += 0x1000 (e.g. K-based imm)
  PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON
  PTR += 0x1000
  PTR -= BIG_NUMBER_WITH_SLOW_COMPARISON
  [...]

... which under speculation could end up as ...

  PTR += 0x1000
  PTR -= 0 [ truncated by mitigation ]
  PTR += 0x1000
  PTR -= 0 [ truncated by mitigation ]
  [...]

... and therefore still access out of bounds. To prevent such
case, the verifier is also analyzing safety for potential out
of bounds access under speculative execution. Meaning, it is
also simulating pointer access under truncation. We therefore
"branch off" and push the current verification state after the
ALU operation with known 0 to the verification stack for later
analysis. Given the current path analysis succeeded it is
likely that the one under speculation can be pruned. In any
case, it is also subject to existing complexity limits and
therefore anything beyond this point will be rejected. In
terms of pruning, it needs to be ensured that the verification
state from speculative execution simulation must never prune
a non-speculative execution path, therefore, we mark verifier
state accordingly at the time of push_stack(). If verifier
detects out of bounds access under speculative execution from
one of the possible paths that includes a truncation, it will
reject such program.

Given we mask every reg-based pointer arithmetic for
unprivileged programs, we've been looking into how it could
affect real-world programs in terms of size increase. As the
majority of programs are targeted for privileged-only use
case, we've unconditionally enabled masking (with its alu
restrictions on top of it) for privileged programs for the
sake of testing in order to check i) whether they get rejected
in its current form, and ii) by how much the number of
instructions and size will increase. We've tested this by
using Katran, Cilium and test_l4lb from the kernel selftests.
For Katran we've evaluated balancer_kern.o, Cilium bpf_lxc.o
and an older test object bpf_lxc_opt_-DUNKNOWN.o and l4lb
we've used test_l4lb.o as well as test_l4lb_noinline.o. We
found that none of the programs got rejected by the verifier
with this change, and that impact is rather minimal to none.
balancer_kern.o had 13,904 bytes (1,738 insns) xlated and
7,797 bytes JITed before and after the change. Most complex
program in bpf_lxc.o had 30,544 bytes (3,817 insns) xlated
and 18,538 bytes JITed before and after and none of the other
tail call programs in bpf_lxc.o had any changes either. For
the older bpf_lxc_opt_-DUNKNOWN.o object we found a small
increase from 20,616 bytes (2,576 insns) and 12,536 bytes JITed
before to 20,664 bytes (2,582 insns) and 12,558 bytes JITed
after the change. Other programs from that object file had
similar small increase. Both test_l4lb.o had no change and
remained at 6,544 bytes (817 insns) xlated and 3,401 bytes
JITed and for test_l4lb_noinline.o constant at 5,080 bytes
(634 insns) xlated and 3,313 bytes JITed. This can be explained
in that LLVM typically optimizes stack based pointer arithmetic
by using K-based operations and that use of dynamic map access
is not overly frequent. However, in future we may decide to
optimize the algorithm further under known guarantees from
branch and value speculation. Latter seems also unclear in
terms of prediction heuristics that today's CPUs apply as well
as whether there could be collisions in e.g. the predictor's
Value History/Pattern Table for triggering out of bounds access,
thus masking is performed unconditionally at this point but could
be subject to relaxation later on. We were generally also
brainstorming various other approaches for mitigation, but the
blocker was always lack of available registers at runtime and/or
overhead for runtime tracking of limits belonging to a specific
pointer. Thus, we found this to be minimally intrusive under
given constraints.

With that in place, a simple example with sanitized access on
unprivileged load at post-verification time looks as follows:

  # bpftool prog dump xlated id 282
  [...]
  28: (79) r1 = *(u64 *)(r7 +0)
  29: (79) r2 = *(u64 *)(r7 +8)
  30: (57) r1 &= 15
  31: (79) r3 = *(u64 *)(r0 +4608)
  32: (57) r3 &= 1
  33: (47) r3 |= 1
  34: (2d) if r2 > r3 goto pc+19
  35: (b4) (u32) r11 = (u32) 20479  |
  36: (1f) r11 -= r2                | Dynamic sanitation for pointer
  37: (4f) r11 |= r2                | arithmetic with registers
  38: (87) r11 = -r11               | containing bounded or known
  39: (c7) r11 s>>= 63              | scalars in order to prevent
  40: (5f) r11 &= r2                | out of bounds speculation.
  41: (0f) r4 += r11                |
  42: (71) r4 = *(u8 *)(r4 +0)
  43: (6f) r4 <<= r1
  [...]

For the case where the scalar sits in the destination register
as opposed to the source register, the following code is emitted
for the above example:

  [...]
  16: (b4) (u32) r11 = (u32) 20479
  17: (1f) r11 -= r2
  18: (4f) r11 |= r2
  19: (87) r11 = -r11
  20: (c7) r11 s>>= 63
  21: (5f) r2 &= r11
  22: (0f) r2 += r0
  23: (61) r0 = *(u32 *)(r2 +0)
  [...]

JIT blinding example with non-conflicting use of r10:

  [...]
   d5:	je     0x0000000000000106    _
   d7:	mov    0x0(%rax),%edi       |
   da:	mov    $0xf153246,%r10d     | Index load from map value and
   e0:	xor    $0xf153259,%r10      | (const blinded) mask with 0x1f.
   e7:	and    %r10,%rdi            |_
   ea:	mov    $0x2f,%r10d          |
   f0:	sub    %rdi,%r10            | Sanitized addition. Both use r10
   f3:	or     %rdi,%r10            | but do not interfere with each
   f6:	neg    %r10                 | other. (Neither do these instructions
   f9:	sar    $0x3f,%r10           | interfere with the use of ax as temp
   fd:	and    %r10,%rdi            | in interpreter.)
  100:	add    %rax,%rdi            |_
  103:	mov    0x0(%rdi),%eax
 [...]

Tested that it fixes Jann's reproducer, and also checked that test_verifier
and test_progs suite with interpreter, JIT and JIT with hardening enabled
on x86-64 and arm64 runs successfully.

  [0] Speculose: Analyzing the Security Implications of Speculative
      Execution in CPUs, Giorgi Maisuradze and Christian Rossow,
      https://arxiv.org/pdf/1801.04084.pdf

  [1] A Systematic Evaluation of Transient Execution Attacks and
      Defenses, Claudio Canella, Jo Van Bulck, Michael Schwarz,
      Moritz Lipp, Benjamin von Berg, Philipp Ortner, Frank Piessens,
      Dmitry Evtyushkin, Daniel Gruss,
      https://arxiv.org/pdf/1811.05441.pdf

Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  10 ++
 kernel/bpf/verifier.c        | 185 +++++++++++++++++++++++++++++++++--
 2 files changed, 189 insertions(+), 6 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3f84f3e87704..27b74947cd2b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -148,6 +148,7 @@ struct bpf_verifier_state {
 	/* call stack tracking */
 	struct bpf_func_state *frame[MAX_CALL_FRAMES];
 	u32 curframe;
+	bool speculative;
 };
 
 #define bpf_get_spilled_reg(slot, frame)				\
@@ -167,15 +168,24 @@ struct bpf_verifier_state_list {
 	struct bpf_verifier_state_list *next;
 };
 
+/* Possible states for alu_state member. */
+#define BPF_ALU_SANITIZE_SRC		1U
+#define BPF_ALU_SANITIZE_DST		2U
+#define BPF_ALU_NEG_VALUE		(1U << 2)
+#define BPF_ALU_SANITIZE		(BPF_ALU_SANITIZE_SRC | \
+					 BPF_ALU_SANITIZE_DST)
+
 struct bpf_insn_aux_data {
 	union {
 		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
 		unsigned long map_state;	/* pointer/poison value for maps */
 		s32 call_imm;			/* saved imm field of call insn */
+		u32 alu_limit;			/* limit for add/sub register with pointer */
 	};
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
+	u8 alu_state; /* used in combination with alu_limit */
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8e5da1ce5da4..f6bc62a9ee8e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -710,6 +710,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 		free_func_state(dst_state->frame[i]);
 		dst_state->frame[i] = NULL;
 	}
+	dst_state->speculative = src->speculative;
 	dst_state->curframe = src->curframe;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
@@ -754,7 +755,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
 }
 
 static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
-					     int insn_idx, int prev_insn_idx)
+					     int insn_idx, int prev_insn_idx,
+					     bool speculative)
 {
 	struct bpf_verifier_state *cur = env->cur_state;
 	struct bpf_verifier_stack_elem *elem;
@@ -772,6 +774,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	err = copy_verifier_state(&elem->st, cur);
 	if (err)
 		goto err;
+	elem->st.speculative |= speculative;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
 		verbose(env, "BPF program is too complex\n");
 		goto err;
@@ -3067,6 +3070,102 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
 	return true;
 }
 
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+	return &env->insn_aux_data[env->insn_idx];
+}
+
+static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
+			      u32 *ptr_limit, u8 opcode, bool off_is_neg)
+{
+	bool mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
+			    (opcode == BPF_SUB && !off_is_neg);
+	u32 off;
+
+	switch (ptr_reg->type) {
+	case PTR_TO_STACK:
+		off = ptr_reg->off + ptr_reg->var_off.value;
+		if (mask_to_left)
+			*ptr_limit = MAX_BPF_STACK + off;
+		else
+			*ptr_limit = -off;
+		return 0;
+	case PTR_TO_MAP_VALUE:
+		if (mask_to_left) {
+			*ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+		} else {
+			off = ptr_reg->smin_value + ptr_reg->off;
+			*ptr_limit = ptr_reg->map_ptr->value_size - off;
+		}
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int sanitize_ptr_alu(struct bpf_verifier_env *env,
+			    struct bpf_insn *insn,
+			    const struct bpf_reg_state *ptr_reg,
+			    struct bpf_reg_state *dst_reg,
+			    bool off_is_neg)
+{
+	struct bpf_verifier_state *vstate = env->cur_state;
+	struct bpf_insn_aux_data *aux = cur_aux(env);
+	bool ptr_is_dst_reg = ptr_reg == dst_reg;
+	u8 opcode = BPF_OP(insn->code);
+	u32 alu_state, alu_limit;
+	struct bpf_reg_state tmp;
+	bool ret;
+
+	if (env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K)
+		return 0;
+
+	/* We already marked aux for masking from non-speculative
+	 * paths, thus we got here in the first place. We only care
+	 * to explore bad access from here.
+	 */
+	if (vstate->speculative)
+		goto do_sim;
+
+	alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+	alu_state |= ptr_is_dst_reg ?
+		     BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+	if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
+		return 0;
+
+	/* If we arrived here from different branches with different
+	 * limits to sanitize, then this won't work.
+	 */
+	if (aux->alu_state &&
+	    (aux->alu_state != alu_state ||
+	     aux->alu_limit != alu_limit))
+		return -EACCES;
+
+	/* Corresponding fixup done in fixup_bpf_calls(). */
+	aux->alu_state = alu_state;
+	aux->alu_limit = alu_limit;
+
+do_sim:
+	/* Simulate and find potential out-of-bounds access under
+	 * speculative execution from truncation as a result of
+	 * masking when off was not within expected range. If off
+	 * sits in dst, then we temporarily need to move ptr there
+	 * to simulate dst (== 0) +/-= ptr. Needed, for example,
+	 * for cases where we use K-based arithmetic in one direction
+	 * and truncated reg-based in the other in order to explore
+	 * bad access.
+	 */
+	if (!ptr_is_dst_reg) {
+		tmp = *dst_reg;
+		*dst_reg = *ptr_reg;
+	}
+	ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
+	if (!ptr_is_dst_reg)
+		*dst_reg = tmp;
+	return !ret ? -EFAULT : 0;
+}
+
 /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
  * Caller should also handle BPF_MOV case separately.
  * If we return -EACCES, caller may want to try again treating pointer as a
@@ -3087,6 +3186,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
 	u32 dst = insn->dst_reg, src = insn->src_reg;
 	u8 opcode = BPF_OP(insn->code);
+	int ret;
 
 	dst_reg = &regs[dst];
 
@@ -3142,6 +3242,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 
 	switch (opcode) {
 	case BPF_ADD:
+		ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
+		if (ret < 0) {
+			verbose(env, "R%d tried to add from different maps or paths\n", dst);
+			return ret;
+		}
 		/* We can take a fixed offset as long as it doesn't overflow
 		 * the s32 'off' field
 		 */
@@ -3192,6 +3297,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		break;
 	case BPF_SUB:
+		ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
+		if (ret < 0) {
+			verbose(env, "R%d tried to sub from different maps or paths\n", dst);
+			return ret;
+		}
 		if (dst_reg == off_reg) {
 			/* scalar -= pointer.  Creates an unknown scalar */
 			verbose(env, "R%d tried to subtract pointer from scalar\n",
@@ -4389,7 +4499,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		}
 	}
 
-	other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+	other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
+				  false);
 	if (!other_branch)
 		return -EFAULT;
 	other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
@@ -5499,6 +5610,12 @@ static bool states_equal(struct bpf_verifier_env *env,
 	if (old->curframe != cur->curframe)
 		return false;
 
+	/* Verification state from speculative execution simulation
+	 * must never prune a non-speculative execution one.
+	 */
+	if (old->speculative && !cur->speculative)
+		return false;
+
 	/* for states to be equal callsites have to be the same
 	 * and all frame states need to be equivalent
 	 */
@@ -5700,6 +5817,7 @@ static int do_check(struct bpf_verifier_env *env)
 	if (!state)
 		return -ENOMEM;
 	state->curframe = 0;
+	state->speculative = false;
 	state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
 	if (!state->frame[0]) {
 		kfree(state);
@@ -5739,8 +5857,10 @@ static int do_check(struct bpf_verifier_env *env)
 			/* found equivalent state, can prune the search */
 			if (env->log.level) {
 				if (do_print_state)
-					verbose(env, "\nfrom %d to %d: safe\n",
-						env->prev_insn_idx, env->insn_idx);
+					verbose(env, "\nfrom %d to %d%s: safe\n",
+						env->prev_insn_idx, env->insn_idx,
+						env->cur_state->speculative ?
+						" (speculative execution)" : "");
 				else
 					verbose(env, "%d: safe\n", env->insn_idx);
 			}
@@ -5757,8 +5877,10 @@ static int do_check(struct bpf_verifier_env *env)
 			if (env->log.level > 1)
 				verbose(env, "%d:", env->insn_idx);
 			else
-				verbose(env, "\nfrom %d to %d:",
-					env->prev_insn_idx, env->insn_idx);
+				verbose(env, "\nfrom %d to %d%s:",
+					env->prev_insn_idx, env->insn_idx,
+					env->cur_state->speculative ?
+					" (speculative execution)" : "");
 			print_verifier_state(env, state->frame[state->curframe]);
 			do_print_state = false;
 		}
@@ -6750,6 +6872,57 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			continue;
 		}
 
+		if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
+		    insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
+			const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
+			const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
+			struct bpf_insn insn_buf[16];
+			struct bpf_insn *patch = &insn_buf[0];
+			bool issrc, isneg;
+			u32 off_reg;
+
+			aux = &env->insn_aux_data[i + delta];
+			if (!aux->alu_state)
+				continue;
+
+			isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
+			issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
+				BPF_ALU_SANITIZE_SRC;
+
+			off_reg = issrc ? insn->src_reg : insn->dst_reg;
+			if (isneg)
+				*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+			*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+			*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+			*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+			*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+			*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+			if (issrc) {
+				*patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX,
+							 off_reg);
+				insn->src_reg = BPF_REG_AX;
+			} else {
+				*patch++ = BPF_ALU64_REG(BPF_AND, off_reg,
+							 BPF_REG_AX);
+			}
+			if (isneg)
+				insn->code = insn->code == code_add ?
+					     code_sub : code_add;
+			*patch++ = *insn;
+			if (issrc && isneg)
+				*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+			cnt = patch - insn_buf;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		if (insn->code != (BPF_JMP | BPF_CALL))
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_CALL)

From 80c9b2fae87bb5c5698940da1a981f14f89518d1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Jan 2019 00:58:35 +0100
Subject: [PATCH 55/57] bpf: add various test cases to selftests

Add various map value pointer related test cases to test_verifier
kselftest to reflect recent changes and improve test coverage. The
tests include basic masking functionality, unprivileged behavior
on pointer arithmetic which goes oob, mixed bounds tests, negative
unknown scalar but resulting positive offset for access and helper
range, handling of arithmetic from multiple maps, various masking
scenarios with subsequent map value access and others including two
test cases from Jann Horn for prior fixes.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c | 1108 ++++++++++++++++++-
 1 file changed, 1105 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 33f7d38849b8..10d44446e801 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -23,6 +23,7 @@
 #include <stdbool.h>
 #include <sched.h>
 #include <limits.h>
+#include <assert.h>
 
 #include <sys/capability.h>
 
@@ -2577,6 +2578,7 @@ static struct bpf_test tests[] = {
 		},
 		.result = REJECT,
 		.errstr = "invalid stack off=-79992 size=8",
+		.errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
 	},
 	{
 		"PTR_TO_STACK store/load - out of bounds high",
@@ -3104,6 +3106,8 @@ static struct bpf_test tests[] = {
 			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8),
 			BPF_EXIT_INSN(),
 		},
+		.errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+		.result_unpriv = REJECT,
 		.result = ACCEPT,
 	},
 	{
@@ -3205,6 +3209,243 @@ static struct bpf_test tests[] = {
 		/* Verifier rewrite for unpriv skips tail call here. */
 		.retval_unpriv = 2,
 	},
+	{
+		"PTR_TO_STACK check high 1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -1),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"PTR_TO_STACK check high 2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, -1, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, -1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"PTR_TO_STACK check high 3",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, -1, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, -1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+		.result_unpriv = REJECT,
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"PTR_TO_STACK check high 4",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+		.errstr = "invalid stack off=0 size=1",
+		.result = REJECT,
+	},
+	{
+		"PTR_TO_STACK check high 5",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid stack off",
+	},
+	{
+		"PTR_TO_STACK check high 6",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MAX, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MAX),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid stack off",
+	},
+	{
+		"PTR_TO_STACK check high 7",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MAX, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MAX),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+		.errstr = "fp pointer offset",
+	},
+	{
+		"PTR_TO_STACK check low 1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -512),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"PTR_TO_STACK check low 2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -513),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 1, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"PTR_TO_STACK check low 3",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -513),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+		.errstr = "invalid stack off=-513 size=1",
+		.result = REJECT,
+	},
+	{
+		"PTR_TO_STACK check low 4",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, INT_MIN),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "math between fp pointer",
+	},
+	{
+		"PTR_TO_STACK check low 5",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid stack off",
+	},
+	{
+		"PTR_TO_STACK check low 6",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MIN, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MIN),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid stack off",
+	},
+	{
+		"PTR_TO_STACK check low 7",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MIN, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MIN),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+		.errstr = "fp pointer offset",
+	},
+	{
+		"PTR_TO_STACK mixed reg/k, 1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -3),
+			BPF_MOV64_IMM(BPF_REG_2, -3),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"PTR_TO_STACK mixed reg/k, 2",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -3),
+			BPF_MOV64_IMM(BPF_REG_2, -3),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_5, -6),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"PTR_TO_STACK mixed reg/k, 3",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -3),
+			BPF_MOV64_IMM(BPF_REG_2, -3),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = -3,
+	},
+	{
+		"PTR_TO_STACK reg",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_MOV64_IMM(BPF_REG_2, -3),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+			BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "invalid stack off=0 size=1",
+		.result = ACCEPT,
+		.retval = 42,
+	},
 	{
 		"stack pointer arithmetic",
 		.insns = {
@@ -6609,6 +6850,232 @@ static struct bpf_test tests[] = {
 		.errstr = "R1 min value is negative",
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 	},
+	{
+		"map access: known scalar += value_ptr from different maps",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, len)),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+			BPF_MOV64_IMM(BPF_REG_1, 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_hash_16b = { 5 },
+		.fixup_map_array_48b = { 8 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R1 tried to add from different maps",
+		.retval = 1,
+	},
+	{
+		"map access: value_ptr -= known scalar from different maps",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, len)),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_IMM(BPF_REG_1, 4),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_hash_16b = { 5 },
+		.fixup_map_array_48b = { 8 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 min value is outside of the array range",
+		.retval = 1,
+	},
+	{
+		"map access: known scalar += value_ptr from different maps, but same value properties",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, len)),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+			BPF_MOV64_IMM(BPF_REG_1, 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_hash_48b = { 5 },
+		.fixup_map_array_48b = { 8 },
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
+		"map access: value_ptr += known scalar, upper oob arith, test 1",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_IMM(BPF_REG_1, 48),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+		.retval = 1,
+	},
+	{
+		"map access: value_ptr += known scalar, upper oob arith, test 2",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_IMM(BPF_REG_1, 49),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+		.retval = 1,
+	},
+	{
+		"map access: value_ptr += known scalar, upper oob arith, test 3",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_IMM(BPF_REG_1, 47),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+		.retval = 1,
+	},
+	{
+		"map access: value_ptr -= known scalar, lower oob arith, test 1",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_IMM(BPF_REG_1, 47),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_1, 48),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = REJECT,
+		.errstr = "R0 min value is outside of the array range",
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+	},
+	{
+		"map access: value_ptr -= known scalar, lower oob arith, test 2",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+			BPF_MOV64_IMM(BPF_REG_1, 47),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_1, 48),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_1, 1),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+		.retval = 1,
+	},
+	{
+		"map access: value_ptr -= known scalar, lower oob arith, test 3",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_IMM(BPF_REG_1, 47),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_1, 47),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+		.retval = 1,
+	},
 	{
 		"map access: known scalar += value_ptr",
 		.insns = {
@@ -6630,7 +7097,7 @@ static struct bpf_test tests[] = {
 		.retval = 1,
 	},
 	{
-		"map access: value_ptr += known scalar",
+		"map access: value_ptr += known scalar, 1",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -6650,7 +7117,113 @@ static struct bpf_test tests[] = {
 		.retval = 1,
 	},
 	{
-		"map access: unknown scalar += value_ptr",
+		"map access: value_ptr += known scalar, 2",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+			BPF_MOV64_IMM(BPF_REG_1, 49),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = REJECT,
+		.errstr = "invalid access to map value",
+	},
+	{
+		"map access: value_ptr += known scalar, 3",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+			BPF_MOV64_IMM(BPF_REG_1, -1),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = REJECT,
+		.errstr = "invalid access to map value",
+	},
+	{
+		"map access: value_ptr += known scalar, 4",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+			BPF_MOV64_IMM(BPF_REG_1, 5),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_1, -2),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_1, -1),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+		.retval = 1,
+	},
+	{
+		"map access: value_ptr += known scalar, 5",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+			BPF_MOV64_IMM(BPF_REG_1, (6 + 1) * sizeof(int)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.retval = 0xabcdef12,
+	},
+	{
+		"map access: value_ptr += known scalar, 6",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_IMM(BPF_REG_1, (3 + 1) * sizeof(int)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_1, 3 * sizeof(int)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.retval = 0xabcdef12,
+	},
+	{
+		"map access: unknown scalar += value_ptr, 1",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -6671,7 +7244,76 @@ static struct bpf_test tests[] = {
 		.retval = 1,
 	},
 	{
-		"map access: value_ptr += unknown scalar",
+		"map access: unknown scalar += value_ptr, 2",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.retval = 0xabcdef12,
+	},
+	{
+		"map access: unknown scalar += value_ptr, 3",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+			BPF_MOV64_IMM(BPF_REG_1, -1),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_1, 1),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+		.retval = 0xabcdef12,
+	},
+	{
+		"map access: unknown scalar += value_ptr, 4",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_IMM(BPF_REG_1, 19),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = REJECT,
+		.errstr = "R1 max value is outside of the array range",
+		.errstr_unpriv = "R1 pointer arithmetic of map value goes out of range",
+	},
+	{
+		"map access: value_ptr += unknown scalar, 1",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -6691,6 +7333,54 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.retval = 1,
 	},
+	{
+		"map access: value_ptr += unknown scalar, 2",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.retval = 0xabcdef12,
+	},
+	{
+		"map access: value_ptr += unknown scalar, 3",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 16),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_3, 1),
+			BPF_ALU64_IMM(BPF_OR, BPF_REG_3, 1),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_3, 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_3),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_JMP_IMM(BPF_JA, 0, 0, -3),
+		},
+		.fixup_map_array_48b = { 3 },
+		.result = ACCEPT,
+		.retval = 1,
+	},
 	{
 		"map access: value_ptr += value_ptr",
 		.insns = {
@@ -6770,6 +7460,8 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_array_48b = { 3 },
 		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
 		.retval = 1,
 	},
 	{
@@ -6837,6 +7529,8 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_array_48b = { 3 },
 		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
 		.retval = 1,
 	},
 	{
@@ -8376,6 +9070,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8400,6 +9095,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8426,6 +9122,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R8 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8451,6 +9148,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R8 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8499,6 +9197,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8570,6 +9269,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8621,6 +9321,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8648,6 +9349,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8674,6 +9376,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8703,6 +9406,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R7 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8733,6 +9437,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 4 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
 	{
@@ -8761,6 +9466,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "unbounded min value",
+		.errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 		.result_unpriv = REJECT,
 	},
@@ -8813,8 +9519,38 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.",
+		.errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
 		.result = REJECT,
 	},
+	{
+		"check subtraction on pointers for unpriv",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_LD_MAP_FD(BPF_REG_ARG1, 0),
+			BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_ARG2, 0, 9),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_9, BPF_REG_FP),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_0),
+			BPF_LD_MAP_FD(BPF_REG_ARG1, 0),
+			BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_ARG2, 0, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_hash_8b = { 1, 9 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R9 pointer -= pointer prohibited",
+	},
 	{
 		"bounds check based on zero-extended MOV",
 		.insns = {
@@ -9145,6 +9881,36 @@ static struct bpf_test tests[] = {
 		.errstr = "R0 unbounded memory access",
 		.result = REJECT
 	},
+	{
+		"bounds check after 32-bit right shift with 64-bit input",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			/* r1 = 2 */
+			BPF_MOV64_IMM(BPF_REG_1, 2),
+			/* r1 = 1<<32 */
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 31),
+			/* r1 = 0 (NOT 2!) */
+			BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 31),
+			/* r1 = 0xffff'fffe (NOT 0!) */
+			BPF_ALU32_IMM(BPF_SUB, BPF_REG_1, 2),
+			/* computes OOB pointer */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* OOB access */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map_hash_8b = { 3 },
+		.errstr = "R0 invalid mem access",
+		.result = REJECT,
+	},
 	{
 		"bounds check map access with off+size signed 32bit overflow. test1",
 		.insns = {
@@ -9185,6 +9951,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "pointer offset 1073741822",
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
 		.result = REJECT
 	},
 	{
@@ -9206,6 +9973,7 @@ static struct bpf_test tests[] = {
 		},
 		.fixup_map_hash_8b = { 3 },
 		.errstr = "pointer offset -1073741822",
+		.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
 		.result = REJECT
 	},
 	{
@@ -9377,6 +10145,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN()
 		},
 		.errstr = "fp pointer offset 1073741822",
+		.errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
 		.result = REJECT
 	},
 	{
@@ -13718,6 +14487,328 @@ static struct bpf_test tests[] = {
 		.result_unpriv = ACCEPT,
 		.insn_processed = 15,
 	},
+	{
+		"masking, test out of bounds 1",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 5),
+			BPF_MOV32_IMM(BPF_REG_2, 5 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 2",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 1),
+			BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 3",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0xffffffff),
+			BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 4",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0xffffffff),
+			BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 5",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, -1),
+			BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 6",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, -1),
+			BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 7",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 5),
+			BPF_MOV32_IMM(BPF_REG_2, 5 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 8",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 1),
+			BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 9",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 0xffffffff),
+			BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 10",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 0xffffffff),
+			BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 11",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, -1),
+			BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test out of bounds 12",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, -1),
+			BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test in bounds 1",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 4),
+			BPF_MOV32_IMM(BPF_REG_2, 5 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 4,
+	},
+	{
+		"masking, test in bounds 2",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test in bounds 3",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0xfffffffe),
+			BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0xfffffffe,
+	},
+	{
+		"masking, test in bounds 4",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0xabcde),
+			BPF_MOV32_IMM(BPF_REG_2, 0xabcdef - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0xabcde,
+	},
+	{
+		"masking, test in bounds 5",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 0),
+			BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"masking, test in bounds 6",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_1, 46),
+			BPF_MOV32_IMM(BPF_REG_2, 47 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 46,
+	},
+	{
+		"masking, test in bounds 7",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_3, -46),
+			BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, -1),
+			BPF_MOV32_IMM(BPF_REG_2, 47 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_3),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_3, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 46,
+	},
+	{
+		"masking, test in bounds 8",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_3, -47),
+			BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, -1),
+			BPF_MOV32_IMM(BPF_REG_2, 47 - 1),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_3),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3),
+			BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+			BPF_ALU64_REG(BPF_AND, BPF_REG_3, BPF_REG_2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 0,
+	},
 	{
 		"reference tracking in call: free reference in subprog and outside",
 		.insns = {
@@ -14413,6 +15504,16 @@ static int create_map(uint32_t type, uint32_t size_key,
 	return fd;
 }
 
+static void update_map(int fd, int index)
+{
+	struct test_val value = {
+		.index = (6 + 1) * sizeof(int),
+		.foo[6] = 0xabcdef12,
+	};
+
+	assert(!bpf_map_update_elem(fd, &index, &value, 0));
+}
+
 static int create_prog_dummy1(enum bpf_prog_type prog_type)
 {
 	struct bpf_insn prog[] = {
@@ -14564,6 +15665,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
 	if (*fixup_map_array_48b) {
 		map_fds[3] = create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
 					sizeof(struct test_val), 1);
+		update_map(map_fds[3], 0);
 		do {
 			prog[*fixup_map_array_48b].imm = map_fds[3];
 			fixup_map_array_48b++;

From 73155879b3c1ac3ace35208a54a3a160ec520bef Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 2 Jan 2019 18:26:13 -0800
Subject: [PATCH 56/57] ipv6: Fix dump of specific table with strict checking

Dump of a specific table with strict checking enabled is looping. The
problem is that the end of the table dump is not marked in the cb. When
dumping a specific table, cb args 0 and 1 are not used (they are the hash
index and entry with an hash table index when dumping all tables). Re-use
args[0] to hold a 'done' flag for the specific table dump.

Fixes: 13e38901d46ca ("net/ipv6: Plumb support for filtering route dumps")
Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ae3786132c23..6613d8dbb0e5 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -627,7 +627,11 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 			return -ENOENT;
 		}
 
-		res = fib6_dump_table(tb, skb, cb);
+		if (!cb->args[0]) {
+			res = fib6_dump_table(tb, skb, cb);
+			if (!res)
+				cb->args[0] = 1;
+		}
 		goto out;
 	}
 

From c5ee066333ebc322a24a00a743ed941a0c68617e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 2 Jan 2019 18:57:09 -0800
Subject: [PATCH 57/57] ipv6: Consider sk_bound_dev_if when binding a socket to
 an address

IPv6 does not consider if the socket is bound to a device when binding
to an address. The result is that a socket can be bound to eth0 and then
bound to the address of eth1. If the device is a VRF, the result is that
a socket can only be bound to an address in the default VRF.

Resolve by considering the device if sk_bound_dev_if is set.

This problem exists from the beginning of git history.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index f0cd291034f0..0bfb6cc0a30a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -350,6 +350,9 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 					err = -EINVAL;
 					goto out_unlock;
 				}
+			}
+
+			if (sk->sk_bound_dev_if) {
 				dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
 				if (!dev) {
 					err = -ENODEV;