From 07bf7908950a8b14e81aa1807e3c667eab39287a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 1 Aug 2018 13:45:11 +0200 Subject: [PATCH 001/499] xfrm: Validate address prefix lengths in the xfrm selector. We don't validate the address prefix lengths in the xfrm selector we got from userspace. This can lead to undefined behaviour in the address matching functions if the prefix is too big for the given address family. Fix this by checking the prefixes and refuse SA/policy insertation when a prefix is invalid. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Air Icy Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 33878e6e0d0a..5151b3ebf068 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -151,10 +151,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, err = -EINVAL; switch (p->family) { case AF_INET: + if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) + goto out; + break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) + if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) + goto out; + break; #else err = -EAFNOSUPPORT; @@ -1359,10 +1365,16 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) switch (p->sel.family) { case AF_INET: + if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) + return -EINVAL; + break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) + if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) + return -EINVAL; + break; #else return -EAFNOSUPPORT; From 0aebe40bae6cf5652fdc3d05ecee15fbf5748194 Mon Sep 17 00:00:00 2001 From: Shreyas NC Date: Fri, 27 Jul 2018 14:44:08 +0530 Subject: [PATCH 002/499] soundwire: Fix duplicate stream state assignment For a SoundWire stream it is expected that a Slave is added to the stream before Master is added. So, move the stream state to CONFIGURED after the first Slave is added and remove the stream state assignment for Master add. Along with these changes, add additional comments to explain the same. Signed-off-by: Shreyas NC Acked-by: Pierre-Louis Bossart Signed-off-by: Vinod Koul --- drivers/soundwire/stream.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c index 4b5e250e8615..7ba6d4d8cd03 100644 --- a/drivers/soundwire/stream.c +++ b/drivers/soundwire/stream.c @@ -1123,8 +1123,6 @@ int sdw_stream_add_master(struct sdw_bus *bus, if (ret) goto stream_error; - stream->state = SDW_STREAM_CONFIGURED; - stream_error: sdw_release_master_stream(stream); error: @@ -1141,6 +1139,10 @@ EXPORT_SYMBOL(sdw_stream_add_master); * @stream: SoundWire stream * @port_config: Port configuration for audio stream * @num_ports: Number of ports + * + * It is expected that Slave is added before adding Master + * to the Stream. + * */ int sdw_stream_add_slave(struct sdw_slave *slave, struct sdw_stream_config *stream_config, @@ -1186,6 +1188,12 @@ int sdw_stream_add_slave(struct sdw_slave *slave, if (ret) goto stream_error; + /* + * Change stream state to CONFIGURED on first Slave add. + * Bus is not aware of number of Slave(s) in a stream at this + * point so cannot depend on all Slave(s) to be added in order to + * change stream state to CONFIGURED. + */ stream->state = SDW_STREAM_CONFIGURED; goto error; From 3fef1a2259c556cce34df2791688cb3001f81c92 Mon Sep 17 00:00:00 2001 From: Shreyas NC Date: Fri, 27 Jul 2018 14:44:09 +0530 Subject: [PATCH 003/499] soundwire: Fix incorrect exit after configuring stream In sdw_stream_add_master() after the Master ports are configured, the stream is released incorrectly. So, fix it by avoiding stream release after configuring the Master for the stream. While at it, rename the label appropriately. Signed-off-by: Shreyas NC Acked-by: Pierre-Louis Bossart Signed-off-by: Vinod Koul --- drivers/soundwire/stream.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c index 7ba6d4d8cd03..b2682272503e 100644 --- a/drivers/soundwire/stream.c +++ b/drivers/soundwire/stream.c @@ -1112,7 +1112,7 @@ int sdw_stream_add_master(struct sdw_bus *bus, "Master runtime config failed for stream:%s", stream->name); ret = -ENOMEM; - goto error; + goto unlock; } ret = sdw_config_stream(bus->dev, stream, stream_config, false); @@ -1123,9 +1123,11 @@ int sdw_stream_add_master(struct sdw_bus *bus, if (ret) goto stream_error; + goto unlock; + stream_error: sdw_release_master_stream(stream); -error: +unlock: mutex_unlock(&bus->bus_lock); return ret; } From 8d6ccf5cebbc7ed1dee9986e36853a78dfb64084 Mon Sep 17 00:00:00 2001 From: Sanyog Kale Date: Fri, 27 Jul 2018 14:44:10 +0530 Subject: [PATCH 004/499] soundwire: Fix acquiring bus lock twice during master release As part of sdw_stream_remove_master(), sdw_stream_remove_slave() is called which results in bus lock being acquired twice. So, fix it by performing specific Slave remove operations in sdw_release_master_stream() instead of calling sdw_stream_remove_slave(). Signed-off-by: Sanyog Kale Signed-off-by: Shreyas NC Acked-by: Pierre-Louis Bossart Signed-off-by: Vinod Koul --- drivers/soundwire/stream.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c index b2682272503e..e5c7e1ef6318 100644 --- a/drivers/soundwire/stream.c +++ b/drivers/soundwire/stream.c @@ -899,9 +899,10 @@ static void sdw_release_master_stream(struct sdw_stream_runtime *stream) struct sdw_master_runtime *m_rt = stream->m_rt; struct sdw_slave_runtime *s_rt, *_s_rt; - list_for_each_entry_safe(s_rt, _s_rt, - &m_rt->slave_rt_list, m_rt_node) - sdw_stream_remove_slave(s_rt->slave, stream); + list_for_each_entry_safe(s_rt, _s_rt, &m_rt->slave_rt_list, m_rt_node) { + sdw_slave_port_release(s_rt->slave->bus, s_rt->slave, stream); + sdw_release_slave_stream(s_rt->slave, stream); + } list_del(&m_rt->bus_node); } From 823f18f8b860526fc099c222619a126d57d2ad8c Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Wed, 29 Aug 2018 15:36:10 +0300 Subject: [PATCH 005/499] regulator: bd71837: Disable voltage monitoring for LDO3/4 There is a HW quirk in BD71837. The shutdown sequence timings for bucks/LDOs which are enabled via register interface are changed. At PMIC poweroff the voltage for BUCK6/7 is cut immediately at the beginning of shut-down sequence. This causes LDO5/6 voltage monitoring to detect under voltage and force PMIC to emergency state instead of poweroff. Disable voltage monitoring for LDO5 and LDO6 at probe to avoid this. Signed-off-by: Matti Vaittinen Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/regulator/bd71837-regulator.c | 19 +++++++++++++++ include/linux/mfd/rohm-bd718x7.h | 33 ++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/drivers/regulator/bd71837-regulator.c b/drivers/regulator/bd71837-regulator.c index 0f8ac8dec3e1..a1bd8aaf4d98 100644 --- a/drivers/regulator/bd71837-regulator.c +++ b/drivers/regulator/bd71837-regulator.c @@ -569,6 +569,25 @@ static int bd71837_probe(struct platform_device *pdev) BD71837_REG_REGLOCK); } + /* + * There is a HW quirk in BD71837. The shutdown sequence timings for + * bucks/LDOs which are controlled via register interface are changed. + * At PMIC poweroff the voltage for BUCK6/7 is cut immediately at the + * beginning of shut-down sequence. As bucks 6 and 7 are parent + * supplies for LDO5 and LDO6 - this causes LDO5/6 voltage + * monitoring to errorneously detect under voltage and force PMIC to + * emergency state instead of poweroff. In order to avoid this we + * disable voltage monitoring for LDO5 and LDO6 + */ + err = regmap_update_bits(pmic->mfd->regmap, BD718XX_REG_MVRFLTMASK2, + BD718XX_LDO5_VRMON80 | BD718XX_LDO6_VRMON80, + BD718XX_LDO5_VRMON80 | BD718XX_LDO6_VRMON80); + if (err) { + dev_err(&pmic->pdev->dev, + "Failed to disable voltage monitoring\n"); + goto err; + } + for (i = 0; i < ARRAY_SIZE(pmic_regulator_inits); i++) { struct regulator_desc *desc; diff --git a/include/linux/mfd/rohm-bd718x7.h b/include/linux/mfd/rohm-bd718x7.h index a528747f8aed..e8338e5dc10b 100644 --- a/include/linux/mfd/rohm-bd718x7.h +++ b/include/linux/mfd/rohm-bd718x7.h @@ -78,9 +78,9 @@ enum { BD71837_REG_TRANS_COND0 = 0x1F, BD71837_REG_TRANS_COND1 = 0x20, BD71837_REG_VRFAULTEN = 0x21, - BD71837_REG_MVRFLTMASK0 = 0x22, - BD71837_REG_MVRFLTMASK1 = 0x23, - BD71837_REG_MVRFLTMASK2 = 0x24, + BD718XX_REG_MVRFLTMASK0 = 0x22, + BD718XX_REG_MVRFLTMASK1 = 0x23, + BD718XX_REG_MVRFLTMASK2 = 0x24, BD71837_REG_RCVCFG = 0x25, BD71837_REG_RCVNUM = 0x26, BD71837_REG_PWRONCONFIG0 = 0x27, @@ -159,6 +159,33 @@ enum { #define BUCK8_MASK 0x3F #define BUCK8_DEFAULT 0x1E +/* BD718XX Voltage monitoring masks */ +#define BD718XX_BUCK1_VRMON80 0x1 +#define BD718XX_BUCK1_VRMON130 0x2 +#define BD718XX_BUCK2_VRMON80 0x4 +#define BD718XX_BUCK2_VRMON130 0x8 +#define BD718XX_1ST_NODVS_BUCK_VRMON80 0x1 +#define BD718XX_1ST_NODVS_BUCK_VRMON130 0x2 +#define BD718XX_2ND_NODVS_BUCK_VRMON80 0x4 +#define BD718XX_2ND_NODVS_BUCK_VRMON130 0x8 +#define BD718XX_3RD_NODVS_BUCK_VRMON80 0x10 +#define BD718XX_3RD_NODVS_BUCK_VRMON130 0x20 +#define BD718XX_4TH_NODVS_BUCK_VRMON80 0x40 +#define BD718XX_4TH_NODVS_BUCK_VRMON130 0x80 +#define BD718XX_LDO1_VRMON80 0x1 +#define BD718XX_LDO2_VRMON80 0x2 +#define BD718XX_LDO3_VRMON80 0x4 +#define BD718XX_LDO4_VRMON80 0x8 +#define BD718XX_LDO5_VRMON80 0x10 +#define BD718XX_LDO6_VRMON80 0x20 + +/* BD71837 specific voltage monitoring masks */ +#define BD71837_BUCK3_VRMON80 0x10 +#define BD71837_BUCK3_VRMON130 0x20 +#define BD71837_BUCK4_VRMON80 0x40 +#define BD71837_BUCK4_VRMON130 0x80 +#define BD71837_LDO7_VRMON80 0x40 + /* BD71837_REG_IRQ bits */ #define IRQ_SWRST 0x40 #define IRQ_PWRON_S 0x20 From 4fbf51ee6e822ac28be28ff25883745131e8c4a3 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 23 Aug 2018 09:55:43 -0400 Subject: [PATCH 006/499] media: video_function_calls.rst: drop obsolete video-set-attributes reference This fixes this warning: Documentation/media/uapi/dvb/video_function_calls.rst:9: WARNING: toctree contains reference to nonexisting document 'uapi/dvb/video-set-attributes' Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/dvb/video_function_calls.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/media/uapi/dvb/video_function_calls.rst b/Documentation/media/uapi/dvb/video_function_calls.rst index 3f4f6c9ffad7..a4222b6cd2d3 100644 --- a/Documentation/media/uapi/dvb/video_function_calls.rst +++ b/Documentation/media/uapi/dvb/video_function_calls.rst @@ -33,4 +33,3 @@ Video Function Calls video-clear-buffer video-set-streamtype video-set-format - video-set-attributes From 312f73b648626a0526a3aceebb0a3192aaba05ce Mon Sep 17 00:00:00 2001 From: Jozef Balga Date: Tue, 21 Aug 2018 05:01:04 -0400 Subject: [PATCH 007/499] media: af9035: prevent buffer overflow on write When less than 3 bytes are written to the device, memcpy is called with negative array size which leads to buffer overflow and kernel panic. This patch adds a condition and returns -EOPNOTSUPP instead. Fixes bugzilla issue 64871 [mchehab+samsung@kernel.org: fix a merge conflict and changed the condition to match the patch's comment, e. g. len == 3 could also be valid] Signed-off-by: Jozef Balga Signed-off-by: Mauro Carvalho Chehab --- drivers/media/usb/dvb-usb-v2/af9035.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/media/usb/dvb-usb-v2/af9035.c b/drivers/media/usb/dvb-usb-v2/af9035.c index 666d319d3d1a..1f6c1eefe389 100644 --- a/drivers/media/usb/dvb-usb-v2/af9035.c +++ b/drivers/media/usb/dvb-usb-v2/af9035.c @@ -402,8 +402,10 @@ static int af9035_i2c_master_xfer(struct i2c_adapter *adap, if (msg[0].addr == state->af9033_i2c_addr[1]) reg |= 0x100000; - ret = af9035_wr_regs(d, reg, &msg[0].buf[3], - msg[0].len - 3); + ret = (msg[0].len >= 3) ? af9035_wr_regs(d, reg, + &msg[0].buf[3], + msg[0].len - 3) + : -EOPNOTSUPP; } else { /* I2C write */ u8 buf[MAX_XFER_SIZE]; From 44a9ffd4eb99ada6c1496a735a414414fef30696 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Aug 2018 05:53:42 -0400 Subject: [PATCH 008/499] media: camss: mark PM functions as __maybe_unused The empty suspend/resume functions cause a build warning when they are unused: drivers/media/platform/qcom/camss/camss.c:1001:12: error: 'camss_runtime_resume' defined but not used [-Werror=unused-function] drivers/media/platform/qcom/camss/camss.c:996:12: error: 'camss_runtime_suspend' defined but not used [-Werror=unused-function] Mark them as __maybe_unused so the compiler can silently drop them. Fixes: 02afa816dbbf ("media: camss: Add basic runtime PM support") Signed-off-by: Arnd Bergmann Acked-by: Todor Tomov Acked-by: Geert Uytterhoeven Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/camss/camss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/qcom/camss/camss.c b/drivers/media/platform/qcom/camss/camss.c index dcc0c30ef1b1..9f19d5f5966b 100644 --- a/drivers/media/platform/qcom/camss/camss.c +++ b/drivers/media/platform/qcom/camss/camss.c @@ -993,12 +993,12 @@ static const struct of_device_id camss_dt_match[] = { MODULE_DEVICE_TABLE(of, camss_dt_match); -static int camss_runtime_suspend(struct device *dev) +static int __maybe_unused camss_runtime_suspend(struct device *dev) { return 0; } -static int camss_runtime_resume(struct device *dev) +static int __maybe_unused camss_runtime_resume(struct device *dev) { return 0; } From 55b518998996cc935d294bbf1f07ddba58b8b89e Mon Sep 17 00:00:00 2001 From: Todor Tomov Date: Tue, 14 Aug 2018 11:57:35 -0400 Subject: [PATCH 009/499] media: camss: Use managed memory allocations Use managed memory allocations for structs which are used until the driver is removed. Signed-off-by: Todor Tomov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/camss/camss-ispif.c | 4 ++-- drivers/media/platform/qcom/camss/camss.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/media/platform/qcom/camss/camss-ispif.c b/drivers/media/platform/qcom/camss/camss-ispif.c index 7f269021d08c..ca7b0917500d 100644 --- a/drivers/media/platform/qcom/camss/camss-ispif.c +++ b/drivers/media/platform/qcom/camss/camss-ispif.c @@ -1076,8 +1076,8 @@ int msm_ispif_subdev_init(struct ispif_device *ispif, else return -EINVAL; - ispif->line = kcalloc(ispif->line_num, sizeof(*ispif->line), - GFP_KERNEL); + ispif->line = devm_kcalloc(dev, ispif->line_num, sizeof(*ispif->line), + GFP_KERNEL); if (!ispif->line) return -ENOMEM; diff --git a/drivers/media/platform/qcom/camss/camss.c b/drivers/media/platform/qcom/camss/camss.c index 9f19d5f5966b..669615fff6a0 100644 --- a/drivers/media/platform/qcom/camss/camss.c +++ b/drivers/media/platform/qcom/camss/camss.c @@ -848,17 +848,18 @@ static int camss_probe(struct platform_device *pdev) return -EINVAL; } - camss->csiphy = kcalloc(camss->csiphy_num, sizeof(*camss->csiphy), - GFP_KERNEL); + camss->csiphy = devm_kcalloc(dev, camss->csiphy_num, + sizeof(*camss->csiphy), GFP_KERNEL); if (!camss->csiphy) return -ENOMEM; - camss->csid = kcalloc(camss->csid_num, sizeof(*camss->csid), - GFP_KERNEL); + camss->csid = devm_kcalloc(dev, camss->csid_num, sizeof(*camss->csid), + GFP_KERNEL); if (!camss->csid) return -ENOMEM; - camss->vfe = kcalloc(camss->vfe_num, sizeof(*camss->vfe), GFP_KERNEL); + camss->vfe = devm_kcalloc(dev, camss->vfe_num, sizeof(*camss->vfe), + GFP_KERNEL); if (!camss->vfe) return -ENOMEM; From 3799eca51c5be3cd76047a582ac52087373b54b3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Aug 2018 06:27:28 -0400 Subject: [PATCH 010/499] media: camss: add missing includes Multiple files in this driver fail to build because of missing header inclusions: drivers/media/platform/qcom/camss/camss-csiphy-2ph-1-0.c: In function 'csiphy_hw_version_read': drivers/media/platform/qcom/camss/camss-csiphy-2ph-1-0.c:31:18: error: implicit declaration of function 'readl_relaxed'; did you mean 'xchg_relaxed'? [-Werror=implicit-function-declaration] drivers/media/platform/qcom/camss/camss-csiphy-3ph-1-0.c: In function 'csiphy_hw_version_read': drivers/media/platform/qcom/camss/camss-csiphy-3ph-1-0.c:52:2: error: implicit declaration of function 'writel' [-Werror=implicit-function-declaration] Add linux/io.h there and in all other files that call readl/writel and related interfaces. Signed-off-by: Arnd Bergmann Signed-off-by: Todor Tomov Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/qcom/camss/camss-csid.c | 1 + drivers/media/platform/qcom/camss/camss-csiphy-2ph-1-0.c | 1 + drivers/media/platform/qcom/camss/camss-csiphy-3ph-1-0.c | 1 + drivers/media/platform/qcom/camss/camss-csiphy.c | 1 + drivers/media/platform/qcom/camss/camss-ispif.c | 1 + drivers/media/platform/qcom/camss/camss-vfe-4-1.c | 1 + drivers/media/platform/qcom/camss/camss-vfe-4-7.c | 1 + 7 files changed, 7 insertions(+) diff --git a/drivers/media/platform/qcom/camss/camss-csid.c b/drivers/media/platform/qcom/camss/camss-csid.c index 729b31891466..a5ae85674ffb 100644 --- a/drivers/media/platform/qcom/camss/camss-csid.c +++ b/drivers/media/platform/qcom/camss/camss-csid.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/platform/qcom/camss/camss-csiphy-2ph-1-0.c b/drivers/media/platform/qcom/camss/camss-csiphy-2ph-1-0.c index c832539397d7..12bce391d71f 100644 --- a/drivers/media/platform/qcom/camss/camss-csiphy-2ph-1-0.c +++ b/drivers/media/platform/qcom/camss/camss-csiphy-2ph-1-0.c @@ -12,6 +12,7 @@ #include #include +#include #define CAMSS_CSI_PHY_LNn_CFG2(n) (0x004 + 0x40 * (n)) #define CAMSS_CSI_PHY_LNn_CFG3(n) (0x008 + 0x40 * (n)) diff --git a/drivers/media/platform/qcom/camss/camss-csiphy-3ph-1-0.c b/drivers/media/platform/qcom/camss/camss-csiphy-3ph-1-0.c index bcd0dfd33618..2e65caf1ecae 100644 --- a/drivers/media/platform/qcom/camss/camss-csiphy-3ph-1-0.c +++ b/drivers/media/platform/qcom/camss/camss-csiphy-3ph-1-0.c @@ -12,6 +12,7 @@ #include #include +#include #define CSIPHY_3PH_LNn_CFG1(n) (0x000 + 0x100 * (n)) #define CSIPHY_3PH_LNn_CFG1_SWI_REC_DLY_PRG (BIT(7) | BIT(6)) diff --git a/drivers/media/platform/qcom/camss/camss-csiphy.c b/drivers/media/platform/qcom/camss/camss-csiphy.c index 4559f3b1b38c..008afb85023b 100644 --- a/drivers/media/platform/qcom/camss/camss-csiphy.c +++ b/drivers/media/platform/qcom/camss/camss-csiphy.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/platform/qcom/camss/camss-ispif.c b/drivers/media/platform/qcom/camss/camss-ispif.c index ca7b0917500d..1f33b4eb198c 100644 --- a/drivers/media/platform/qcom/camss/camss-ispif.c +++ b/drivers/media/platform/qcom/camss/camss-ispif.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/platform/qcom/camss/camss-vfe-4-1.c b/drivers/media/platform/qcom/camss/camss-vfe-4-1.c index da3a9fed9f2d..174a36be6f5d 100644 --- a/drivers/media/platform/qcom/camss/camss-vfe-4-1.c +++ b/drivers/media/platform/qcom/camss/camss-vfe-4-1.c @@ -9,6 +9,7 @@ */ #include +#include #include #include "camss-vfe.h" diff --git a/drivers/media/platform/qcom/camss/camss-vfe-4-7.c b/drivers/media/platform/qcom/camss/camss-vfe-4-7.c index 4c584bffd179..0dca8bf9281e 100644 --- a/drivers/media/platform/qcom/camss/camss-vfe-4-7.c +++ b/drivers/media/platform/qcom/camss/camss-vfe-4-7.c @@ -9,6 +9,7 @@ */ #include +#include #include #include "camss-vfe.h" From ef394f3fbecbe61d69450ad8cf0fa8f713c8ce8f Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Fri, 31 Aug 2018 14:34:02 +0200 Subject: [PATCH 011/499] regulator: da9063: fix DT probing with constraints Commit 1c892e38ce59 ("regulator: da9063: Handle less LDOs on DA9063L") reordered the da9063_regulator_info[] array, but not the DA9063_ID_* regulator ids and not the da9063_matches[] array, because ids are used as indices in the array initializer. This mismatch between regulator id and da9063_regulator_info[] array index causes the driver probe to fail because constraints from DT are not applied to the correct regulator: da9063 0-0058: Device detected (chip-ID: 0x61, var-ID: 0x50) DA9063_BMEM: Bringing 900000uV into 3300000-3300000uV DA9063_LDO9: Bringing 3300000uV into 2500000-2500000uV DA9063_LDO1: Bringing 900000uV into 3300000-3300000uV DA9063_LDO1: failed to apply 3300000-3300000uV constraint(-22) This patch reorders the DA9063_ID_* as apparently intended, and with them the entries in the da90630_matches[] array. Fixes: 1c892e38ce59 ("regulator: da9063: Handle less LDOs on DA9063L") Signed-off-by: Philipp Zabel Reviewed-by: Geert Uytterhoeven Reviewed-by: Marek Vasut Signed-off-by: Mark Brown --- include/linux/mfd/da9063/pdata.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h index 8a125701ef7b..50bed4f89c1a 100644 --- a/include/linux/mfd/da9063/pdata.h +++ b/include/linux/mfd/da9063/pdata.h @@ -21,7 +21,7 @@ /* * Regulator configuration */ -/* DA9063 regulator IDs */ +/* DA9063 and DA9063L regulator IDs */ enum { /* BUCKs */ DA9063_ID_BCORE1, @@ -37,18 +37,20 @@ enum { DA9063_ID_BMEM_BIO_MERGED, /* When two BUCKs are merged, they cannot be reused separately */ - /* LDOs */ - DA9063_ID_LDO1, - DA9063_ID_LDO2, + /* LDOs on both DA9063 and DA9063L */ DA9063_ID_LDO3, - DA9063_ID_LDO4, - DA9063_ID_LDO5, - DA9063_ID_LDO6, DA9063_ID_LDO7, DA9063_ID_LDO8, DA9063_ID_LDO9, - DA9063_ID_LDO10, DA9063_ID_LDO11, + + /* DA9063-only LDOs */ + DA9063_ID_LDO1, + DA9063_ID_LDO2, + DA9063_ID_LDO4, + DA9063_ID_LDO5, + DA9063_ID_LDO6, + DA9063_ID_LDO10, }; /* Regulators platform data */ From 215ab0f021c9fea3c18b75e7d522400ee6a49990 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 31 Aug 2018 08:38:49 -0300 Subject: [PATCH 012/499] xfrm6: call kfree_skb when skb is toobig After commit d6990976af7c5d8f55903bfb4289b6fb030bf754 ("vti6: fix PMTU caching and reporting on xmit"), some too big skbs might be potentially passed down to __xfrm6_output, causing it to fail to transmit but not free the skb, causing a leak of skb, and consequentially a leak of dst references. After running pmtu.sh, that shows as failure to unregister devices in a namespace: [ 311.397671] unregister_netdevice: waiting for veth_b to become free. Usage count = 1 The fix is to call kfree_skb in case of transmit failures. Fixes: dd767856a36e ("xfrm6: Don't call icmpv6_send on local error") Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_output.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 5959ce9620eb..6a74080005cf 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -170,9 +170,11 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (toobig && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); + kfree_skb(skb); return -EMSGSIZE; } else if (!skb->ignore_df && toobig && skb->sk) { xfrm_local_error(skb, mtu); + kfree_skb(skb); return -EMSGSIZE; } From 7001cab1dabc0b72b2b672ef58a90ab64f5e2343 Mon Sep 17 00:00:00 2001 From: Marcel Ziswiler Date: Wed, 29 Aug 2018 08:47:57 +0200 Subject: [PATCH 013/499] spi: tegra20-slink: explicitly enable/disable clock Depending on the SPI instance one may get an interrupt storm upon requesting resp. interrupt unless the clock is explicitly enabled beforehand. This has been observed trying to bring up instance 4 on T20. Signed-off-by: Marcel Ziswiler Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/spi/spi-tegra20-slink.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/spi/spi-tegra20-slink.c b/drivers/spi/spi-tegra20-slink.c index 6f7b946b5ced..1427f343b39a 100644 --- a/drivers/spi/spi-tegra20-slink.c +++ b/drivers/spi/spi-tegra20-slink.c @@ -1063,6 +1063,24 @@ static int tegra_slink_probe(struct platform_device *pdev) goto exit_free_master; } + /* disabled clock may cause interrupt storm upon request */ + tspi->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(tspi->clk)) { + ret = PTR_ERR(tspi->clk); + dev_err(&pdev->dev, "Can not get clock %d\n", ret); + goto exit_free_master; + } + ret = clk_prepare(tspi->clk); + if (ret < 0) { + dev_err(&pdev->dev, "Clock prepare failed %d\n", ret); + goto exit_free_master; + } + ret = clk_enable(tspi->clk); + if (ret < 0) { + dev_err(&pdev->dev, "Clock enable failed %d\n", ret); + goto exit_free_master; + } + spi_irq = platform_get_irq(pdev, 0); tspi->irq = spi_irq; ret = request_threaded_irq(tspi->irq, tegra_slink_isr, @@ -1071,14 +1089,7 @@ static int tegra_slink_probe(struct platform_device *pdev) if (ret < 0) { dev_err(&pdev->dev, "Failed to register ISR for IRQ %d\n", tspi->irq); - goto exit_free_master; - } - - tspi->clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(tspi->clk)) { - dev_err(&pdev->dev, "can not get clock\n"); - ret = PTR_ERR(tspi->clk); - goto exit_free_irq; + goto exit_clk_disable; } tspi->rst = devm_reset_control_get_exclusive(&pdev->dev, "spi"); @@ -1138,6 +1149,8 @@ static int tegra_slink_probe(struct platform_device *pdev) tegra_slink_deinit_dma_param(tspi, true); exit_free_irq: free_irq(spi_irq, tspi); +exit_clk_disable: + clk_disable(tspi->clk); exit_free_master: spi_master_put(master); return ret; @@ -1150,6 +1163,8 @@ static int tegra_slink_remove(struct platform_device *pdev) free_irq(tspi->irq, tspi); + clk_disable(tspi->clk); + if (tspi->tx_dma_chan) tegra_slink_deinit_dma_param(tspi, false); From 3edd79cf5a44b12dbb13bc320f5788aed6562b36 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Mon, 3 Sep 2018 16:49:37 +0200 Subject: [PATCH 014/499] regulator: Fix 'do-nothing' value for regulators without suspend state Some regulators don't have all states defined and in such cases regulator core should not assume anything. However in current implementation of of_get_regulation_constraints() DO_NOTHING_IN_SUSPEND enable value was set only for regulators which had suspend node defined, otherwise the default 0 value was used, what means DISABLE_IN_SUSPEND. This lead to broken system suspend/resume on boards, which had simple regulator constraints definition (without suspend state nodes). To avoid further mismatches between the default and uninitialized values of the suspend enabled/disabled states, change the values of the them, so default '0' means DO_NOTHING_IN_SUSPEND. Fixes: 72069f9957a1: regulator: leave one item to record whether regulator is enabled Signed-off-by: Marek Szyprowski Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/regulator/core.c | 2 +- drivers/regulator/of_regulator.c | 2 -- include/linux/regulator/machine.h | 6 +++--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index bb1324f93143..90215f57270f 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -3161,7 +3161,7 @@ static inline int regulator_suspend_toggle(struct regulator_dev *rdev, if (!rstate->changeable) return -EPERM; - rstate->enabled = en; + rstate->enabled = (en) ? ENABLE_IN_SUSPEND : DISABLE_IN_SUSPEND; return 0; } diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c index 638f17d4c848..210fc20f7de7 100644 --- a/drivers/regulator/of_regulator.c +++ b/drivers/regulator/of_regulator.c @@ -213,8 +213,6 @@ static void of_get_regulation_constraints(struct device_node *np, else if (of_property_read_bool(suspend_np, "regulator-off-in-suspend")) suspend_state->enabled = DISABLE_IN_SUSPEND; - else - suspend_state->enabled = DO_NOTHING_IN_SUSPEND; if (!of_property_read_u32(np, "regulator-suspend-min-microvolt", &pval)) diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 3468703d663a..a459a5e973a7 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -48,9 +48,9 @@ struct regulator; * DISABLE_IN_SUSPEND - turn off regulator in suspend states * ENABLE_IN_SUSPEND - keep regulator on in suspend states */ -#define DO_NOTHING_IN_SUSPEND (-1) -#define DISABLE_IN_SUSPEND 0 -#define ENABLE_IN_SUSPEND 1 +#define DO_NOTHING_IN_SUSPEND 0 +#define DISABLE_IN_SUSPEND 1 +#define ENABLE_IN_SUSPEND 2 /* Regulator active discharge flags */ enum regulator_active_discharge { From bfc0698bebcb16d19ecfc89574ad4d696955e5d3 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 3 Sep 2018 04:36:52 -0700 Subject: [PATCH 015/499] xfrm: reset transport header back to network header after all input transforms ahave been applied A policy may have been set up with multiple transforms (e.g., ESP and ipcomp). In this situation, the ingress IPsec processing iterates in xfrm_input() and applies each transform in turn, processing the nexthdr to find any additional xfrm that may apply. This patch resets the transport header back to network header only after the last transformation so that subsequent xfrms can find the correct transport header. Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") Suggested-by: Steffen Klassert Signed-off-by: Sowmini Varadhan Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_input.c | 1 + net/ipv4/xfrm4_mode_transport.c | 4 +--- net/ipv6/xfrm6_input.c | 1 + net/ipv6/xfrm6_mode_transport.c | 4 +--- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index bcfc00e88756..f8de2482a529 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -67,6 +67,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 3d36644890bb..1ad2c2c4e250 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -46,7 +46,6 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); - struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -54,8 +53,7 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) skb->network_header = skb->transport_header; } ip_hdr(skb)->tot_len = htons(skb->len + ihl); - if (!xo || !(xo->flags & XFRM_GRO)) - skb_reset_transport_header(skb); + skb_reset_transport_header(skb); return 0; } diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 841f4a07438e..9ef490dddcea 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -59,6 +59,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); + skb_reset_transport_header(skb); return -1; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 9ad07a91708e..3c29da5defe6 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -51,7 +51,6 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); - struct xfrm_offload *xo = xfrm_offload(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), @@ -60,8 +59,7 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) } ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - sizeof(struct ipv6hdr)); - if (!xo || !(xo->flags & XFRM_GRO)) - skb_reset_transport_header(skb); + skb_reset_transport_header(skb); return 0; } From 782710e333a526780d65918d669cb96646983ba2 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 3 Sep 2018 04:36:53 -0700 Subject: [PATCH 016/499] xfrm: reset crypto_done when iterating over multiple input xfrms We only support one offloaded xfrm (we do not have devices that can handle more than one offload), so reset crypto_done in xfrm_input() when iterating over multiple transforms in xfrm_input, so that we can invoke the appropriate x->type->input for the non-offloaded transforms Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Sowmini Varadhan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 352abca2605f..86f5afbd0a0c 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -453,6 +453,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } + crypto_done = false; } while (!err); err = xfrm_rcv_cb(skb, family, x->type->proto, 0); From 1723c3155f117ee6e00f28fadf6e9eda4fc85806 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 4 Sep 2018 15:39:30 +0200 Subject: [PATCH 017/499] spi: gpio: Fix copy-and-paste error This fixes an embarrassing copy-and-paste error in the errorpath of spi_gpio_request(): we were checking the wrong struct member for error code right after retrieveing the sck GPIO. Fixes: 9b00bc7b901ff672 ("spi: spi-gpio: Rewrite to use GPIO descriptors") Reviewed-by: Geert Uytterhoeven Signed-off-by: Linus Walleij Signed-off-by: Mark Brown --- drivers/spi/spi-gpio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-gpio.c b/drivers/spi/spi-gpio.c index 0626e6e3ea0c..421bfc7dda67 100644 --- a/drivers/spi/spi-gpio.c +++ b/drivers/spi/spi-gpio.c @@ -300,8 +300,8 @@ static int spi_gpio_request(struct device *dev, *mflags |= SPI_MASTER_NO_RX; spi_gpio->sck = devm_gpiod_get(dev, "sck", GPIOD_OUT_LOW); - if (IS_ERR(spi_gpio->mosi)) - return PTR_ERR(spi_gpio->mosi); + if (IS_ERR(spi_gpio->sck)) + return PTR_ERR(spi_gpio->sck); for (i = 0; i < num_chipselects; i++) { spi_gpio->cs_gpios[i] = devm_gpiod_get_index(dev, "cs", From 8682250b3c1b75a45feb7452bc413d004cfe3778 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 5 Sep 2018 08:06:13 +0300 Subject: [PATCH 018/499] mac80211: Always report TX status If a frame is dropped for any reason, mac80211 wouldn't report the TX status back to user space. As the user space may rely on the TX_STATUS to kick its state machines, resends etc, it's better to just report this frame as not acked instead. Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/status.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 9a6d7208bf4f..001a869c059c 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -479,11 +479,6 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, if (!skb) return; - if (dropped) { - dev_kfree_skb_any(skb); - return; - } - if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; @@ -506,6 +501,8 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, } rcu_read_unlock(); + dev_kfree_skb_any(skb); + } else if (dropped) { dev_kfree_skb_any(skb); } else { /* consumes skb */ From 94a5b3acd0aef83c0e38b5117eda7b2abf4a05a4 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 5 Sep 2018 08:06:14 +0300 Subject: [PATCH 019/499] mac80211: Don't wake up from PS for offchannel TX Otherwise the offchannel frame might be queued due to IEEE80211_QUEUE_STOP_REASON_PS and later dropped (in ieee80211_tx_frags()). Anyway, it doesn't make much sense to wake up the device during ROC. Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index f353d9db54bc..131542513c8f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -214,6 +214,7 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) { struct ieee80211_local *local = tx->local; struct ieee80211_if_managed *ifmgd; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); /* driver doesn't support power save */ if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS)) @@ -242,6 +243,9 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx) if (tx->sdata->vif.type != NL80211_IFTYPE_STATION) return TX_CONTINUE; + if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK)) + return TX_CONTINUE; + ifmgd = &tx->sdata->u.mgd; /* From 24f33e64fcd0d50a4b1a8e5b41bd0257aa66b0e8 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 5 Sep 2018 08:06:12 +0300 Subject: [PATCH 020/499] cfg80211: reg: Init wiphy_idx in regulatory_hint_core() Core regulatory hints didn't set wiphy_idx to WIPHY_IDX_INVALID. Since the regulatory request is zeroed, wiphy_idx was always implicitly set to 0. This resulted in updating only phy #0. Fix that. Fixes: 806a9e39670b ("cfg80211: make regulatory_request use wiphy_idx instead of wiphy") Signed-off-by: Andrei Otcheretianski Signed-off-by: Luca Coelho [add fixes tag] Signed-off-by: Johannes Berg --- net/wireless/reg.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 2f702adf2912..765dedb12361 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2867,6 +2867,7 @@ static int regulatory_hint_core(const char *alpha2) request->alpha2[0] = alpha2[0]; request->alpha2[1] = alpha2[1]; request->initiator = NL80211_REGDOM_SET_BY_CORE; + request->wiphy_idx = WIPHY_IDX_INVALID; queue_regulatory_request(request); From 6eae4a6c2be387fec41b0d2782c4fffb57159498 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Wed, 5 Sep 2018 06:22:59 -0400 Subject: [PATCH 021/499] mac80211: fix pending queue hang due to TX_DROP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In our environment running lots of mesh nodes, we are seeing the pending queue hang periodically, with the debugfs queues file showing lines such as: 00: 0x00000000/348 i.e. there are a large number of frames but no stop reason set. One way this could happen is if queue processing from the pending tasklet exited early without processing all frames, and without having some future event (incoming frame, stop reason flag, ...) to reschedule it. Exactly this can occur today if ieee80211_tx() returns false due to packet drops or power-save buffering in the tx handlers. In the past, this function would return true in such cases, and the change to false doesn't seem to be intentional. Fix this case by reverting to the previous behavior. Fixes: bb42f2d13ffc ("mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue") Signed-off-by: Bob Copeland Acked-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 131542513c8f..25ba24bef8f5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1894,7 +1894,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; if (invoke_tx_handlers_early(&tx)) - return false; + return true; if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb)) return true; From ffa69d6a16f686efe45269342474e421f2aa58b2 Mon Sep 17 00:00:00 2001 From: Gaku Inami Date: Wed, 5 Sep 2018 10:49:36 +0200 Subject: [PATCH 022/499] spi: sh-msiof: Fix invalid SPI use during system suspend If the SPI queue is running during system suspend, the system may lock up. Fix this by stopping/restarting the queue during system suspend/resume by calling spi_master_suspend()/spi_master_resume() from the PM callbacks. In-kernel users will receive an -ESHUTDOWN error while system suspend/resume is in progress. Signed-off-by: Gaku Inami Signed-off-by: Hiromitsu Yamasaki [geert: Cleanup, reword] Signed-off-by: Geert Uytterhoeven Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/spi/spi-sh-msiof.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c index 539d6d1a277a..bfe4e6d4f7bf 100644 --- a/drivers/spi/spi-sh-msiof.c +++ b/drivers/spi/spi-sh-msiof.c @@ -1426,12 +1426,37 @@ static const struct platform_device_id spi_driver_ids[] = { }; MODULE_DEVICE_TABLE(platform, spi_driver_ids); +#ifdef CONFIG_PM_SLEEP +static int sh_msiof_spi_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct sh_msiof_spi_priv *p = platform_get_drvdata(pdev); + + return spi_master_suspend(p->master); +} + +static int sh_msiof_spi_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct sh_msiof_spi_priv *p = platform_get_drvdata(pdev); + + return spi_master_resume(p->master); +} + +static SIMPLE_DEV_PM_OPS(sh_msiof_spi_pm_ops, sh_msiof_spi_suspend, + sh_msiof_spi_resume); +#define DEV_PM_OPS &sh_msiof_spi_pm_ops +#else +#define DEV_PM_OPS NULL +#endif /* CONFIG_PM_SLEEP */ + static struct platform_driver sh_msiof_spi_drv = { .probe = sh_msiof_spi_probe, .remove = sh_msiof_spi_remove, .id_table = spi_driver_ids, .driver = { .name = "spi_sh_msiof", + .pm = DEV_PM_OPS, .of_match_table = of_match_ptr(sh_msiof_match), }, }; From 31a5fae4c5a009898da6d177901d5328051641ff Mon Sep 17 00:00:00 2001 From: Hiromitsu Yamasaki Date: Wed, 5 Sep 2018 10:49:37 +0200 Subject: [PATCH 023/499] spi: sh-msiof: Fix handling of write value for SISTR register This patch changes writing to the SISTR register according to the H/W user's manual. The TDREQ bit and RDREQ bits of SISTR are read-only, and must be written their initial values of zero. Signed-off-by: Hiromitsu Yamasaki [geert: reword] Signed-off-by: Geert Uytterhoeven Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/spi/spi-sh-msiof.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c index bfe4e6d4f7bf..101cd6aae2ea 100644 --- a/drivers/spi/spi-sh-msiof.c +++ b/drivers/spi/spi-sh-msiof.c @@ -397,7 +397,8 @@ static void sh_msiof_spi_set_mode_regs(struct sh_msiof_spi_priv *p, static void sh_msiof_reset_str(struct sh_msiof_spi_priv *p) { - sh_msiof_write(p, STR, sh_msiof_read(p, STR)); + sh_msiof_write(p, STR, + sh_msiof_read(p, STR) & ~(STR_TDREQ | STR_RDREQ)); } static void sh_msiof_spi_write_fifo_8(struct sh_msiof_spi_priv *p, From c1ca59c22c56930b377a665fdd1b43351887830b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Sep 2018 10:49:38 +0200 Subject: [PATCH 024/499] spi: rspi: Fix invalid SPI use during system suspend If the SPI queue is running during system suspend, the system may lock up. Fix this by stopping/restarting the queue during system suspend/resume, by calling spi_master_suspend()/spi_master_resume() from the PM callbacks. In-kernel users will receive an -ESHUTDOWN error while system suspend/resume is in progress. Based on a patch for sh-msiof by Gaku Inami. Signed-off-by: Geert Uytterhoeven Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/spi/spi-rspi.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c index 95dc4d78618d..f93a4587e3fb 100644 --- a/drivers/spi/spi-rspi.c +++ b/drivers/spi/spi-rspi.c @@ -1350,12 +1350,36 @@ static const struct platform_device_id spi_driver_ids[] = { MODULE_DEVICE_TABLE(platform, spi_driver_ids); +#ifdef CONFIG_PM_SLEEP +static int rspi_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct rspi_data *rspi = platform_get_drvdata(pdev); + + return spi_master_suspend(rspi->master); +} + +static int rspi_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct rspi_data *rspi = platform_get_drvdata(pdev); + + return spi_master_resume(rspi->master); +} + +static SIMPLE_DEV_PM_OPS(rspi_pm_ops, rspi_suspend, rspi_resume); +#define DEV_PM_OPS &rspi_pm_ops +#else +#define DEV_PM_OPS NULL +#endif /* CONFIG_PM_SLEEP */ + static struct platform_driver rspi_driver = { .probe = rspi_probe, .remove = rspi_remove, .id_table = spi_driver_ids, .driver = { .name = "renesas_spi", + .pm = DEV_PM_OPS, .of_match_table = of_match_ptr(rspi_of_match), }, }; From 8dbbaa47b96f6ea5f09f922b4effff3c505cd8cf Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Sep 2018 10:49:39 +0200 Subject: [PATCH 025/499] spi: rspi: Fix interrupted DMA transfers When interrupted, wait_event_interruptible_timeout() returns -ERESTARTSYS, and the SPI transfer in progress will fail, as expected: m25p80 spi0.0: SPI transfer failed: -512 spi_master spi0: failed to transfer one message from queue However, as the underlying DMA transfers may not have completed, all subsequent SPI transfers may start to fail: spi_master spi0: receive timeout qspi_transfer_out_in() returned -110 m25p80 spi0.0: SPI transfer failed: -110 spi_master spi0: failed to transfer one message from queue Fix this by calling dmaengine_terminate_all() not only for timeouts, but also for errors. This can be reproduced on r8a7991/koelsch, using "hd /dev/mtd0" followed by CTRL-C. Signed-off-by: Geert Uytterhoeven Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/spi/spi-rspi.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c index f93a4587e3fb..b37de1d991d6 100644 --- a/drivers/spi/spi-rspi.c +++ b/drivers/spi/spi-rspi.c @@ -598,11 +598,13 @@ static int rspi_dma_transfer(struct rspi_data *rspi, struct sg_table *tx, ret = wait_event_interruptible_timeout(rspi->wait, rspi->dma_callbacked, HZ); - if (ret > 0 && rspi->dma_callbacked) + if (ret > 0 && rspi->dma_callbacked) { ret = 0; - else if (!ret) { - dev_err(&rspi->master->dev, "DMA timeout\n"); - ret = -ETIMEDOUT; + } else { + if (!ret) { + dev_err(&rspi->master->dev, "DMA timeout\n"); + ret = -ETIMEDOUT; + } if (tx) dmaengine_terminate_all(rspi->master->dma_tx); if (rx) From 88d0895d0ea9d4431507d576c963f2ff9918144d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 31 Aug 2018 15:08:44 +0200 Subject: [PATCH 026/499] batman-adv: Avoid probe ELP information leak The probe ELPs for WiFi interfaces are expanded to contain at least BATADV_ELP_MIN_PROBE_SIZE bytes. This is usually a lot more than the number of bytes which the template ELP packet requires. These extra padding bytes were not initialized and thus could contain data which were previously stored at the same location. It is therefore required to set it to some predefined or random values to avoid leaking private information from the system transmitting these kind of packets. Fixes: e4623c913508 ("batman-adv: Avoid probe ELP information leak") Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 71c20c1d4002..e103c759b7ab 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -241,7 +241,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) * the packet to be exactly of that size to make the link * throughput estimation effective. */ - skb_put(skb, probe_len - hard_iface->bat_v.elp_skb->len); + skb_put_zero(skb, probe_len - hard_iface->bat_v.elp_skb->len); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Sending unicast (probe) ELP packet on interface %s to %pM\n", From b9fd14c20871e6189f635e49b32d7789e430b3c8 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 31 Aug 2018 16:46:47 +0200 Subject: [PATCH 027/499] batman-adv: Fix segfault when writing to throughput_override The per hardif sysfs file "batman_adv/throughput_override" prints the resulting change as info text when the users writes to this file. It uses the helper function batadv_info to add it at the same time to the kernel ring buffer and to the batman-adv debug log (when CONFIG_BATMAN_ADV_DEBUG is enabled). The function batadv_info requires as first parameter the batman-adv softif net_device. This parameter is then used to find the private buffer which contains the debug log for this batman-adv interface. But batadv_store_throughput_override used as first argument the slave net_device. This slave device doesn't have the batadv_priv private data which is access by batadv_info. Writing to this file with CONFIG_BATMAN_ADV_DEBUG enabled can either lead to a segfault or to memory corruption. Fixes: 0b5ecc6811bd ("batman-adv: add throughput override attribute to hard_ifaces") Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/sysfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index f2eef43bd2ec..3a76e8970c02 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1090,8 +1090,9 @@ static ssize_t batadv_store_throughput_override(struct kobject *kobj, if (old_tp_override == tp_override) goto out; - batadv_info(net_dev, "%s: Changing from: %u.%u MBit to: %u.%u MBit\n", - "throughput_override", + batadv_info(hard_iface->soft_iface, + "%s: %s: Changing from: %u.%u MBit to: %u.%u MBit\n", + "throughput_override", net_dev->name, old_tp_override / 10, old_tp_override % 10, tp_override / 10, tp_override % 10); From a25bab9d723a08bd0bdafb1529faf9094c690b70 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 31 Aug 2018 16:56:29 +0200 Subject: [PATCH 028/499] batman-adv: Fix segfault when writing to sysfs elp_interval The per hardif sysfs file "batman_adv/elp_interval" is using the generic functions to store/show uint values. The helper __batadv_store_uint_attr requires the softif net_device as parameter to print the resulting change as info text when the users writes to this file. It uses the helper function batadv_info to add it at the same time to the kernel ring buffer and to the batman-adv debug log (when CONFIG_BATMAN_ADV_DEBUG is enabled). The function batadv_info requires as first parameter the batman-adv softif net_device. This parameter is then used to find the private buffer which contains the debug log for this batman-adv interface. But batadv_store_throughput_override used as first argument the slave net_device. This slave device doesn't have the batadv_priv private data which is access by batadv_info. Writing to this file with CONFIG_BATMAN_ADV_DEBUG enabled can either lead to a segfault or to memory corruption. Fixes: 0744ff8fa8fa ("batman-adv: Add hard_iface specific sysfs wrapper macros for UINT") Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/sysfs.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 3a76e8970c02..09427fc6494a 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -188,7 +188,8 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ \ return __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ - &bat_priv->_var, net_dev); \ + &bat_priv->_var, net_dev, \ + NULL); \ } #define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ @@ -262,7 +263,9 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ \ length = __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ - &hard_iface->_var, net_dev); \ + &hard_iface->_var, \ + hard_iface->soft_iface, \ + net_dev); \ \ batadv_hardif_put(hard_iface); \ return length; \ @@ -356,10 +359,12 @@ __batadv_store_bool_attr(char *buff, size_t count, static int batadv_store_uint_attr(const char *buff, size_t count, struct net_device *net_dev, + struct net_device *slave_dev, const char *attr_name, unsigned int min, unsigned int max, atomic_t *attr) { + char ifname[IFNAMSIZ + 3] = ""; unsigned long uint_val; int ret; @@ -385,8 +390,11 @@ static int batadv_store_uint_attr(const char *buff, size_t count, if (atomic_read(attr) == uint_val) return count; - batadv_info(net_dev, "%s: Changing from: %i to: %lu\n", - attr_name, atomic_read(attr), uint_val); + if (slave_dev) + snprintf(ifname, sizeof(ifname), "%s: ", slave_dev->name); + + batadv_info(net_dev, "%s: %sChanging from: %i to: %lu\n", + attr_name, ifname, atomic_read(attr), uint_val); atomic_set(attr, uint_val); return count; @@ -397,12 +405,13 @@ static ssize_t __batadv_store_uint_attr(const char *buff, size_t count, void (*post_func)(struct net_device *), const struct attribute *attr, atomic_t *attr_store, - struct net_device *net_dev) + struct net_device *net_dev, + struct net_device *slave_dev) { int ret; - ret = batadv_store_uint_attr(buff, count, net_dev, attr->name, min, max, - attr_store); + ret = batadv_store_uint_attr(buff, count, net_dev, slave_dev, + attr->name, min, max, attr_store); if (post_func && ret) post_func(net_dev); @@ -571,7 +580,7 @@ static ssize_t batadv_store_gw_sel_class(struct kobject *kobj, return __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect, attr, &bat_priv->gw.sel_class, - bat_priv->soft_iface); + bat_priv->soft_iface, NULL); } static ssize_t batadv_show_gw_bwidth(struct kobject *kobj, From dff9bc42ab0b2d38c5e90ddd79b238fed5b4c7ad Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Aug 2018 21:04:41 +0200 Subject: [PATCH 029/499] batman-adv: Prevent duplicated gateway_node entry The function batadv_gw_node_add is responsible for adding new gw_node to the gateway_list. It is expecting that the caller already checked that there is not already an entry with the same key or not. But the lock for the list is only held when the list is really modified. This could lead to duplicated entries because another context could create an entry with the same key between the check and the list manipulation. The check and the manipulation of the list must therefore be in the same locked code section. Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/gateway_client.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 8b198ee798c9..140c61a3f1ec 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -348,6 +349,9 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information + * + * Has to be called with the appropriate locks being acquired + * (gw.list_lock). */ static void batadv_gw_node_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -355,6 +359,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, { struct batadv_gw_node *gw_node; + lockdep_assert_held(&bat_priv->gw.list_lock); + if (gateway->bandwidth_down == 0) return; @@ -369,10 +375,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - spin_lock_bh(&bat_priv->gw.list_lock); kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list); - spin_unlock_bh(&bat_priv->gw.list_lock); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n", @@ -428,11 +432,14 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, { struct batadv_gw_node *gw_node, *curr_gw = NULL; + spin_lock_bh(&bat_priv->gw.list_lock); gw_node = batadv_gw_node_get(bat_priv, orig_node); if (!gw_node) { batadv_gw_node_add(bat_priv, orig_node, gateway); + spin_unlock_bh(&bat_priv->gw.list_lock); goto out; } + spin_unlock_bh(&bat_priv->gw.list_lock); if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) && gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)) From fa122fec8640eb7186ce5a41b83a4c1744ceef8f Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Aug 2018 21:04:42 +0200 Subject: [PATCH 030/499] batman-adv: Prevent duplicated nc_node entry The function batadv_nc_get_nc_node is responsible for adding new nc_nodes to the in_coding_list and out_coding_list. It first checks whether the entry already is in the list or not. If it is, then the creation of a new entry is aborted. But the lock for the list is only held when the list is really modified. This could lead to duplicated entries because another context could create an entry with the same key between the check and the list manipulation. The check and the manipulation of the list must therefore be in the same locked code section. Fixes: d56b1705e28c ("batman-adv: network coding - detect coding nodes and remove these after timeout") Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/network-coding.c | 41 ++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index c3578444f3cb..34caf129a9bf 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -854,24 +854,6 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ struct list_head *list; - /* Check if nc_node is already added */ - nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); - - /* Node found */ - if (nc_node) - return nc_node; - - nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); - if (!nc_node) - return NULL; - - /* Initialize nc_node */ - INIT_LIST_HEAD(&nc_node->list); - kref_init(&nc_node->refcount); - ether_addr_copy(nc_node->addr, orig_node->orig); - kref_get(&orig_neigh_node->refcount); - nc_node->orig_node = orig_neigh_node; - /* Select ingoing or outgoing coding node */ if (in_coding) { lock = &orig_neigh_node->in_coding_list_lock; @@ -881,13 +863,34 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, list = &orig_neigh_node->out_coding_list; } + spin_lock_bh(lock); + + /* Check if nc_node is already added */ + nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); + + /* Node found */ + if (nc_node) + goto unlock; + + nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); + if (!nc_node) + goto unlock; + + /* Initialize nc_node */ + INIT_LIST_HEAD(&nc_node->list); + kref_init(&nc_node->refcount); + ether_addr_copy(nc_node->addr, orig_node->orig); + kref_get(&orig_neigh_node->refcount); + nc_node->orig_node = orig_neigh_node; + batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); /* Add nc_node to orig_node */ - spin_lock_bh(lock); kref_get(&nc_node->refcount); list_add_tail_rcu(&nc_node->list, list); + +unlock: spin_unlock_bh(lock); return nc_node; From 94cb82f594ed86be303398d6dfc7640a6f1d45d4 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Aug 2018 21:04:43 +0200 Subject: [PATCH 031/499] batman-adv: Prevent duplicated softif_vlan entry The function batadv_softif_vlan_get is responsible for adding new softif_vlan to the softif_vlan_list. It first checks whether the entry already is in the list or not. If it is, then the creation of a new entry is aborted. But the lock for the list is only held when the list is really modified. This could lead to duplicated entries because another context could create an entry with the same key between the check and the list manipulation. The check and the manipulation of the list must therefore be in the same locked code section. Fixes: 5d2c05b21337 ("batman-adv: add per VLAN interface attribute framework") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/soft-interface.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 1485263a348b..626ddca332db 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -574,15 +574,20 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) struct batadv_softif_vlan *vlan; int err; + spin_lock_bh(&bat_priv->softif_vlan_list_lock); + vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { batadv_softif_vlan_put(vlan); + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); - if (!vlan) + if (!vlan) { + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -ENOMEM; + } vlan->bat_priv = bat_priv; vlan->vid = vid; @@ -590,17 +595,23 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) atomic_set(&vlan->ap_isolation, 0); - err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); - if (err) { - kfree(vlan); - return err; - } - - spin_lock_bh(&bat_priv->softif_vlan_list_lock); kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + /* batadv_sysfs_add_vlan cannot be in the spinlock section due to the + * sleeping behavior of the sysfs functions and the fs_reclaim lock + */ + err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); + if (err) { + /* ref for the function */ + batadv_softif_vlan_put(vlan); + + /* ref for the list */ + batadv_softif_vlan_put(vlan); + return err; + } + /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ From e7136e48ffdfb9f37b0820f619380485eb407361 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Aug 2018 21:04:44 +0200 Subject: [PATCH 032/499] batman-adv: Prevent duplicated global TT entry The function batadv_tt_global_orig_entry_add is responsible for adding new tt_orig_list_entry to the orig_list. It first checks whether the entry already is in the list or not. If it is, then the creation of a new entry is aborted. But the lock for the list is only held when the list is really modified. This could lead to duplicated entries because another context could create an entry with the same key between the check and the list manipulation. The check and the manipulation of the list must therefore be in the same locked code section. Fixes: d657e621a0f5 ("batman-adv: add reference counting for type batadv_tt_orig_list_entry") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 12a2b7d21376..d21624c44665 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1613,6 +1613,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, { struct batadv_tt_orig_list_entry *orig_entry; + spin_lock_bh(&tt_global->list_lock); + orig_entry = batadv_tt_global_orig_entry_find(tt_global, orig_node); if (orig_entry) { /* refresh the ttvn: the current value could be a bogus one that @@ -1635,11 +1637,9 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, orig_entry->flags = flags; kref_init(&orig_entry->refcount); - spin_lock_bh(&tt_global->list_lock); kref_get(&orig_entry->refcount); hlist_add_head_rcu(&orig_entry->list, &tt_global->orig_list); - spin_unlock_bh(&tt_global->list_lock); atomic_inc(&tt_global->orig_list_count); sync_flags: @@ -1647,6 +1647,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, out: if (orig_entry) batadv_tt_orig_list_entry_put(orig_entry); + + spin_unlock_bh(&tt_global->list_lock); } /** From ae3cdc97dc10c7a3b31f297dab429bfb774c9ccb Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Aug 2018 21:04:45 +0200 Subject: [PATCH 033/499] batman-adv: Prevent duplicated tvlv handler The function batadv_tvlv_handler_register is responsible for adding new tvlv_handler to the handler_list. It first checks whether the entry already is in the list or not. If it is, then the creation of a new entry is aborted. But the lock for the list is only held when the list is really modified. This could lead to duplicated entries because another context could create an entry with the same key between the check and the list manipulation. The check and the manipulation of the list must therefore be in the same locked code section. Fixes: ef26157747d4 ("batman-adv: tvlv - basic infrastructure") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/tvlv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index a637458205d1..40e69c9346d2 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -529,15 +529,20 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, { struct batadv_tvlv_handler *tvlv_handler; + spin_lock_bh(&bat_priv->tvlv.handler_list_lock); + tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (tvlv_handler) { + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); batadv_tvlv_handler_put(tvlv_handler); return; } tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); - if (!tvlv_handler) + if (!tvlv_handler) { + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); return; + } tvlv_handler->ogm_handler = optr; tvlv_handler->unicast_handler = uptr; @@ -547,7 +552,6 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, kref_init(&tvlv_handler->refcount); INIT_HLIST_NODE(&tvlv_handler->list); - spin_lock_bh(&bat_priv->tvlv.handler_list_lock); kref_get(&tvlv_handler->refcount); hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); From 5af96b9c59c72fb2af2d19c5cc2f3cdcee391dff Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Fri, 7 Sep 2018 05:45:54 +0800 Subject: [PATCH 034/499] batman-adv: fix backbone_gw refcount on queue_work() failure The backbone_gw refcounter is to be decreased by the queued work and currently is never decreased if the queue_work() call fails. Fix by checking the queue_work() return value and decrease refcount if necessary. Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ff9659af6b91..5f1aeeded0e3 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1772,6 +1772,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, { struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; + bool ret; ethhdr = eth_hdr(skb); @@ -1795,8 +1796,13 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, if (unlikely(!backbone_gw)) return true; - queue_work(batadv_event_workqueue, &backbone_gw->report_work); - /* backbone_gw is unreferenced in the report work function function */ + ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work); + + /* backbone_gw is unreferenced in the report work function function + * if queue_work() call was successful + */ + if (!ret) + batadv_backbone_gw_put(backbone_gw); return true; } From 4c4af6900844ab04c9434c972021d7b48610e06a Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Fri, 7 Sep 2018 05:45:55 +0800 Subject: [PATCH 035/499] batman-adv: fix hardif_neigh refcount on queue_work() failure The hardif_neigh refcounter is to be decreased by the queued work and currently is never decreased if the queue_work() call fails. Fix by checking the queue_work() return value and decrease refcount if necessary. Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index e103c759b7ab..9f481cfdf77d 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -268,6 +268,7 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) struct batadv_priv *bat_priv; struct sk_buff *skb; u32 elp_interval; + bool ret; bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); @@ -329,8 +330,11 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) * may sleep and that is not allowed in an rcu protected * context. Therefore schedule a task for that. */ - queue_work(batadv_event_workqueue, - &hardif_neigh->bat_v.metric_work); + ret = queue_work(batadv_event_workqueue, + &hardif_neigh->bat_v.metric_work); + + if (!ret) + batadv_hardif_neigh_put(hardif_neigh); } rcu_read_unlock(); From 119f94a6fefcc76d47075b83d2b73d04c895df78 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Wed, 5 Sep 2018 18:52:22 +0300 Subject: [PATCH 036/499] cfg80211: Address some corner cases in scan result channel updating cfg80211_get_bss_channel() is used to update the RX channel based on the available frame payload information (channel number from DSSS Parameter Set element or HT Operation element). This is needed on 2.4 GHz channels where frames may be received on neighboring channels due to overlapping frequency range. This might of some use on the 5 GHz band in some corner cases, but things are more complex there since there is no n:1 or 1:n mapping between channel numbers and frequencies due to multiple different starting frequencies in different operating classes. This could result in ieee80211_channel_to_frequency() returning incorrect frequency and ieee80211_get_channel() returning incorrect channel information (or indication of no match). In the previous implementation, this could result in some scan results being dropped completely, e.g., for the 4.9 GHz channels. That prevented connection to such BSSs. Fix this by using the driver-provided channel pointer if ieee80211_get_channel() does not find matching channel data for the channel number in the frame payload and if the scan is done with 5 MHz or 10 MHz channel bandwidth. While doing this, also add comments describing what the function is trying to achieve to make it easier to understand what happens here and why. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/wireless/scan.c | 58 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index d36c3eb7b931..d0e7472dd9fd 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1058,13 +1058,23 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, return NULL; } +/* + * Update RX channel information based on the available frame payload + * information. This is mainly for the 2.4 GHz band where frames can be received + * from neighboring channels and the Beacon frames use the DSSS Parameter Set + * element to indicate the current (transmitting) channel, but this might also + * be needed on other bands if RX frequency does not match with the actual + * operating channel of a BSS. + */ static struct ieee80211_channel * cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, - struct ieee80211_channel *channel) + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width) { const u8 *tmp; u32 freq; int channel_number = -1; + struct ieee80211_channel *alt_channel; tmp = cfg80211_find_ie(WLAN_EID_DS_PARAMS, ie, ielen); if (tmp && tmp[1] == 1) { @@ -1078,16 +1088,45 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, } } - if (channel_number < 0) + if (channel_number < 0) { + /* No channel information in frame payload */ return channel; + } freq = ieee80211_channel_to_frequency(channel_number, channel->band); - channel = ieee80211_get_channel(wiphy, freq); - if (!channel) + alt_channel = ieee80211_get_channel(wiphy, freq); + if (!alt_channel) { + if (channel->band == NL80211_BAND_2GHZ) { + /* + * Better not allow unexpected channels when that could + * be going beyond the 1-11 range (e.g., discovering + * BSS on channel 12 when radio is configured for + * channel 11. + */ + return NULL; + } + + /* No match for the payload channel number - ignore it */ + return channel; + } + + if (scan_width == NL80211_BSS_CHAN_WIDTH_10 || + scan_width == NL80211_BSS_CHAN_WIDTH_5) { + /* + * Ignore channel number in 5 and 10 MHz channels where there + * may not be an n:1 or 1:n mapping between frequencies and + * channel numbers. + */ + return channel; + } + + /* + * Use the channel determined through the payload channel number + * instead of the RX channel reported by the driver. + */ + if (alt_channel->flags & IEEE80211_CHAN_DISABLED) return NULL; - if (channel->flags & IEEE80211_CHAN_DISABLED) - return NULL; - return channel; + return alt_channel; } /* Returned bss is reference counted and must be cleaned up appropriately. */ @@ -1112,7 +1151,8 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, (data->signal < 0 || data->signal > 100))) return NULL; - channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan); + channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan, + data->scan_width); if (!channel) return NULL; @@ -1210,7 +1250,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, return NULL; channel = cfg80211_get_bss_channel(wiphy, mgmt->u.beacon.variable, - ielen, data->chan); + ielen, data->chan, data->scan_width); if (!channel) return NULL; From cb59bc14e830028d2244861216df038165d7625d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Sep 2018 13:34:02 +0200 Subject: [PATCH 037/499] mac80211: TDLS: fix skb queue/priority assignment If the TDLS setup happens over a connection to an AP that doesn't have QoS, we nevertheless assign a non-zero TID (skb->priority) and queue mapping, which may confuse us or drivers later. Fix it by just assigning the special skb->priority and then using ieee80211_select_queue() just like other data frames would go through. Signed-off-by: Johannes Berg --- net/mac80211/tdls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 5cd5e6e5834e..6c647f425e05 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -16,6 +16,7 @@ #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" +#include "wme.h" /* give usermode some time for retries in setting up the TDLS session */ #define TDLS_PEER_SETUP_TIMEOUT (15 * HZ) @@ -1010,14 +1011,13 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, switch (action_code) { case WLAN_TDLS_SETUP_REQUEST: case WLAN_TDLS_SETUP_RESPONSE: - skb_set_queue_mapping(skb, IEEE80211_AC_BK); - skb->priority = 2; + skb->priority = 256 + 2; break; default: - skb_set_queue_mapping(skb, IEEE80211_AC_VI); - skb->priority = 5; + skb->priority = 256 + 5; break; } + skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, skb)); /* * Set the WLAN_TDLS_TEARDOWN flag to indicate a teardown in progress. From c42055105785580563535e6d3143cad95c7ac7ee Mon Sep 17 00:00:00 2001 From: Yuan-Chi Pang Date: Thu, 6 Sep 2018 16:57:48 +0800 Subject: [PATCH 038/499] mac80211: fix TX status reporting for ieee80211s TX status reporting to ieee80211s is through ieee80211s_update_metric. There are two problems about ieee80211s_update_metric: 1. The purpose is to estimate the fail probability to a specific link. No need to restrict to data frame. 2. Current implementation does not work if wireless driver does not pass tx_status with skb. Fix this by removing ieee80211_is_data condition, passing ieee80211_tx_status directly to ieee80211s_update_metric, and putting it in both __ieee80211_tx_status and ieee80211_tx_status_ext. Signed-off-by: Yuan-Chi Pang Signed-off-by: Johannes Berg --- net/mac80211/mesh.h | 3 ++- net/mac80211/mesh_hwmp.c | 9 +++------ net/mac80211/status.c | 4 +++- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index ee56f18cad3f..21526630bf65 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -217,7 +217,8 @@ void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); void ieee80211s_update_metric(struct ieee80211_local *local, - struct sta_info *sta, struct sk_buff *skb); + struct sta_info *sta, + struct ieee80211_tx_status *st); void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_teardown_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index daf9db3c8f24..6950cd0bf594 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -295,15 +295,12 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, } void ieee80211s_update_metric(struct ieee80211_local *local, - struct sta_info *sta, struct sk_buff *skb) + struct sta_info *sta, + struct ieee80211_tx_status *st) { - struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb); - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *txinfo = st->info; int failed; - if (!ieee80211_is_data(hdr->frame_control)) - return; - failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK); /* moving average, scaled to 100. diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 001a869c059c..91d7c0cd1882 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -808,7 +808,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, rate_control_tx_status(local, sband, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) - ieee80211s_update_metric(local, sta, skb); + ieee80211s_update_metric(local, sta, status); if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); @@ -969,6 +969,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw, } rate_control_tx_status(local, sband, status); + if (ieee80211_vif_is_mesh(&sta->sdata->vif)) + ieee80211s_update_metric(local, sta, status); } if (acked || noack_success) { From 9e1437937807b0122e8da1ca8765be2adca9aee6 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 11 Sep 2018 10:31:15 +0200 Subject: [PATCH 039/499] xfrm: Fix NULL pointer dereference when skb_dst_force clears the dst_entry. Since commit 222d7dbd258d ("net: prevent dst uses after free") skb_dst_force() might clear the dst_entry attached to the skb. The xfrm code don't expect this to happen, so we crash with a NULL pointer dereference in this case. Fix it by checking skb_dst(skb) for NULL after skb_dst_force() and drop the packet in cast the dst_entry was cleared. Fixes: 222d7dbd258d ("net: prevent dst uses after free") Reported-by: Tobias Hommel Reported-by: Kristian Evensen Reported-by: Wolfgang Walter Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 4 ++++ net/xfrm/xfrm_policy.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 89b178a78dc7..36d15a38ce5e 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -101,6 +101,10 @@ static int xfrm_output_one(struct sk_buff *skb, int err) spin_unlock_bh(&x->lock); skb_dst_force(skb); + if (!skb_dst(skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + goto error_nolock; + } if (xfrm_offload(skb)) { x->type_offload->encap(x, skb); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7c5e8978aeaa..626e0f4d1749 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2548,6 +2548,10 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } skb_dst_force(skb); + if (!skb_dst(skb)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); + return 0; + } dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { From 9f34519a82356f6cf0ccb8480ee0ed99b3d0af75 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 31 Aug 2018 11:52:00 -0700 Subject: [PATCH 040/499] cxgb4: fix abort_req_rss6 struct Remove the incorrect WR_HDR field which can cause a misinterpretation of ABORT CPL by ULDs, such as iw_cxgb4. Fixes: a3cdaa69e4ae ("cxgb4: Adds CPL support for Shared Receive Queues") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/chelsio/cxgb4/t4_msg.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h index b8f75a22fb6c..f152da1ce046 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h @@ -753,7 +753,6 @@ struct cpl_abort_req_rss { }; struct cpl_abort_req_rss6 { - WR_HDR; union opcode_tid ot; __be32 srqidx_status; }; From b90ca5cc32f59bb214847c6855959702f00c6801 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 11 Sep 2018 21:27:44 -0700 Subject: [PATCH 041/499] filesystem-dax: Fix use of zero page Use my_zero_pfn instead of ZERO_PAGE(), and pass the vaddr to it instead of zero so it works on MIPS and s390 who reference the vaddr to select a zero page. Cc: Fixes: 91d25ba8a6b0 ("dax: use common 4k zero page for dax mmap reads") Signed-off-by: Matthew Wilcox Reviewed-by: Ross Zwisler Signed-off-by: Dan Williams --- fs/dax.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index f32d7125ad0f..b68ce484e1be 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1120,21 +1120,12 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; - vm_fault_t ret = VM_FAULT_NOPAGE; - struct page *zero_page; - pfn_t pfn; + pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); + vm_fault_t ret; - zero_page = ZERO_PAGE(0); - if (unlikely(!zero_page)) { - ret = VM_FAULT_OOM; - goto out; - } - - pfn = page_to_pfn_t(zero_page); dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, false); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); -out: trace_dax_load_hole(inode, vmf, ret); return ret; } From 379e36d3f591cb62e781907edafea17d75e8c1fa Mon Sep 17 00:00:00 2001 From: Ludovic Desroches Date: Fri, 7 Sep 2018 10:18:51 +0200 Subject: [PATCH 042/499] ARM: dts: at91: sama5d2_ptc_ek: fix nand pinctrl The drive strength has to be set to medium otherwise some data corruption may happen. Signed-off-by: Ludovic Desroches Acked-by: Nicolas Ferre Signed-off-by: Alexandre Belloni --- arch/arm/boot/dts/at91-sama5d2_ptc_ek.dts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/boot/dts/at91-sama5d2_ptc_ek.dts b/arch/arm/boot/dts/at91-sama5d2_ptc_ek.dts index b10dccd0958f..3b1baa8605a7 100644 --- a/arch/arm/boot/dts/at91-sama5d2_ptc_ek.dts +++ b/arch/arm/boot/dts/at91-sama5d2_ptc_ek.dts @@ -11,6 +11,7 @@ #include "sama5d2-pinfunc.h" #include #include +#include / { model = "Atmel SAMA5D2 PTC EK"; @@ -299,6 +300,7 @@ re_we_data { , ; bias-pull-up; + atmel,drive-strength = ; }; ale_cle_rdy_cs { From 9d2ac5f4bec00a0697ab77c999ef38753b3cb900 Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Fri, 3 Aug 2018 12:40:58 +0200 Subject: [PATCH 043/499] media: i2c: mt9v111: Fix v4l2-ctrl error handling Fix error handling of v4l2_ctrl creation by inspecting the ctrl.error flag instead of testing for each returned value correctness. As reported by Dan Carpenter returning PTR_ERR() on the v4l2_ctrl_new_std() return value is also wrong, as that function return NULL on error. While at there re-order the cleanup path to respect the operation inverse order. Fixes: aab7ed1c "media: i2c: Add driver for Aptina MT9V111" Reported-by: Dan Carpenter Signed-off-by: Jacopo Mondi Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/mt9v111.c | 41 ++++++++++++------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/drivers/media/i2c/mt9v111.c b/drivers/media/i2c/mt9v111.c index b5410aeb5fe2..bb41bea950ac 100644 --- a/drivers/media/i2c/mt9v111.c +++ b/drivers/media/i2c/mt9v111.c @@ -1159,41 +1159,21 @@ static int mt9v111_probe(struct i2c_client *client) V4L2_CID_AUTO_WHITE_BALANCE, 0, 1, 1, V4L2_WHITE_BALANCE_AUTO); - if (IS_ERR_OR_NULL(mt9v111->auto_awb)) { - ret = PTR_ERR(mt9v111->auto_awb); - goto error_free_ctrls; - } - mt9v111->auto_exp = v4l2_ctrl_new_std_menu(&mt9v111->ctrls, &mt9v111_ctrl_ops, V4L2_CID_EXPOSURE_AUTO, V4L2_EXPOSURE_MANUAL, 0, V4L2_EXPOSURE_AUTO); - if (IS_ERR_OR_NULL(mt9v111->auto_exp)) { - ret = PTR_ERR(mt9v111->auto_exp); - goto error_free_ctrls; - } - - /* Initialize timings */ mt9v111->hblank = v4l2_ctrl_new_std(&mt9v111->ctrls, &mt9v111_ctrl_ops, V4L2_CID_HBLANK, MT9V111_CORE_R05_MIN_HBLANK, MT9V111_CORE_R05_MAX_HBLANK, 1, MT9V111_CORE_R05_DEF_HBLANK); - if (IS_ERR_OR_NULL(mt9v111->hblank)) { - ret = PTR_ERR(mt9v111->hblank); - goto error_free_ctrls; - } - mt9v111->vblank = v4l2_ctrl_new_std(&mt9v111->ctrls, &mt9v111_ctrl_ops, V4L2_CID_VBLANK, MT9V111_CORE_R06_MIN_VBLANK, MT9V111_CORE_R06_MAX_VBLANK, 1, MT9V111_CORE_R06_DEF_VBLANK); - if (IS_ERR_OR_NULL(mt9v111->vblank)) { - ret = PTR_ERR(mt9v111->vblank); - goto error_free_ctrls; - } /* PIXEL_RATE is fixed: just expose it to user space. */ v4l2_ctrl_new_std(&mt9v111->ctrls, &mt9v111_ctrl_ops, @@ -1201,6 +1181,10 @@ static int mt9v111_probe(struct i2c_client *client) DIV_ROUND_CLOSEST(mt9v111->sysclk, 2), 1, DIV_ROUND_CLOSEST(mt9v111->sysclk, 2)); + if (mt9v111->ctrls.error) { + ret = mt9v111->ctrls.error; + goto error_free_ctrls; + } mt9v111->sd.ctrl_handler = &mt9v111->ctrls; /* Start with default configuration: 640x480 UYVY. */ @@ -1226,26 +1210,27 @@ static int mt9v111_probe(struct i2c_client *client) mt9v111->pad.flags = MEDIA_PAD_FL_SOURCE; ret = media_entity_pads_init(&mt9v111->sd.entity, 1, &mt9v111->pad); if (ret) - goto error_free_ctrls; + goto error_free_entity; #endif ret = mt9v111_chip_probe(mt9v111); if (ret) - goto error_free_ctrls; + goto error_free_entity; ret = v4l2_async_register_subdev(&mt9v111->sd); if (ret) - goto error_free_ctrls; + goto error_free_entity; return 0; -error_free_ctrls: - v4l2_ctrl_handler_free(&mt9v111->ctrls); - +error_free_entity: #if IS_ENABLED(CONFIG_MEDIA_CONTROLLER) media_entity_cleanup(&mt9v111->sd.entity); #endif +error_free_ctrls: + v4l2_ctrl_handler_free(&mt9v111->ctrls); + mutex_destroy(&mt9v111->pwr_mutex); mutex_destroy(&mt9v111->stream_mutex); @@ -1259,12 +1244,12 @@ static int mt9v111_remove(struct i2c_client *client) v4l2_async_unregister_subdev(sd); - v4l2_ctrl_handler_free(&mt9v111->ctrls); - #if IS_ENABLED(CONFIG_MEDIA_CONTROLLER) media_entity_cleanup(&sd->entity); #endif + v4l2_ctrl_handler_free(&mt9v111->ctrls); + mutex_destroy(&mt9v111->pwr_mutex); mutex_destroy(&mt9v111->stream_mutex); From dab99a0f77e9764a00b0203d0155ca134247f461 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 30 Aug 2018 06:18:22 -0400 Subject: [PATCH 044/499] media: staging/media/mt9t031/Kconfig: remove bogus entry The 'config SOC_CAMERA_IMX074' is a copy-and-paste error and should be removed. This Kconfig is for mt9t031 only. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/mt9t031/Kconfig | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/staging/media/mt9t031/Kconfig b/drivers/staging/media/mt9t031/Kconfig index f48e06a03cdb..9a58aaf72edd 100644 --- a/drivers/staging/media/mt9t031/Kconfig +++ b/drivers/staging/media/mt9t031/Kconfig @@ -1,9 +1,3 @@ -config SOC_CAMERA_IMX074 - tristate "imx074 support (DEPRECATED)" - depends on SOC_CAMERA && I2C - help - This driver supports IMX074 cameras from Sony - config SOC_CAMERA_MT9T031 tristate "mt9t031 support (DEPRECATED)" depends on SOC_CAMERA && I2C From 67e3816842fe6414d629c7515b955952ec40c7d7 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 31 Aug 2018 07:16:03 -0700 Subject: [PATCH 045/499] RDMA/uverbs: Atomically flush and mark closed the comp event queue Currently a uverbs completion event queue is flushed of events in ib_uverbs_comp_event_close() with the queue spinlock held and then released. Yet setting ev_queue->is_closed is not set until later in uverbs_hot_unplug_completion_event_file(). In between the time ib_uverbs_comp_event_close() releases the lock and uverbs_hot_unplug_completion_event_file() acquires the lock, a completion event can arrive and be inserted into the event queue by ib_uverbs_comp_handler(). This can cause a "double add" list_add warning or crash depending on the kernel configuration, or a memory leak because the event is never dequeued since the queue is already closed down. So add setting ev_queue->is_closed = 1 to ib_uverbs_comp_event_close(). Cc: stable@vger.kernel.org Fixes: 1e7710f3f656 ("IB/core: Change completion channel to use the reworked objects schema") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 6d974e2363df..50152c1b1004 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -440,6 +440,7 @@ static int ib_uverbs_comp_event_close(struct inode *inode, struct file *filp) list_del(&entry->obj_list); kfree(entry); } + file->ev_queue.is_closed = 1; spin_unlock_irq(&file->ev_queue.lock); uverbs_close_fd(filp); From dfb06cba8c73c0704710b2e3fbe2c35ac66a01b4 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 5 Sep 2018 13:31:40 -0700 Subject: [PATCH 046/499] uaccess: Fix is_source param for check_copy_size() in copy_to_iter_mcsafe() copy_to_iter_mcsafe() is passing in the is_source parameter as "false" to check_copy_size(). This is different than what copy_to_iter() does. Also, the addr parameter passed to check_copy_size() is the source so therefore we should be passing in "true" instead. Fixes: 8780356ef630 ("x86/asm/memcpy_mcsafe: Define copy_to_iter_mcsafe()") Cc: Reported-by: Fan Du Signed-off-by: Dave Jiang Reviewed-by: Vishal Verma Reported-by: Wenwei Tao Signed-off-by: Dan Williams --- include/linux/uio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 409c845d4cd3..422b1c01ee0d 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -172,7 +172,7 @@ size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) static __always_inline __must_check size_t copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i) { - if (unlikely(!check_copy_size(addr, bytes, false))) + if (unlikely(!check_copy_size(addr, bytes, true))) return 0; else return _copy_to_iter_mcsafe(addr, bytes, i); From 3b7d96a0dbb6b630878597a1838fc39f808b761b Mon Sep 17 00:00:00 2001 From: Keerthy Date: Wed, 8 Aug 2018 18:44:59 +0530 Subject: [PATCH 047/499] clocksource/drivers/ti-32k: Add CLOCK_SOURCE_SUSPEND_NONSTOP flag for non-am43 SoCs The 32k clocksource is NONSTOP for non-am43 SoCs. Hence add the flag for all the other SoCs. Reported-by: Tony Lindgren Signed-off-by: Keerthy Acked-by: Tony Lindgren Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-32k.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/clocksource/timer-ti-32k.c b/drivers/clocksource/timer-ti-32k.c index 29e2e1a78a43..6949a9113dbb 100644 --- a/drivers/clocksource/timer-ti-32k.c +++ b/drivers/clocksource/timer-ti-32k.c @@ -97,6 +97,9 @@ static int __init ti_32k_timer_init(struct device_node *np) return -ENXIO; } + if (!of_machine_is_compatible("ti,am43")) + ti_32k_timer.cs.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; + ti_32k_timer.counter = ti_32k_timer.base; /* From 5fe23f262e0548ca7f19fb79f89059a60d087d22 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 12 Sep 2018 16:27:44 -0700 Subject: [PATCH 048/499] ucma: fix a use-after-free in ucma_resolve_ip() There is a race condition between ucma_close() and ucma_resolve_ip(): CPU0 CPU1 ucma_resolve_ip(): ucma_close(): ctx = ucma_get_ctx(file, cmd.id); list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { mutex_lock(&mut); idr_remove(&ctx_idr, ctx->id); mutex_unlock(&mut); ... mutex_lock(&mut); if (!ctx->closing) { mutex_unlock(&mut); rdma_destroy_id(ctx->cm_id); ... ucma_free_ctx(ctx); ret = rdma_resolve_addr(); ucma_put_ctx(ctx); Before idr_remove(), ucma_get_ctx() could still find the ctx and after rdma_destroy_id(), rdma_resolve_addr() may still access id_priv pointer. Also, ucma_put_ctx() may use ctx after ucma_free_ctx() too. ucma_close() should call ucma_put_ctx() too which tests the refcnt and waits for the last one releasing it. The similar pattern is already used by ucma_destroy_id(). Reported-and-tested-by: syzbot+da2591e115d57a9cbb8b@syzkaller.appspotmail.com Reported-by: syzbot+cfe3c1e8ef634ba8964b@syzkaller.appspotmail.com Cc: Jason Gunthorpe Cc: Doug Ledford Cc: Leon Romanovsky Signed-off-by: Cong Wang Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/ucma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 5f437d1570fb..21863ddde63e 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1759,6 +1759,8 @@ static int ucma_close(struct inode *inode, struct file *filp) mutex_lock(&mut); if (!ctx->closing) { mutex_unlock(&mut); + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); /* rdma_destroy_id ensures that no event handlers are * inflight for that id before releasing it. */ From 807588ac92018bde88a1958f546438e840eb0158 Mon Sep 17 00:00:00 2001 From: Anisse Astier Date: Wed, 12 Sep 2018 15:07:05 +0200 Subject: [PATCH 049/499] HID: i2c-hid: disable runtime PM operations on hantick touchpad MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This hantick HTIX5288 touchpad can quickly fall in a wrong state if there are too many open/close operations. This will either make it stop reporting any input, or will shift all the input reads by a few bytes, making it impossible to decode. Here, we never release the probed touchpad runtime pm while the driver is loaded, which should disable all runtime pm suspend/resumes. This fast repetition of sleep/wakeup is also more likely to happen when using runtime PM, which is why the quirk is done there, and not for all power downs, which would include suspend or module removal. Signed-off-by: Anisse Astier Cc: stable@vger.kernel.org Acked-by: Benjamin Tissoires Reviewed-by: Hans de Goede Tested-by: Philip Müller Signed-off-by: Jiri Kosina --- drivers/hid/i2c-hid/i2c-hid.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c index f3076659361a..a8610f5bf6f5 100644 --- a/drivers/hid/i2c-hid/i2c-hid.c +++ b/drivers/hid/i2c-hid/i2c-hid.c @@ -48,6 +48,7 @@ #define I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV BIT(0) #define I2C_HID_QUIRK_NO_IRQ_AFTER_RESET BIT(1) #define I2C_HID_QUIRK_RESEND_REPORT_DESCR BIT(2) +#define I2C_HID_QUIRK_NO_RUNTIME_PM BIT(3) /* flags */ #define I2C_HID_STARTED 0 @@ -169,7 +170,8 @@ static const struct i2c_hid_quirks { { USB_VENDOR_ID_WEIDA, USB_DEVICE_ID_WEIDA_8755, I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV }, { I2C_VENDOR_ID_HANTICK, I2C_PRODUCT_ID_HANTICK_5288, - I2C_HID_QUIRK_NO_IRQ_AFTER_RESET }, + I2C_HID_QUIRK_NO_IRQ_AFTER_RESET | + I2C_HID_QUIRK_NO_RUNTIME_PM }, { USB_VENDOR_ID_SIS_TOUCH, USB_DEVICE_ID_SIS10FB_TOUCH, I2C_HID_QUIRK_RESEND_REPORT_DESCR }, { 0, 0 } @@ -1105,7 +1107,9 @@ static int i2c_hid_probe(struct i2c_client *client, goto err_mem_free; } - pm_runtime_put(&client->dev); + if (!(ihid->quirks & I2C_HID_QUIRK_NO_RUNTIME_PM)) + pm_runtime_put(&client->dev); + return 0; err_mem_free: @@ -1130,7 +1134,8 @@ static int i2c_hid_remove(struct i2c_client *client) struct i2c_hid *ihid = i2c_get_clientdata(client); struct hid_device *hid; - pm_runtime_get_sync(&client->dev); + if (!(ihid->quirks & I2C_HID_QUIRK_NO_RUNTIME_PM)) + pm_runtime_get_sync(&client->dev); pm_runtime_disable(&client->dev); pm_runtime_set_suspended(&client->dev); pm_runtime_put_noidle(&client->dev); From 36cae568404a298a19a6e8a3f18641075d4cab04 Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Thu, 13 Sep 2018 11:21:49 +0200 Subject: [PATCH 050/499] USB: serial: option: improve Quectel EP06 detection The Quectel EP06 (and EM06/EG06) LTE modem supports updating the USB configuration, without the VID/PID or configuration number changing. When the configuration is updated and interfaces are added/removed, the interface numbers are updated. This causes our current code for matching EP06 not to work as intended, as the assumption about reserved interfaces no longer holds. If for example the diagnostic (first) interface is removed, option will (try to) bind to the QMI interface. This patch improves EP06 detection by replacing the current match with two matches, and those matches check class, subclass and protocol as well as VID and PID. The diag interface exports class, subclass and protocol as 0xff. For the other serial interfaces, class is 0xff and subclass and protocol are both 0x0. The modem can export the following devices and always in this order: diag, nmea, at, ppp. qmi and adb. This means that diag can only ever be interface 0, and interface numbers 1-5 should be marked as reserved. The three other serial devices can have interface numbers 0-3, but I have not marked any interfaces as reserved. The reason is that the serial devices are the only interfaces exported by the device where subclass and protocol is 0x0. QMI exports the same class, subclass and protocol values as the diag interface. However, the two interfaces have different number of endpoints, QMI has three and diag two. I have added a check for number of interfaces if VID/PID matches the EP06, and we ignore the device if number of interfaces equals three (and subclass is set). Signed-off-by: Kristian Evensen Acked-by: Dan Williams [ johan: drop uneeded RSVD(5) for ADB ] Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 0215b70c4efc..382feafbd127 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1081,8 +1081,9 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(4) }, { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_BG96), .driver_info = RSVD(4) }, - { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06), - .driver_info = RSVD(4) | RSVD(5) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0xff, 0xff), + .driver_info = RSVD(1) | RSVD(2) | RSVD(3) | RSVD(4) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0, 0) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CMU_300) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6003), @@ -1985,6 +1986,7 @@ static int option_probe(struct usb_serial *serial, { struct usb_interface_descriptor *iface_desc = &serial->interface->cur_altsetting->desc; + struct usb_device_descriptor *dev_desc = &serial->dev->descriptor; unsigned long device_flags = id->driver_info; /* Never bind to the CD-Rom emulation interface */ @@ -1999,6 +2001,18 @@ static int option_probe(struct usb_serial *serial, if (device_flags & RSVD(iface_desc->bInterfaceNumber)) return -ENODEV; + /* + * Don't bind to the QMI device of the Quectel EP06/EG06/EM06. Class, + * subclass and protocol is 0xff for both the diagnostic port and the + * QMI interface, but the diagnostic port only has two endpoints (QMI + * has three). + */ + if (dev_desc->idVendor == cpu_to_le16(QUECTEL_VENDOR_ID) && + dev_desc->idProduct == cpu_to_le16(QUECTEL_PRODUCT_EP06) && + iface_desc->bInterfaceSubClass && iface_desc->bNumEndpoints == 3) { + return -ENODEV; + } + /* Store the device flags so we can use them during attach. */ usb_set_serial_data(serial, (void *)device_flags); From 35aecc02b5b621782111f64cbb032c7f6a90bb32 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 13 Sep 2018 11:21:50 +0200 Subject: [PATCH 051/499] USB: serial: option: add two-endpoints device-id flag Allow matching on interfaces having two endpoints by adding a new device-id flag. This allows for the handling of devices whose interface numbers can change (e.g. Quectel EP06) to be contained in the device-id table. Tested-by: Kristian Evensen Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 382feafbd127..e72ad9f81c73 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -561,6 +561,9 @@ static void option_instat_callback(struct urb *urb); /* Interface is reserved */ #define RSVD(ifnum) ((BIT(ifnum) & 0xff) << 0) +/* Interface must have two endpoints */ +#define NUMEP2 BIT(16) + static const struct usb_device_id option_ids[] = { { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) }, @@ -1082,7 +1085,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_BG96), .driver_info = RSVD(4) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0xff, 0xff), - .driver_info = RSVD(1) | RSVD(2) | RSVD(3) | RSVD(4) }, + .driver_info = RSVD(1) | RSVD(2) | RSVD(3) | RSVD(4) | NUMEP2 }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0, 0) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CMU_300) }, @@ -1986,7 +1989,6 @@ static int option_probe(struct usb_serial *serial, { struct usb_interface_descriptor *iface_desc = &serial->interface->cur_altsetting->desc; - struct usb_device_descriptor *dev_desc = &serial->dev->descriptor; unsigned long device_flags = id->driver_info; /* Never bind to the CD-Rom emulation interface */ @@ -2002,16 +2004,11 @@ static int option_probe(struct usb_serial *serial, return -ENODEV; /* - * Don't bind to the QMI device of the Quectel EP06/EG06/EM06. Class, - * subclass and protocol is 0xff for both the diagnostic port and the - * QMI interface, but the diagnostic port only has two endpoints (QMI - * has three). + * Allow matching on bNumEndpoints for devices whose interface numbers + * can change (e.g. Quectel EP06). */ - if (dev_desc->idVendor == cpu_to_le16(QUECTEL_VENDOR_ID) && - dev_desc->idProduct == cpu_to_le16(QUECTEL_PRODUCT_EP06) && - iface_desc->bInterfaceSubClass && iface_desc->bNumEndpoints == 3) { + if (device_flags & NUMEP2 && iface_desc->bNumEndpoints != 2) return -ENODEV; - } /* Store the device flags so we can use them during attach. */ usb_set_serial_data(serial, (void *)device_flags); From dabeb13eee81329338b1f8f330dfcc37a86714d7 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 14 Sep 2018 10:47:14 +0200 Subject: [PATCH 052/499] batman-adv: Increase version number to 2018.3 Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 8da3c9336111..3ccc75ee719c 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -25,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2018.2" +#define BATADV_SOURCE_VERSION "2018.3" #endif /* B.A.T.M.A.N. parameters */ From 1a866306e0fbf3caab168c1389e4497702749441 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Wed, 12 Sep 2018 16:34:27 +0200 Subject: [PATCH 053/499] drm/etnaviv: add DMA configuration for etnaviv platform device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The etnaviv device is a virtual device backing the DRM device, which may drive multiple hardware GPU core devices. As most of the dma-mapping handling is done through the virtual device, we need to make sure that a proper DMA setup is in place. The easiest way to get a reasonable configuration is to let the virtual device share the DMA configuration with one of the GPU devices, so call of_dma_configure() with the right parameters manually. This assumes that all etnaviv driven GPU devices in the system share the same DMA configuration. If we ever encounter a SoC where the GPUs are on busses with different offsets or behind different IOMMUs that will require much deeper changes to the driver, as we would need to implement etnaviv specific versions of most of the DRM helper functions. For now we should be fine with this solution. Signed-off-by: Lucas Stach Tested-by: Guido Günther Tested-by: Eugeniy Paltsev --- drivers/gpu/drm/etnaviv/etnaviv_drv.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.c b/drivers/gpu/drm/etnaviv/etnaviv_drv.c index 9b2720b41571..83c1f46670bf 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_drv.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.c @@ -592,8 +592,6 @@ static int etnaviv_pdev_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct component_match *match = NULL; - dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)); - if (!dev->platform_data) { struct device_node *core_node; @@ -655,13 +653,30 @@ static int __init etnaviv_init(void) for_each_compatible_node(np, NULL, "vivante,gc") { if (!of_device_is_available(np)) continue; - pdev = platform_device_register_simple("etnaviv", -1, - NULL, 0); - if (IS_ERR(pdev)) { - ret = PTR_ERR(pdev); + + pdev = platform_device_alloc("etnaviv", -1); + if (!pdev) { + ret = -ENOMEM; of_node_put(np); goto unregister_platform_driver; } + pdev->dev.coherent_dma_mask = DMA_BIT_MASK(40); + pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask; + + /* + * Apply the same DMA configuration to the virtual etnaviv + * device as the GPU we found. This assumes that all Vivante + * GPUs in the system share the same DMA constraints. + */ + of_dma_configure(&pdev->dev, np, true); + + ret = platform_device_add(pdev); + if (ret) { + platform_device_put(pdev); + of_node_put(np); + goto unregister_platform_driver; + } + etnaviv_drm = pdev; of_node_put(np); break; From f14040bca89258b8a1c71e2112e430462172ce93 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 13 Sep 2018 15:33:47 +1000 Subject: [PATCH 054/499] KVM: PPC: Book3S HV: Fix guest r11 corruption with POWER9 TM workarounds When we come into the softpatch handler (0x1500), we use r11 to store the HSRR0 for later use by the denorm handler. We also use the softpatch handler for the TM workarounds for POWER9. Unfortunately, in kvmppc_interrupt_hv we later store r11 out to the vcpu assuming it's still what we got from userspace. This causes r11 to be corrupted in the VCPU and hence when we restore the guest, we get a corrupted r11. We've seen this when running TM tests inside guests on P9. This fixes the problem by only touching r11 in the denorm case. Fixes: 4bb3c7a020 ("KVM: PPC: Book3S HV: Work around transactional memory bugs in POWER9") Cc: # 4.17+ Test-by: Suraj Jitindar Singh Reviewed-by: Paul Mackerras Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ea04dfb8c092..2d8fc8c9da7a 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1314,9 +1314,7 @@ EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100) #ifdef CONFIG_PPC_DENORMALISATION mfspr r10,SPRN_HSRR1 - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ - addi r11,r11,-4 /* HSRR0 is next instruction */ bne+ denorm_assist #endif @@ -1382,6 +1380,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) */ XVCPSGNDP32(32) denorm_done: + mfspr r11,SPRN_HSRR0 + subi r11,r11,4 mtspr SPRN_HSRR0,r11 mtcrf 0x80,r9 ld r9,PACA_EXGEN+EX_R9(r13) From d792d4c4fc866ae224b0b0ca2aabd87d23b4d6cc Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 11 Sep 2018 12:22:25 -0700 Subject: [PATCH 055/499] scsi: ibmvscsis: Fix a stringop-overflow warning There's currently a warning about string overflow with strncat: drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c: In function 'ibmvscsis_probe': drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c:3479:2: error: 'strncat' specified bound 64 equals destination size [-Werror=stringop-overflow=] strncat(vscsi->eye, vdev->name, MAX_EYE); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Switch to a single snprintf instead of a strcpy + strcat to handle this cleanly. Signed-off-by: Laura Abbott Suggested-by: Kees Cook Signed-off-by: Martin K. Petersen --- drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c index fac377320158..b3a029ad07cd 100644 --- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -3474,8 +3474,7 @@ static int ibmvscsis_probe(struct vio_dev *vdev, vscsi->dds.window[LOCAL].liobn, vscsi->dds.window[REMOTE].liobn); - strcpy(vscsi->eye, "VSCSI "); - strncat(vscsi->eye, vdev->name, MAX_EYE); + snprintf(vscsi->eye, sizeof(vscsi->eye), "VSCSI %s", vdev->name); vscsi->dds.unit_id = vdev->unit_address; strncpy(vscsi->dds.partition_name, partition_name, From adad633af7b970bfa5dd1b624a4afc83cac9b235 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 11 Sep 2018 12:22:26 -0700 Subject: [PATCH 056/499] scsi: ibmvscsis: Ensure partition name is properly NUL terminated While reviewing another part of the code, Kees noticed that the strncpy of the partition name might not always be NUL terminated. Switch to using strscpy which does this safely. Reported-by: Kees Cook Signed-off-by: Laura Abbott Reviewed-by: Kees Cook Signed-off-by: Martin K. Petersen --- drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c index b3a029ad07cd..f42a619198c4 100644 --- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -3477,7 +3477,7 @@ static int ibmvscsis_probe(struct vio_dev *vdev, snprintf(vscsi->eye, sizeof(vscsi->eye), "VSCSI %s", vdev->name); vscsi->dds.unit_id = vdev->unit_address; - strncpy(vscsi->dds.partition_name, partition_name, + strscpy(vscsi->dds.partition_name, partition_name, sizeof(vscsi->dds.partition_name)); vscsi->dds.partition_num = partition_number; From 83e32a5910772e1475d3640a429b7686695f04d1 Mon Sep 17 00:00:00 2001 From: Xuewei Zhang Date: Thu, 6 Sep 2018 13:37:19 -0700 Subject: [PATCH 057/499] scsi: sd: Contribute to randomness when running rotational device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently a scsi device won't contribute to kernel randomness when it uses blk-mq. Since we commonly use scsi on rotational device with blk-mq, it make sense to keep contributing to kernel randomness in these cases. This is especially important for virtual machines. commit b5b6e8c8d3b4 ("scsi: virtio_scsi: fix IO hang caused by automatic irq vector affinity") made all virtio-scsi device to use blk-mq, which does not contribute to randomness today. So for a virtual machine only having virtio-scsi disk (which is common), it will simple stop getting randomness from its disks in today's implementation. With this patch, if the above VM has rotational virtio-scsi device, then it can still benefit from the entropy generated from the disk. Reported-by: Xuewei Zhang Signed-off-by: Xuewei Zhang Reviewed-by: Ming Lei Reviewed-by: Maciej Żenczykowski Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index b79b366a94f7..5e4f10d28065 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2959,6 +2959,9 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp) if (rot == 1) { blk_queue_flag_set(QUEUE_FLAG_NONROT, q); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); + } else { + blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + blk_queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q); } if (sdkp->device->type == TYPE_ZBC) { From 3341ba9f0f2637a51738ef5de5114e13a1cd3eca Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Mon, 3 Sep 2018 12:26:21 +0200 Subject: [PATCH 058/499] mt76x0: fix remove_interface We wrongly use wcid_mask instead of vif_mask. This creates problems when the interface is removed more than 8 times, for example with iw: iw dev interface add type iw dev del This caused 'ifconfig up' to fail with error: SIOCSIFFLAGS: No space left on device Fixes: 95e444098a7b ("mt76x0: main file") Reported-and-tested-by: Sid Hayn Signed-off-by: Stanislaw Gruszka Signed-off-by: Kalle Valo --- drivers/net/wireless/mediatek/mt76/mt76x0/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/main.c b/drivers/net/wireless/mediatek/mt76/mt76x0/main.c index cf6ffb1ba4a2..22bc9d368728 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/main.c @@ -77,9 +77,8 @@ static void mt76x0_remove_interface(struct ieee80211_hw *hw, { struct mt76x0_dev *dev = hw->priv; struct mt76_vif *mvif = (struct mt76_vif *) vif->drv_priv; - unsigned int wcid = mvif->group_wcid.idx; - dev->wcid_mask[wcid / BITS_PER_LONG] &= ~BIT(wcid % BITS_PER_LONG); + dev->vif_mask &= ~BIT(mvif->idx); } static int mt76x0_config(struct ieee80211_hw *hw, u32 changed) From 013ad043906b2befd4a9bfb06219ed9fedd92716 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 13 Sep 2018 21:16:20 -0400 Subject: [PATCH 059/499] dm thin metadata: fix __udivdi3 undefined on 32-bit sector_div() is only viable for use with sector_t. dm_block_t is typedef'd to uint64_t -- so use div_u64() instead. Fixes: 3ab918281 ("dm thin metadata: try to avoid ever aborting transactions") Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 74f6770c70b1..20b0776e39ef 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -832,10 +832,8 @@ static void __set_metadata_reserve(struct dm_pool_metadata *pmd) if (r) { DMERR("could not get size of metadata device"); pmd->metadata_reserve = max_blocks; - } else { - sector_div(total, 10); - pmd->metadata_reserve = min(max_blocks, total); - } + } else + pmd->metadata_reserve = min(max_blocks, div_u64(total, 10)); } struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, From 2ea15030c3f0554db695f85e4760da8621c183d3 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 11 Sep 2018 15:06:26 +0200 Subject: [PATCH 060/499] mmc: sdhi: sys_dmac: check for all Gen3 types when whitelisting Fixes: 26eb2607fa28 ("mmc: renesas_sdhi: add eMMC HS400 mode support") Signed-off-by: Wolfram Sang Reviewed-by: Simon Horman Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_sys_dmac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/renesas_sdhi_sys_dmac.c b/drivers/mmc/host/renesas_sdhi_sys_dmac.c index 890f192dedbd..5389c4821882 100644 --- a/drivers/mmc/host/renesas_sdhi_sys_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_sys_dmac.c @@ -498,7 +498,8 @@ static const struct soc_device_attribute gen3_soc_whitelist[] = { static int renesas_sdhi_sys_dmac_probe(struct platform_device *pdev) { - if (of_device_get_match_data(&pdev->dev) == &of_rcar_gen3_compatible && + if ((of_device_get_match_data(&pdev->dev) == &of_rcar_gen3_compatible || + of_device_get_match_data(&pdev->dev) == &of_rcar_r8a7795_compatible) && !soc_device_match(gen3_soc_whitelist)) return -ENODEV; From 324493fba77500592bbaa66421729421f139d4b5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 14 Aug 2018 12:02:31 -0400 Subject: [PATCH 061/499] media: platform: fix cros-ec-cec build error Fix build when MFD_CROS_EC is not enabled but COMPILE_TEST=y. Fixes this build error: ERROR: "cros_ec_cmd_xfer_status" [drivers/media/platform/cros-ec-cec/cros-ec-cec.ko] undefined! Signed-off-by: Randy Dunlap Cc: Neil Armstrong Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/platform/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig index 94c1fe0e9787..54fe90acb5b2 100644 --- a/drivers/media/platform/Kconfig +++ b/drivers/media/platform/Kconfig @@ -541,6 +541,8 @@ config VIDEO_CROS_EC_CEC depends on MFD_CROS_EC select CEC_CORE select CEC_NOTIFIER + select CHROME_PLATFORMS + select CROS_EC_PROTO ---help--- If you say yes here you will get support for the ChromeOS Embedded Controller's CEC. From 250ae0d46d857ed61f3e0abf584901e46bd2c638 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 2 Sep 2018 12:01:53 +0300 Subject: [PATCH 062/499] net/mlx5: Fix read from coherent memory Use accessor function READ_ONCE to read from coherent memory modified by the device and read by the driver. This becomes most important in preemptive kernels where cond_resched implementation does not have the side effect which guaranteed the updated value. Fixes: 269d26f47f6f ("net/mlx5: Reduce command polling interval") Change-Id: Ie6deeb565ffaf76777b07448c7fbcce3510bbb8a Signed-off-by: Eli Cohen Reported-by: Jesper Dangaard Brouer Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 3ce14d42ddc8..a53736c26c0c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -206,7 +206,7 @@ static void poll_timeout(struct mlx5_cmd_work_ent *ent) u8 own; do { - own = ent->lay->status_own; + own = READ_ONCE(ent->lay->status_own); if (!(own & CMD_OWNER_HW)) { ent->ret = 0; return; From 6b359d5550a1ae7a1269c9dc1dd73dfdc4d6fe58 Mon Sep 17 00:00:00 2001 From: Alaa Hleihel Date: Mon, 3 Sep 2018 10:38:14 +0300 Subject: [PATCH 063/499] net/mlx5: Check for SQ and not RQ state when modifying hairpin SQ When modifying hairpin SQ, instead of checking if the next state equals to MLX5_SQC_STATE_RDY, we compare it against the MLX5_RQC_STATE_RDY enum value. The code worked since both of MLX5_RQC_STATE_RDY and MLX5_SQC_STATE_RDY have the same value today. This patch fixes this issue. Fixes: 18e568c390c6 ("net/mlx5: Hairpin pair core object setup") Change-Id: I6758aa7b4bd137966ae28206b70648c5bc223b46 Signed-off-by: Alaa Hleihel Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index dae1c5c5d27c..d2f76070ea7c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -509,7 +509,7 @@ static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn, sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); - if (next_state == MLX5_RQC_STATE_RDY) { + if (next_state == MLX5_SQC_STATE_RDY) { MLX5_SET(sqc, sqc, hairpin_peer_rq, peer_rq); MLX5_SET(sqc, sqc, hairpin_peer_vhca, peer_vhca); } From 8f92e35aff9692028279d3c03e88547df6d15020 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 15 Sep 2018 00:50:02 -0700 Subject: [PATCH 064/499] net/mlx5e: TLS, Read capabilities only when it is safe Read TLS caps from the core driver only when TLS is supported, i.e mlx5_accel_is_tls_device returns true. Fixes: 790af90c00d2 ("net/mlx5e: TLS, build TLS netdev from capabilities") Change-Id: I5f21ff4d684901af487e366a7e0cf032b54ee9cf Reported-by: Michal Kubecek Signed-off-by: Saeed Mahameed Reviewed-by: Boris Pismenny Reviewed-by: Tariq Toukan --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c index eddd7702680b..e88340e196f7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c @@ -183,12 +183,13 @@ static const struct tlsdev_ops mlx5e_tls_ops = { void mlx5e_tls_build_netdev(struct mlx5e_priv *priv) { - u32 caps = mlx5_accel_tls_device_caps(priv->mdev); struct net_device *netdev = priv->netdev; + u32 caps; if (!mlx5_accel_is_tls_device(priv->mdev)) return; + caps = mlx5_accel_tls_device_caps(priv->mdev); if (caps & MLX5_ACCEL_TLS_TX) { netdev->features |= NETIF_F_HW_TLS_TX; netdev->hw_features |= NETIF_F_HW_TLS_TX; From b592211c33f745af67a3271ce77c10fc1e6d6241 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 17 Sep 2018 11:38:47 -0400 Subject: [PATCH 065/499] dm mpath: fix attached_handler_name leak and dangling hw_handler_name pointer Commit e8f74a0f0011 ("dm mpath: eliminate need to use scsi_device_from_queue") introduced 2 regressions: 1) memory leak occurs if attached_handler_name is not assigned to m->hw_handler_name 2) m->hw_handler_name can become a dangling pointer if the RETAIN_ATTACHED_HW_HANDLER flag is set and scsi_dh_attach() returns -EBUSY. Fix both of these by clearing 'attached_handler_name' pointer passed to setup_scsi_dh() after it is assigned to m->hw_handler_name. And if setup_scsi_dh() doesn't consume 'attached_handler_name' parse_path() will kfree() it. Fixes: e8f74a0f0011 ("dm mpath: eliminate need to use scsi_device_from_queue") Cc: stable@vger.kernel.org # 4.16+ Reported-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d94ba6f72ff5..419362c2d8ac 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -806,19 +806,19 @@ static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, } static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, - const char *attached_handler_name, char **error) + const char **attached_handler_name, char **error) { struct request_queue *q = bdev_get_queue(bdev); int r; if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) { retain: - if (attached_handler_name) { + if (*attached_handler_name) { /* * Clear any hw_handler_params associated with a * handler that isn't already attached. */ - if (m->hw_handler_name && strcmp(attached_handler_name, m->hw_handler_name)) { + if (m->hw_handler_name && strcmp(*attached_handler_name, m->hw_handler_name)) { kfree(m->hw_handler_params); m->hw_handler_params = NULL; } @@ -830,7 +830,8 @@ static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, * handler instead of the original table passed in. */ kfree(m->hw_handler_name); - m->hw_handler_name = attached_handler_name; + m->hw_handler_name = *attached_handler_name; + *attached_handler_name = NULL; } } @@ -867,7 +868,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps struct pgpath *p; struct multipath *m = ti->private; struct request_queue *q; - const char *attached_handler_name; + const char *attached_handler_name = NULL; /* we need at least a path arg */ if (as->argc < 1) { @@ -890,7 +891,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); if (attached_handler_name || m->hw_handler_name) { INIT_DELAYED_WORK(&p->activate_path, activate_path_work); - r = setup_scsi_dh(p->path.dev->bdev, m, attached_handler_name, &ti->error); + r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error); if (r) { dm_put_device(ti, p->path.dev); goto bad; @@ -905,6 +906,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps return p; bad: + kfree(attached_handler_name); free_pgpath(p); return ERR_PTR(r); } From 0328ba90407c82874f9f39f9a5a6e5ad6cb76f44 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 17 Sep 2018 10:02:34 +0200 Subject: [PATCH 066/499] dm raid: remove bogus const from decipher_sync_action() return type With gcc-4.1.2: drivers/md/dm-raid.c:3357: warning: type qualifiers ignored on function return type Remove the "const" keyword to fix this. Fixes: 36a240a706d43383 ("dm raid: fix RAID leg rebuild errors") Signed-off-by: Geert Uytterhoeven Acked-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5ba067fa0c72..c44925e4e481 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3353,7 +3353,7 @@ static const char *sync_str(enum sync_state state) }; /* Return enum sync_state for @mddev derived from @recovery flags */ -static const enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long recovery) +static enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long recovery) { if (test_bit(MD_RECOVERY_FROZEN, &recovery)) return st_frozen; From ffc4c92227db5699493e43eb140b4cb5904c30ff Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 18 Sep 2018 00:36:36 -0400 Subject: [PATCH 067/499] sysfs: Do not return POSIX ACL xattrs via listxattr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 786534b92f3c introduced a regression that caused listxattr to return the POSIX ACL attribute names even though sysfs doesn't support POSIX ACLs. This happens because simple_xattr_list checks for NULL i_acl / i_default_acl, but inode_init_always initializes those fields to ACL_NOT_CACHED ((void *)-1). For example: $ getfattr -m- -d /sys /sys: system.posix_acl_access: Operation not supported /sys: system.posix_acl_default: Operation not supported Fix this in simple_xattr_list by checking if the filesystem supports POSIX ACLs. Fixes: 786534b92f3c ("tmpfs: listxattr should include POSIX ACL xattrs") Reported-by: Marc Aurèle La France Tested-by: Marc Aurèle La France Signed-off-by: Andreas Gruenbacher Cc: stable@vger.kernel.org # v4.5+ Signed-off-by: Al Viro --- fs/xattr.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index daa732550088..0d6a6a4af861 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -948,17 +948,19 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, int err = 0; #ifdef CONFIG_FS_POSIX_ACL - if (inode->i_acl) { - err = xattr_list_one(&buffer, &remaining_size, - XATTR_NAME_POSIX_ACL_ACCESS); - if (err) - return err; - } - if (inode->i_default_acl) { - err = xattr_list_one(&buffer, &remaining_size, - XATTR_NAME_POSIX_ACL_DEFAULT); - if (err) - return err; + if (IS_POSIXACL(inode)) { + if (inode->i_acl) { + err = xattr_list_one(&buffer, &remaining_size, + XATTR_NAME_POSIX_ACL_ACCESS); + if (err) + return err; + } + if (inode->i_default_acl) { + err = xattr_list_one(&buffer, &remaining_size, + XATTR_NAME_POSIX_ACL_DEFAULT); + if (err) + return err; + } } #endif From 51c3c62b58b357e8d35e4cc32f7b4ec907426fe3 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 14 Sep 2018 11:14:11 +1000 Subject: [PATCH 068/499] powerpc: Avoid code patching freed init sections This stops us from doing code patching in init sections after they've been freed. In this chain: kvm_guest_init() -> kvm_use_magic_page() -> fault_in_pages_readable() -> __get_user() -> __get_user_nocheck() -> barrier_nospec(); We have a code patching location at barrier_nospec() and kvm_guest_init() is an init function. This whole chain gets inlined, so when we free the init section (hence kvm_guest_init()), this code goes away and hence should no longer be patched. We seen this as userspace memory corruption when using a memory checker while doing partition migration testing on powervm (this starts the code patching post migration via /sys/kernel/mobility/migration). In theory, it could also happen when using /sys/kernel/debug/powerpc/barrier_nospec. Cc: stable@vger.kernel.org # 4.13+ Signed-off-by: Michael Neuling Reviewed-by: Nicholas Piggin Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/setup.h | 1 + arch/powerpc/lib/code-patching.c | 6 ++++++ arch/powerpc/mm/mem.c | 2 ++ 3 files changed, 9 insertions(+) diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 1a951b00465d..1fffbba8d6a5 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -9,6 +9,7 @@ extern void ppc_printk_progress(char *s, unsigned short hex); extern unsigned int rtas_data; extern unsigned long long memory_limit; +extern bool init_mem_is_free; extern unsigned long klimit; extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask); diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 850f3b8f4da5..6ae2777c220d 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -28,6 +28,12 @@ static int __patch_instruction(unsigned int *exec_addr, unsigned int instr, { int err; + /* Make sure we aren't patching a freed init section */ + if (init_mem_is_free && init_section_contains(exec_addr, 4)) { + pr_debug("Skipping init section patching addr: 0x%px\n", exec_addr); + return 0; + } + __put_user_size(instr, patch_addr, 4, err); if (err) return err; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 5c8530d0c611..04ccb274a620 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -63,6 +63,7 @@ #endif unsigned long long memory_limit; +bool init_mem_is_free; #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; @@ -396,6 +397,7 @@ void free_initmem(void) { ppc_md.progress = ppc_printk_progress; mark_initmem_nx(); + init_mem_is_free = true; free_initmem_default(POISON_FREE_INITMEM); } From e0bf2d4982fe7d9ddaf550dd023803ea286f47fc Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 26 Aug 2018 19:49:32 +0200 Subject: [PATCH 069/499] serial: mvebu-uart: Fix reporting of effective CSIZE to userspace Apparently, this driver (or the hardware) does not support character length settings. It's apparently running in 8-bit mode, but it makes userspace believe it's in 5-bit mode. That makes tcsetattr with CS8 incorrectly fail, breaking e.g. getty from busybox, thus the login shell on ttyMVx. Fix by hard-wiring CS8 into c_cflag. Signed-off-by: Jan Kiszka Fixes: 30530791a7a0 ("serial: mvebu-uart: initial support for Armada-3700 serial port") Cc: stable # 4.6+ Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/mvebu-uart.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/mvebu-uart.c b/drivers/tty/serial/mvebu-uart.c index d04b5eeea3c6..170e446a2f62 100644 --- a/drivers/tty/serial/mvebu-uart.c +++ b/drivers/tty/serial/mvebu-uart.c @@ -511,6 +511,7 @@ static void mvebu_uart_set_termios(struct uart_port *port, termios->c_iflag |= old->c_iflag & ~(INPCK | IGNPAR); termios->c_cflag &= CREAD | CBAUD; termios->c_cflag |= old->c_cflag & ~(CREAD | CBAUD); + termios->c_cflag |= CS8; } spin_unlock_irqrestore(&port->lock, flags); From 3216c622a24b0ebb9c159a8d1daf7f17a106b3f5 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Tue, 28 Aug 2018 12:44:24 +0200 Subject: [PATCH 070/499] tty: serial: lpuart: avoid leaking struct tty_struct The function tty_port_tty_get() gets a reference to the tty. Since the code is not using tty_port_tty_set(), the reference is kept even after closing the tty. Avoid using tty_port_tty_get() by directly access the tty instance. Since lpuart_start_rx_dma() is called from the .startup() and .set_termios() callback, it is safe to assume the tty instance is valid. Cc: stable@vger.kernel.org # v4.9+ Fixes: 5887ad43ee02 ("tty: serial: fsl_lpuart: Use cyclic DMA for Rx") Signed-off-by: Stefan Agner Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/fsl_lpuart.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index 51e47a63d61a..3f8d1274fc85 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -979,7 +979,8 @@ static inline int lpuart_start_rx_dma(struct lpuart_port *sport) struct circ_buf *ring = &sport->rx_ring; int ret, nent; int bits, baud; - struct tty_struct *tty = tty_port_tty_get(&sport->port.state->port); + struct tty_port *port = &sport->port.state->port; + struct tty_struct *tty = port->tty; struct ktermios *termios = &tty->termios; baud = tty_get_baud_rate(tty); From be28c1e3ca29887e207f0cbcd294cefe5074bab6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 14 Sep 2018 10:32:50 +0000 Subject: [PATCH 071/499] serial: cpm_uart: return immediately from console poll kgdb expects poll function to return immediately and returning NO_POLL_CHAR when no character is available. Fixes: f5316b4aea024 ("kgdb,8250,pl011: Return immediately from console poll") Cc: Jason Wessel Cc: Signed-off-by: Christophe Leroy Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/cpm_uart/cpm_uart_core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/tty/serial/cpm_uart/cpm_uart_core.c b/drivers/tty/serial/cpm_uart/cpm_uart_core.c index 24a5f05e769b..e5389591bb4f 100644 --- a/drivers/tty/serial/cpm_uart/cpm_uart_core.c +++ b/drivers/tty/serial/cpm_uart/cpm_uart_core.c @@ -1054,8 +1054,8 @@ static int poll_wait_key(char *obuf, struct uart_cpm_port *pinfo) /* Get the address of the host memory buffer. */ bdp = pinfo->rx_cur; - while (bdp->cbd_sc & BD_SC_EMPTY) - ; + if (bdp->cbd_sc & BD_SC_EMPTY) + return NO_POLL_CHAR; /* If the buffer address is in the CPM DPRAM, don't * convert it. @@ -1090,7 +1090,11 @@ static int cpm_get_poll_char(struct uart_port *port) poll_chars = 0; } if (poll_chars <= 0) { - poll_chars = poll_wait_key(poll_buf, pinfo); + int ret = poll_wait_key(poll_buf, pinfo); + + if (ret == NO_POLL_CHAR) + return ret; + poll_chars = ret; pollp = poll_buf; } poll_chars--; From fe32416790093b31364c08395727de17ec96ace1 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 18 Sep 2018 00:52:52 +0100 Subject: [PATCH 072/499] tty: Drop tty->count on tty_reopen() failure In case of tty_ldisc_reinit() failure, tty->count should be decremented back, otherwise we will never release_tty(). Tetsuo reported that it fixes noisy warnings on tty release like: pts pts4033: tty_release: tty->count(10529) != (#fd's(7) + #kopen's(0)) Fixes: commit 892d1fa7eaae ("tty: Destroy ldisc instance on hangup") Cc: stable@vger.kernel.org # v4.6+ Cc: Greg Kroah-Hartman Cc: Jiri Slaby Reviewed-by: Jiri Slaby Tested-by: Jiri Slaby Tested-by: Mark Rutland Tested-by: Tetsuo Handa Signed-off-by: Dmitry Safonov Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_io.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 32bc3e3fe4d3..5e5da9acaf0a 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1255,6 +1255,7 @@ static void tty_driver_remove_tty(struct tty_driver *driver, struct tty_struct * static int tty_reopen(struct tty_struct *tty) { struct tty_driver *driver = tty->driver; + int retval; if (driver->type == TTY_DRIVER_TYPE_PTY && driver->subtype == PTY_TYPE_MASTER) @@ -1268,10 +1269,14 @@ static int tty_reopen(struct tty_struct *tty) tty->count++; - if (!tty->ldisc) - return tty_ldisc_reinit(tty, tty->termios.c_line); + if (tty->ldisc) + return 0; - return 0; + retval = tty_ldisc_reinit(tty, tty->termios.c_line); + if (retval) + tty->count--; + + return retval; } /** From e97267cb4d1ee01ca0929638ec0fcbb0904f903d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 16 Aug 2018 15:30:38 -0500 Subject: [PATCH 073/499] tty: vt_ioctl: fix potential Spectre v1 vsa.console is indirectly controlled by user-space, hence leading to a potential exploitation of the Spectre variant 1 vulnerability. This issue was detected with the help of Smatch: drivers/tty/vt/vt_ioctl.c:711 vt_ioctl() warn: potential spectre issue 'vc_cons' [r] Fix this by sanitizing vsa.console before using it to index vc_cons Notice that given that speculation windows are large, the policy is to kill the speculation on the first load and not worry if it can be completed with a dependent load/store [1]. [1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2 Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva Reviewed-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt_ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index a78ad10a119b..73cdc0d633dd 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -32,6 +32,8 @@ #include #include +#include + #include #include #include @@ -700,6 +702,8 @@ int vt_ioctl(struct tty_struct *tty, if (vsa.console == 0 || vsa.console > MAX_NR_CONSOLES) ret = -ENXIO; else { + vsa.console = array_index_nospec(vsa.console, + MAX_NR_CONSOLES + 1); vsa.console--; console_lock(); ret = vc_allocate(vsa.console); From 8801922cd94c918e4dc3a108ecaa500c4d40583f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 18 Sep 2018 16:10:47 +0300 Subject: [PATCH 074/499] intel_th: Fix device removal logic Commit a753bfcfdb1f ("intel_th: Make the switch allocate its subdevices") brings in new subdevice addition/removal logic that's broken for "host mode": the SWITCH device has no children to begin with, which is not handled in the code. This results in a null dereference bug later down the path. This patch fixes the subdevice removal code to handle host mode correctly. Signed-off-by: Alexander Shishkin Fixes: a753bfcfdb1f ("intel_th: Make the switch allocate its subdevices") CC: stable@vger.kernel.org # v4.14+ Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c index da962aa2cef5..4e70ecee2103 100644 --- a/drivers/hwtracing/intel_th/core.c +++ b/drivers/hwtracing/intel_th/core.c @@ -139,7 +139,8 @@ static int intel_th_remove(struct device *dev) th->thdev[i] = NULL; } - th->num_thdevs = lowest; + if (lowest >= 0) + th->num_thdevs = lowest; } if (thdrv->attr_group) From ebe4582281d6e90972f057318a6edea14810ea48 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 18 Sep 2018 16:10:48 +0300 Subject: [PATCH 075/499] intel_th: Fix resource handling for ACPI glue layer The core of the driver expects the resource array from the glue layer to be indexed by even numbers, as is the case for 64-bit PCI resources. This doesn't hold true for others, ACPI in this instance, which leads to an out-of-bounds access and an ioremap() on whatever address that access fetches. This patch fixes the problem by reading resource array differently based on whether the 64-bit flag is set, which would indicate PCI glue layer. Signed-off-by: Alexander Shishkin Fixes: ebc57e399b8e ("intel_th: Add ACPI glue layer") CC: stable@vger.kernel.org # v4.17+ Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c index 4e70ecee2103..fc6b7f8b62fb 100644 --- a/drivers/hwtracing/intel_th/core.c +++ b/drivers/hwtracing/intel_th/core.c @@ -488,7 +488,7 @@ static const struct intel_th_subdevice { .flags = IORESOURCE_MEM, }, { - .start = TH_MMIO_SW, + .start = 1, /* use resource[1] */ .end = 0, .flags = IORESOURCE_MEM, }, @@ -581,6 +581,7 @@ intel_th_subdevice_alloc(struct intel_th *th, struct intel_th_device *thdev; struct resource res[3]; unsigned int req = 0; + bool is64bit = false; int r, err; thdev = intel_th_device_alloc(th, subdev->type, subdev->name, @@ -590,12 +591,18 @@ intel_th_subdevice_alloc(struct intel_th *th, thdev->drvdata = th->drvdata; + for (r = 0; r < th->num_resources; r++) + if (th->resource[r].flags & IORESOURCE_MEM_64) { + is64bit = true; + break; + } + memcpy(res, subdev->res, sizeof(struct resource) * subdev->nres); for (r = 0; r < subdev->nres; r++) { struct resource *devres = th->resource; - int bar = TH_MMIO_CONFIG; + int bar = 0; /* cut subdevices' MMIO from resource[0] */ /* * Take .end == 0 to mean 'take the whole bar', @@ -604,6 +611,8 @@ intel_th_subdevice_alloc(struct intel_th *th, */ if (!res[r].end && res[r].flags == IORESOURCE_MEM) { bar = res[r].start; + if (is64bit) + bar *= 2; res[r].start = 0; res[r].end = resource_size(&devres[bar]) - 1; } From 59d08d00d43c644ee2011d7ff1807bdd69f31fe0 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 18 Sep 2018 16:10:49 +0300 Subject: [PATCH 076/499] intel_th: pci: Add Ice Lake PCH support This adds Intel(R) Trace Hub PCI ID for Ice Lake PCH. Signed-off-by: Alexander Shishkin Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index c2e55e5d97f6..1cf6290d6435 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -160,6 +160,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x18e1), .driver_data = (kernel_ulong_t)&intel_th_2x, }, + { + /* Ice Lake PCH */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x34a6), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, { 0 }, }; From 235fe0851bcf1d6ffd6cef4c7eb0d701d0946053 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 12 Sep 2018 11:55:26 -0500 Subject: [PATCH 077/499] MAINTAINERS: Update PPC contacts for PCI core error handling The original PCI error recovery functionality was for the powerpc-specific IBM EEH feature. PCIe subsequently added some similar features, including AER and DPC, that can be used on any architecture. We want the generic PCI core error handling support to work with all of these features. Driver error recovery callbacks should be independent of which feature the platform provides. Add the generic PCI core error recovery files to the powerpc EEH MAINTAINERS entry so the powerpc folks will be copied on changes to the generic PCI error handling strategy. Add Sam and Oliver as maintainers for this area. Signed-off-by: Bjorn Helgaas Acked-by: Russell Currey Acked-by: Sam Bobroff --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 4ece30f15777..f05d2e9ffc84 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11203,8 +11203,14 @@ F: tools/pci/ PCI ENHANCED ERROR HANDLING (EEH) FOR POWERPC M: Russell Currey +M: Sam Bobroff +M: Oliver O'Halloran L: linuxppc-dev@lists.ozlabs.org S: Supported +F: Documentation/PCI/pci-error-recovery.txt +F: drivers/pci/pcie/aer.c +F: drivers/pci/pcie/dpc.c +F: drivers/pci/pcie/err.c F: Documentation/powerpc/eeh-pci-error-recovery.txt F: arch/powerpc/kernel/eeh*.c F: arch/powerpc/platforms/*/eeh*.c From b8a946d8dc245ec3adae65c8eab6f3e73a950af3 Mon Sep 17 00:00:00 2001 From: Song Qiang Date: Mon, 17 Sep 2018 11:08:01 -0700 Subject: [PATCH 078/499] Input: gpio-keys - fix a documentation index issue gpio_keys.c now exists in the drivers/input/keyboard/ rather than drivers/input/. Signed-off-by: Song Qiang Signed-off-by: Dmitry Torokhov --- Documentation/devicetree/bindings/input/gpio-keys.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/input/gpio-keys.txt b/Documentation/devicetree/bindings/input/gpio-keys.txt index 996ce84352cb..7cccc49b6bea 100644 --- a/Documentation/devicetree/bindings/input/gpio-keys.txt +++ b/Documentation/devicetree/bindings/input/gpio-keys.txt @@ -1,4 +1,4 @@ -Device-Tree bindings for input/gpio_keys.c keyboard driver +Device-Tree bindings for input/keyboard/gpio_keys.c keyboard driver Required properties: - compatible = "gpio-keys"; From 49f62249a9577b0f8c20c7c843d23289d143daf1 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Mon, 17 Sep 2018 11:08:52 -0700 Subject: [PATCH 079/499] Input: egalax_ts - add system wakeup support This patch adds wakeup function support for egalax touch screen, if "wakeup-source" is added to device tree's egalax touch screen node, the wakeup function will be enabled, and egalax touch screen will be able to wakeup system from suspend. Signed-off-by: Anson Huang Reviewed-by: Fabio Estevam Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/egalax_ts.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/input/touchscreen/egalax_ts.c b/drivers/input/touchscreen/egalax_ts.c index 80e69bb8283e..83ac8c128192 100644 --- a/drivers/input/touchscreen/egalax_ts.c +++ b/drivers/input/touchscreen/egalax_ts.c @@ -241,6 +241,9 @@ static int __maybe_unused egalax_ts_suspend(struct device *dev) struct i2c_client *client = to_i2c_client(dev); int ret; + if (device_may_wakeup(dev)) + return enable_irq_wake(client->irq); + ret = i2c_master_send(client, suspend_cmd, MAX_I2C_DATA_LEN); return ret > 0 ? 0 : ret; } @@ -249,6 +252,9 @@ static int __maybe_unused egalax_ts_resume(struct device *dev) { struct i2c_client *client = to_i2c_client(dev); + if (device_may_wakeup(dev)) + return disable_irq_wake(client->irq); + return egalax_wake_up_device(client); } From 9e62df51be993035c577371ffee5477697a56aad Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Mon, 17 Sep 2018 12:43:34 -0700 Subject: [PATCH 080/499] Input: atakbd - fix Atari keymap Fix errors in Atari keymap (mostly in keypad, help and undo keys). Patch provided on debian-68k ML by Andreas Schwab , keymap array size and unhandled scancode limit adjusted to 0x73 by me. Tested-by: Michael Schmitz Signed-off-by: Michael Schmitz Signed-off-by: Andreas Schwab Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/atakbd.c | 64 ++++++++++++++------------------- 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/drivers/input/keyboard/atakbd.c b/drivers/input/keyboard/atakbd.c index 6f62da2909ec..e989574a2e20 100644 --- a/drivers/input/keyboard/atakbd.c +++ b/drivers/input/keyboard/atakbd.c @@ -75,8 +75,7 @@ MODULE_LICENSE("GPL"); */ -static unsigned char atakbd_keycode[0x72] = { /* American layout */ - [0] = KEY_GRAVE, +static unsigned char atakbd_keycode[0x73] = { /* American layout */ [1] = KEY_ESC, [2] = KEY_1, [3] = KEY_2, @@ -117,9 +116,9 @@ static unsigned char atakbd_keycode[0x72] = { /* American layout */ [38] = KEY_L, [39] = KEY_SEMICOLON, [40] = KEY_APOSTROPHE, - [41] = KEY_BACKSLASH, /* FIXME, '#' */ + [41] = KEY_GRAVE, [42] = KEY_LEFTSHIFT, - [43] = KEY_GRAVE, /* FIXME: '~' */ + [43] = KEY_BACKSLASH, [44] = KEY_Z, [45] = KEY_X, [46] = KEY_C, @@ -145,45 +144,34 @@ static unsigned char atakbd_keycode[0x72] = { /* American layout */ [66] = KEY_F8, [67] = KEY_F9, [68] = KEY_F10, - [69] = KEY_ESC, - [70] = KEY_DELETE, - [71] = KEY_KP7, - [72] = KEY_KP8, - [73] = KEY_KP9, + [71] = KEY_HOME, + [72] = KEY_UP, [74] = KEY_KPMINUS, - [75] = KEY_KP4, - [76] = KEY_KP5, - [77] = KEY_KP6, + [75] = KEY_LEFT, + [77] = KEY_RIGHT, [78] = KEY_KPPLUS, - [79] = KEY_KP1, - [80] = KEY_KP2, - [81] = KEY_KP3, - [82] = KEY_KP0, - [83] = KEY_KPDOT, - [90] = KEY_KPLEFTPAREN, - [91] = KEY_KPRIGHTPAREN, - [92] = KEY_KPASTERISK, /* FIXME */ - [93] = KEY_KPASTERISK, - [94] = KEY_KPPLUS, - [95] = KEY_HELP, + [80] = KEY_DOWN, + [82] = KEY_INSERT, + [83] = KEY_DELETE, [96] = KEY_102ND, - [97] = KEY_KPASTERISK, /* FIXME */ - [98] = KEY_KPSLASH, + [97] = KEY_UNDO, + [98] = KEY_HELP, [99] = KEY_KPLEFTPAREN, [100] = KEY_KPRIGHTPAREN, [101] = KEY_KPSLASH, [102] = KEY_KPASTERISK, - [103] = KEY_UP, - [104] = KEY_KPASTERISK, /* FIXME */ - [105] = KEY_LEFT, - [106] = KEY_RIGHT, - [107] = KEY_KPASTERISK, /* FIXME */ - [108] = KEY_DOWN, - [109] = KEY_KPASTERISK, /* FIXME */ - [110] = KEY_KPASTERISK, /* FIXME */ - [111] = KEY_KPASTERISK, /* FIXME */ - [112] = KEY_KPASTERISK, /* FIXME */ - [113] = KEY_KPASTERISK /* FIXME */ + [103] = KEY_KP7, + [104] = KEY_KP8, + [105] = KEY_KP9, + [106] = KEY_KP4, + [107] = KEY_KP5, + [108] = KEY_KP6, + [109] = KEY_KP1, + [110] = KEY_KP2, + [111] = KEY_KP3, + [112] = KEY_KP0, + [113] = KEY_KPDOT, + [114] = KEY_KPENTER, }; static struct input_dev *atakbd_dev; @@ -191,7 +179,7 @@ static struct input_dev *atakbd_dev; static void atakbd_interrupt(unsigned char scancode, char down) { - if (scancode < 0x72) { /* scancodes < 0xf2 are keys */ + if (scancode < 0x73) { /* scancodes < 0xf3 are keys */ // report raw events here? @@ -205,7 +193,7 @@ static void atakbd_interrupt(unsigned char scancode, char down) input_report_key(atakbd_dev, scancode, down); input_sync(atakbd_dev); } - } else /* scancodes >= 0xf2 are mouse data, most likely */ + } else /* scancodes >= 0xf3 are mouse data, most likely */ printk(KERN_INFO "atakbd: unhandled scancode %x\n", scancode); return; From 52d2c7bf7c90217fbe875d2d76f310979c48eb83 Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Mon, 17 Sep 2018 15:27:49 -0700 Subject: [PATCH 081/499] Input: atakbd - fix Atari CapsLock behaviour The CapsLock key on Atari keyboards is not a toggle, it does send the normal make and break scancodes. Drop the CapsLock toggle handling code, which did cause the CapsLock key to merely act as a Shift key. Tested-by: Michael Schmitz Signed-off-by: Michael Schmitz Signed-off-by: Andreas Schwab Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/atakbd.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/input/keyboard/atakbd.c b/drivers/input/keyboard/atakbd.c index e989574a2e20..6caee807cafa 100644 --- a/drivers/input/keyboard/atakbd.c +++ b/drivers/input/keyboard/atakbd.c @@ -185,14 +185,8 @@ static void atakbd_interrupt(unsigned char scancode, char down) scancode = atakbd_keycode[scancode]; - if (scancode == KEY_CAPSLOCK) { /* CapsLock is a toggle switch key on Amiga */ - input_report_key(atakbd_dev, scancode, 1); - input_report_key(atakbd_dev, scancode, 0); - input_sync(atakbd_dev); - } else { - input_report_key(atakbd_dev, scancode, down); - input_sync(atakbd_dev); - } + input_report_key(atakbd_dev, scancode, down); + input_sync(atakbd_dev); } else /* scancodes >= 0xf3 are mouse data, most likely */ printk(KERN_INFO "atakbd: unhandled scancode %x\n", scancode); From 91a97507323e1ad4bfc10f4a5922e67cdaf8b3cd Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Tue, 18 Sep 2018 09:32:22 -0700 Subject: [PATCH 082/499] Input: elantech - enable middle button of touchpad on ThinkPad P72 Adding 2 new touchpad IDs to support middle button support. Cc: stable@vger.kernel.org Signed-off-by: Aaron Ma Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elantech.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index 44f57cf6675b..2d95e8d93cc7 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -1178,6 +1178,8 @@ static const struct dmi_system_id elantech_dmi_has_middle_button[] = { static const char * const middle_button_pnp_ids[] = { "LEN2131", /* ThinkPad P52 w/ NFC */ "LEN2132", /* ThinkPad P52 */ + "LEN2133", /* ThinkPad P72 w/ NFC */ + "LEN2134", /* ThinkPad P72 */ NULL }; From 4fef1250eafddc5182cd5c3c354a6971bcf7520d Mon Sep 17 00:00:00 2001 From: Peter Hutterer Date: Tue, 18 Sep 2018 09:53:32 -0700 Subject: [PATCH 083/499] Input: uinput - allow for max == min during input_absinfo validation These values are inclusive, so a range of 1 requires min == max. Signed-off-by: Peter Hutterer Reviewed-by: Martin Kepplinger Signed-off-by: Dmitry Torokhov --- drivers/input/misc/uinput.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index 96a887f33698..eb14ddf69346 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c @@ -410,7 +410,7 @@ static int uinput_validate_absinfo(struct input_dev *dev, unsigned int code, min = abs->minimum; max = abs->maximum; - if ((min != 0 || max != 0) && max <= min) { + if ((min != 0 || max != 0) && max < min) { printk(KERN_DEBUG "%s: invalid abs[%02x] min:%d max:%d\n", UINPUT_NAME, code, min, max); From 19a4fbffc94e41abaa2a623a25ce2641d69eccf0 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Delgado Date: Thu, 13 Sep 2018 15:37:04 +0200 Subject: [PATCH 084/499] gpiolib: Free the last requested descriptor The current code only frees N-1 gpios if an error occurs during gpiod_set_transitory, gpiod_direction_output or gpiod_direction_input. Leading to gpios that cannot be used by userspace nor other drivers. Cc: Timur Tabi Cc: stable@vger.kernel.org Fixes: ab3dbcf78f60f46d ("gpioib: do not free unrequested descriptors) Reported-by: Jan Lorenzen Reported-by: Jim Paris Signed-off-by: Ricardo Ribalda Delgado Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index e8f8a1999393..a57300c1d649 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -571,7 +571,7 @@ static int linehandle_create(struct gpio_device *gdev, void __user *ip) if (ret) goto out_free_descs; lh->descs[i] = desc; - count = i; + count = i + 1; if (lflags & GPIOHANDLE_REQUEST_ACTIVE_LOW) set_bit(FLAG_ACTIVE_LOW, &desc->flags); From 8fce3331702316d4bcfeb0771c09ac75d2192bbc Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 17 Sep 2018 09:22:56 +0100 Subject: [PATCH 085/499] net: stmmac: Rework coalesce timer and fix multi-queue races This follows David Miller advice and tries to fix coalesce timer in multi-queue scenarios. We are now using per-queue coalesce values and per-queue TX timer. Coalesce timer default values was changed to 1ms and the coalesce frames to 25. Tested in B2B setup between XGMAC2 and GMAC5. Signed-off-by: Jose Abreu Fixes: ce736788e8a ("net: stmmac: adding multiple buffers for TX") Cc: Florian Fainelli Cc: Neil Armstrong Cc: Jerome Brunet Cc: Martin Blumenstingl Cc: David S. Miller Cc: Joao Pinto Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/common.h | 4 +- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 14 +- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 233 ++++++++++-------- include/linux/stmmac.h | 1 + 4 files changed, 146 insertions(+), 106 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 1854f270ad66..b1b305f8f414 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -258,10 +258,10 @@ struct stmmac_safety_stats { #define MAX_DMA_RIWT 0xff #define MIN_DMA_RIWT 0x20 /* Tx coalesce parameters */ -#define STMMAC_COAL_TX_TIMER 40000 +#define STMMAC_COAL_TX_TIMER 1000 #define STMMAC_MAX_COAL_TX_TICK 100000 #define STMMAC_TX_MAX_FRAMES 256 -#define STMMAC_TX_FRAMES 64 +#define STMMAC_TX_FRAMES 25 /* Packets types */ enum packets_types { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index c0a855b7ab3b..63e1064b27a2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -48,6 +48,8 @@ struct stmmac_tx_info { /* Frequently used values are kept adjacent for cache effect */ struct stmmac_tx_queue { + u32 tx_count_frames; + struct timer_list txtimer; u32 queue_index; struct stmmac_priv *priv_data; struct dma_extended_desc *dma_etx ____cacheline_aligned_in_smp; @@ -73,7 +75,14 @@ struct stmmac_rx_queue { u32 rx_zeroc_thresh; dma_addr_t dma_rx_phy; u32 rx_tail_addr; +}; + +struct stmmac_channel { struct napi_struct napi ____cacheline_aligned_in_smp; + struct stmmac_priv *priv_data; + u32 index; + int has_rx; + int has_tx; }; struct stmmac_tc_entry { @@ -109,14 +118,12 @@ struct stmmac_pps_cfg { struct stmmac_priv { /* Frequently used values are kept adjacent for cache effect */ - u32 tx_count_frames; u32 tx_coal_frames; u32 tx_coal_timer; int tx_coalesce; int hwts_tx_en; bool tx_path_in_lpi_mode; - struct timer_list txtimer; bool tso; unsigned int dma_buf_sz; @@ -137,6 +144,9 @@ struct stmmac_priv { /* TX Queue */ struct stmmac_tx_queue tx_queue[MTL_MAX_TX_QUEUES]; + /* Generic channel for NAPI */ + struct stmmac_channel channel[STMMAC_CH_MAX]; + bool oldlink; int speed; int oldduplex; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 9f458bb16f2a..ab9cc0143ff2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -148,12 +148,14 @@ static void stmmac_verify_args(void) static void stmmac_disable_all_queues(struct stmmac_priv *priv) { u32 rx_queues_cnt = priv->plat->rx_queues_to_use; + u32 tx_queues_cnt = priv->plat->tx_queues_to_use; + u32 maxq = max(rx_queues_cnt, tx_queues_cnt); u32 queue; - for (queue = 0; queue < rx_queues_cnt; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + for (queue = 0; queue < maxq; queue++) { + struct stmmac_channel *ch = &priv->channel[queue]; - napi_disable(&rx_q->napi); + napi_disable(&ch->napi); } } @@ -164,12 +166,14 @@ static void stmmac_disable_all_queues(struct stmmac_priv *priv) static void stmmac_enable_all_queues(struct stmmac_priv *priv) { u32 rx_queues_cnt = priv->plat->rx_queues_to_use; + u32 tx_queues_cnt = priv->plat->tx_queues_to_use; + u32 maxq = max(rx_queues_cnt, tx_queues_cnt); u32 queue; - for (queue = 0; queue < rx_queues_cnt; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + for (queue = 0; queue < maxq; queue++) { + struct stmmac_channel *ch = &priv->channel[queue]; - napi_enable(&rx_q->napi); + napi_enable(&ch->napi); } } @@ -1843,18 +1847,18 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) * @queue: TX queue index * Description: it reclaims the transmit resources after transmission completes. */ -static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue) +static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue) { struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue]; unsigned int bytes_compl = 0, pkts_compl = 0; - unsigned int entry; + unsigned int entry, count = 0; - netif_tx_lock(priv->dev); + __netif_tx_lock_bh(netdev_get_tx_queue(priv->dev, queue)); priv->xstats.tx_clean++; entry = tx_q->dirty_tx; - while (entry != tx_q->cur_tx) { + while ((entry != tx_q->cur_tx) && (count < budget)) { struct sk_buff *skb = tx_q->tx_skbuff[entry]; struct dma_desc *p; int status; @@ -1870,6 +1874,8 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue) if (unlikely(status & tx_dma_own)) break; + count++; + /* Make sure descriptor fields are read after reading * the own bit. */ @@ -1937,7 +1943,10 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue) stmmac_enable_eee_mode(priv); mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(eee_timer)); } - netif_tx_unlock(priv->dev); + + __netif_tx_unlock_bh(netdev_get_tx_queue(priv->dev, queue)); + + return count; } /** @@ -2020,6 +2029,33 @@ static bool stmmac_safety_feat_interrupt(struct stmmac_priv *priv) return false; } +static int stmmac_napi_check(struct stmmac_priv *priv, u32 chan) +{ + int status = stmmac_dma_interrupt_status(priv, priv->ioaddr, + &priv->xstats, chan); + struct stmmac_channel *ch = &priv->channel[chan]; + bool needs_work = false; + + if ((status & handle_rx) && ch->has_rx) { + needs_work = true; + } else { + status &= ~handle_rx; + } + + if ((status & handle_tx) && ch->has_tx) { + needs_work = true; + } else { + status &= ~handle_tx; + } + + if (needs_work && napi_schedule_prep(&ch->napi)) { + stmmac_disable_dma_irq(priv, priv->ioaddr, chan); + __napi_schedule(&ch->napi); + } + + return status; +} + /** * stmmac_dma_interrupt - DMA ISR * @priv: driver private structure @@ -2034,57 +2070,14 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv) u32 channels_to_check = tx_channel_count > rx_channel_count ? tx_channel_count : rx_channel_count; u32 chan; - bool poll_scheduled = false; int status[max_t(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)]; /* Make sure we never check beyond our status buffer. */ if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status))) channels_to_check = ARRAY_SIZE(status); - /* Each DMA channel can be used for rx and tx simultaneously, yet - * napi_struct is embedded in struct stmmac_rx_queue rather than in a - * stmmac_channel struct. - * Because of this, stmmac_poll currently checks (and possibly wakes) - * all tx queues rather than just a single tx queue. - */ for (chan = 0; chan < channels_to_check; chan++) - status[chan] = stmmac_dma_interrupt_status(priv, priv->ioaddr, - &priv->xstats, chan); - - for (chan = 0; chan < rx_channel_count; chan++) { - if (likely(status[chan] & handle_rx)) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[chan]; - - if (likely(napi_schedule_prep(&rx_q->napi))) { - stmmac_disable_dma_irq(priv, priv->ioaddr, chan); - __napi_schedule(&rx_q->napi); - poll_scheduled = true; - } - } - } - - /* If we scheduled poll, we already know that tx queues will be checked. - * If we didn't schedule poll, see if any DMA channel (used by tx) has a - * completed transmission, if so, call stmmac_poll (once). - */ - if (!poll_scheduled) { - for (chan = 0; chan < tx_channel_count; chan++) { - if (status[chan] & handle_tx) { - /* It doesn't matter what rx queue we choose - * here. We use 0 since it always exists. - */ - struct stmmac_rx_queue *rx_q = - &priv->rx_queue[0]; - - if (likely(napi_schedule_prep(&rx_q->napi))) { - stmmac_disable_dma_irq(priv, - priv->ioaddr, chan); - __napi_schedule(&rx_q->napi); - } - break; - } - } - } + status[chan] = stmmac_napi_check(priv, chan); for (chan = 0; chan < tx_channel_count; chan++) { if (unlikely(status[chan] & tx_hard_error_bump_tc)) { @@ -2233,6 +2226,13 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) return ret; } +static void stmmac_tx_timer_arm(struct stmmac_priv *priv, u32 queue) +{ + struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue]; + + mod_timer(&tx_q->txtimer, STMMAC_COAL_TIMER(priv->tx_coal_timer)); +} + /** * stmmac_tx_timer - mitigation sw timer for tx. * @data: data pointer @@ -2241,13 +2241,14 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) */ static void stmmac_tx_timer(struct timer_list *t) { - struct stmmac_priv *priv = from_timer(priv, t, txtimer); - u32 tx_queues_count = priv->plat->tx_queues_to_use; - u32 queue; + struct stmmac_tx_queue *tx_q = from_timer(tx_q, t, txtimer); + struct stmmac_priv *priv = tx_q->priv_data; + struct stmmac_channel *ch; - /* let's scan all the tx queues */ - for (queue = 0; queue < tx_queues_count; queue++) - stmmac_tx_clean(priv, queue); + ch = &priv->channel[tx_q->queue_index]; + + if (likely(napi_schedule_prep(&ch->napi))) + __napi_schedule(&ch->napi); } /** @@ -2260,11 +2261,17 @@ static void stmmac_tx_timer(struct timer_list *t) */ static void stmmac_init_tx_coalesce(struct stmmac_priv *priv) { + u32 tx_channel_count = priv->plat->tx_queues_to_use; + u32 chan; + priv->tx_coal_frames = STMMAC_TX_FRAMES; priv->tx_coal_timer = STMMAC_COAL_TX_TIMER; - timer_setup(&priv->txtimer, stmmac_tx_timer, 0); - priv->txtimer.expires = STMMAC_COAL_TIMER(priv->tx_coal_timer); - add_timer(&priv->txtimer); + + for (chan = 0; chan < tx_channel_count; chan++) { + struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan]; + + timer_setup(&tx_q->txtimer, stmmac_tx_timer, 0); + } } static void stmmac_set_rings_length(struct stmmac_priv *priv) @@ -2592,6 +2599,7 @@ static void stmmac_hw_teardown(struct net_device *dev) static int stmmac_open(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); + u32 chan; int ret; stmmac_check_ether_addr(priv); @@ -2688,7 +2696,9 @@ static int stmmac_open(struct net_device *dev) if (dev->phydev) phy_stop(dev->phydev); - del_timer_sync(&priv->txtimer); + for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) + del_timer_sync(&priv->tx_queue[chan].txtimer); + stmmac_hw_teardown(dev); init_error: free_dma_desc_resources(priv); @@ -2708,6 +2718,7 @@ static int stmmac_open(struct net_device *dev) static int stmmac_release(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); + u32 chan; if (priv->eee_enabled) del_timer_sync(&priv->eee_ctrl_timer); @@ -2722,7 +2733,8 @@ static int stmmac_release(struct net_device *dev) stmmac_disable_all_queues(priv); - del_timer_sync(&priv->txtimer); + for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) + del_timer_sync(&priv->tx_queue[chan].txtimer); /* Free the IRQ lines */ free_irq(dev->irq, dev); @@ -2936,14 +2948,13 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) priv->xstats.tx_tso_nfrags += nfrags; /* Manage tx mitigation */ - priv->tx_count_frames += nfrags + 1; - if (likely(priv->tx_coal_frames > priv->tx_count_frames)) { - mod_timer(&priv->txtimer, - STMMAC_COAL_TIMER(priv->tx_coal_timer)); - } else { - priv->tx_count_frames = 0; + tx_q->tx_count_frames += nfrags + 1; + if (priv->tx_coal_frames <= tx_q->tx_count_frames) { stmmac_set_tx_ic(priv, desc); priv->xstats.tx_set_ic_bit++; + tx_q->tx_count_frames = 0; + } else { + stmmac_tx_timer_arm(priv, queue); } skb_tx_timestamp(skb); @@ -3146,14 +3157,13 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) * This approach takes care about the fragments: desc is the first * element in case of no SG. */ - priv->tx_count_frames += nfrags + 1; - if (likely(priv->tx_coal_frames > priv->tx_count_frames)) { - mod_timer(&priv->txtimer, - STMMAC_COAL_TIMER(priv->tx_coal_timer)); - } else { - priv->tx_count_frames = 0; + tx_q->tx_count_frames += nfrags + 1; + if (priv->tx_coal_frames <= tx_q->tx_count_frames) { stmmac_set_tx_ic(priv, desc); priv->xstats.tx_set_ic_bit++; + tx_q->tx_count_frames = 0; + } else { + stmmac_tx_timer_arm(priv, queue); } skb_tx_timestamp(skb); @@ -3199,6 +3209,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len); stmmac_enable_dma_transmission(priv, priv->ioaddr); + stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue); return NETDEV_TX_OK; @@ -3319,6 +3330,7 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue) static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) { struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + struct stmmac_channel *ch = &priv->channel[queue]; unsigned int entry = rx_q->cur_rx; int coe = priv->hw->rx_csum; unsigned int next_entry; @@ -3491,7 +3503,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) else skb->ip_summed = CHECKSUM_UNNECESSARY; - napi_gro_receive(&rx_q->napi, skb); + napi_gro_receive(&ch->napi, skb); priv->dev->stats.rx_packets++; priv->dev->stats.rx_bytes += frame_len; @@ -3514,27 +3526,33 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) * Description : * To look at the incoming frames and clear the tx resources. */ -static int stmmac_poll(struct napi_struct *napi, int budget) +static int stmmac_napi_poll(struct napi_struct *napi, int budget) { - struct stmmac_rx_queue *rx_q = - container_of(napi, struct stmmac_rx_queue, napi); - struct stmmac_priv *priv = rx_q->priv_data; - u32 tx_count = priv->plat->tx_queues_to_use; - u32 chan = rx_q->queue_index; - int work_done = 0; - u32 queue; + struct stmmac_channel *ch = + container_of(napi, struct stmmac_channel, napi); + struct stmmac_priv *priv = ch->priv_data; + int work_done = 0, work_rem = budget; + u32 chan = ch->index; priv->xstats.napi_poll++; - /* check all the queues */ - for (queue = 0; queue < tx_count; queue++) - stmmac_tx_clean(priv, queue); + if (ch->has_tx) { + int done = stmmac_tx_clean(priv, work_rem, chan); - work_done = stmmac_rx(priv, budget, rx_q->queue_index); - if (work_done < budget) { - napi_complete_done(napi, work_done); - stmmac_enable_dma_irq(priv, priv->ioaddr, chan); + work_done += done; + work_rem -= done; } + + if (ch->has_rx) { + int done = stmmac_rx(priv, work_rem, chan); + + work_done += done; + work_rem -= done; + } + + if (work_done < budget && napi_complete_done(napi, work_done)) + stmmac_enable_dma_irq(priv, priv->ioaddr, chan); + return work_done; } @@ -4198,8 +4216,8 @@ int stmmac_dvr_probe(struct device *device, { struct net_device *ndev = NULL; struct stmmac_priv *priv; + u32 queue, maxq; int ret = 0; - u32 queue; ndev = alloc_etherdev_mqs(sizeof(struct stmmac_priv), MTL_MAX_TX_QUEUES, @@ -4322,11 +4340,22 @@ int stmmac_dvr_probe(struct device *device, "Enable RX Mitigation via HW Watchdog Timer\n"); } - for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + /* Setup channels NAPI */ + maxq = max(priv->plat->rx_queues_to_use, priv->plat->tx_queues_to_use); - netif_napi_add(ndev, &rx_q->napi, stmmac_poll, - (8 * priv->plat->rx_queues_to_use)); + for (queue = 0; queue < maxq; queue++) { + struct stmmac_channel *ch = &priv->channel[queue]; + + ch->priv_data = priv; + ch->index = queue; + + if (queue < priv->plat->rx_queues_to_use) + ch->has_rx = true; + if (queue < priv->plat->tx_queues_to_use) + ch->has_tx = true; + + netif_napi_add(ndev, &ch->napi, stmmac_napi_poll, + NAPI_POLL_WEIGHT); } mutex_init(&priv->lock); @@ -4372,10 +4401,10 @@ int stmmac_dvr_probe(struct device *device, priv->hw->pcs != STMMAC_PCS_RTBI) stmmac_mdio_unregister(ndev); error_mdio_register: - for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + for (queue = 0; queue < maxq; queue++) { + struct stmmac_channel *ch = &priv->channel[queue]; - netif_napi_del(&rx_q->napi); + netif_napi_del(&ch->napi); } error_hw_init: destroy_workqueue(priv->wq); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index c43e9a01b892..7ddfc65586b0 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -30,6 +30,7 @@ #define MTL_MAX_RX_QUEUES 8 #define MTL_MAX_TX_QUEUES 8 +#define STMMAC_CH_MAX 8 #define STMMAC_RX_COE_NONE 0 #define STMMAC_RX_COE_TYPE1 1 From 0431100b3d82c509729ece1ab22ada2484e209c1 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 17 Sep 2018 09:22:57 +0100 Subject: [PATCH 086/499] net: stmmac: Fixup the tail addr setting in xmit path Currently we are always setting the tail address of descriptor list to the end of the pre-allocated list. According to databook this is not correct. Tail address should point to the last available descriptor + 1, which means we have to update the tail address everytime we call the xmit function. This should make no impact in older versions of MAC but in newer versions there are some DMA features which allows the IP to fetch descriptors in advance and in a non sequential order so its critical that we set the tail address correctly. Signed-off-by: Jose Abreu Fixes: f748be531d70 ("stmmac: support new GMAC4") Cc: David S. Miller Cc: Joao Pinto Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index ab9cc0143ff2..75896d6ba6e2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2213,8 +2213,7 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) stmmac_init_tx_chan(priv, priv->ioaddr, priv->plat->dma_cfg, tx_q->dma_tx_phy, chan); - tx_q->tx_tail_addr = tx_q->dma_tx_phy + - (DMA_TX_SIZE * sizeof(struct dma_desc)); + tx_q->tx_tail_addr = tx_q->dma_tx_phy; stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, chan); } @@ -3003,6 +3002,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len); + tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * sizeof(*desc)); stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue); return NETDEV_TX_OK; @@ -3210,6 +3210,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) stmmac_enable_dma_transmission(priv, priv->ioaddr); + tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * sizeof(*desc)); stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue); return NETDEV_TX_OK; From 0a286afee5a1e8dca86d824209dbd3200294f86f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 17 Sep 2018 15:30:06 +0200 Subject: [PATCH 087/499] selftests: pmtu: properly redirect stderr to /dev/null The cleanup function uses "$CMD 2 > /dev/null", which doesn't actually send stderr to /dev/null, so when the netns doesn't exist, the error message is shown. Use "2> /dev/null" instead, so that those messages disappear, as was intended. Fixes: d1f1b9cbf34c ("selftests: net: Introduce first PMTU test") Signed-off-by: Sabrina Dubroca Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 32a194e3e07a..0ab9423d009f 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -178,8 +178,8 @@ setup() { cleanup() { [ ${cleanup_done} -eq 1 ] && return - ip netns del ${NS_A} 2 > /dev/null - ip netns del ${NS_B} 2 > /dev/null + ip netns del ${NS_A} 2> /dev/null + ip netns del ${NS_B} 2> /dev/null cleanup_done=1 } From 674d9de02aa7d521ebdf66c3958758bdd9c64e11 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 17 Sep 2018 15:51:40 +0200 Subject: [PATCH 088/499] NFC: Fix possible memory corruption when handling SHDLC I-Frame commands When handling SHDLC I-Frame commands "pipe" field used for indexing into an array should be checked before usage. If left unchecked it might access memory outside of the array of size NFC_HCI_MAX_PIPES(127). Malformed NFC HCI frames could be injected by a malicious NFC device communicating with the device being attacked (remote attack vector), or even by an attacker with physical access to the I2C bus such that they could influence the data transfers on that bus (local attack vector). skb->data is controlled by the attacker and has only been sanitized in the most trivial ways (CRC check), therefore we can consider the create_info struct and all of its members to tainted. 'create_info->pipe' with max value of 255 (uint8) is used to take an offset of the hdev->pipes array of 127 elements which can lead to OOB write. Cc: Samuel Ortiz Cc: Allen Pais Cc: "David S. Miller" Suggested-by: Kevin Deus Signed-off-by: Suren Baghdasaryan Acked-by: Kees Cook Cc: stable Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/nfc/hci/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index ac8030c4bcf8..19cb2e473ea6 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -209,6 +209,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } create_info = (struct hci_create_pipe_resp *)skb->data; + if (create_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id * but since we received this cmd from host controller, we @@ -232,6 +237,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } delete_info = (struct hci_delete_pipe_noti *)skb->data; + if (delete_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE; hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST; break; From e285d5bfb7e9785d289663baef252dd315e171f8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 17 Sep 2018 15:51:41 +0200 Subject: [PATCH 089/499] NFC: Fix the number of pipes According to ETSI TS 102 622 specification chapter 4.4 pipe identifier is 7 bits long which allows for 128 unique pipe IDs. Because NFC_HCI_MAX_PIPES is used as the number of pipes supported and not as the max pipe ID, its value should be 128 instead of 127. nfc_hci_recv_from_llc extracts pipe ID from packet header using NFC_HCI_FRAGMENT(0x7F) mask which allows for pipe ID value of 127. Same happens when NCI_HCP_MSG_GET_PIPE() is being used. With pipes array having only 127 elements and pipe ID of 127 the OOB memory access will result. Cc: Samuel Ortiz Cc: Allen Pais Cc: "David S. Miller" Suggested-by: Dan Carpenter Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: stable Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- include/net/nfc/hci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/nfc/hci.h b/include/net/nfc/hci.h index 316694dafa5b..008f466d1da7 100644 --- a/include/net/nfc/hci.h +++ b/include/net/nfc/hci.h @@ -87,7 +87,7 @@ struct nfc_hci_pipe { * According to specification 102 622 chapter 4.4 Pipes, * the pipe identifier is 7 bits long. */ -#define NFC_HCI_MAX_PIPES 127 +#define NFC_HCI_MAX_PIPES 128 struct nfc_hci_init_data { u8 gate_count; struct nfc_hci_gate gates[NFC_HCI_MAX_CUSTOM_GATES]; From 08e39982ef64f800fd1f9b9b92968d14d5fafa82 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Mon, 17 Sep 2018 17:22:40 +0200 Subject: [PATCH 090/499] net: emac: fix fixed-link setup for the RTL8363SB switch On the Netgear WNDAP620, the emac ethernet isn't receiving nor xmitting any frames from/to the RTL8363SB (identifies itself as a RTL8367RB). This is caused by the emac hardware not knowing the forced link parameters for speed, duplex, pause, etc. This begs the question, how this was working on the original driver code, when it was necessary to set the phy_address and phy_map to 0xffffffff. But I guess without access to the old PPC405/440/460 hardware, it's not possible to know. Signed-off-by: Christian Lamparter Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 372664686309..129f4e9f38da 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -2677,12 +2677,17 @@ static int emac_init_phy(struct emac_instance *dev) if (of_phy_is_fixed_link(np)) { int res = emac_dt_mdio_probe(dev); - if (!res) { - res = of_phy_register_fixed_link(np); - if (res) - mdiobus_unregister(dev->mii_bus); + if (res) + return res; + + res = of_phy_register_fixed_link(np); + dev->phy_dev = of_phy_find_device(np); + if (res || !dev->phy_dev) { + mdiobus_unregister(dev->mii_bus); + return res ? res : -EINVAL; } - return res; + emac_adjust_link(dev->ndev); + put_device(&dev->phy_dev->mdio.dev); } return 0; } From 65fac4fe9080714df80d430888834ce87c6716ba Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Tue, 18 Sep 2018 15:15:44 +0800 Subject: [PATCH 091/499] net: bnxt: Fix a uninitialized variable warning. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following compile warning: drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c:49:5: warning: ‘nvm_param.dir_type’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (nvm_param.dir_type == BNXT_NVM_PORT_CFG) Signed-off-by: zhong jiang Acked-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index f3b9fbcc705b..790c684f08ab 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -46,6 +46,9 @@ static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg, } } + if (i == ARRAY_SIZE(nvm_params)) + return -EOPNOTSUPP; + if (nvm_param.dir_type == BNXT_NVM_PORT_CFG) idx = bp->pf.port_id; else if (nvm_param.dir_type == BNXT_NVM_FUNC_CFG) From 2fe397a3959de8a472f165e6d152f64cb77fa2cc Mon Sep 17 00:00:00 2001 From: Kazuya Mizuguchi Date: Tue, 18 Sep 2018 12:22:26 +0200 Subject: [PATCH 092/499] ravb: do not write 1 to reserved bits EtherAVB hardware requires 0 to be written to status register bits in order to clear them, however, care must be taken not to: 1. Clear other bits, by writing zero to them 2. Write one to reserved bits This patch corrects the ravb driver with respect to the second point above. This is done by defining reserved bit masks for the affected registers and, after auditing the code, ensure all sites that may write a one to a reserved bit use are suitably masked. Signed-off-by: Kazuya Mizuguchi Signed-off-by: Simon Horman Reviewed-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/ravb.h | 5 +++++ drivers/net/ethernet/renesas/ravb_main.c | 11 ++++++----- drivers/net/ethernet/renesas/ravb_ptp.c | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb.h b/drivers/net/ethernet/renesas/ravb.h index 1470fc12282b..9b6bf557a2f5 100644 --- a/drivers/net/ethernet/renesas/ravb.h +++ b/drivers/net/ethernet/renesas/ravb.h @@ -428,6 +428,7 @@ enum EIS_BIT { EIS_CULF1 = 0x00000080, EIS_TFFF = 0x00000100, EIS_QFS = 0x00010000, + EIS_RESERVED = (GENMASK(31, 17) | GENMASK(15, 11)), }; /* RIC0 */ @@ -472,6 +473,7 @@ enum RIS0_BIT { RIS0_FRF15 = 0x00008000, RIS0_FRF16 = 0x00010000, RIS0_FRF17 = 0x00020000, + RIS0_RESERVED = GENMASK(31, 18), }; /* RIC1 */ @@ -528,6 +530,7 @@ enum RIS2_BIT { RIS2_QFF16 = 0x00010000, RIS2_QFF17 = 0x00020000, RIS2_RFFF = 0x80000000, + RIS2_RESERVED = GENMASK(30, 18), }; /* TIC */ @@ -544,6 +547,7 @@ enum TIS_BIT { TIS_FTF1 = 0x00000002, /* Undocumented? */ TIS_TFUF = 0x00000100, TIS_TFWF = 0x00000200, + TIS_RESERVED = (GENMASK(31, 20) | GENMASK(15, 12) | GENMASK(7, 4)) }; /* ISS */ @@ -617,6 +621,7 @@ enum GIC_BIT { enum GIS_BIT { GIS_PTCF = 0x00000001, /* Undocumented? */ GIS_PTMF = 0x00000004, + GIS_RESERVED = GENMASK(15, 10), }; /* GIE (R-Car Gen3 only) */ diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index aff5516b781e..d6f753925352 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -739,10 +739,11 @@ static void ravb_error_interrupt(struct net_device *ndev) u32 eis, ris2; eis = ravb_read(ndev, EIS); - ravb_write(ndev, ~EIS_QFS, EIS); + ravb_write(ndev, ~(EIS_QFS | EIS_RESERVED), EIS); if (eis & EIS_QFS) { ris2 = ravb_read(ndev, RIS2); - ravb_write(ndev, ~(RIS2_QFF0 | RIS2_RFFF), RIS2); + ravb_write(ndev, ~(RIS2_QFF0 | RIS2_RFFF | RIS2_RESERVED), + RIS2); /* Receive Descriptor Empty int */ if (ris2 & RIS2_QFF0) @@ -795,7 +796,7 @@ static bool ravb_timestamp_interrupt(struct net_device *ndev) u32 tis = ravb_read(ndev, TIS); if (tis & TIS_TFUF) { - ravb_write(ndev, ~TIS_TFUF, TIS); + ravb_write(ndev, ~(TIS_TFUF | TIS_RESERVED), TIS); ravb_get_tx_tstamp(ndev); return true; } @@ -930,7 +931,7 @@ static int ravb_poll(struct napi_struct *napi, int budget) /* Processing RX Descriptor Ring */ if (ris0 & mask) { /* Clear RX interrupt */ - ravb_write(ndev, ~mask, RIS0); + ravb_write(ndev, ~(mask | RIS0_RESERVED), RIS0); if (ravb_rx(ndev, "a, q)) goto out; } @@ -938,7 +939,7 @@ static int ravb_poll(struct napi_struct *napi, int budget) if (tis & mask) { spin_lock_irqsave(&priv->lock, flags); /* Clear TX interrupt */ - ravb_write(ndev, ~mask, TIS); + ravb_write(ndev, ~(mask | TIS_RESERVED), TIS); ravb_tx_free(ndev, q, true); netif_wake_subqueue(ndev, q); mmiowb(); diff --git a/drivers/net/ethernet/renesas/ravb_ptp.c b/drivers/net/ethernet/renesas/ravb_ptp.c index 0721b5c35d91..dce2a40a31e3 100644 --- a/drivers/net/ethernet/renesas/ravb_ptp.c +++ b/drivers/net/ethernet/renesas/ravb_ptp.c @@ -315,7 +315,7 @@ void ravb_ptp_interrupt(struct net_device *ndev) } } - ravb_write(ndev, ~gis, GIS); + ravb_write(ndev, ~(gis | GIS_RESERVED), GIS); } void ravb_ptp_init(struct net_device *ndev, struct platform_device *pdev) From 648a5a7aed346c3b8fe7c32a835edfb0dfbf4451 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 18 Sep 2018 15:46:34 +0200 Subject: [PATCH 093/499] net/smc: fix non-blocking connect problem In state SMC_INIT smc_poll() delegates polling to the internal CLC socket. This means, once the connect worker has finished its kernel_connect() step, the poll wake-up may occur. This is not intended. The wake-up should occur from the wake up call in smc_connect_work() after __smc_connect() has finished. Thus in state SMC_INIT this patch now calls sock_poll_wait() on the main SMC socket. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2d8a1e15e4f9..9c3976bcde46 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -742,7 +742,10 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_err = -rc; out: - smc->sk.sk_state_change(&smc->sk); + if (smc->sk.sk_err) + smc->sk.sk_state_change(&smc->sk); + else + smc->sk.sk_write_space(&smc->sk); kfree(smc->connect_info); smc->connect_info = NULL; release_sock(&smc->sk); @@ -1529,7 +1532,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, return EPOLLNVAL; smc = smc_sk(sock->sk); - if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { + if (smc->use_fallback) { /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; From 1ca52fcfaca43665d525645348801a6f4a4b9e9a Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 18 Sep 2018 15:46:35 +0200 Subject: [PATCH 094/499] net/smc: remove duplicate mutex_unlock For a failing smc_listen_rdma_finish() smc_listen_decline() is called. If fallback is possible, the new socket is already enqueued to be accepted in smc_listen_decline(). Avoid enqueuing a second time afterwards in this case, otherwise the smc_create_lgr_pending lock is released twice: [ 373.463976] WARNING: bad unlock balance detected! [ 373.463978] 4.18.0-rc7+ #123 Tainted: G O [ 373.463979] ------------------------------------- [ 373.463980] kworker/1:1/30 is trying to release lock (smc_create_lgr_pending) at: [ 373.463990] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc] [ 373.463991] but there are no more locks to release! [ 373.463991] other info that might help us debug this: [ 373.463993] 2 locks held by kworker/1:1/30: [ 373.463994] #0: 00000000772cbaed ((wq_completion)"events"){+.+.}, at: process_one_work+0x1ec/0x6b0 [ 373.464000] #1: 000000003ad0894a ((work_completion)(&new_smc->smc_listen_work)){+.+.}, at: process_one_work+0x1ec/0x6b0 [ 373.464003] stack backtrace: [ 373.464005] CPU: 1 PID: 30 Comm: kworker/1:1 Kdump: loaded Tainted: G O 4.18.0-rc7uschi+ #123 [ 373.464007] Hardware name: IBM 2827 H43 738 (LPAR) [ 373.464010] Workqueue: events smc_listen_work [smc] [ 373.464011] Call Trace: [ 373.464015] ([<0000000000114100>] show_stack+0x60/0xd8) [ 373.464019] [<0000000000a8c9bc>] dump_stack+0x9c/0xd8 [ 373.464021] [<00000000001dcaf8>] print_unlock_imbalance_bug+0xf8/0x108 [ 373.464022] [<00000000001e045c>] lock_release+0x114/0x4f8 [ 373.464025] [<0000000000aa87fa>] __mutex_unlock_slowpath+0x4a/0x300 [ 373.464027] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc] [ 373.464029] [<0000000000197a68>] process_one_work+0x2a8/0x6b0 [ 373.464030] [<0000000000197ec2>] worker_thread+0x52/0x410 [ 373.464033] [<000000000019fd0e>] kthread+0x15e/0x178 [ 373.464035] [<0000000000aaf58a>] kernel_thread_starter+0x6/0xc [ 373.464052] [<0000000000aaf584>] kernel_thread_starter+0x0/0xc [ 373.464054] INFO: lockdep is turned off. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9c3976bcde46..5c6d30eb4a71 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1153,9 +1153,9 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) } /* listen worker: finish RDMA setup */ -static void smc_listen_rdma_finish(struct smc_sock *new_smc, - struct smc_clc_msg_accept_confirm *cclc, - int local_contact) +static int smc_listen_rdma_finish(struct smc_sock *new_smc, + struct smc_clc_msg_accept_confirm *cclc, + int local_contact) { struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; int reason_code = 0; @@ -1178,11 +1178,12 @@ static void smc_listen_rdma_finish(struct smc_sock *new_smc, if (reason_code) goto decline; } - return; + return 0; decline: mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, reason_code, local_contact); + return reason_code; } /* setup for RDMA connection of server */ @@ -1279,8 +1280,10 @@ static void smc_listen_work(struct work_struct *work) } /* finish worker */ - if (!ism_supported) - smc_listen_rdma_finish(new_smc, &cclc, local_contact); + if (!ism_supported) { + if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) + return; + } smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); smc_listen_out_connected(new_smc); From dd65d87a6abd537ae9bb3bdcee8905704b936884 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 18 Sep 2018 15:46:36 +0200 Subject: [PATCH 095/499] net/smc: enable fallback for connection abort in state INIT If a linkgroup is terminated abnormally already due to failing LLC CONFIRM LINK or LLC ADD LINK, fallback to TCP is still possible. In this case do not switch to state SMC_PEERABORTWAIT and do not set sk_err. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_close.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index ac961dfb1ea1..ea2b87f29469 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -100,15 +100,14 @@ static void smc_close_active_abort(struct smc_sock *smc) struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - sk->sk_err = ECONNABORTED; - if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_err = ECONNABORTED; - smc->clcsock->sk->sk_state_change(smc->clcsock->sk); + if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { + sk->sk_err = ECONNABORTED; + if (smc->clcsock && smc->clcsock->sk) { + smc->clcsock->sk->sk_err = ECONNABORTED; + smc->clcsock->sk->sk_state_change(smc->clcsock->sk); + } } switch (sk->sk_state) { - case SMC_INIT: - sk->sk_state = SMC_PEERABORTWAIT; - break; case SMC_ACTIVE: sk->sk_state = SMC_PEERABORTWAIT; release_sock(sk); @@ -143,6 +142,7 @@ static void smc_close_active_abort(struct smc_sock *smc) case SMC_PEERFINCLOSEWAIT: sock_put(sk); /* passive closing */ break; + case SMC_INIT: case SMC_PEERABORTWAIT: case SMC_CLOSED: break; From 71d117f527425e2d6a5029e8365d82a8d2d6916a Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 18 Sep 2018 15:46:37 +0200 Subject: [PATCH 096/499] net/smc: no urgent data check for listen sockets Don't check a listen socket for pending urgent data in smc_poll(). Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5c6d30eb4a71..015231789ed2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1566,9 +1566,9 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (sk->sk_state == SMC_APPCLOSEWAIT1) mask |= EPOLLIN; + if (smc->conn.urg_state == SMC_URG_VALID) + mask |= EPOLLPRI; } - if (smc->conn.urg_state == SMC_URG_VALID) - mask |= EPOLLPRI; } return mask; From 381897798a94065ffcad0772eecdc6b04a7ff23d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 18 Sep 2018 15:46:38 +0200 Subject: [PATCH 097/499] net/smc: fix sizeof to int comparison Comparing an int to a size, which is unsigned, causes the int to become unsigned, giving the wrong result. kernel_sendmsg can return a negative error code. Signed-off-by: YueHaibing Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 83aba9ade060..52241d679cc9 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -446,14 +446,12 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, vec[i++].iov_len = sizeof(trl); /* due to the few bytes needed for clc-handshake this cannot block */ len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); - if (len < sizeof(pclc)) { - if (len >= 0) { - reason_code = -ENETUNREACH; - smc->sk.sk_err = -reason_code; - } else { - smc->sk.sk_err = smc->clcsock->sk->sk_err; - reason_code = -smc->sk.sk_err; - } + if (len < 0) { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } else if (len < (int)sizeof(pclc)) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; } return reason_code; From 774268f3e51b53ed432a1ec516574fd5ba469398 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 18 Sep 2018 16:58:47 +0200 Subject: [PATCH 098/499] net: mvpp2: fix a txq_done race condition When no Tx IRQ is available, the txq_done() routine (called from tx_done()) shouldn't be called from the polling function, as in such case it is already called in the Tx path thanks to an hrtimer. This mostly occurred when using PPv2.1, as the engine then do not have Tx IRQs. Fixes: edc660fa09e2 ("net: mvpp2: replace TX coalescing interrupts with hrtimer") Reported-by: Stefan Chulski Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 702fec82d806..38cc01beea79 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3055,10 +3055,12 @@ static int mvpp2_poll(struct napi_struct *napi, int budget) cause_rx_tx & ~MVPP2_CAUSE_MISC_SUM_MASK); } - cause_tx = cause_rx_tx & MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK; - if (cause_tx) { - cause_tx >>= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET; - mvpp2_tx_done(port, cause_tx, qv->sw_thread_id); + if (port->has_tx_irqs) { + cause_tx = cause_rx_tx & MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK; + if (cause_tx) { + cause_tx >>= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET; + mvpp2_tx_done(port, cause_tx, qv->sw_thread_id); + } } /* Process RX packets */ From 126d6848ef13958e1cb959e96c21d19bc498ade9 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 18 Sep 2018 16:48:53 +0100 Subject: [PATCH 099/499] sfp: fix oops with ethtool -m If a network interface is created prior to the SFP socket being available, ethtool can request module information. This unfortunately leads to an oops: Unable to handle kernel NULL pointer dereference at virtual address 00000008 pgd = (ptrval) [00000008] *pgd=7c400831, *pte=00000000, *ppte=00000000 Internal error: Oops: 17 [#1] SMP ARM Modules linked in: CPU: 0 PID: 1480 Comm: ethtool Not tainted 4.19.0-rc3 #138 Hardware name: Broadcom Northstar Plus SoC PC is at sfp_get_module_info+0x8/0x10 LR is at dev_ethtool+0x218c/0x2afc Fix this by not filling in the network device's SFP bus pointer until SFP is fully bound, thereby avoiding the core calling into the SFP bus code. Fixes: ce0aa27ff3f6 ("sfp: add sfp-bus to bridge between network devices and sfp cages") Reported-by: Florian Fainelli Tested-by: Florian Fainelli Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/sfp-bus.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 740655261e5b..83060fb349f4 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -349,6 +349,7 @@ static int sfp_register_bus(struct sfp_bus *bus) } if (bus->started) bus->socket_ops->start(bus->sfp); + bus->netdev->sfp_bus = bus; bus->registered = true; return 0; } @@ -357,6 +358,7 @@ static void sfp_unregister_bus(struct sfp_bus *bus) { const struct sfp_upstream_ops *ops = bus->upstream_ops; + bus->netdev->sfp_bus = NULL; if (bus->registered) { if (bus->started) bus->socket_ops->stop(bus->sfp); @@ -438,7 +440,6 @@ static void sfp_upstream_clear(struct sfp_bus *bus) { bus->upstream_ops = NULL; bus->upstream = NULL; - bus->netdev->sfp_bus = NULL; bus->netdev = NULL; } @@ -467,7 +468,6 @@ struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode, bus->upstream_ops = ops; bus->upstream = upstream; bus->netdev = ndev; - ndev->sfp_bus = bus; if (bus->sfp) { ret = sfp_register_bus(bus); From 8675860592321a43bbb0b5aedc244a9b2321edc3 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 18 Sep 2018 13:44:59 -0700 Subject: [PATCH 100/499] Revert "ipv6: fix double refcount of fib6_metrics" This reverts commit e70a3aad44cc8b24986687ffc98c4a4f6ecf25ea. This change causes use-after-free on dst->_metrics. The crash trace looks like this: [ 97.763269] BUG: KASAN: use-after-free in ip6_mtu+0x116/0x140 [ 97.769038] Read of size 4 at addr ffff881781d2cf84 by task svw_NetThreadEv/8801 [ 97.777954] CPU: 76 PID: 8801 Comm: svw_NetThreadEv Not tainted 4.15.0-smp-DEV #11 [ 97.777956] Hardware name: Default string Default string/Indus_QC_02, BIOS 5.46.4 03/29/2018 [ 97.777957] Call Trace: [ 97.777971] [] dump_stack+0x4d/0x72 [ 97.777985] [] print_address_description+0x6f/0x260 [ 97.777997] [] kasan_report+0x257/0x370 [ 97.778001] [] ? ip6_mtu+0x116/0x140 [ 97.778004] [] __asan_report_load4_noabort+0x19/0x20 [ 97.778008] [] ip6_mtu+0x116/0x140 [ 97.778013] [] tcp_current_mss+0x12e/0x280 [ 97.778016] [] ? tcp_mtu_to_mss+0x2d0/0x2d0 [ 97.778022] [] ? depot_save_stack+0x138/0x4a0 [ 97.778037] [] ? __mmdrop+0x145/0x1f0 [ 97.778040] [] ? save_stack+0xb1/0xd0 [ 97.778046] [] tcp_send_mss+0x22/0x220 [ 97.778059] [] tcp_sendmsg_locked+0x4f9/0x39f0 [ 97.778062] [] ? kasan_check_write+0x14/0x20 [ 97.778066] [] ? tcp_sendpage+0x60/0x60 [ 97.778070] [] ? rw_copy_check_uvector+0x69/0x280 [ 97.778075] [] ? import_iovec+0x9f/0x430 [ 97.778078] [] ? kasan_slab_free+0x87/0xc0 [ 97.778082] [] ? memzero_page+0x140/0x140 [ 97.778085] [] ? kasan_check_write+0x14/0x20 [ 97.778088] [] tcp_sendmsg+0x2c/0x50 [ 97.778092] [] ? tcp_sendmsg+0x2c/0x50 [ 97.778098] [] inet_sendmsg+0x103/0x480 [ 97.778102] [] ? inet_gso_segment+0x15b0/0x15b0 [ 97.778105] [] sock_sendmsg+0xba/0xf0 [ 97.778108] [] ___sys_sendmsg+0x6ca/0x8e0 [ 97.778113] [] ? hrtimer_try_to_cancel+0x71/0x3b0 [ 97.778116] [] ? copy_msghdr_from_user+0x3d0/0x3d0 [ 97.778119] [] ? memset+0x31/0x40 [ 97.778123] [] ? schedule_hrtimeout_range_clock+0x165/0x380 [ 97.778127] [] ? hrtimer_nanosleep_restart+0x250/0x250 [ 97.778130] [] ? __hrtimer_init+0x180/0x180 [ 97.778133] [] ? ktime_get_ts64+0x172/0x200 [ 97.778137] [] ? __fget_light+0x8c/0x2f0 [ 97.778141] [] __sys_sendmsg+0xe6/0x190 [ 97.778144] [] ? __sys_sendmsg+0xe6/0x190 [ 97.778147] [] ? SyS_shutdown+0x20/0x20 [ 97.778152] [] ? wake_up_q+0xe0/0xe0 [ 97.778155] [] ? __sys_sendmsg+0x190/0x190 [ 97.778158] [] SyS_sendmsg+0x13/0x20 [ 97.778162] [] do_syscall_64+0x2ac/0x430 [ 97.778166] [] ? do_page_fault+0x35/0x3d0 [ 97.778171] [] ? page_fault+0x2f/0x50 [ 97.778174] [] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 97.778177] RIP: 0033:0x7f83fa36000d [ 97.778178] RSP: 002b:00007f83ef9229e0 EFLAGS: 00000293 ORIG_RAX: 000000000000002e [ 97.778180] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f83fa36000d [ 97.778182] RDX: 0000000000004000 RSI: 00007f83ef922f00 RDI: 0000000000000036 [ 97.778183] RBP: 00007f83ef923040 R08: 00007f83ef9231f8 R09: 00007f83ef923168 [ 97.778184] R10: 0000000000000000 R11: 0000000000000293 R12: 00007f83f69c5b40 [ 97.778185] R13: 000000000000001c R14: 0000000000000001 R15: 0000000000004000 [ 97.779684] Allocated by task 5919: [ 97.783185] save_stack+0x46/0xd0 [ 97.783187] kasan_kmalloc+0xad/0xe0 [ 97.783189] kmem_cache_alloc_trace+0xdf/0x580 [ 97.783190] ip6_convert_metrics.isra.79+0x7e/0x190 [ 97.783192] ip6_route_info_create+0x60a/0x2480 [ 97.783193] ip6_route_add+0x1d/0x80 [ 97.783195] inet6_rtm_newroute+0xdd/0xf0 [ 97.783198] rtnetlink_rcv_msg+0x641/0xb10 [ 97.783200] netlink_rcv_skb+0x27b/0x3e0 [ 97.783202] rtnetlink_rcv+0x15/0x20 [ 97.783203] netlink_unicast+0x4be/0x720 [ 97.783204] netlink_sendmsg+0x7bc/0xbf0 [ 97.783205] sock_sendmsg+0xba/0xf0 [ 97.783207] ___sys_sendmsg+0x6ca/0x8e0 [ 97.783208] __sys_sendmsg+0xe6/0x190 [ 97.783209] SyS_sendmsg+0x13/0x20 [ 97.783211] do_syscall_64+0x2ac/0x430 [ 97.783213] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 97.784709] Freed by task 0: [ 97.785056] knetbase: Error: /proc/sys/net/core/txcs_enable does not exist [ 97.794497] save_stack+0x46/0xd0 [ 97.794499] kasan_slab_free+0x71/0xc0 [ 97.794500] kfree+0x7c/0xf0 [ 97.794501] fib6_info_destroy_rcu+0x24f/0x310 [ 97.794504] rcu_process_callbacks+0x38b/0x1730 [ 97.794506] __do_softirq+0x1c8/0x5d0 Reported-by: John Sperbeck Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 480a79f47c52..b5d3e6b294ab 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -976,6 +976,10 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) rt->rt6i_flags &= ~RTF_EXPIRES; rcu_assign_pointer(rt->from, from); dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); + if (from->fib6_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + refcount_inc(&from->fib6_metrics->refcnt); + } } /* Caller must already hold reference to @ort */ From ce7ea4af0838ffd4667ecad4eb5eec7a25342f1e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 18 Sep 2018 13:45:00 -0700 Subject: [PATCH 101/499] ipv6: fix memory leak on dst->_metrics When dst->_metrics and f6i->fib6_metrics share the same memory, both take reference count on the dst_metrics structure. However, when dst is destroyed, ip6_dst_destroy() only invokes dst_destroy_metrics_generic() which does not take care of READONLY metrics and does not release refcnt. This causes memory leak. Similar to ipv4 logic, the fix is to properly release refcnt and free the memory space pointed by dst->_metrics if refcnt becomes 0. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Reported-by: Sabrina Dubroca Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b5d3e6b294ab..826b14de7dbb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -364,11 +364,14 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rt6_info *rt = (struct rt6_info *)dst; struct fib6_info *from; struct inet6_dev *idev; - dst_destroy_metrics_generic(dst); + if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) + kfree(p); + rt6_uncached_list_del(rt); idev = rt->rt6i_idev; From 73aeb2cbcdc9be391b3d32a55319a59ce425426f Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Tue, 11 Sep 2018 04:27:41 +0100 Subject: [PATCH 102/499] ARM: 8787/1: wire up io_pgetevents syscall Wire up the new io_pgetevents syscall for ARM. Signed-off-by: Stefan Agner Acked-by: Christoph Hellwig Signed-off-by: Russell King --- arch/arm/tools/syscall.tbl | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index fbc74b5fa3ed..8edf93b4490f 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -413,3 +413,4 @@ 396 common pkey_free sys_pkey_free 397 common statx sys_statx 398 common rseq sys_rseq +399 common io_pgetevents sys_io_pgetevents From 3a58ac65e2d7969bcdf1b6acb70fa4d12a88e53e Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Thu, 13 Sep 2018 16:48:08 +0100 Subject: [PATCH 103/499] ARM: 8799/1: mm: fix pci_ioremap_io() offset check IO_SPACE_LIMIT is the ending address of the PCI IO space, i.e something like 0xfffff (and not 0x100000). Therefore, when offset = 0xf0000 is passed as argument, this function fails even though the offset + SZ_64K fits below the IO_SPACE_LIMIT. This makes the last chunk of 64 KB of the I/O space not usable as it cannot be mapped. This patch fixes that by substracing 1 to offset + SZ_64K, so that we compare the addrss of the last byte of the I/O space against IO_SPACE_LIMIT instead of the address of the first byte of what is after the I/O space. Fixes: c2794437091a4 ("ARM: Add fixed PCI i/o mapping") Signed-off-by: Thomas Petazzoni Acked-by: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index fc91205ff46c..5bf9443cfbaa 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -473,7 +473,7 @@ void pci_ioremap_set_mem_type(int mem_type) int pci_ioremap_io(unsigned int offset, phys_addr_t phys_addr) { - BUG_ON(offset + SZ_64K > IO_SPACE_LIMIT); + BUG_ON(offset + SZ_64K - 1 > IO_SPACE_LIMIT); return ioremap_page_range(PCI_IO_VIRT_BASE + offset, PCI_IO_VIRT_BASE + offset + SZ_64K, From 9e796c9db93b4840d1b00e550eea26db7cb741e2 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 14 Sep 2018 08:51:14 -0600 Subject: [PATCH 104/499] ext2, dax: set ext2_dax_aops for dax files Sync syscall to DAX file needs to flush processor cache, but it currently does not flush to existing DAX files. This is because 'ext2_da_aops' is set to address_space_operations of existing DAX files, instead of 'ext2_dax_aops', since S_DAX flag is set after ext2_set_aops() in the open path. Similar to ext4, change ext2_iget() to initialize i_flags before ext2_set_aops(). Fixes: fb094c90748f ("ext2, dax: introduce ext2_dax_aops") Signed-off-by: Toshi Kani Suggested-by: Jan Kara Cc: Jan Kara Cc: Dan Williams Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Signed-off-by: Jan Kara --- fs/ext2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 7f7ee18fe179..e4bb9386c045 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1448,6 +1448,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); ei->i_flags = le32_to_cpu(raw_inode->i_flags); + ext2_set_inode_flags(inode); ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); ei->i_frag_no = raw_inode->i_frag; ei->i_frag_size = raw_inode->i_fsize; @@ -1517,7 +1518,6 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } brelse (bh); - ext2_set_inode_flags(inode); unlock_new_inode(inode); return inode; From ee92efe41cf358f4b99e73509f2bfd4733609f26 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 17 Sep 2018 18:10:05 -0700 Subject: [PATCH 105/499] IB/srp: Avoid that sg_reset -d ${srp_device} triggers an infinite loop Use different loop variables for the inner and outer loop. This avoids that an infinite loop occurs if there are more RDMA channels than target->req_ring_size. Fixes: d92c0da71a35 ("IB/srp: Add multichannel support") Cc: Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 444d16520506..0b34e909505f 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -2951,7 +2951,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); struct srp_rdma_ch *ch; - int i; + int i, j; u8 status; shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); @@ -2965,8 +2965,8 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; - for (i = 0; i < target->req_ring_size; ++i) { - struct srp_request *req = &ch->req_ring[i]; + for (j = 0; j < target->req_ring_size; ++j) { + struct srp_request *req = &ch->req_ring[j]; srp_finish_req(ch, req, scmnd->device, DID_RESET << 16); } From 080220b687147fd9376878534aba7194f17f6ef5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 18 Sep 2018 10:13:59 -0700 Subject: [PATCH 106/499] tools: bpf: fix license for a compat header file libc_compat.h is used by libbpf so make sure it's licensed under LGPL or BSD license. The license change should be OK, I'm the only author of the file. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/include/tools/libc_compat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/tools/libc_compat.h b/tools/include/tools/libc_compat.h index 664ced8cb1b0..e907ba6f15e5 100644 --- a/tools/include/tools/libc_compat.h +++ b/tools/include/tools/libc_compat.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: (LGPL-2.0+ OR BSD-2-Clause) /* Copyright (C) 2018 Netronome Systems, Inc. */ #ifndef __TOOLS_LIBC_COMPAT_H From 13b91ed77a65cf3bf23353880245d0cbd9138294 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Sun, 16 Sep 2018 12:34:09 +0800 Subject: [PATCH 107/499] ARM: dts: sun8i: drop A64 HDMI PHY fallback compatible from R40 DT The R40 HDMI PHY seems to be different to the A64 one, the A64 one has no input mux, but the R40 one has. Drop the A64 fallback compatible from the HDMI PHY node in R40 DT. Signed-off-by: Icenowy Zheng Signed-off-by: Maxime Ripard [wens@csie.org: Fix subject prefix order] Signed-off-by: Chen-Yu Tsai --- arch/arm/boot/dts/sun8i-r40.dtsi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/sun8i-r40.dtsi b/arch/arm/boot/dts/sun8i-r40.dtsi index ffd9f00f74a4..5f547c161baf 100644 --- a/arch/arm/boot/dts/sun8i-r40.dtsi +++ b/arch/arm/boot/dts/sun8i-r40.dtsi @@ -800,8 +800,7 @@ hdmi_out: port@1 { }; hdmi_phy: hdmi-phy@1ef0000 { - compatible = "allwinner,sun8i-r40-hdmi-phy", - "allwinner,sun50i-a64-hdmi-phy"; + compatible = "allwinner,sun8i-r40-hdmi-phy"; reg = <0x01ef0000 0x10000>; clocks = <&ccu CLK_BUS_HDMI1>, <&ccu CLK_HDMI_SLOW>, <&ccu 7>, <&ccu 16>; From 76c0ddd8c3a683f6e2c6e60e11dc1a1558caf4bc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 19 Sep 2018 15:02:07 +0200 Subject: [PATCH 108/499] ip6_tunnel: be careful when accessing the inner header the ip6 tunnel xmit ndo assumes that the processed skb always contains an ip[v6] header, but syzbot has found a way to send frames that fall short of this assumption, leading to the following splat: BUG: KMSAN: uninit-value in ip6ip6_tnl_xmit net/ipv6/ip6_tunnel.c:1307 [inline] BUG: KMSAN: uninit-value in ip6_tnl_start_xmit+0x7d2/0x1ef0 net/ipv6/ip6_tunnel.c:1390 CPU: 0 PID: 4504 Comm: syz-executor558 Not tainted 4.16.0+ #87 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683 ip6ip6_tnl_xmit net/ipv6/ip6_tunnel.c:1307 [inline] ip6_tnl_start_xmit+0x7d2/0x1ef0 net/ipv6/ip6_tunnel.c:1390 __netdev_start_xmit include/linux/netdevice.h:4066 [inline] netdev_start_xmit include/linux/netdevice.h:4075 [inline] xmit_one net/core/dev.c:3026 [inline] dev_hard_start_xmit+0x5f1/0xc70 net/core/dev.c:3042 __dev_queue_xmit+0x27ee/0x3520 net/core/dev.c:3557 dev_queue_xmit+0x4b/0x60 net/core/dev.c:3590 packet_snd net/packet/af_packet.c:2944 [inline] packet_sendmsg+0x7c70/0x8a30 net/packet/af_packet.c:2969 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmmsg+0x42d/0x800 net/socket.c:2136 SYSC_sendmmsg+0xc4/0x110 net/socket.c:2167 SyS_sendmmsg+0x63/0x90 net/socket.c:2162 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x441819 RSP: 002b:00007ffe58ee8268 EFLAGS: 00000213 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000441819 RDX: 0000000000000002 RSI: 0000000020000100 RDI: 0000000000000003 RBP: 00000000006cd018 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000213 R12: 0000000000402510 R13: 00000000004025a0 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314 kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321 slab_post_alloc_hook mm/slab.h:445 [inline] slab_alloc_node mm/slub.c:2737 [inline] __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369 __kmalloc_reserve net/core/skbuff.c:138 [inline] __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206 alloc_skb include/linux/skbuff.h:984 [inline] alloc_skb_with_frags+0x1d4/0xb20 net/core/skbuff.c:5234 sock_alloc_send_pskb+0xb56/0x1190 net/core/sock.c:2085 packet_alloc_skb net/packet/af_packet.c:2803 [inline] packet_snd net/packet/af_packet.c:2894 [inline] packet_sendmsg+0x6454/0x8a30 net/packet/af_packet.c:2969 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmmsg+0x42d/0x800 net/socket.c:2136 SYSC_sendmmsg+0xc4/0x110 net/socket.c:2167 SyS_sendmmsg+0x63/0x90 net/socket.c:2162 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 This change addresses the issue adding the needed check before accessing the inner header. The ipv4 side of the issue is apparently there since the ipv4 over ipv6 initial support, and the ipv6 side predates git history. Fixes: c4d3efafcc93 ("[IPV6] IP6TUNNEL: Add support to IPv4 over IPv6 tunnel.") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+3fde91d4d394747d6db4@syzkaller.appspotmail.com Tested-by: Alexander Potapenko Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 419960b0ba16..a0b6932c3afd 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1234,7 +1234,7 @@ static inline int ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - const struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph; int encap_limit = -1; struct flowi6 fl6; __u8 dsfield; @@ -1242,6 +1242,11 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; + /* ensure we can access the full inner ip header */ + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return -1; + + iph = ip_hdr(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); tproto = READ_ONCE(t->parms.proto); @@ -1306,7 +1311,7 @@ static inline int ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ipv6hdr *ipv6h; int encap_limit = -1; __u16 offset; struct flowi6 fl6; @@ -1315,6 +1320,10 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + return -1; + + ipv6h = ipv6_hdr(skb); tproto = READ_ONCE(t->parms.proto); if ((tproto != IPPROTO_IPV6 && tproto != 0) || ip6_tnl_addr_conflict(t, ipv6h)) From cf5cca6e4cc4dea604d8c83e7676f10459d6809c Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 19 Sep 2018 15:29:06 +0200 Subject: [PATCH 109/499] net: mvneta: fix the Rx desc buffer DMA unmapping With CONFIG_DMA_API_DEBUG enabled we now get a warning when using the mvneta driver: mvneta d0030000.ethernet: DMA-API: device driver frees DMA memory with wrong function [device address=0x000000001165b000] [size=4096 bytes] [mapped as page] [unmapped as single] This is because when using the s/w buffer management, the Rx descriptor buffer is mapped with dma_map_page but unmapped with dma_unmap_single. This patch fixes this by using the right unmapping function. Fixes: 562e2f467e71 ("net: mvneta: Improve the buffer allocation method for SWBM") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index bc80a678abc3..2db9708f2e24 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2008,8 +2008,8 @@ static int mvneta_rx_swbm(struct napi_struct *napi, skb_add_rx_frag(rxq->skb, frag_num, page, frag_offset, frag_size, PAGE_SIZE); - dma_unmap_single(dev->dev.parent, phys_addr, - PAGE_SIZE, DMA_FROM_DEVICE); + dma_unmap_page(dev->dev.parent, phys_addr, + PAGE_SIZE, DMA_FROM_DEVICE); rxq->left_size -= frag_size; } } else { From 7233b8cab39014620ac9534da11f0f3e506d8fd8 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 11 Sep 2018 15:38:05 +1000 Subject: [PATCH 110/499] powerpc/powernv/ioda2: Reduce upper limit for DMA window size (again) mpe: This was fixed originally in commit d3d4ffaae439 ("powerpc/powernv/ioda2: Reduce upper limit for DMA window size"), but contrary to what the merge commit says was inadvertently lost by me in commit ce57c6610cc2 ("Merge branch 'topic/ppc-kvm' into next") which brought in changes that moved the code to a new file. So reapply it to the new file. Original commit message follows: We use PHB in mode1 which uses bit 59 to select a correct DMA window. However there is mode2 which uses bits 59:55 and allows up to 32 DMA windows per a PE. Even though documentation does not clearly specify that, it seems that the actual hardware does not support bits 59:55 even in mode1, in other words we can create a window as big as 1<<58 but DMA simply won't work. This reduces the upper limit from 59 to 55 bits to let the userspace know about the hardware limits. Fixes: ce57c6610cc2 ("Merge branch 'topic/ppc-kvm' into next") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 6c5db1acbe8d..fe9691040f54 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -276,7 +276,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, level_shift = entries_shift + 3; level_shift = max_t(unsigned int, level_shift, PAGE_SHIFT); - if ((level_shift - 3) * levels + page_shift >= 60) + if ((level_shift - 3) * levels + page_shift >= 55) return -EINVAL; /* Allocate TCE table */ From 50fdf60181b01b7383b85d4b9acbb842263d96a2 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 19 Sep 2018 21:59:10 -0700 Subject: [PATCH 111/499] qed: Fix populating the invalid stag value in multi function mode. In multi-function mode, driver receives the stag value (outer vlan) for a PF from management FW (MFW). If the stag value is negotiated prior to the driver load, then the stag is not notified to the driver and hence driver will have the invalid stag value. The fix is to request the MFW for STAG value during the driver load time. Fixes: cac6f691 ("qed: Add support for Unified Fabric Port") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Tomer Tayar Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dev.c | 15 ++++++++++++++- drivers/net/ethernet/qlogic/qed/qed_hsi.h | 4 ++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 016ca8a7ec8a..97f073fd3725 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -1706,7 +1706,7 @@ static int qed_vf_start(struct qed_hwfn *p_hwfn, int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params) { struct qed_load_req_params load_req_params; - u32 load_code, param, drv_mb_param; + u32 load_code, resp, param, drv_mb_param; bool b_default_mtu = true; struct qed_hwfn *p_hwfn; int rc = 0, mfw_rc, i; @@ -1852,6 +1852,19 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params) if (IS_PF(cdev)) { p_hwfn = QED_LEADING_HWFN(cdev); + + /* Get pre-negotiated values for stag, bandwidth etc. */ + DP_VERBOSE(p_hwfn, + QED_MSG_SPQ, + "Sending GET_OEM_UPDATES command to trigger stag/bandwidth attention handling\n"); + drv_mb_param = 1 << DRV_MB_PARAM_DUMMY_OEM_UPDATES_OFFSET; + rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt, + DRV_MSG_CODE_GET_OEM_UPDATES, + drv_mb_param, &resp, ¶m); + if (rc) + DP_NOTICE(p_hwfn, + "Failed to send GET_OEM_UPDATES attention request\n"); + drv_mb_param = STORM_FW_VERSION; rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt, DRV_MSG_CODE_OV_UPDATE_STORM_FW_VER, diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index 8faceb691657..9b3ef00e5782 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -12414,6 +12414,7 @@ struct public_drv_mb { #define DRV_MSG_SET_RESOURCE_VALUE_MSG 0x35000000 #define DRV_MSG_CODE_OV_UPDATE_WOL 0x38000000 #define DRV_MSG_CODE_OV_UPDATE_ESWITCH_MODE 0x39000000 +#define DRV_MSG_CODE_GET_OEM_UPDATES 0x41000000 #define DRV_MSG_CODE_BW_UPDATE_ACK 0x32000000 #define DRV_MSG_CODE_NIG_DRAIN 0x30000000 @@ -12541,6 +12542,9 @@ struct public_drv_mb { #define DRV_MB_PARAM_ESWITCH_MODE_VEB 0x1 #define DRV_MB_PARAM_ESWITCH_MODE_VEPA 0x2 +#define DRV_MB_PARAM_DUMMY_OEM_UPDATES_MASK 0x1 +#define DRV_MB_PARAM_DUMMY_OEM_UPDATES_OFFSET 0 + #define DRV_MB_PARAM_SET_LED_MODE_OPER 0x0 #define DRV_MB_PARAM_SET_LED_MODE_ON 0x1 #define DRV_MB_PARAM_SET_LED_MODE_OFF 0x2 From 0216da9413afa546627a1b0d319dfd17fef34050 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 19 Sep 2018 21:59:11 -0700 Subject: [PATCH 112/499] qed: Do not add VLAN 0 tag to untagged frames in multi-function mode. In certain multi-function switch dependent modes, firmware adds vlan tag 0 to the untagged frames. This leads to double tagging for the traffic if the dcbx is enabled, which is not the desired behavior. To avoid this, driver needs to set "dcb_dont_add_vlan0" flag. Fixes: cac6f691 ("qed: Add support for Unified Fabric Port") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Tomer Tayar Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 9 ++++++++- drivers/net/ethernet/qlogic/qed/qed_dcbx.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c index 6bb76e6d3c14..d53f33c6b1fc 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c @@ -190,6 +190,7 @@ qed_dcbx_dp_protocol(struct qed_hwfn *p_hwfn, struct qed_dcbx_results *p_data) static void qed_dcbx_set_params(struct qed_dcbx_results *p_data, + struct qed_hwfn *p_hwfn, struct qed_hw_info *p_info, bool enable, u8 prio, @@ -206,6 +207,11 @@ qed_dcbx_set_params(struct qed_dcbx_results *p_data, else p_data->arr[type].update = DONT_UPDATE_DCB_DSCP; + /* Do not add vlan tag 0 when DCB is enabled and port in UFP/OV mode */ + if ((test_bit(QED_MF_8021Q_TAGGING, &p_hwfn->cdev->mf_bits) || + test_bit(QED_MF_8021AD_TAGGING, &p_hwfn->cdev->mf_bits))) + p_data->arr[type].dont_add_vlan0 = true; + /* QM reconf data */ if (p_info->personality == personality) qed_hw_info_set_offload_tc(p_info, tc); @@ -231,7 +237,7 @@ qed_dcbx_update_app_info(struct qed_dcbx_results *p_data, personality = qed_dcbx_app_update[i].personality; - qed_dcbx_set_params(p_data, p_info, enable, + qed_dcbx_set_params(p_data, p_hwfn, p_info, enable, prio, tc, type, personality); } } @@ -954,6 +960,7 @@ static void qed_dcbx_update_protocol_data(struct protocol_dcb_data *p_data, p_data->dcb_enable_flag = p_src->arr[type].enable; p_data->dcb_priority = p_src->arr[type].priority; p_data->dcb_tc = p_src->arr[type].tc; + p_data->dcb_dont_add_vlan0 = p_src->arr[type].dont_add_vlan0; } /* Set pf update ramrod command params */ diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h index a4d688c04e18..01f253ea4b22 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h @@ -55,6 +55,7 @@ struct qed_dcbx_app_data { u8 update; /* Update indication */ u8 priority; /* Priority */ u8 tc; /* Traffic Class */ + bool dont_add_vlan0; /* Do not insert a vlan tag with id 0 */ }; #define QED_DCBX_VERSION_DISABLED 0 From 7e3e375ceede8fba5218362d14db7de25fa83f12 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 19 Sep 2018 21:59:12 -0700 Subject: [PATCH 113/499] qed: Add missing device config for RoCE EDPM in UFP mode. This patch adds support to configure the DORQ to use vlan-id/priority for roce EDPM. Fixes: cac6f691 ("qed: Add support for Unified Fabric Port") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Tomer Tayar Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 40 ++++++++++--------- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 24 +++++++++-- .../net/ethernet/qlogic/qed/qed_reg_addr.h | 6 +++ 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c index d53f33c6b1fc..f5459de6d60a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c @@ -190,11 +190,8 @@ qed_dcbx_dp_protocol(struct qed_hwfn *p_hwfn, struct qed_dcbx_results *p_data) static void qed_dcbx_set_params(struct qed_dcbx_results *p_data, - struct qed_hwfn *p_hwfn, - struct qed_hw_info *p_info, - bool enable, - u8 prio, - u8 tc, + struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, + bool enable, u8 prio, u8 tc, enum dcbx_protocol_type type, enum qed_pci_personality personality) { @@ -213,18 +210,24 @@ qed_dcbx_set_params(struct qed_dcbx_results *p_data, p_data->arr[type].dont_add_vlan0 = true; /* QM reconf data */ - if (p_info->personality == personality) - qed_hw_info_set_offload_tc(p_info, tc); + if (p_hwfn->hw_info.personality == personality) + qed_hw_info_set_offload_tc(&p_hwfn->hw_info, tc); + + /* Configure dcbx vlan priority in doorbell block for roce EDPM */ + if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) && + type == DCBX_PROTOCOL_ROCE) { + qed_wr(p_hwfn, p_ptt, DORQ_REG_TAG1_OVRD_MODE, 1); + qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_PCP_BB_K2, prio << 1); + } } /* Update app protocol data and hw_info fields with the TLV info */ static void qed_dcbx_update_app_info(struct qed_dcbx_results *p_data, - struct qed_hwfn *p_hwfn, - bool enable, - u8 prio, u8 tc, enum dcbx_protocol_type type) + struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, + bool enable, u8 prio, u8 tc, + enum dcbx_protocol_type type) { - struct qed_hw_info *p_info = &p_hwfn->hw_info; enum qed_pci_personality personality; enum dcbx_protocol_type id; int i; @@ -237,7 +240,7 @@ qed_dcbx_update_app_info(struct qed_dcbx_results *p_data, personality = qed_dcbx_app_update[i].personality; - qed_dcbx_set_params(p_data, p_hwfn, p_info, enable, + qed_dcbx_set_params(p_data, p_hwfn, p_ptt, enable, prio, tc, type, personality); } } @@ -271,7 +274,7 @@ qed_dcbx_get_app_protocol_type(struct qed_hwfn *p_hwfn, * reconfiguring QM. Get protocol specific data for PF update ramrod command. */ static int -qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn, +qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, struct qed_dcbx_results *p_data, struct dcbx_app_priority_entry *p_tbl, u32 pri_tc_tbl, int count, u8 dcbx_version) @@ -315,7 +318,7 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn, enable = true; } - qed_dcbx_update_app_info(p_data, p_hwfn, enable, + qed_dcbx_update_app_info(p_data, p_hwfn, p_ptt, enable, priority, tc, type); } } @@ -337,7 +340,7 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn, continue; enable = (type == DCBX_PROTOCOL_ETH) ? false : !!dcbx_version; - qed_dcbx_update_app_info(p_data, p_hwfn, enable, + qed_dcbx_update_app_info(p_data, p_hwfn, p_ptt, enable, priority, tc, type); } @@ -347,7 +350,8 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn, /* Parse app TLV's to update TC information in hw_info structure for * reconfiguring QM. Get protocol specific data for PF update ramrod command. */ -static int qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn) +static int +qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) { struct dcbx_app_priority_feature *p_app; struct dcbx_app_priority_entry *p_tbl; @@ -371,7 +375,7 @@ static int qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn) p_info = &p_hwfn->hw_info; num_entries = QED_MFW_GET_FIELD(p_app->flags, DCBX_APP_NUM_ENTRIES); - rc = qed_dcbx_process_tlv(p_hwfn, &data, p_tbl, pri_tc_tbl, + rc = qed_dcbx_process_tlv(p_hwfn, p_ptt, &data, p_tbl, pri_tc_tbl, num_entries, dcbx_version); if (rc) return rc; @@ -897,7 +901,7 @@ qed_dcbx_mib_update_event(struct qed_hwfn *p_hwfn, return rc; if (type == QED_DCBX_OPERATIONAL_MIB) { - rc = qed_dcbx_process_mib_info(p_hwfn); + rc = qed_dcbx_process_mib_info(p_hwfn, p_ptt); if (!rc) { /* reconfigure tcs of QM queues according * to negotiation results diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 5d37ec7e9b0b..58c7eb9d8e1b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -1581,13 +1581,29 @@ static void qed_mcp_update_stag(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) p_hwfn->mcp_info->func_info.ovlan = (u16)shmem_info.ovlan_stag & FUNC_MF_CFG_OV_STAG_MASK; p_hwfn->hw_info.ovlan = p_hwfn->mcp_info->func_info.ovlan; - if ((p_hwfn->hw_info.hw_mode & BIT(MODE_MF_SD)) && - (p_hwfn->hw_info.ovlan != QED_MCP_VLAN_UNSET)) { - qed_wr(p_hwfn, p_ptt, - NIG_REG_LLH_FUNC_TAG_VALUE, p_hwfn->hw_info.ovlan); + if (test_bit(QED_MF_OVLAN_CLSS, &p_hwfn->cdev->mf_bits)) { + if (p_hwfn->hw_info.ovlan != QED_MCP_VLAN_UNSET) { + qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_FUNC_TAG_VALUE, + p_hwfn->hw_info.ovlan); + qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_FUNC_TAG_EN, 1); + + /* Configure DB to add external vlan to EDPM packets */ + qed_wr(p_hwfn, p_ptt, DORQ_REG_TAG1_OVRD_MODE, 1); + qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_EXT_VID_BB_K2, + p_hwfn->hw_info.ovlan); + } else { + qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_FUNC_TAG_EN, 0); + qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_FUNC_TAG_VALUE, 0); + qed_wr(p_hwfn, p_ptt, DORQ_REG_TAG1_OVRD_MODE, 0); + qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_EXT_VID_BB_K2, 0); + } + qed_sp_pf_update_stag(p_hwfn); } + DP_VERBOSE(p_hwfn, QED_MSG_SP, "ovlan = %d hw_mode = 0x%x\n", + p_hwfn->mcp_info->func_info.ovlan, p_hwfn->hw_info.hw_mode); + /* Acknowledge the MFW */ qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_S_TAG_UPDATE_ACK, 0, &resp, ¶m); diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h index f736f70956fd..2440970882c4 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h +++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h @@ -216,6 +216,12 @@ 0x00c000UL #define DORQ_REG_IFEN \ 0x100040UL +#define DORQ_REG_TAG1_OVRD_MODE \ + 0x1008b4UL +#define DORQ_REG_PF_PCP_BB_K2 \ + 0x1008c4UL +#define DORQ_REG_PF_EXT_VID_BB_K2 \ + 0x1008c8UL #define DORQ_REG_DB_DROP_REASON \ 0x100a2cUL #define DORQ_REG_DB_DROP_DETAILS \ From 32bf94fb5c2ec4ec842152d0e5937cd4bb6738fa Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Wed, 19 Sep 2018 13:54:56 -0600 Subject: [PATCH 114/499] xfrm: validate template mode XFRM mode parameters passed as part of the user templates in the IP_XFRM_POLICY are never properly validated. Passing values other than valid XFRM modes can cause stack-out-of-bounds reads to occur later in the XFRM processing: [ 140.535608] ================================================================ [ 140.543058] BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x17e4/0x1cc4 [ 140.550306] Read of size 4 at addr ffffffc0238a7a58 by task repro/5148 [ 140.557369] [ 140.558927] Call trace: [ 140.558936] dump_backtrace+0x0/0x388 [ 140.558940] show_stack+0x24/0x30 [ 140.558946] __dump_stack+0x24/0x2c [ 140.558949] dump_stack+0x8c/0xd0 [ 140.558956] print_address_description+0x74/0x234 [ 140.558960] kasan_report+0x240/0x264 [ 140.558963] __asan_report_load4_noabort+0x2c/0x38 [ 140.558967] xfrm_state_find+0x17e4/0x1cc4 [ 140.558971] xfrm_resolve_and_create_bundle+0x40c/0x1fb8 [ 140.558975] xfrm_lookup+0x238/0x1444 [ 140.558977] xfrm_lookup_route+0x48/0x11c [ 140.558984] ip_route_output_flow+0x88/0xc4 [ 140.558991] raw_sendmsg+0xa74/0x266c [ 140.558996] inet_sendmsg+0x258/0x3b0 [ 140.559002] sock_sendmsg+0xbc/0xec [ 140.559005] SyS_sendto+0x3a8/0x5a8 [ 140.559008] el0_svc_naked+0x34/0x38 [ 140.559009] [ 140.592245] page dumped because: kasan: bad access detected [ 140.597981] page_owner info is not active (free page?) [ 140.603267] [ 140.653503] ================================================================ Signed-off-by: Sean Tranchetti Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5151b3ebf068..d0672c400c2f 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1455,6 +1455,9 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) (ut[i].family != prev_family)) return -EINVAL; + if (ut[i].mode >= XFRM_MODE_MAX) + return -EINVAL; + prev_family = ut[i].family; switch (ut[i].family) { From b3a5402cbcebaf5a9db4d6a3268070e4a099355d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 18 Sep 2018 18:21:11 +0200 Subject: [PATCH 115/499] ALSA: hda: Fix the audio-component completion timeout The timeout of audio component binding was incorrectly specified in msec, not in jiffies, which results in way too shorter timeout than expected. Along with fixing it, add the information print about the binding failure to show the unexpected situation more clearly. Fixes: a57942bfdd61 ("ALSA: hda: Make audio component support more generic") Signed-off-by: Takashi Iwai --- sound/hda/hdac_i915.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c index b5282cbbe489..617ff1aa818f 100644 --- a/sound/hda/hdac_i915.c +++ b/sound/hda/hdac_i915.c @@ -145,9 +145,11 @@ int snd_hdac_i915_init(struct hdac_bus *bus) if (!acomp->ops) { request_module("i915"); /* 10s timeout */ - wait_for_completion_timeout(&bind_complete, 10 * 1000); + wait_for_completion_timeout(&bind_complete, + msecs_to_jiffies(10 * 1000)); } if (!acomp->ops) { + dev_info(bus->dev, "couldn't bind with audio component\n"); snd_hdac_acomp_exit(bus); return -ENODEV; } From 783f3b4e9ec50491c21746e7e05ec6c39c21f563 Mon Sep 17 00:00:00 2001 From: Bin Liu Date: Mon, 17 Sep 2018 11:40:22 -0500 Subject: [PATCH 116/499] usb: musb: dsps: do not disable CPPI41 irq in driver teardown TI AM335x CPPI 4.1 module uses a single register bit for CPPI interrupts in both musb controllers. So disabling the CPPI irq in one musb driver breaks the other musb module. Since musb is already disabled before tearing down dma controller in musb_remove(), it is safe to not disable CPPI irq in musb_dma_controller_destroy(). Fixes: 255348289f71 ("usb: musb: dsps: Manage CPPI 4.1 DMA interrupt in DSPS") Cc: stable@vger.kernel.org Signed-off-by: Bin Liu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/musb_dsps.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/usb/musb/musb_dsps.c b/drivers/usb/musb/musb_dsps.c index df827ff57b0d..23a0df79ef21 100644 --- a/drivers/usb/musb/musb_dsps.c +++ b/drivers/usb/musb/musb_dsps.c @@ -658,16 +658,6 @@ dsps_dma_controller_create(struct musb *musb, void __iomem *base) return controller; } -static void dsps_dma_controller_destroy(struct dma_controller *c) -{ - struct musb *musb = c->musb; - struct dsps_glue *glue = dev_get_drvdata(musb->controller->parent); - void __iomem *usbss_base = glue->usbss_base; - - musb_writel(usbss_base, USBSS_IRQ_CLEARR, USBSS_IRQ_PD_COMP); - cppi41_dma_controller_destroy(c); -} - #ifdef CONFIG_PM_SLEEP static void dsps_dma_controller_suspend(struct dsps_glue *glue) { @@ -697,7 +687,7 @@ static struct musb_platform_ops dsps_ops = { #ifdef CONFIG_USB_TI_CPPI41_DMA .dma_init = dsps_dma_controller_create, - .dma_exit = dsps_dma_controller_destroy, + .dma_exit = cppi41_dma_controller_destroy, #endif .enable = dsps_musb_enable, .disable = dsps_musb_disable, From e871db8d78df1c411032cbb3acfdf8930509360e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 11 Sep 2018 10:00:44 +0200 Subject: [PATCH 117/499] Revert "usb: cdc-wdm: Fix a sleep-in-atomic-context bug in service_outstanding_interrupt()" This reverts commit 6e22e3af7bb3a7b9dc53cb4687659f6e63fca427. The bug the patch describes to, has been already fixed in commit 2df6948428542 ("USB: cdc-wdm: don't enable interrupts in USB-giveback") so need to this, revert it. Fixes: 6e22e3af7bb3 ("usb: cdc-wdm: Fix a sleep-in-atomic-context bug in service_outstanding_interrupt()") Cc: stable Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 656d247819c9..bec581fb7c63 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -460,7 +460,7 @@ static int service_outstanding_interrupt(struct wdm_device *desc) set_bit(WDM_RESPONDING, &desc->flags); spin_unlock_irq(&desc->iuspin); - rv = usb_submit_urb(desc->response, GFP_ATOMIC); + rv = usb_submit_urb(desc->response, GFP_KERNEL); spin_lock_irq(&desc->iuspin); if (rv) { dev_err(&desc->intf->dev, From 7a68d9fb851012829c29e770621905529bd9490b Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 5 Sep 2018 12:07:02 +0200 Subject: [PATCH 118/499] USB: usbdevfs: sanitize flags more Requesting a ZERO_PACKET or not is sensible only for output. In the input direction the device decides. Likewise accepting short packets makes sense only for input. This allows operation with panic_on_warn without opening up a local DOS. Signed-off-by: Oliver Neukum Reported-by: syzbot+843efa30c8821bd69f53@syzkaller.appspotmail.com Fixes: 0cb54a3e47cb ("USB: debugging code shouldn't alter control flow") Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 6ce77b33da61..263dd2f309fb 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1434,10 +1434,13 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb struct async *as = NULL; struct usb_ctrlrequest *dr = NULL; unsigned int u, totlen, isofrmlen; - int i, ret, is_in, num_sgs = 0, ifnum = -1; + int i, ret, num_sgs = 0, ifnum = -1; int number_of_packets = 0; unsigned int stream_id = 0; void *buf; + bool is_in; + bool allow_short = false; + bool allow_zero = false; unsigned long mask = USBDEVFS_URB_SHORT_NOT_OK | USBDEVFS_URB_BULK_CONTINUATION | USBDEVFS_URB_NO_FSBR | @@ -1471,6 +1474,8 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb u = 0; switch (uurb->type) { case USBDEVFS_URB_TYPE_CONTROL: + if (is_in) + allow_short = true; if (!usb_endpoint_xfer_control(&ep->desc)) return -EINVAL; /* min 8 byte setup packet */ @@ -1511,6 +1516,10 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb break; case USBDEVFS_URB_TYPE_BULK: + if (!is_in) + allow_zero = true; + else + allow_short = true; switch (usb_endpoint_type(&ep->desc)) { case USB_ENDPOINT_XFER_CONTROL: case USB_ENDPOINT_XFER_ISOC: @@ -1531,6 +1540,10 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb if (!usb_endpoint_xfer_int(&ep->desc)) return -EINVAL; interrupt_urb: + if (!is_in) + allow_zero = true; + else + allow_short = true; break; case USBDEVFS_URB_TYPE_ISO: @@ -1676,9 +1689,9 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb u = (is_in ? URB_DIR_IN : URB_DIR_OUT); if (uurb->flags & USBDEVFS_URB_ISO_ASAP) u |= URB_ISO_ASAP; - if (uurb->flags & USBDEVFS_URB_SHORT_NOT_OK && is_in) + if (allow_short && uurb->flags & USBDEVFS_URB_SHORT_NOT_OK) u |= URB_SHORT_NOT_OK; - if (uurb->flags & USBDEVFS_URB_ZERO_PACKET) + if (allow_zero && uurb->flags & USBDEVFS_URB_ZERO_PACKET) u |= URB_ZERO_PACKET; if (uurb->flags & USBDEVFS_URB_NO_INTERRUPT) u |= URB_NO_INTERRUPT; From 81e0403b26d94360abd1f6a57311337973bc82cd Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 5 Sep 2018 12:07:03 +0200 Subject: [PATCH 119/499] USB: usbdevfs: restore warning for nonsensical flags If we filter flags before they reach the core we need to generate our own warnings. Signed-off-by: Oliver Neukum Fixes: 0cb54a3e47cb ("USB: debugging code shouldn't alter control flow") Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 263dd2f309fb..244417d0dfd1 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1697,6 +1697,11 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb u |= URB_NO_INTERRUPT; as->urb->transfer_flags = u; + if (!allow_short && uurb->flags & USBDEVFS_URB_SHORT_NOT_OK) + dev_warn(&ps->dev->dev, "Requested nonsensical USBDEVFS_URB_SHORT_NOT_OK.\n"); + if (!allow_zero && uurb->flags & USBDEVFS_URB_ZERO_PACKET) + dev_warn(&ps->dev->dev, "Requested nonsensical USBDEVFS_URB_ZERO_PACKET.\n"); + as->urb->transfer_buffer_length = uurb->buffer_length; as->urb->setup_packet = (unsigned char *)dr; dr = NULL; From c183813fcee44a249339b7c46e1ad271ca1870aa Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 10 Sep 2018 13:58:51 -0400 Subject: [PATCH 120/499] USB: remove LPM management from usb_driver_claim_interface() usb_driver_claim_interface() disables and re-enables Link Power Management, but it shouldn't do either one, for the reasons listed below. This patch removes the two LPM-related function calls from the routine. The reason for disabling LPM in the analogous function usb_probe_interface() is so that drivers won't have to deal with unwanted LPM transitions in their probe routine. But usb_driver_claim_interface() doesn't call the driver's probe routine (or any other callbacks), so that reason doesn't apply here. Furthermore, no driver other than usbfs will ever call usb_driver_claim_interface() unless it is already bound to another interface in the same device, which means disabling LPM here would be redundant. usbfs doesn't interact with LPM at all. Lastly, the error return from usb_unlocked_disable_lpm() isn't handled properly; the code doesn't clean up its earlier actions before returning. Signed-off-by: Alan Stern Fixes: 8306095fd2c1 ("USB: Disable USB 3.0 LPM in critical sections.") CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/driver.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index e76e95f62f76..7652dcb57998 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -512,7 +512,6 @@ int usb_driver_claim_interface(struct usb_driver *driver, struct device *dev; struct usb_device *udev; int retval = 0; - int lpm_disable_error = -ENODEV; if (!iface) return -ENODEV; @@ -533,16 +532,6 @@ int usb_driver_claim_interface(struct usb_driver *driver, iface->condition = USB_INTERFACE_BOUND; - /* See the comment about disabling LPM in usb_probe_interface(). */ - if (driver->disable_hub_initiated_lpm) { - lpm_disable_error = usb_unlocked_disable_lpm(udev); - if (lpm_disable_error) { - dev_err(&iface->dev, "%s Failed to disable LPM for driver %s\n", - __func__, driver->name); - return -ENOMEM; - } - } - /* Claimed interfaces are initially inactive (suspended) and * runtime-PM-enabled, but only if the driver has autosuspend * support. Otherwise they are marked active, to prevent the @@ -561,10 +550,6 @@ int usb_driver_claim_interface(struct usb_driver *driver, if (device_is_registered(dev)) retval = device_bind_driver(dev); - /* Attempt to re-enable USB3 LPM, if the disable was successful. */ - if (!lpm_disable_error) - usb_unlocked_enable_lpm(udev); - return retval; } EXPORT_SYMBOL_GPL(usb_driver_claim_interface); From bd729f9d67aa9a303d8925bb8c4f06af25f407d1 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 10 Sep 2018 13:59:59 -0400 Subject: [PATCH 121/499] USB: fix error handling in usb_driver_claim_interface() The syzbot fuzzing project found a use-after-free bug in the USB core. The bug was caused by usbfs not unbinding from an interface when the USB device file was closed, which led another process to attempt the unbind later on, after the private data structure had been deallocated. The reason usbfs did not unbind the interface at the appropriate time was because it thought the interface had never been claimed in the first place. This was caused by the fact that usb_driver_claim_interface() does not clean up properly when device_bind_driver() returns an error. Although the error code gets passed back to the caller, the iface->dev.driver pointer remains set and iface->condition remains equal to USB_INTERFACE_BOUND. This patch adds proper error handling to usb_driver_claim_interface(). Signed-off-by: Alan Stern Reported-by: syzbot+f84aa7209ccec829536f@syzkaller.appspotmail.com CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/driver.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index 7652dcb57998..a1f225f077cd 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -550,6 +550,21 @@ int usb_driver_claim_interface(struct usb_driver *driver, if (device_is_registered(dev)) retval = device_bind_driver(dev); + if (retval) { + dev->driver = NULL; + usb_set_intfdata(iface, NULL); + iface->needs_remote_wakeup = 0; + iface->condition = USB_INTERFACE_UNBOUND; + + /* + * Unbound interfaces are always runtime-PM-disabled + * and runtime-PM-suspended + */ + if (driver->supports_autosuspend) + pm_runtime_disable(dev); + pm_runtime_set_suspended(dev); + } + return retval; } EXPORT_SYMBOL_GPL(usb_driver_claim_interface); From c9a4cb204e9eb7fa7dfbe3f7d3a674fa530aa193 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 10 Sep 2018 14:00:53 -0400 Subject: [PATCH 122/499] USB: handle NULL config in usb_find_alt_setting() usb_find_alt_setting() takes a pointer to a struct usb_host_config as an argument; it searches for an interface with specified interface and alternate setting numbers in that config. However, it crashes if the usb_host_config pointer argument is NULL. Since this is a general-purpose routine, available for use in many places, we want to to be more robust. This patch makes it return NULL whenever the config argument is NULL. Signed-off-by: Alan Stern Reported-by: syzbot+19c3aaef85a89d451eac@syzkaller.appspotmail.com CC: Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/usb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index 623be3174fb3..79d8bd7a612e 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -228,6 +228,8 @@ struct usb_host_interface *usb_find_alt_setting( struct usb_interface_cache *intf_cache = NULL; int i; + if (!config) + return NULL; for (i = 0; i < config->desc.bNumInterfaces; i++) { if (config->intf_cache[i]->altsetting[0].desc.bInterfaceNumber == iface_num) { From 85682a7e3b9c664995ad477520f917039afdc330 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 10 Sep 2018 06:09:04 +0000 Subject: [PATCH 123/499] powerpc: fix csum_ipv6_magic() on little endian platforms On little endian platforms, csum_ipv6_magic() keeps len and proto in CPU byte order. This generates a bad results leading to ICMPv6 packets from other hosts being dropped by powerpc64le platforms. In order to fix this, len and proto should be converted to network byte order ie bigendian byte order. However checksumming 0x12345678 and 0x56341278 provide the exact same result so it is enough to rotate the sum of len and proto by 1 byte. PPC32 only support bigendian so the fix is needed for PPC64 only Fixes: e9c4943a107b ("powerpc: Implement csum_ipv6_magic in assembly") Reported-by: Jianlin Shi Reported-by: Xin Long Cc: # 4.18+ Signed-off-by: Christophe Leroy Tested-by: Xin Long Signed-off-by: Michael Ellerman --- arch/powerpc/lib/checksum_64.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S index 886ed94b9c13..d05c8af4ac51 100644 --- a/arch/powerpc/lib/checksum_64.S +++ b/arch/powerpc/lib/checksum_64.S @@ -443,6 +443,9 @@ _GLOBAL(csum_ipv6_magic) addc r0, r8, r9 ld r10, 0(r4) ld r11, 8(r4) +#ifdef CONFIG_CPU_LITTLE_ENDIAN + rotldi r5, r5, 8 +#endif adde r0, r0, r10 add r5, r5, r7 adde r0, r0, r11 From 5c54fcac9a9de559b444ac63ec3cd82f1d157a0b Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 19 Sep 2018 10:58:05 +0300 Subject: [PATCH 124/499] usb: roles: Take care of driver module reference counting This fixes potential "BUG: unable to handle kernel paging request at ..." from happening. Fixes: fde0aa6c175a ("usb: common: Small class for USB role switches") Cc: Acked-by: Hans de Goede Tested-by: Hans de Goede Signed-off-by: Heikki Krogerus Signed-off-by: Greg Kroah-Hartman --- drivers/usb/common/roles.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/usb/common/roles.c b/drivers/usb/common/roles.c index 15cc76e22123..99116af07f1d 100644 --- a/drivers/usb/common/roles.c +++ b/drivers/usb/common/roles.c @@ -109,8 +109,15 @@ static void *usb_role_switch_match(struct device_connection *con, int ep, */ struct usb_role_switch *usb_role_switch_get(struct device *dev) { - return device_connection_find_match(dev, "usb-role-switch", NULL, - usb_role_switch_match); + struct usb_role_switch *sw; + + sw = device_connection_find_match(dev, "usb-role-switch", NULL, + usb_role_switch_match); + + if (!IS_ERR_OR_NULL(sw)) + WARN_ON(!try_module_get(sw->dev.parent->driver->owner)); + + return sw; } EXPORT_SYMBOL_GPL(usb_role_switch_get); @@ -122,8 +129,10 @@ EXPORT_SYMBOL_GPL(usb_role_switch_get); */ void usb_role_switch_put(struct usb_role_switch *sw) { - if (!IS_ERR_OR_NULL(sw)) + if (!IS_ERR_OR_NULL(sw)) { put_device(&sw->dev); + module_put(sw->dev.parent->driver->owner); + } } EXPORT_SYMBOL_GPL(usb_role_switch_put); From 16c4cb19fa85c648a803752eb63cac0ef69231c2 Mon Sep 17 00:00:00 2001 From: Harry Pan Date: Fri, 14 Sep 2018 16:58:16 +0800 Subject: [PATCH 125/499] usb: core: safely deal with the dynamic quirk lists Applying dynamic usbcore quirks in early booting when the slab is not yet ready would cause kernel panic of null pointer dereference because the quirk_count has been counted as 1 while the quirk_list was failed to allocate. i.e., [ 1.044970] BUG: unable to handle kernel NULL pointer dereference at (null) [ 1.044995] IP: [] usb_detect_quirks+0x88/0xd1 [ 1.045016] PGD 0 [ 1.045026] Oops: 0000 [#1] PREEMPT SMP [ 1.046986] gsmi: Log Shutdown Reason 0x03 [ 1.046995] Modules linked in: [ 1.047008] CPU: 0 PID: 81 Comm: kworker/0:3 Not tainted 4.4.154 #28 [ 1.047016] Hardware name: Google Coral/Coral, BIOS Google_Coral.10068.27.0 12/04/2017 [ 1.047028] Workqueue: usb_hub_wq hub_event [ 1.047037] task: ffff88017a321c80 task.stack: ffff88017a384000 [ 1.047044] RIP: 0010:[] [] usb_detect_quirks+0x88/0xd1 To tackle this odd, let's balance the quirk_count to 0 when the kcalloc call fails, and defer the quirk setting into a lower level callback which ensures that the kernel memory management has been initialized. Fixes: 027bd6cafd9a ("usb: core: Add "quirks" parameter for usbcore") Signed-off-by: Harry Pan Acked-by: Kai-Heng Feng Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index e77dfe5ed5ec..178d6c6063c0 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -58,6 +58,7 @@ static int quirks_param_set(const char *val, const struct kernel_param *kp) quirk_list = kcalloc(quirk_count, sizeof(struct quirk_entry), GFP_KERNEL); if (!quirk_list) { + quirk_count = 0; mutex_unlock(&quirk_mutex); return -ENOMEM; } @@ -154,7 +155,7 @@ static struct kparam_string quirks_param_string = { .string = quirks_param, }; -module_param_cb(quirks, &quirks_param_ops, &quirks_param_string, 0644); +device_param_cb(quirks, &quirks_param_ops, &quirks_param_string, 0644); MODULE_PARM_DESC(quirks, "Add/modify USB quirks by specifying quirks=vendorID:productID:quirks"); /* Lists of quirky USB devices, split in device quirks and interface quirks. From 3e3b81965cbfa01fda6d77750feedc3c46fc28d0 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 19 Sep 2018 10:58:04 +0300 Subject: [PATCH 126/499] usb: typec: mux: Take care of driver module reference counting Functions typec_mux_get() and typec_switch_get() already make sure that the mux device reference count is incremented, but the same must be done to the driver module as well to prevent the drivers from being unloaded in the middle of operation. This fixes a potential "BUG: unable to handle kernel paging request at ..." from happening. Fixes: 93dd2112c7b2 ("usb: typec: mux: Get the mux identifier from function parameter") Acked-by: Hans de Goede Tested-by: Hans de Goede Signed-off-by: Heikki Krogerus Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/mux.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/usb/typec/mux.c b/drivers/usb/typec/mux.c index ddaac63ecf12..d990aa510fab 100644 --- a/drivers/usb/typec/mux.c +++ b/drivers/usb/typec/mux.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -49,8 +50,10 @@ struct typec_switch *typec_switch_get(struct device *dev) mutex_lock(&switch_lock); sw = device_connection_find_match(dev, "typec-switch", NULL, typec_switch_match); - if (!IS_ERR_OR_NULL(sw)) + if (!IS_ERR_OR_NULL(sw)) { + WARN_ON(!try_module_get(sw->dev->driver->owner)); get_device(sw->dev); + } mutex_unlock(&switch_lock); return sw; @@ -65,8 +68,10 @@ EXPORT_SYMBOL_GPL(typec_switch_get); */ void typec_switch_put(struct typec_switch *sw) { - if (!IS_ERR_OR_NULL(sw)) + if (!IS_ERR_OR_NULL(sw)) { + module_put(sw->dev->driver->owner); put_device(sw->dev); + } } EXPORT_SYMBOL_GPL(typec_switch_put); @@ -136,8 +141,10 @@ struct typec_mux *typec_mux_get(struct device *dev, const char *name) mutex_lock(&mux_lock); mux = device_connection_find_match(dev, name, NULL, typec_mux_match); - if (!IS_ERR_OR_NULL(mux)) + if (!IS_ERR_OR_NULL(mux)) { + WARN_ON(!try_module_get(mux->dev->driver->owner)); get_device(mux->dev); + } mutex_unlock(&mux_lock); return mux; @@ -152,8 +159,10 @@ EXPORT_SYMBOL_GPL(typec_mux_get); */ void typec_mux_put(struct typec_mux *mux) { - if (!IS_ERR_OR_NULL(mux)) + if (!IS_ERR_OR_NULL(mux)) { + module_put(mux->dev->driver->owner); put_device(mux->dev); + } } EXPORT_SYMBOL_GPL(typec_mux_put); From 2823c8716c687d6c7e261a3a02b3cab43809fe9c Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 27 Aug 2018 10:34:07 -0500 Subject: [PATCH 127/499] b43: fix DMA error related regression with proprietary firmware In commit 66cffd6daab7 ("b43: fix transmit failure when VT is switched"), a condition is noted where the network controller needs to be reset. Note that this situation happens when running the open-source firmware (http://netweb.ing.unibs.it/~openfwwf/), plus a number of other special conditions. for a different card model, it is reported that this change breaks operation running the proprietary firmware (https://marc.info/?l=linux-wireless&m=153504546924558&w=2). Rather than reverting the previous patch, the code is tweaked to avoid the reset unless the open-source firmware is being used. Fixes: 66cffd6daab7 ("b43: fix transmit failure when VT is switched") Cc: Stable # 4.18+ Cc: Taketo Kabe Reported-and-tested-by: D. Prabhu Signed-off-by: Larry Finger Signed-off-by: Kalle Valo --- drivers/net/wireless/broadcom/b43/dma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/broadcom/b43/dma.c b/drivers/net/wireless/broadcom/b43/dma.c index 6b0e1ec346cb..d46d57b989ae 100644 --- a/drivers/net/wireless/broadcom/b43/dma.c +++ b/drivers/net/wireless/broadcom/b43/dma.c @@ -1518,13 +1518,15 @@ void b43_dma_handle_txstatus(struct b43_wldev *dev, } } else { /* More than a single header/data pair were missed. - * Report this error, and reset the controller to + * Report this error. If running with open-source + * firmware, then reset the controller to * revive operation. */ b43dbg(dev->wl, "Out of order TX status report on DMA ring %d. Expected %d, but got %d\n", ring->index, firstused, slot); - b43_controller_restart(dev, "Out of order TX"); + if (dev->fw.opensource) + b43_controller_restart(dev, "Out of order TX"); return; } } From c716a25b9b70084e1144f77423f5aedd772ea478 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 20 Sep 2018 01:38:58 -0300 Subject: [PATCH 128/499] powerpc/pkeys: Fix reading of ibm, processor-storage-keys property scan_pkey_feature() uses of_property_read_u32_array() to read the ibm,processor-storage-keys property and calls be32_to_cpu() on the value it gets. The problem is that of_property_read_u32_array() already returns the value converted to the CPU byte order. The value of pkeys_total ends up more or less sane because there's a min() call in pkey_initialize() which reduces pkeys_total to 32. So in practice the kernel ignores the fact that the hypervisor reserved one key for itself (the device tree advertises 31 keys in my test VM). This is wrong, but the effect in practice is that when a process tries to allocate the 32nd key, it gets an -EINVAL error instead of -ENOSPC which would indicate that there aren't any keys available Fixes: cf43d3b26452 ("powerpc: Enable pkey subsystem") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pkeys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c index 333b1f80c435..b271b283c785 100644 --- a/arch/powerpc/mm/pkeys.c +++ b/arch/powerpc/mm/pkeys.c @@ -45,7 +45,7 @@ static void scan_pkey_feature(void) * Since any pkey can be used for data or execute, we will just treat * all keys as equal and track them as one entity. */ - pkeys_total = be32_to_cpu(vals[0]); + pkeys_total = vals[0]; pkeys_devtree_defined = true; } From 7e620984b62532783912312e334f3c48cdacbd5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 20 Sep 2018 14:11:17 +0200 Subject: [PATCH 129/499] serial: imx: restore handshaking irq for imx1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Back in 2015 when irda was dropped from the driver imx1 was broken. This change reintroduces the support for the third interrupt of the UART. Fixes: afe9cbb1a6ad ("serial: imx: drop support for IRDA") Cc: stable Signed-off-by: Uwe Kleine-König Reviewed-by: Leonard Crestez Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/imx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 239c0fa2e981..0f67197a3783 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -2351,6 +2351,14 @@ static int imx_uart_probe(struct platform_device *pdev) ret); return ret; } + + ret = devm_request_irq(&pdev->dev, rtsirq, imx_uart_rtsint, 0, + dev_name(&pdev->dev), sport); + if (ret) { + dev_err(&pdev->dev, "failed to request rts irq: %d\n", + ret); + return ret; + } } else { ret = devm_request_irq(&pdev->dev, rxirq, imx_uart_int, 0, dev_name(&pdev->dev), sport); From f9d5b1d50840320fb6c66fafd1a532a995fc3758 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 20 Sep 2018 09:31:45 +0300 Subject: [PATCH 130/499] mlxsw: spectrum: Bump required firmware version MC-aware mode was introduced to mlxsw in commit 7b8195306694 ("mlxsw: spectrum: Configure MC-aware mode on mlxsw ports") and fixed up later in commit 3a3539cd3632 ("mlxsw: spectrum_buffers: Set up a dedicated pool for BUM traffic"). As the final piece of puzzle, a firmware issue whereby a wrong priority was assigned to BUM traffic was corrected in FW version 13.1703.4. Therefore require this FW version in the driver. Fixes: 7b8195306694 ("mlxsw: spectrum: Configure MC-aware mode on mlxsw ports") Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 930700413b1d..b492152c8881 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -44,8 +44,8 @@ #define MLXSW_SP_FWREV_MINOR_TO_BRANCH(minor) ((minor) / 100) #define MLXSW_SP1_FWREV_MAJOR 13 -#define MLXSW_SP1_FWREV_MINOR 1702 -#define MLXSW_SP1_FWREV_SUBMINOR 6 +#define MLXSW_SP1_FWREV_MINOR 1703 +#define MLXSW_SP1_FWREV_SUBMINOR 4 #define MLXSW_SP1_FWREV_CAN_RESET_MINOR 1702 static const struct mlxsw_fw_rev mlxsw_sp1_fw_rev = { From 56ce3c5a50f4d8cc95361b1ec7f152006c6320d8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 20 Sep 2018 09:27:30 +0200 Subject: [PATCH 131/499] smc: generic netlink family should be __ro_after_init The generic netlink family is only initialized during module init, so it should be __ro_after_init like all other generic netlink families. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 01c6ce042a1c..7cb3e4f07c10 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -461,7 +461,7 @@ static const struct genl_ops smc_pnet_ops[] = { }; /* SMC_PNETID family definition */ -static struct genl_family smc_pnet_nl_family = { +static struct genl_family smc_pnet_nl_family __ro_after_init = { .hdrsize = 0, .name = SMCR_GENL_FAMILY_NAME, .version = SMCR_GENL_FAMILY_VERSION, From fb6de923ca3358a91525552b4907d4cb38730bdd Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 19 Sep 2018 15:30:51 -0600 Subject: [PATCH 132/499] regulator: fix crash caused by null driver data dev_set_drvdata() needs to be called before device_register() exposes device to userspace. Otherwise kernel crashes after it gets null pointer from dev_get_drvdata() when userspace tries to access sysfs entries. [Removed backtrace for length -- broonie] Signed-off-by: Yu Zhao Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- drivers/regulator/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 90215f57270f..9577d8941846 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -4395,13 +4395,13 @@ regulator_register(const struct regulator_desc *regulator_desc, !rdev->desc->fixed_uV) rdev->is_switch = true; + dev_set_drvdata(&rdev->dev, rdev); ret = device_register(&rdev->dev); if (ret != 0) { put_device(&rdev->dev); goto unset_supplies; } - dev_set_drvdata(&rdev->dev, rdev); rdev_init_debugfs(rdev); /* try to resolve regulators supply since a new one was registered */ From d98627d1360d55e3b28f702caca8b6342c4a4e45 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 27 May 2018 22:42:55 +0100 Subject: [PATCH 133/499] drm/i2c: tda9950: fix timeout counter check Currently the check to see if the timeout has reached zero is incorrect and the check is instead checking if the timeout is non-zero and not zero, hence it will break out of the loop on the first iteration and the msleep is never executed. Fix this by breaking from the loop when timeout is zero. Detected by CoverityScan, CID#1469404 ("Logically Dead Code") Fixes: f0316f93897c ("drm/i2c: tda9950: add CEC driver") Signed-off-by: Colin Ian King Signed-off-by: Russell King --- drivers/gpu/drm/i2c/tda9950.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i2c/tda9950.c b/drivers/gpu/drm/i2c/tda9950.c index 3f7396caad48..f2186409f0cf 100644 --- a/drivers/gpu/drm/i2c/tda9950.c +++ b/drivers/gpu/drm/i2c/tda9950.c @@ -307,7 +307,7 @@ static void tda9950_release(struct tda9950_priv *priv) /* Wait up to .5s for it to signal non-busy */ do { csr = tda9950_read(client, REG_CSR); - if (!(csr & CSR_BUSY) || --timeout) + if (!(csr & CSR_BUSY) || !--timeout) break; msleep(10); } while (1); From e0dccce1193f87597548d0db6ecc942fb92c04cd Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 27 Aug 2018 14:28:50 +0200 Subject: [PATCH 134/499] drm/i2c: tda9950: set MAX_RETRIES for errors only The CEC_TX_STATUS_MAX_RETRIES should be set for errors only to prevent the CEC framework from retrying the transmit. If the transmit was successful, then don't set this flag. Found by running 'cec-compliance -A' on a beaglebone box. Signed-off-by: Hans Verkuil Signed-off-by: Russell King --- drivers/gpu/drm/i2c/tda9950.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i2c/tda9950.c b/drivers/gpu/drm/i2c/tda9950.c index f2186409f0cf..ccd355d0c123 100644 --- a/drivers/gpu/drm/i2c/tda9950.c +++ b/drivers/gpu/drm/i2c/tda9950.c @@ -188,7 +188,8 @@ static irqreturn_t tda9950_irq(int irq, void *data) break; } /* TDA9950 executes all retries for us */ - tx_status |= CEC_TX_STATUS_MAX_RETRIES; + if (tx_status != CEC_TX_STATUS_OK) + tx_status |= CEC_TX_STATUS_MAX_RETRIES; cec_transmit_done(priv->adap, tx_status, arb_lost_cnt, nack_cnt, 0, err_cnt); break; From a173f066c7cfc031acb8f541708041e009fc9812 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 17 Sep 2018 08:20:36 -0700 Subject: [PATCH 135/499] netfilter: bridge: Don't sabotage nf_hook calls from an l3mdev For starters, the bridge netfilter code registers operations that are invoked any time nh_hook is called. Specifically, ip_sabotage_in watches for nested calls for NF_INET_PRE_ROUTING when a bridge is in the stack. Packet wise, the bridge netfilter hook runs first. br_nf_pre_routing allocates nf_bridge, sets in_prerouting to 1 and calls NF_HOOK for NF_INET_PRE_ROUTING. It's finish function, br_nf_pre_routing_finish, then resets in_prerouting flag to 0 and the packet continues up the stack. The packet eventually makes it to the VRF driver and it invokes nf_hook for NF_INET_PRE_ROUTING in case any rules have been added against the vrf device. Because of the registered operations the call to nf_hook causes ip_sabotage_in to be invoked. That function sees the nf_bridge on the skb and that in_prerouting is not set. Thinking it is an invalid nested call it steals (drops) the packet. Update ip_sabotage_in to recognize that the bridge or one of its upper devices (e.g., vlan) can be enslaved to a VRF (L3 master device) and allow the packet to go through the nf_hook a second time. Fixes: 73e20b761acf ("net: vrf: Add support for PREROUTING rules on vrf device") Reported-by: D'Souza, Nelson Signed-off-by: David Ahern Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6e0dc6bcd32a..37278dc280eb 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -835,7 +835,8 @@ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) { + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev)) { state->okfn(state->net, state->sk, skb); return NF_STOLEN; } From bab4344975fe2c719eda32de59298d6de26fe126 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Mon, 17 Sep 2018 22:21:36 -0700 Subject: [PATCH 136/499] netfilter: nft_osf: use enum nft_data_types for nft_validate_register_store The function nft_validate_register_store requires a struct of type struct nft_data_types. NFTA_DATA_VALUE is of type enum nft_verdict_attributes. Pass the correct enum type. This fixes a warning seen with Clang: net/netfilter/nft_osf.c:52:8: warning: implicit conversion from enumeration type 'enum nft_data_attributes' to different enumeration type 'enum nft_data_types' [-Wenum-conversion] NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN); ^~~~~~~~~~~~~~~ Fixes: b96af92d6eaf ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf") Link: https://github.com/ClangBuiltLinux/linux/issues/103 Signed-off-by: Stefan Agner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_osf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 5af74b37f423..a35fb59ace73 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -49,7 +49,7 @@ static int nft_osf_init(const struct nft_ctx *ctx, priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFTA_DATA_VALUE, NFT_OSF_MAXGENRELEN); + NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN); if (err < 0) return err; From 346fa83d10934cf206e2fd0f514bf8ce186f08fe Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 19 Sep 2018 20:21:11 +0800 Subject: [PATCH 137/499] netfilter: conntrack: get rid of double sizeof sizeof(sizeof()) is quite strange and does not seem to be what is wanted here. The issue is detected with the help of Coccinelle. Fixes: 39215846740a ("netfilter: conntrack: remove nlattr_size pointer from l4proto trackers") Signed-off-by: zhong jiang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b4bdf9eda7b7..247b89784a6f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1213,8 +1213,8 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { #define TCP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags))) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags)))) + NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \ + NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags))) static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { From 8c6ec3613e7b0aade20a3196169c0bab32ed3e3f Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 19 Sep 2018 19:01:37 +0200 Subject: [PATCH 138/499] bnxt_en: don't try to offload VLAN 'modify' action bnxt offload code currently supports only 'push' and 'pop' operation: let .ndo_setup_tc() return -EOPNOTSUPP if VLAN 'modify' action is configured. Fixes: 2ae7408fedfe ("bnxt_en: bnxt: add TC flower filter offload support") Signed-off-by: Davide Caratti Acked-by: Sathya Perla Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index 092c817f8f11..e1594c9df4c6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -75,17 +75,23 @@ static int bnxt_tc_parse_redir(struct bnxt *bp, return 0; } -static void bnxt_tc_parse_vlan(struct bnxt *bp, - struct bnxt_tc_actions *actions, - const struct tc_action *tc_act) +static int bnxt_tc_parse_vlan(struct bnxt *bp, + struct bnxt_tc_actions *actions, + const struct tc_action *tc_act) { - if (tcf_vlan_action(tc_act) == TCA_VLAN_ACT_POP) { + switch (tcf_vlan_action(tc_act)) { + case TCA_VLAN_ACT_POP: actions->flags |= BNXT_TC_ACTION_FLAG_POP_VLAN; - } else if (tcf_vlan_action(tc_act) == TCA_VLAN_ACT_PUSH) { + break; + case TCA_VLAN_ACT_PUSH: actions->flags |= BNXT_TC_ACTION_FLAG_PUSH_VLAN; actions->push_vlan_tci = htons(tcf_vlan_push_vid(tc_act)); actions->push_vlan_tpid = tcf_vlan_push_proto(tc_act); + break; + default: + return -EOPNOTSUPP; } + return 0; } static int bnxt_tc_parse_tunnel_set(struct bnxt *bp, @@ -134,7 +140,9 @@ static int bnxt_tc_parse_actions(struct bnxt *bp, /* Push/pop VLAN */ if (is_tcf_vlan(tc_act)) { - bnxt_tc_parse_vlan(bp, actions, tc_act); + rc = bnxt_tc_parse_vlan(bp, actions, tc_act); + if (rc) + return rc; continue; } From d7ab5cdce54da631f0c8c11e506c974536a3581e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 20 Sep 2018 17:27:28 +0800 Subject: [PATCH 139/499] sctp: update dst pmtu with the correct daddr When processing pmtu update from an icmp packet, it calls .update_pmtu with sk instead of skb in sctp_transport_update_pmtu. However for sctp, the daddr in the transport might be different from inet_sock->inet_daddr or sk->sk_v6_daddr, which is used to update or create the route cache. The incorrect daddr will cause a different route cache created for the path. So before calling .update_pmtu, inet_sock->inet_daddr/sk->sk_v6_daddr should be updated with the daddr in the transport, and update it back after it's done. The issue has existed since route exceptions introduction. Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.") Reported-by: ian.periam@dialogic.com Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/transport.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 12cac85da994..033696e6f74f 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -260,6 +260,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct dst_entry *dst = sctp_transport_dst_check(t); + struct sock *sk = t->asoc->base.sk; bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { @@ -271,12 +272,19 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) pmtu = SCTP_TRUNC4(pmtu); if (dst) { - dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu); + struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family); + union sctp_addr addr; + + pf->af->from_sk(&addr, sk); + pf->to_sk_daddr(&t->ipaddr, sk); + dst->ops->update_pmtu(dst, sk, NULL, pmtu); + pf->to_sk_daddr(&addr, sk); + dst = sctp_transport_dst_check(t); } if (!dst) { - t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk); + t->af_specific->get_dst(t, &t->saddr, &t->fl, sk); dst = t->dst; } From 60489f085574157c343fc62a32f997fe7346a659 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 20 Sep 2018 09:31:10 +0200 Subject: [PATCH 140/499] spi: spi-mem: Add missing description for data.nbytes field Add a description for spi_mem_op.data.nbytes to the kerneldoc header. Fixes: c36ff266dc82 ("spi: Extend the core to ease integration of SPI memory controllers") Signed-off-by: Boris Brezillon Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 62722fb7472d..0cfbb1ad8d96 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -79,6 +79,8 @@ enum spi_mem_data_dir { * @dummy.buswidth: number of IO lanes used to transmit the dummy bytes * @data.buswidth: number of IO lanes used to send/receive the data * @data.dir: direction of the transfer + * @data.nbytes: number of data bytes to send/receive. Can be zero if the + * operation does not involve transferring data * @data.buf.in: input buffer * @data.buf.out: output buffer */ From c949a8e8b43f2c75567269bcc9a50d704ae3c420 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 20 Sep 2018 09:31:11 +0200 Subject: [PATCH 141/499] spi: spi-mem: Move the DMA-able constraint doc to the kerneldoc header We'd better have that documented in the kerneldoc header, so that it's exposed to the doc generated by Sphinx. Signed-off-by: Boris Brezillon Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 0cfbb1ad8d96..7195fbc234aa 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -81,8 +81,8 @@ enum spi_mem_data_dir { * @data.dir: direction of the transfer * @data.nbytes: number of data bytes to send/receive. Can be zero if the * operation does not involve transferring data - * @data.buf.in: input buffer - * @data.buf.out: output buffer + * @data.buf.in: input buffer (must be DMA-able) + * @data.buf.out: output buffer (must be DMA-able) */ struct spi_mem_op { struct { @@ -105,7 +105,6 @@ struct spi_mem_op { u8 buswidth; enum spi_mem_data_dir dir; unsigned int nbytes; - /* buf.{in,out} must be DMA-able. */ union { void *in; const void *out; From b3027b7746ce1e5a9429715ee6492aca2a6e4cf0 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Wed, 19 Sep 2018 17:06:36 +0100 Subject: [PATCH 142/499] MAINTAINERS: Move mobiveil PCI driver entry where it belongs Commit 92f9ccca4c08 ("PCI: mobiveil: Add Mobiveil PCIe Host Bridge IP driver DT bindings") managed to add a MAINTAINERS entry where it does not really belong (ie in the middle of a totally unrelated series of entries and in the wrong alphabetical order). Fix it. Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Cc: Subrahmanya Lingappa --- MAINTAINERS | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f05d2e9ffc84..700408b7bc53 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9716,13 +9716,6 @@ Q: http://patchwork.linuxtv.org/project/linux-media/list/ S: Maintained F: drivers/media/dvb-frontends/mn88473* -PCI DRIVER FOR MOBIVEIL PCIE IP -M: Subrahmanya Lingappa -L: linux-pci@vger.kernel.org -S: Supported -F: Documentation/devicetree/bindings/pci/mobiveil-pcie.txt -F: drivers/pci/controller/pcie-mobiveil.c - MODULE SUPPORT M: Jessica Yu T: git git://git.kernel.org/pub/scm/linux/kernel/git/jeyu/linux.git modules-next @@ -11137,6 +11130,13 @@ F: include/uapi/linux/switchtec_ioctl.h F: include/linux/switchtec.h F: drivers/ntb/hw/mscc/ +PCI DRIVER FOR MOBIVEIL PCIE IP +M: Subrahmanya Lingappa +L: linux-pci@vger.kernel.org +S: Supported +F: Documentation/devicetree/bindings/pci/mobiveil-pcie.txt +F: drivers/pci/controller/pcie-mobiveil.c + PCI DRIVER FOR MVEBU (Marvell Armada 370 and Armada XP SOC support) M: Thomas Petazzoni M: Jason Cooper From 9024143e700f89d74b8cdaf316a3499d74fc56fe Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 20 Sep 2018 16:32:52 -0500 Subject: [PATCH 143/499] PCI: dwc: Fix scheduling while atomic issues When programming the inbound/outbound ATUs, we call usleep_range() after each checking PCIE_ATU_ENABLE bit. Unfortunately, the ATU programming can be executed in atomic context: inbound ATU programming could be called through pci_epc_write_header() =>dw_pcie_ep_write_header() =>dw_pcie_prog_inbound_atu() outbound ATU programming could be called through pci_bus_read_config_dword() =>dw_pcie_rd_conf() =>dw_pcie_prog_outbound_atu() Fix this issue by calling mdelay() instead. Fixes: f8aed6ec624f ("PCI: dwc: designware: Add EP mode support") Fixes: d8bbeb39fbf3 ("PCI: designware: Wait for iATU enable") Signed-off-by: Jisheng Zhang [lorenzo.pieralisi@arm.com: commit log update] Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Acked-by: Gustavo Pimentel --- drivers/pci/controller/dwc/pcie-designware.c | 8 ++++---- drivers/pci/controller/dwc/pcie-designware.h | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 778c4f76a884..2153956a0b20 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -135,7 +135,7 @@ static void dw_pcie_prog_outbound_atu_unroll(struct dw_pcie *pci, int index, if (val & PCIE_ATU_ENABLE) return; - usleep_range(LINK_WAIT_IATU_MIN, LINK_WAIT_IATU_MAX); + mdelay(LINK_WAIT_IATU); } dev_err(pci->dev, "Outbound iATU is not being enabled\n"); } @@ -178,7 +178,7 @@ void dw_pcie_prog_outbound_atu(struct dw_pcie *pci, int index, int type, if (val & PCIE_ATU_ENABLE) return; - usleep_range(LINK_WAIT_IATU_MIN, LINK_WAIT_IATU_MAX); + mdelay(LINK_WAIT_IATU); } dev_err(pci->dev, "Outbound iATU is not being enabled\n"); } @@ -236,7 +236,7 @@ static int dw_pcie_prog_inbound_atu_unroll(struct dw_pcie *pci, int index, if (val & PCIE_ATU_ENABLE) return 0; - usleep_range(LINK_WAIT_IATU_MIN, LINK_WAIT_IATU_MAX); + mdelay(LINK_WAIT_IATU); } dev_err(pci->dev, "Inbound iATU is not being enabled\n"); @@ -282,7 +282,7 @@ int dw_pcie_prog_inbound_atu(struct dw_pcie *pci, int index, int bar, if (val & PCIE_ATU_ENABLE) return 0; - usleep_range(LINK_WAIT_IATU_MIN, LINK_WAIT_IATU_MAX); + mdelay(LINK_WAIT_IATU); } dev_err(pci->dev, "Inbound iATU is not being enabled\n"); diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index 96126fd8403c..9f1a5e399b70 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -26,8 +26,7 @@ /* Parameters for the waiting for iATU enabled routine */ #define LINK_WAIT_MAX_IATU_RETRIES 5 -#define LINK_WAIT_IATU_MIN 9000 -#define LINK_WAIT_IATU_MAX 10000 +#define LINK_WAIT_IATU 9 /* Synopsys-specific PCIe configuration registers */ #define PCIE_PORT_LINK_CONTROL 0x710 From 4eeed3686981ff887bbdd7254139e2eca276534c Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Tue, 18 Sep 2018 10:51:37 +0300 Subject: [PATCH 144/499] RDMA/uverbs: Fix validity check for modify QP Uverbs shouldn't enforce QP state in the command unless the user set the QP state bit in the attribute mask. In addition, only copy qp attr fields which have the corresponding bit set in the attribute mask over to the internal attr structure. Fixes: 88de869bbe4f ("RDMA/uverbs: Ensure validity of current QP state value") Fixes: bc38a6abdd5a ("[PATCH] IB uverbs: core implementation") Signed-off-by: Majd Dibbiny Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 68 ++++++++++++++++++---------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index a21d5214afc3..e012ca80f9d1 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2027,33 +2027,55 @@ static int modify_qp(struct ib_uverbs_file *file, if ((cmd->base.attr_mask & IB_QP_CUR_STATE && cmd->base.cur_qp_state > IB_QPS_ERR) || - cmd->base.qp_state > IB_QPS_ERR) { + (cmd->base.attr_mask & IB_QP_STATE && + cmd->base.qp_state > IB_QPS_ERR)) { ret = -EINVAL; goto release_qp; } - attr->qp_state = cmd->base.qp_state; - attr->cur_qp_state = cmd->base.cur_qp_state; - attr->path_mtu = cmd->base.path_mtu; - attr->path_mig_state = cmd->base.path_mig_state; - attr->qkey = cmd->base.qkey; - attr->rq_psn = cmd->base.rq_psn; - attr->sq_psn = cmd->base.sq_psn; - attr->dest_qp_num = cmd->base.dest_qp_num; - attr->qp_access_flags = cmd->base.qp_access_flags; - attr->pkey_index = cmd->base.pkey_index; - attr->alt_pkey_index = cmd->base.alt_pkey_index; - attr->en_sqd_async_notify = cmd->base.en_sqd_async_notify; - attr->max_rd_atomic = cmd->base.max_rd_atomic; - attr->max_dest_rd_atomic = cmd->base.max_dest_rd_atomic; - attr->min_rnr_timer = cmd->base.min_rnr_timer; - attr->port_num = cmd->base.port_num; - attr->timeout = cmd->base.timeout; - attr->retry_cnt = cmd->base.retry_cnt; - attr->rnr_retry = cmd->base.rnr_retry; - attr->alt_port_num = cmd->base.alt_port_num; - attr->alt_timeout = cmd->base.alt_timeout; - attr->rate_limit = cmd->rate_limit; + if (cmd->base.attr_mask & IB_QP_STATE) + attr->qp_state = cmd->base.qp_state; + if (cmd->base.attr_mask & IB_QP_CUR_STATE) + attr->cur_qp_state = cmd->base.cur_qp_state; + if (cmd->base.attr_mask & IB_QP_PATH_MTU) + attr->path_mtu = cmd->base.path_mtu; + if (cmd->base.attr_mask & IB_QP_PATH_MIG_STATE) + attr->path_mig_state = cmd->base.path_mig_state; + if (cmd->base.attr_mask & IB_QP_QKEY) + attr->qkey = cmd->base.qkey; + if (cmd->base.attr_mask & IB_QP_RQ_PSN) + attr->rq_psn = cmd->base.rq_psn; + if (cmd->base.attr_mask & IB_QP_SQ_PSN) + attr->sq_psn = cmd->base.sq_psn; + if (cmd->base.attr_mask & IB_QP_DEST_QPN) + attr->dest_qp_num = cmd->base.dest_qp_num; + if (cmd->base.attr_mask & IB_QP_ACCESS_FLAGS) + attr->qp_access_flags = cmd->base.qp_access_flags; + if (cmd->base.attr_mask & IB_QP_PKEY_INDEX) + attr->pkey_index = cmd->base.pkey_index; + if (cmd->base.attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) + attr->en_sqd_async_notify = cmd->base.en_sqd_async_notify; + if (cmd->base.attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + attr->max_rd_atomic = cmd->base.max_rd_atomic; + if (cmd->base.attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + attr->max_dest_rd_atomic = cmd->base.max_dest_rd_atomic; + if (cmd->base.attr_mask & IB_QP_MIN_RNR_TIMER) + attr->min_rnr_timer = cmd->base.min_rnr_timer; + if (cmd->base.attr_mask & IB_QP_PORT) + attr->port_num = cmd->base.port_num; + if (cmd->base.attr_mask & IB_QP_TIMEOUT) + attr->timeout = cmd->base.timeout; + if (cmd->base.attr_mask & IB_QP_RETRY_CNT) + attr->retry_cnt = cmd->base.retry_cnt; + if (cmd->base.attr_mask & IB_QP_RNR_RETRY) + attr->rnr_retry = cmd->base.rnr_retry; + if (cmd->base.attr_mask & IB_QP_ALT_PATH) { + attr->alt_port_num = cmd->base.alt_port_num; + attr->alt_timeout = cmd->base.alt_timeout; + attr->alt_pkey_index = cmd->base.alt_pkey_index; + } + if (cmd->base.attr_mask & IB_QP_RATE_LIMIT) + attr->rate_limit = cmd->rate_limit; if (cmd->base.attr_mask & IB_QP_AV) copy_ah_attr_from_uverbs(qp->device, &attr->ah_attr, From 0dbfaa9f2813787679e296eb5476e40938ab48c8 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 20 Sep 2018 12:58:46 -0700 Subject: [PATCH 145/499] IB/hfi1: Fix SL array bounds check The SL specified by a user needs to be a valid SL. Add a range check to the user specified SL value which protects from running off the end of the SL to SC table. CC: stable@vger.kernel.org Fixes: 7724105686e7 ("IB/hfi1: add driver files") Signed-off-by: Ira Weiny Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/verbs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 13374c727b14..a7c586a5589d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1582,6 +1582,7 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr) struct hfi1_pportdata *ppd; struct hfi1_devdata *dd; u8 sc5; + u8 sl; if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) && !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) @@ -1590,8 +1591,13 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr) /* test the mapping for validity */ ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); ppd = ppd_from_ibp(ibp); - sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; dd = dd_from_ppd(ppd); + + sl = rdma_ah_get_sl(ah_attr); + if (sl >= ARRAY_SIZE(ibp->sl_to_sc)) + return -EINVAL; + + sc5 = ibp->sl_to_sc[sl]; if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf) return -EINVAL; return 0; From 94694d18cf27a6faad91487a38ce516c2b16e7d9 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 20 Sep 2018 12:58:56 -0700 Subject: [PATCH 146/499] IB/hfi1: Invalid user input can result in crash If the number of packets in a user sdma request does not match the actual iovectors being sent, sdma_cleanup can be called on an uninitialized request structure, resulting in a crash similar to this: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] __sdma_txclean+0x57/0x1e0 [hfi1] PGD 8000001044f61067 PUD 1052706067 PMD 0 Oops: 0000 [#1] SMP CPU: 30 PID: 69912 Comm: upsm Kdump: loaded Tainted: G OE ------------ 3.10.0-862.el7.x86_64 #1 Hardware name: Intel Corporation S2600KPR/S2600KPR, BIOS SE5C610.86B.01.01.0019.101220160604 10/12/2016 task: ffff8b331c890000 ti: ffff8b2ed1f98000 task.ti: ffff8b2ed1f98000 RIP: 0010:[] [] __sdma_txclean+0x57/0x1e0 [hfi1] RSP: 0018:ffff8b2ed1f9bab0 EFLAGS: 00010286 RAX: 0000000000008b2b RBX: ffff8b2adf6e0000 RCX: 0000000000000000 RDX: 00000000000000a0 RSI: ffff8b2e9eedc540 RDI: ffff8b2adf6e0000 RBP: ffff8b2ed1f9bad8 R08: 0000000000000000 R09: ffffffffc0b04a06 R10: ffff8b331c890190 R11: ffffe6ed00bf1840 R12: ffff8b3315480000 R13: ffff8b33154800f0 R14: 00000000fffffff2 R15: ffff8b2e9eedc540 FS: 00007f035ac47740(0000) GS:ffff8b331e100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 0000000c03fe6000 CR4: 00000000001607e0 Call Trace: [] user_sdma_send_pkts+0xdcd/0x1990 [hfi1] [] ? gup_pud_range+0x140/0x290 [] ? hfi1_mmu_rb_insert+0x155/0x1b0 [hfi1] [] hfi1_user_sdma_process_request+0xc5b/0x11b0 [hfi1] [] hfi1_aio_write+0xba/0x110 [hfi1] [] do_sync_readv_writev+0x7b/0xd0 [] do_readv_writev+0xce/0x260 [] ? tty_ldisc_deref+0x19/0x20 [] ? n_tty_ioctl+0xe0/0xe0 [] vfs_writev+0x35/0x60 [] SyS_writev+0x7f/0x110 [] system_call_fastpath+0x1c/0x21 Code: 06 49 c7 47 18 00 00 00 00 0f 87 89 01 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 66 2e 0f 1f 84 00 00 00 00 00 48 8b 4e 10 48 89 fb <48> 8b 51 08 49 89 d4 83 e2 0c 41 81 e4 00 e0 00 00 48 c1 ea 02 RIP [] __sdma_txclean+0x57/0x1e0 [hfi1] RSP CR2: 0000000000000008 There are two exit points from user_sdma_send_pkts(). One (free_tx) merely frees the slab entry and one (free_txreq) cleans the sdma_txreq prior to freeing the slab entry. The free_txreq variation can only be called after one of the sdma_init*() variations has been called. In the panic case, the slab entry had been allocated but not inited. Fix the issue by exiting through free_tx thus avoiding sdma_clean(). Cc: # 4.9.x+ Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Mike Marciniszyn Reviewed-by: Lukasz Odzioba Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/user_sdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index a3a7b33196d6..5c88706121c1 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -828,7 +828,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { if (++req->iov_idx == req->data_iovs) { ret = -EFAULT; - goto free_txreq; + goto free_tx; } iovec = &req->iovs[req->iov_idx]; WARN_ON(iovec->offset); From d623500b3c4efd8d4e945ac9003c6b87b469a9ab Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 20 Sep 2018 12:59:05 -0700 Subject: [PATCH 147/499] IB/hfi1: Fix context recovery when PBC has an UnsupportedVL If a packet stream uses an UnsupportedVL (virtual lane), the send engine will not send the packet, and it will not indicate that an error has occurred. This will cause the packet stream to block. HFI has 8 virtual lanes available for packet streams. Each lane can be enabled or disabled using the UnsupportedVL mask. If a lane is disabled, adding a packet to the send context must be disallowed. The current mask for determining unsupported VLs defaults to 0 (allow all). This is incorrect. Only the VLs that are defined should be allowed. Determine which VLs are disabled (mtu == 0), and set the appropriate unsupported bit in the mask. The correct mask will allow the send engine to error on the invalid VL, and error recovery will work correctly. Cc: # 4.9.x+ Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Mike Marciniszyn Reviewed-by: Lukasz Odzioba Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/pio.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index c2c1cba5b23b..cd962c9ea6bc 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -86,6 +86,7 @@ void pio_send_control(struct hfi1_devdata *dd, int op) unsigned long flags; int write = 1; /* write sendctrl back */ int flush = 0; /* re-read sendctrl to make sure it is flushed */ + int i; spin_lock_irqsave(&dd->sendctrl_lock, flags); @@ -95,9 +96,13 @@ void pio_send_control(struct hfi1_devdata *dd, int op) reg |= SEND_CTRL_SEND_ENABLE_SMASK; /* Fall through */ case PSC_DATA_VL_ENABLE: + mask = 0; + for (i = 0; i < ARRAY_SIZE(dd->vld); i++) + if (!dd->vld[i].mtu) + mask |= BIT_ULL(i); /* Disallow sending on VLs not enabled */ - mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) << - SEND_CTRL_UNSUPPORTED_VL_SHIFT; + mask = (mask & SEND_CTRL_UNSUPPORTED_VL_MASK) << + SEND_CTRL_UNSUPPORTED_VL_SHIFT; reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask; break; case PSC_GLOBAL_DISABLE: From b4a4957d3d1c328b733fce783b7264996f866ad2 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 20 Sep 2018 12:59:14 -0700 Subject: [PATCH 148/499] IB/hfi1: Fix destroy_qp hang after a link down rvt_destroy_qp() cannot complete until all in process packets have been released from the underlying hardware. If a link down event occurs, an application can hang with a kernel stack similar to: cat /proc//stack quiesce_qp+0x178/0x250 [hfi1] rvt_reset_qp+0x23d/0x400 [rdmavt] rvt_destroy_qp+0x69/0x210 [rdmavt] ib_destroy_qp+0xba/0x1c0 [ib_core] nvme_rdma_destroy_queue_ib+0x46/0x80 [nvme_rdma] nvme_rdma_free_queue+0x3c/0xd0 [nvme_rdma] nvme_rdma_destroy_io_queues+0x88/0xd0 [nvme_rdma] nvme_rdma_error_recovery_work+0x52/0xf0 [nvme_rdma] process_one_work+0x17a/0x440 worker_thread+0x126/0x3c0 kthread+0xcf/0xe0 ret_from_fork+0x58/0x90 0xffffffffffffffff quiesce_qp() waits until all outstanding packets have been freed. This wait should be momentary. During a link down event, the cleanup handling does not ensure that all packets caught by the link down are flushed properly. This is caused by the fact that the freeze path and the link down event is handled the same. This is not correct. The freeze path waits until the HFI is unfrozen and then restarts PIO. A link down is not a freeze event. The link down path cannot restart the PIO until link is restored. If the PIO path is restarted before the link comes up, the application (QP) using the PIO path will hang (until link is restored). Fix by separating the linkdown path from the freeze path and use the link down path for link down events. Close a race condition sc_disable() by acquiring both the progress and release locks. Close a race condition in sc_stop() by moving the setting of the flag bits under the alloc lock. Cc: # 4.9.x+ Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 6 ++++- drivers/infiniband/hw/hfi1/pio.c | 42 +++++++++++++++++++++++++------ drivers/infiniband/hw/hfi1/pio.h | 2 ++ 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 2c19bf772451..e1668bcc2d13 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -6733,6 +6733,7 @@ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags) struct hfi1_devdata *dd = ppd->dd; struct send_context *sc; int i; + int sc_flags; if (flags & FREEZE_SELF) write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK); @@ -6743,11 +6744,13 @@ void start_freeze_handling(struct hfi1_pportdata *ppd, int flags) /* notify all SDMA engines that they are going into a freeze */ sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN)); + sc_flags = SCF_FROZEN | SCF_HALTED | (flags & FREEZE_LINK_DOWN ? + SCF_LINK_DOWN : 0); /* do halt pre-handling on all enabled send contexts */ for (i = 0; i < dd->num_send_contexts; i++) { sc = dd->send_contexts[i].sc; if (sc && (sc->flags & SCF_ENABLED)) - sc_stop(sc, SCF_FROZEN | SCF_HALTED); + sc_stop(sc, sc_flags); } /* Send context are frozen. Notify user space */ @@ -10674,6 +10677,7 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); handle_linkup_change(dd, 1); + pio_kernel_linkup(dd); /* * After link up, a new link width will have been set. diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index cd962c9ea6bc..752057647f09 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -926,20 +926,18 @@ void sc_free(struct send_context *sc) void sc_disable(struct send_context *sc) { u64 reg; - unsigned long flags; struct pio_buf *pbuf; if (!sc) return; /* do all steps, even if already disabled */ - spin_lock_irqsave(&sc->alloc_lock, flags); + spin_lock_irq(&sc->alloc_lock); reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL)); reg &= ~SC(CTRL_CTXT_ENABLE_SMASK); sc->flags &= ~SCF_ENABLED; sc_wait_for_packet_egress(sc, 1); write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg); - spin_unlock_irqrestore(&sc->alloc_lock, flags); /* * Flush any waiters. Once the context is disabled, @@ -949,7 +947,7 @@ void sc_disable(struct send_context *sc) * proceed with the flush. */ udelay(1); - spin_lock_irqsave(&sc->release_lock, flags); + spin_lock(&sc->release_lock); if (sc->sr) { /* this context has a shadow ring */ while (sc->sr_tail != sc->sr_head) { pbuf = &sc->sr[sc->sr_tail].pbuf; @@ -960,7 +958,8 @@ void sc_disable(struct send_context *sc) sc->sr_tail = 0; } } - spin_unlock_irqrestore(&sc->release_lock, flags); + spin_unlock(&sc->release_lock); + spin_unlock_irq(&sc->alloc_lock); } /* return SendEgressCtxtStatus.PacketOccupancy */ @@ -1183,11 +1182,39 @@ void pio_kernel_unfreeze(struct hfi1_devdata *dd) sc = dd->send_contexts[i].sc; if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER) continue; + if (sc->flags & SCF_LINK_DOWN) + continue; sc_enable(sc); /* will clear the sc frozen flag */ } } +/** + * pio_kernel_linkup() - Re-enable send contexts after linkup event + * @dd: valid devive data + * + * When the link goes down, the freeze path is taken. However, a link down + * event is different from a freeze because if the send context is re-enabled + * whowever is sending data will start sending data again, which will hang + * any QP that is sending data. + * + * The freeze path now looks at the type of event that occurs and takes this + * path for link down event. + */ +void pio_kernel_linkup(struct hfi1_devdata *dd) +{ + struct send_context *sc; + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + if (!sc || !(sc->flags & SCF_LINK_DOWN) || sc->type == SC_USER) + continue; + + sc_enable(sc); /* will clear the sc link down flag */ + } +} + /* * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear. * Returns: @@ -1387,11 +1414,10 @@ void sc_stop(struct send_context *sc, int flag) { unsigned long flags; - /* mark the context */ - sc->flags |= flag; - /* stop buffer allocations */ spin_lock_irqsave(&sc->alloc_lock, flags); + /* mark the context */ + sc->flags |= flag; sc->flags &= ~SCF_ENABLED; spin_unlock_irqrestore(&sc->alloc_lock, flags); wake_up(&sc->halt_wait); diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h index 058b08f459ab..aaf372c3e5d6 100644 --- a/drivers/infiniband/hw/hfi1/pio.h +++ b/drivers/infiniband/hw/hfi1/pio.h @@ -139,6 +139,7 @@ struct send_context { #define SCF_IN_FREE 0x02 #define SCF_HALTED 0x04 #define SCF_FROZEN 0x08 +#define SCF_LINK_DOWN 0x10 struct send_context_info { struct send_context *sc; /* allocated working context */ @@ -306,6 +307,7 @@ void set_pio_integrity(struct send_context *sc); void pio_reset_all(struct hfi1_devdata *dd); void pio_freeze(struct hfi1_devdata *dd); void pio_kernel_unfreeze(struct hfi1_devdata *dd); +void pio_kernel_linkup(struct hfi1_devdata *dd); /* global PIO send control operations */ #define PSC_GLOBAL_ENABLE 0 From d87161bea405e3260377026ca8a704a3f68bd67a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 13 Sep 2018 14:28:48 +0300 Subject: [PATCH 149/499] scsi: ufs: Disable blk-mq for now blk-mq does not support runtime pm, so disable blk-mq support for now. Fixes: d5038a13eca7 ("scsi: core: switch to scsi-mq by default") Signed-off-by: Adrian Hunter Acked-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 9d5d2ca7fc4f..c55f38ec391c 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -7940,6 +7940,13 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle) err = -ENOMEM; goto out_error; } + + /* + * Do not use blk-mq at this time because blk-mq does not support + * runtime pm. + */ + host->use_blk_mq = false; + hba = shost_priv(host); hba->host = host; hba->dev = dev; From 9e210178267b80c4eeb832fade7e146a18c84915 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 13 Sep 2018 15:41:10 -0700 Subject: [PATCH 150/499] scsi: lpfc: Synchronize access to remoteport via rport The driver currently uses the ndlp to get the local rport which is then used to get the nvme transport remoteport pointer. There can be cases where a stale remoteport pointer is obtained as synchronization isn't done through the different dereferences. Correct by using locks to synchronize the dereferences. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_attr.c | 15 ++++++++++----- drivers/scsi/lpfc/lpfc_debugfs.c | 10 +++++----- drivers/scsi/lpfc/lpfc_nvme.c | 11 ++++++++--- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 057a60abe664..1a6ed9b0a249 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -360,12 +360,12 @@ lpfc_nvme_info_show(struct device *dev, struct device_attribute *attr, goto buffer_done; list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) { + nrport = NULL; + spin_lock(&vport->phba->hbalock); rport = lpfc_ndlp_get_nrport(ndlp); - if (!rport) - continue; - - /* local short-hand pointer. */ - nrport = rport->remoteport; + if (rport) + nrport = rport->remoteport; + spin_unlock(&vport->phba->hbalock); if (!nrport) continue; @@ -3386,6 +3386,7 @@ lpfc_update_rport_devloss_tmo(struct lpfc_vport *vport) struct lpfc_nodelist *ndlp; #if (IS_ENABLED(CONFIG_NVME_FC)) struct lpfc_nvme_rport *rport; + struct nvme_fc_remote_port *remoteport = NULL; #endif shost = lpfc_shost_from_vport(vport); @@ -3396,8 +3397,12 @@ lpfc_update_rport_devloss_tmo(struct lpfc_vport *vport) if (ndlp->rport) ndlp->rport->dev_loss_tmo = vport->cfg_devloss_tmo; #if (IS_ENABLED(CONFIG_NVME_FC)) + spin_lock(&vport->phba->hbalock); rport = lpfc_ndlp_get_nrport(ndlp); if (rport) + remoteport = rport->remoteport; + spin_unlock(&vport->phba->hbalock); + if (remoteport) nvme_fc_set_remoteport_devloss(rport->remoteport, vport->cfg_devloss_tmo); #endif diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 9df0c051349f..aec5b10a8c85 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -551,7 +551,7 @@ lpfc_debugfs_nodelist_data(struct lpfc_vport *vport, char *buf, int size) unsigned char *statep; struct nvme_fc_local_port *localport; struct lpfc_nvmet_tgtport *tgtp; - struct nvme_fc_remote_port *nrport; + struct nvme_fc_remote_port *nrport = NULL; struct lpfc_nvme_rport *rport; cnt = (LPFC_NODELIST_SIZE / LPFC_NODELIST_ENTRY_SIZE); @@ -696,11 +696,11 @@ lpfc_debugfs_nodelist_data(struct lpfc_vport *vport, char *buf, int size) len += snprintf(buf + len, size - len, "\tRport List:\n"); list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) { /* local short-hand pointer. */ + spin_lock(&phba->hbalock); rport = lpfc_ndlp_get_nrport(ndlp); - if (!rport) - continue; - - nrport = rport->remoteport; + if (rport) + nrport = rport->remoteport; + spin_unlock(&phba->hbalock); if (!nrport) continue; diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index 028462e5994d..918ae18ef8a8 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -2725,7 +2725,9 @@ lpfc_nvme_register_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) rpinfo.port_name = wwn_to_u64(ndlp->nlp_portname.u.wwn); rpinfo.node_name = wwn_to_u64(ndlp->nlp_nodename.u.wwn); + spin_lock_irq(&vport->phba->hbalock); oldrport = lpfc_ndlp_get_nrport(ndlp); + spin_unlock_irq(&vport->phba->hbalock); if (!oldrport) lpfc_nlp_get(ndlp); @@ -2840,7 +2842,7 @@ lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) struct nvme_fc_local_port *localport; struct lpfc_nvme_lport *lport; struct lpfc_nvme_rport *rport; - struct nvme_fc_remote_port *remoteport; + struct nvme_fc_remote_port *remoteport = NULL; localport = vport->localport; @@ -2854,11 +2856,14 @@ lpfc_nvme_unregister_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp) if (!lport) goto input_err; + spin_lock_irq(&vport->phba->hbalock); rport = lpfc_ndlp_get_nrport(ndlp); - if (!rport) + if (rport) + remoteport = rport->remoteport; + spin_unlock_irq(&vport->phba->hbalock); + if (!remoteport) goto input_err; - remoteport = rport->remoteport; lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_DISC, "6033 Unreg nvme remoteport %p, portname x%llx, " "port_id x%06x, portstate x%x port type x%x\n", From 10bc6a6042c900934a097988b5d50e3cf3f91781 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 20 Sep 2018 22:47:09 +0200 Subject: [PATCH 151/499] r8169: fix autoneg issue on resume with RTL8168E It was reported that chip version 33 (RTL8168E) ends up with 10MBit/Half on a 1GBit link after resuming from S3 (with different link partners). For whatever reason the PHY on this chip doesn't properly start a renegotiation when soft-reset. Explicitly requesting a renegotiation fixes this. Fixes: a2965f12fde6 ("r8169: remove rtl8169_set_speed_xmii") Signed-off-by: Heiner Kallweit Reported-by: Neil MacLeod Tested-by: Neil MacLeod Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index bb529ff2ca81..ab30aaeac6d3 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -4071,6 +4071,15 @@ static void rtl8169_init_phy(struct net_device *dev, struct rtl8169_private *tp) phy_speed_up(dev->phydev); genphy_soft_reset(dev->phydev); + + /* It was reported that chip version 33 ends up with 10MBit/Half on a + * 1GBit link after resuming from S3. For whatever reason the PHY on + * this chip doesn't properly start a renegotiation when soft-reset. + * Explicitly requesting a renegotiation fixes this. + */ + if (tp->mac_version == RTL_GIGA_MAC_VER_33 && + dev->phydev->autoneg == AUTONEG_ENABLE) + phy_restart_aneg(dev->phydev); } static void rtl_rar_set(struct rtl8169_private *tp, u8 *addr) From 13cc6f48c7434ce46ba6dbc90003a136a263d75a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Horia=20Geant=C4=83?= Date: Fri, 14 Sep 2018 18:34:28 +0300 Subject: [PATCH 152/499] crypto: caam/jr - fix ablkcipher_edesc pointer arithmetic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases the zero-length hw_desc array at the end of ablkcipher_edesc struct requires for 4B of tail padding. Due to tail padding and the way pointers to S/G table and IV are computed: edesc->sec4_sg = (void *)edesc + sizeof(struct ablkcipher_edesc) + desc_bytes; iv = (u8 *)edesc->hw_desc + desc_bytes + sec4_sg_bytes; first 4 bytes of IV are overwritten by S/G table. Update computation of pointer to S/G table to rely on offset of hw_desc member and not on sizeof() operator. Cc: # 4.13+ Fixes: 115957bb3e59 ("crypto: caam - fix IV DMA mapping and updating") Signed-off-by: Horia Geantă Signed-off-by: Herbert Xu --- drivers/crypto/caam/caamalg.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c index d67667970f7e..ec40f991e6c6 100644 --- a/drivers/crypto/caam/caamalg.c +++ b/drivers/crypto/caam/caamalg.c @@ -1553,8 +1553,8 @@ static struct ablkcipher_edesc *ablkcipher_edesc_alloc(struct ablkcipher_request edesc->src_nents = src_nents; edesc->dst_nents = dst_nents; edesc->sec4_sg_bytes = sec4_sg_bytes; - edesc->sec4_sg = (void *)edesc + sizeof(struct ablkcipher_edesc) + - desc_bytes; + edesc->sec4_sg = (struct sec4_sg_entry *)((u8 *)edesc->hw_desc + + desc_bytes); edesc->iv_dir = DMA_TO_DEVICE; /* Make sure IV is located in a DMAable area */ @@ -1757,8 +1757,8 @@ static struct ablkcipher_edesc *ablkcipher_giv_edesc_alloc( edesc->src_nents = src_nents; edesc->dst_nents = dst_nents; edesc->sec4_sg_bytes = sec4_sg_bytes; - edesc->sec4_sg = (void *)edesc + sizeof(struct ablkcipher_edesc) + - desc_bytes; + edesc->sec4_sg = (struct sec4_sg_entry *)((u8 *)edesc->hw_desc + + desc_bytes); edesc->iv_dir = DMA_FROM_DEVICE; /* Make sure IV is located in a DMAable area */ From 5b3686c7aaade973b8806dba4ecc99bec8c988f3 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 9 Aug 2018 14:44:29 +0800 Subject: [PATCH 153/499] ieee802154: Use kmemdup instead of duplicating it in ca8210_test_int_driver_write Replace calls to kmalloc followed by a memcpy with a direct call to kmemdup. Signed-off-by: YueHaibing Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/ca8210.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index 58299fb666ed..e21279dde85c 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -634,10 +634,9 @@ static int ca8210_test_int_driver_write( for (i = 0; i < len; i++) dev_dbg(&priv->spi->dev, "%#03x\n", buf[i]); - fifo_buffer = kmalloc(len, GFP_KERNEL); + fifo_buffer = kmemdup(buf, len, GFP_KERNEL); if (!fifo_buffer) return -ENOMEM; - memcpy(fifo_buffer, buf, len); kfifo_in(&test->up_fifo, &fifo_buffer, 4); wake_up_interruptible(&priv->test.readq); From 69be1984ded00a11b1ed0888c6d8e4f35370372f Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Mon, 16 Jul 2018 11:07:07 +0100 Subject: [PATCH 154/499] drm: mali-dp: Call drm_crtc_vblank_reset on device init Currently, if userspace calls drm_wait_vblank before the crtc is activated the crtc vblank_enable hook is called, which in case of malidp driver triggers some warninngs. This happens because on device init we don't inform the drm core about the vblank state by calling drm_crtc_vblank_on/off/reset which together with drm_vblank_get have some magic that prevents calling drm_vblank_enable when crtc is off. Signed-off-by: Alexandru Gheorghe Acked-by: Liviu Dudau Signed-off-by: Liviu Dudau --- drivers/gpu/drm/arm/malidp_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/arm/malidp_drv.c b/drivers/gpu/drm/arm/malidp_drv.c index 08b5bb219816..94d6dabec2dc 100644 --- a/drivers/gpu/drm/arm/malidp_drv.c +++ b/drivers/gpu/drm/arm/malidp_drv.c @@ -754,6 +754,7 @@ static int malidp_bind(struct device *dev) drm->irq_enabled = true; ret = drm_vblank_init(drm, drm->mode_config.num_crtc); + drm_crtc_vblank_reset(&malidp->crtc); if (ret < 0) { DRM_ERROR("failed to initialise vblank\n"); goto vblank_fail; From 89578d04b52c872aef6b1257b5f6caf6bcc35abe Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Wed, 22 Aug 2018 16:18:19 +0100 Subject: [PATCH 155/499] drm/malidp: Fix writeback in NV12 When we want to writeback to memory in NV12 format we need to program the RGB2YUV coefficients. Currently, we don't program the coefficients and NV12 doesn't work at all. This patchset fixes that by programming a sane default(bt709, limited range) as rgb2yuv coefficients. In the long run, probably we need to think of a way for userspace to be able to program that, but for now I think this is better than not working at all or not advertising NV12 as a supported format for memwrite. Changes since v1: - Write the rgb2yuv coefficients only once, since we don't change them at all, just write them the first time NV12 is programmed, suggested by Brian Starkey, here [1] [1] https://lists.freedesktop.org/archives/dri-devel/2018-August/186819.html Signed-off-by: Alexandru Gheorghe Acked-by: Liviu Dudau Signed-off-by: Liviu Dudau --- drivers/gpu/drm/arm/malidp_hw.c | 25 +++++++++++++++++++++++-- drivers/gpu/drm/arm/malidp_hw.h | 3 ++- drivers/gpu/drm/arm/malidp_mw.c | 25 +++++++++++++++++++++---- drivers/gpu/drm/arm/malidp_regs.h | 2 ++ 4 files changed, 48 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/arm/malidp_hw.c b/drivers/gpu/drm/arm/malidp_hw.c index c94a4422e0e9..2781e462c1ed 100644 --- a/drivers/gpu/drm/arm/malidp_hw.c +++ b/drivers/gpu/drm/arm/malidp_hw.c @@ -384,7 +384,8 @@ static long malidp500_se_calc_mclk(struct malidp_hw_device *hwdev, static int malidp500_enable_memwrite(struct malidp_hw_device *hwdev, dma_addr_t *addrs, s32 *pitches, - int num_planes, u16 w, u16 h, u32 fmt_id) + int num_planes, u16 w, u16 h, u32 fmt_id, + const s16 *rgb2yuv_coeffs) { u32 base = MALIDP500_SE_MEMWRITE_BASE; u32 de_base = malidp_get_block_base(hwdev, MALIDP_DE_BLOCK); @@ -416,6 +417,16 @@ static int malidp500_enable_memwrite(struct malidp_hw_device *hwdev, malidp_hw_write(hwdev, MALIDP_DE_H_ACTIVE(w) | MALIDP_DE_V_ACTIVE(h), MALIDP500_SE_MEMWRITE_OUT_SIZE); + + if (rgb2yuv_coeffs) { + int i; + + for (i = 0; i < MALIDP_COLORADJ_NUM_COEFFS; i++) { + malidp_hw_write(hwdev, rgb2yuv_coeffs[i], + MALIDP500_SE_RGB_YUV_COEFFS + i * 4); + } + } + malidp_hw_setbits(hwdev, MALIDP_SE_MEMWRITE_EN, MALIDP500_SE_CONTROL); return 0; @@ -658,7 +669,8 @@ static long malidp550_se_calc_mclk(struct malidp_hw_device *hwdev, static int malidp550_enable_memwrite(struct malidp_hw_device *hwdev, dma_addr_t *addrs, s32 *pitches, - int num_planes, u16 w, u16 h, u32 fmt_id) + int num_planes, u16 w, u16 h, u32 fmt_id, + const s16 *rgb2yuv_coeffs) { u32 base = MALIDP550_SE_MEMWRITE_BASE; u32 de_base = malidp_get_block_base(hwdev, MALIDP_DE_BLOCK); @@ -689,6 +701,15 @@ static int malidp550_enable_memwrite(struct malidp_hw_device *hwdev, malidp_hw_setbits(hwdev, MALIDP550_SE_MEMWRITE_ONESHOT | MALIDP_SE_MEMWRITE_EN, MALIDP550_SE_CONTROL); + if (rgb2yuv_coeffs) { + int i; + + for (i = 0; i < MALIDP_COLORADJ_NUM_COEFFS; i++) { + malidp_hw_write(hwdev, rgb2yuv_coeffs[i], + MALIDP550_SE_RGB_YUV_COEFFS + i * 4); + } + } + return 0; } diff --git a/drivers/gpu/drm/arm/malidp_hw.h b/drivers/gpu/drm/arm/malidp_hw.h index ad2e96915d44..9fc94c08190f 100644 --- a/drivers/gpu/drm/arm/malidp_hw.h +++ b/drivers/gpu/drm/arm/malidp_hw.h @@ -191,7 +191,8 @@ struct malidp_hw { * @param fmt_id - internal format ID of output buffer */ int (*enable_memwrite)(struct malidp_hw_device *hwdev, dma_addr_t *addrs, - s32 *pitches, int num_planes, u16 w, u16 h, u32 fmt_id); + s32 *pitches, int num_planes, u16 w, u16 h, u32 fmt_id, + const s16 *rgb2yuv_coeffs); /* * Disable the writing to memory of the next frame's content. diff --git a/drivers/gpu/drm/arm/malidp_mw.c b/drivers/gpu/drm/arm/malidp_mw.c index ba6ae66387c9..91472e5e0c8b 100644 --- a/drivers/gpu/drm/arm/malidp_mw.c +++ b/drivers/gpu/drm/arm/malidp_mw.c @@ -26,6 +26,8 @@ struct malidp_mw_connector_state { s32 pitches[2]; u8 format; u8 n_planes; + bool rgb2yuv_initialized; + const s16 *rgb2yuv_coeffs; }; static int malidp_mw_connector_get_modes(struct drm_connector *connector) @@ -84,7 +86,7 @@ static void malidp_mw_connector_destroy(struct drm_connector *connector) static struct drm_connector_state * malidp_mw_connector_duplicate_state(struct drm_connector *connector) { - struct malidp_mw_connector_state *mw_state; + struct malidp_mw_connector_state *mw_state, *mw_current_state; if (WARN_ON(!connector->state)) return NULL; @@ -93,7 +95,10 @@ malidp_mw_connector_duplicate_state(struct drm_connector *connector) if (!mw_state) return NULL; - /* No need to preserve any of our driver-local data */ + mw_current_state = to_mw_state(connector->state); + mw_state->rgb2yuv_coeffs = mw_current_state->rgb2yuv_coeffs; + mw_state->rgb2yuv_initialized = mw_current_state->rgb2yuv_initialized; + __drm_atomic_helper_connector_duplicate_state(connector, &mw_state->base); return &mw_state->base; @@ -108,6 +113,13 @@ static const struct drm_connector_funcs malidp_mw_connector_funcs = { .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, }; +static const s16 rgb2yuv_coeffs_bt709_limited[MALIDP_COLORADJ_NUM_COEFFS] = { + 47, 157, 16, + -26, -87, 112, + 112, -102, -10, + 16, 128, 128 +}; + static int malidp_mw_encoder_atomic_check(struct drm_encoder *encoder, struct drm_crtc_state *crtc_state, @@ -157,6 +169,9 @@ malidp_mw_encoder_atomic_check(struct drm_encoder *encoder, } mw_state->n_planes = n_planes; + if (fb->format->is_yuv) + mw_state->rgb2yuv_coeffs = rgb2yuv_coeffs_bt709_limited; + return 0; } @@ -239,10 +254,12 @@ void malidp_mw_atomic_commit(struct drm_device *drm, drm_writeback_queue_job(mw_conn, conn_state->writeback_job); conn_state->writeback_job = NULL; - hwdev->hw->enable_memwrite(hwdev, mw_state->addrs, mw_state->pitches, mw_state->n_planes, - fb->width, fb->height, mw_state->format); + fb->width, fb->height, mw_state->format, + !mw_state->rgb2yuv_initialized ? + mw_state->rgb2yuv_coeffs : NULL); + mw_state->rgb2yuv_initialized = !!mw_state->rgb2yuv_coeffs; } else { DRM_DEV_DEBUG_DRIVER(drm->dev, "Disable memwrite\n"); hwdev->hw->disable_memwrite(hwdev); diff --git a/drivers/gpu/drm/arm/malidp_regs.h b/drivers/gpu/drm/arm/malidp_regs.h index 3579d36b2a71..6ffe849774f2 100644 --- a/drivers/gpu/drm/arm/malidp_regs.h +++ b/drivers/gpu/drm/arm/malidp_regs.h @@ -205,6 +205,7 @@ #define MALIDP500_SE_BASE 0x00c00 #define MALIDP500_SE_CONTROL 0x00c0c #define MALIDP500_SE_MEMWRITE_OUT_SIZE 0x00c2c +#define MALIDP500_SE_RGB_YUV_COEFFS 0x00C74 #define MALIDP500_SE_MEMWRITE_BASE 0x00e00 #define MALIDP500_DC_IRQ_BASE 0x00f00 #define MALIDP500_CONFIG_VALID 0x00f00 @@ -238,6 +239,7 @@ #define MALIDP550_SE_CONTROL 0x08010 #define MALIDP550_SE_MEMWRITE_ONESHOT (1 << 7) #define MALIDP550_SE_MEMWRITE_OUT_SIZE 0x08030 +#define MALIDP550_SE_RGB_YUV_COEFFS 0x08078 #define MALIDP550_SE_MEMWRITE_BASE 0x08100 #define MALIDP550_DC_BASE 0x0c000 #define MALIDP550_DC_CONTROL 0x0c010 From 98e616fe7c94f9c787092b6364405d99bdf42153 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 8 Sep 2018 21:44:08 +0800 Subject: [PATCH 156/499] ieee802154: remove unecessary condition check before debugfs_remove_recursive debugfs_remove_recursive has taken IS_ERR_OR_NULL into account. So just remove the condition check before debugfs_remove_recursive. Signed-off-by: zhong jiang Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/adf7242.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/adf7242.c b/drivers/net/ieee802154/adf7242.c index 23a52b9293f3..cd1d8faccca5 100644 --- a/drivers/net/ieee802154/adf7242.c +++ b/drivers/net/ieee802154/adf7242.c @@ -1308,8 +1308,7 @@ static int adf7242_remove(struct spi_device *spi) { struct adf7242_local *lp = spi_get_drvdata(spi); - if (!IS_ERR_OR_NULL(lp->debugfs_root)) - debugfs_remove_recursive(lp->debugfs_root); + debugfs_remove_recursive(lp->debugfs_root); cancel_delayed_work_sync(&lp->work); destroy_workqueue(lp->wqueue); From 652ef42c134da1bbb03bd4c9b4291dfaf8d7febb Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 20 Sep 2018 12:08:54 +0200 Subject: [PATCH 157/499] net: mscc: fix the frame extraction into the skb When extracting frames from the Ocelot switch, the frame check sequence (FCS) is present at the end of the data extracted. The FCS was put into the sk buffer which introduced some issues (as length related ones), as the FCS shouldn't be part of an Rx sk buffer. This patch fixes the Ocelot switch extraction behaviour by discarding the FCS. Fixes: a556c76adc05 ("net: mscc: Add initial Ocelot switch support") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_board.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c index 26bb3b18f3be..3cdf63e35b53 100644 --- a/drivers/net/ethernet/mscc/ocelot_board.c +++ b/drivers/net/ethernet/mscc/ocelot_board.c @@ -91,7 +91,7 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) struct sk_buff *skb; struct net_device *dev; u32 *buf; - int sz, len; + int sz, len, buf_len; u32 ifh[4]; u32 val; struct frame_info info; @@ -116,14 +116,20 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) err = -ENOMEM; break; } - buf = (u32 *)skb_put(skb, info.len); + buf_len = info.len - ETH_FCS_LEN; + buf = (u32 *)skb_put(skb, buf_len); len = 0; do { sz = ocelot_rx_frame_word(ocelot, grp, false, &val); *buf++ = val; len += sz; - } while ((sz == 4) && (len < info.len)); + } while (len < buf_len); + + /* Read the FCS and discard it */ + sz = ocelot_rx_frame_word(ocelot, grp, false, &val); + /* Update the statistics if part of the FCS was read before */ + len -= ETH_FCS_LEN - sz; if (sz < 0) { err = sz; From 1816494330a83f2a064499d8ed2797045641f92c Mon Sep 17 00:00:00 2001 From: Vincent Pelletier Date: Sun, 9 Sep 2018 04:09:26 +0000 Subject: [PATCH 158/499] scsi: target: iscsi: Use hex2bin instead of a re-implementation This change has the following effects, in order of descreasing importance: 1) Prevent a stack buffer overflow 2) Do not append an unnecessary NULL to an anyway binary buffer, which is writing one byte past client_digest when caller is: chap_string_to_hex(client_digest, chap_r, strlen(chap_r)); The latter was found by KASAN (see below) when input value hes expected size (32 hex chars), and further analysis revealed a stack buffer overflow can happen when network-received value is longer, allowing an unauthenticated remote attacker to smash up to 17 bytes after destination buffer (16 bytes attacker-controlled and one null). As switching to hex2bin requires specifying destination buffer length, and does not internally append any null, it solves both issues. This addresses CVE-2018-14633. Beyond this: - Validate received value length and check hex2bin accepted the input, to log this rejection reason instead of just failing authentication. - Only log received CHAP_R and CHAP_C values once they passed sanity checks. ================================================================== BUG: KASAN: stack-out-of-bounds in chap_string_to_hex+0x32/0x60 [iscsi_target_mod] Write of size 1 at addr ffff8801090ef7c8 by task kworker/0:0/1021 CPU: 0 PID: 1021 Comm: kworker/0:0 Tainted: G O 4.17.8kasan.sess.connops+ #2 Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 05/19/2014 Workqueue: events iscsi_target_do_login_rx [iscsi_target_mod] Call Trace: dump_stack+0x71/0xac print_address_description+0x65/0x22e ? chap_string_to_hex+0x32/0x60 [iscsi_target_mod] kasan_report.cold.6+0x241/0x2fd chap_string_to_hex+0x32/0x60 [iscsi_target_mod] chap_server_compute_md5.isra.2+0x2cb/0x860 [iscsi_target_mod] ? chap_binaryhex_to_asciihex.constprop.5+0x50/0x50 [iscsi_target_mod] ? ftrace_caller_op_ptr+0xe/0xe ? __orc_find+0x6f/0xc0 ? unwind_next_frame+0x231/0x850 ? kthread+0x1a0/0x1c0 ? ret_from_fork+0x35/0x40 ? ret_from_fork+0x35/0x40 ? iscsi_target_do_login_rx+0x3bc/0x4c0 [iscsi_target_mod] ? deref_stack_reg+0xd0/0xd0 ? iscsi_target_do_login_rx+0x3bc/0x4c0 [iscsi_target_mod] ? is_module_text_address+0xa/0x11 ? kernel_text_address+0x4c/0x110 ? __save_stack_trace+0x82/0x100 ? ret_from_fork+0x35/0x40 ? save_stack+0x8c/0xb0 ? 0xffffffffc1660000 ? iscsi_target_do_login+0x155/0x8d0 [iscsi_target_mod] ? iscsi_target_do_login_rx+0x3bc/0x4c0 [iscsi_target_mod] ? process_one_work+0x35c/0x640 ? worker_thread+0x66/0x5d0 ? kthread+0x1a0/0x1c0 ? ret_from_fork+0x35/0x40 ? iscsi_update_param_value+0x80/0x80 [iscsi_target_mod] ? iscsit_release_cmd+0x170/0x170 [iscsi_target_mod] chap_main_loop+0x172/0x570 [iscsi_target_mod] ? chap_server_compute_md5.isra.2+0x860/0x860 [iscsi_target_mod] ? rx_data+0xd6/0x120 [iscsi_target_mod] ? iscsit_print_session_params+0xd0/0xd0 [iscsi_target_mod] ? cyc2ns_read_begin.part.2+0x90/0x90 ? _raw_spin_lock_irqsave+0x25/0x50 ? memcmp+0x45/0x70 iscsi_target_do_login+0x875/0x8d0 [iscsi_target_mod] ? iscsi_target_check_first_request.isra.5+0x1a0/0x1a0 [iscsi_target_mod] ? del_timer+0xe0/0xe0 ? memset+0x1f/0x40 ? flush_sigqueue+0x29/0xd0 iscsi_target_do_login_rx+0x3bc/0x4c0 [iscsi_target_mod] ? iscsi_target_nego_release+0x80/0x80 [iscsi_target_mod] ? iscsi_target_restore_sock_callbacks+0x130/0x130 [iscsi_target_mod] process_one_work+0x35c/0x640 worker_thread+0x66/0x5d0 ? flush_rcu_work+0x40/0x40 kthread+0x1a0/0x1c0 ? kthread_bind+0x30/0x30 ret_from_fork+0x35/0x40 The buggy address belongs to the page: page:ffffea0004243bc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 flags: 0x17fffc000000000() raw: 017fffc000000000 0000000000000000 0000000000000000 00000000ffffffff raw: ffffea0004243c20 ffffea0004243ba0 0000000000000000 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801090ef680: f2 f2 f2 f2 f2 f2 f2 01 f2 f2 f2 f2 f2 f2 f2 00 ffff8801090ef700: f2 f2 f2 f2 f2 f2 f2 00 02 f2 f2 f2 f2 f2 f2 00 >ffff8801090ef780: 00 f2 f2 f2 f2 f2 f2 00 00 f2 f2 f2 f2 f2 f2 00 ^ ffff8801090ef800: 00 f2 f2 f2 f2 f2 f2 00 00 00 00 02 f2 f2 f2 f2 ffff8801090ef880: f2 f2 f2 00 00 00 00 00 00 00 00 f2 f2 f2 f2 00 ================================================================== Signed-off-by: Vincent Pelletier Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/iscsi/iscsi_target_auth.c | 30 +++++++++++------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_auth.c b/drivers/target/iscsi/iscsi_target_auth.c index 9518ffd8b8ba..6c3b4c022894 100644 --- a/drivers/target/iscsi/iscsi_target_auth.c +++ b/drivers/target/iscsi/iscsi_target_auth.c @@ -26,18 +26,6 @@ #include "iscsi_target_nego.h" #include "iscsi_target_auth.h" -static int chap_string_to_hex(unsigned char *dst, unsigned char *src, int len) -{ - int j = DIV_ROUND_UP(len, 2), rc; - - rc = hex2bin(dst, src, j); - if (rc < 0) - pr_debug("CHAP string contains non hex digit symbols\n"); - - dst[j] = '\0'; - return j; -} - static void chap_binaryhex_to_asciihex(char *dst, char *src, int src_len) { int i; @@ -248,9 +236,16 @@ static int chap_server_compute_md5( pr_err("Could not find CHAP_R.\n"); goto out; } + if (strlen(chap_r) != MD5_SIGNATURE_SIZE * 2) { + pr_err("Malformed CHAP_R\n"); + goto out; + } + if (hex2bin(client_digest, chap_r, MD5_SIGNATURE_SIZE) < 0) { + pr_err("Malformed CHAP_R\n"); + goto out; + } pr_debug("[server] Got CHAP_R=%s\n", chap_r); - chap_string_to_hex(client_digest, chap_r, strlen(chap_r)); tfm = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(tfm)) { @@ -349,9 +344,7 @@ static int chap_server_compute_md5( pr_err("Could not find CHAP_C.\n"); goto out; } - pr_debug("[server] Got CHAP_C=%s\n", challenge); - challenge_len = chap_string_to_hex(challenge_binhex, challenge, - strlen(challenge)); + challenge_len = DIV_ROUND_UP(strlen(challenge), 2); if (!challenge_len) { pr_err("Unable to convert incoming challenge\n"); goto out; @@ -360,6 +353,11 @@ static int chap_server_compute_md5( pr_err("CHAP_C exceeds maximum binary size of 1024 bytes\n"); goto out; } + if (hex2bin(challenge_binhex, challenge, challenge_len) < 0) { + pr_err("Malformed CHAP_C\n"); + goto out; + } + pr_debug("[server] Got CHAP_C=%s\n", challenge); /* * During mutual authentication, the CHAP_C generated by the * initiator must not match the original CHAP_C generated by From 8c39e2699f8acb2e29782a834e56306da24937fe Mon Sep 17 00:00:00 2001 From: Vincent Pelletier Date: Sun, 9 Sep 2018 04:09:27 +0000 Subject: [PATCH 159/499] scsi: target: iscsi: Use bin2hex instead of a re-implementation Signed-off-by: Vincent Pelletier Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/iscsi/iscsi_target_auth.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_auth.c b/drivers/target/iscsi/iscsi_target_auth.c index 6c3b4c022894..4e680d753941 100644 --- a/drivers/target/iscsi/iscsi_target_auth.c +++ b/drivers/target/iscsi/iscsi_target_auth.c @@ -26,15 +26,6 @@ #include "iscsi_target_nego.h" #include "iscsi_target_auth.h" -static void chap_binaryhex_to_asciihex(char *dst, char *src, int src_len) -{ - int i; - - for (i = 0; i < src_len; i++) { - sprintf(&dst[i*2], "%02x", (int) src[i] & 0xff); - } -} - static int chap_gen_challenge( struct iscsi_conn *conn, int caller, @@ -50,7 +41,7 @@ static int chap_gen_challenge( ret = get_random_bytes_wait(chap->challenge, CHAP_CHALLENGE_LENGTH); if (unlikely(ret)) return ret; - chap_binaryhex_to_asciihex(challenge_asciihex, chap->challenge, + bin2hex(challenge_asciihex, chap->challenge, CHAP_CHALLENGE_LENGTH); /* * Set CHAP_C, and copy the generated challenge into c_str. @@ -289,7 +280,7 @@ static int chap_server_compute_md5( goto out; } - chap_binaryhex_to_asciihex(response, server_digest, MD5_SIGNATURE_SIZE); + bin2hex(response, server_digest, MD5_SIGNATURE_SIZE); pr_debug("[server] MD5 Server Digest: %s\n", response); if (memcmp(server_digest, client_digest, MD5_SIGNATURE_SIZE) != 0) { @@ -411,7 +402,7 @@ static int chap_server_compute_md5( /* * Convert response from binary hex to ascii hext. */ - chap_binaryhex_to_asciihex(response, digest, MD5_SIGNATURE_SIZE); + bin2hex(response, digest, MD5_SIGNATURE_SIZE); *nr_out_len += sprintf(nr_out_ptr + *nr_out_len, "CHAP_R=0x%s", response); *nr_out_len += 1; From 318ddb34b2052f838aa243d07173e2badf3e630e Mon Sep 17 00:00:00 2001 From: Wen Xiong Date: Thu, 20 Sep 2018 19:32:12 -0500 Subject: [PATCH 160/499] scsi: ipr: System hung while dlpar adding primary ipr adapter back While dlpar adding primary ipr adapter back, driver goes through adapter initialization then schedule ipr_worker_thread to start te disk scan by dropping the host lock, calling scsi_add_device. Then get the adapter reset request again, so driver does scsi_block_requests, this will cause the scsi_add_device get hung until we unblock. But we can't run ipr_worker_thread to do the unblock because its stuck in scsi_add_device. This patch fixes the issue. [mkp: typo and whitespace fixes] Signed-off-by: Wen Xiong Acked-by: Brian King Signed-off-by: Martin K. Petersen --- drivers/scsi/ipr.c | 110 ++++++++++++++++++++++++++------------------- drivers/scsi/ipr.h | 1 + 2 files changed, 64 insertions(+), 47 deletions(-) diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index f2ec80b0ffc0..271990bc065b 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -3335,64 +3335,19 @@ static void ipr_release_dump(struct kref *kref) LEAVE; } -/** - * ipr_worker_thread - Worker thread - * @work: ioa config struct - * - * Called at task level from a work thread. This function takes care - * of adding and removing device from the mid-layer as configuration - * changes are detected by the adapter. - * - * Return value: - * nothing - **/ -static void ipr_worker_thread(struct work_struct *work) +static void ipr_add_remove_thread(struct work_struct *work) { unsigned long lock_flags; struct ipr_resource_entry *res; struct scsi_device *sdev; - struct ipr_dump *dump; struct ipr_ioa_cfg *ioa_cfg = - container_of(work, struct ipr_ioa_cfg, work_q); + container_of(work, struct ipr_ioa_cfg, scsi_add_work_q); u8 bus, target, lun; int did_work; ENTER; spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags); - if (ioa_cfg->sdt_state == READ_DUMP) { - dump = ioa_cfg->dump; - if (!dump) { - spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); - return; - } - kref_get(&dump->kref); - spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); - ipr_get_ioa_dump(ioa_cfg, dump); - kref_put(&dump->kref, ipr_release_dump); - - spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags); - if (ioa_cfg->sdt_state == DUMP_OBTAINED && !ioa_cfg->dump_timeout) - ipr_initiate_ioa_reset(ioa_cfg, IPR_SHUTDOWN_NONE); - spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); - return; - } - - if (ioa_cfg->scsi_unblock) { - ioa_cfg->scsi_unblock = 0; - ioa_cfg->scsi_blocked = 0; - spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); - scsi_unblock_requests(ioa_cfg->host); - spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags); - if (ioa_cfg->scsi_blocked) - scsi_block_requests(ioa_cfg->host); - } - - if (!ioa_cfg->scan_enabled) { - spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); - return; - } - restart: do { did_work = 0; @@ -3439,6 +3394,66 @@ static void ipr_worker_thread(struct work_struct *work) LEAVE; } +/** + * ipr_worker_thread - Worker thread + * @work: ioa config struct + * + * Called at task level from a work thread. This function takes care + * of adding and removing device from the mid-layer as configuration + * changes are detected by the adapter. + * + * Return value: + * nothing + **/ +static void ipr_worker_thread(struct work_struct *work) +{ + unsigned long lock_flags; + struct ipr_dump *dump; + struct ipr_ioa_cfg *ioa_cfg = + container_of(work, struct ipr_ioa_cfg, work_q); + + ENTER; + spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags); + + if (ioa_cfg->sdt_state == READ_DUMP) { + dump = ioa_cfg->dump; + if (!dump) { + spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); + return; + } + kref_get(&dump->kref); + spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); + ipr_get_ioa_dump(ioa_cfg, dump); + kref_put(&dump->kref, ipr_release_dump); + + spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags); + if (ioa_cfg->sdt_state == DUMP_OBTAINED && !ioa_cfg->dump_timeout) + ipr_initiate_ioa_reset(ioa_cfg, IPR_SHUTDOWN_NONE); + spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); + return; + } + + if (ioa_cfg->scsi_unblock) { + ioa_cfg->scsi_unblock = 0; + ioa_cfg->scsi_blocked = 0; + spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); + scsi_unblock_requests(ioa_cfg->host); + spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags); + if (ioa_cfg->scsi_blocked) + scsi_block_requests(ioa_cfg->host); + } + + if (!ioa_cfg->scan_enabled) { + spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); + return; + } + + schedule_work(&ioa_cfg->scsi_add_work_q); + + spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); + LEAVE; +} + #ifdef CONFIG_SCSI_IPR_TRACE /** * ipr_read_trace - Dump the adapter trace @@ -9933,6 +9948,7 @@ static void ipr_init_ioa_cfg(struct ipr_ioa_cfg *ioa_cfg, INIT_LIST_HEAD(&ioa_cfg->free_res_q); INIT_LIST_HEAD(&ioa_cfg->used_res_q); INIT_WORK(&ioa_cfg->work_q, ipr_worker_thread); + INIT_WORK(&ioa_cfg->scsi_add_work_q, ipr_add_remove_thread); init_waitqueue_head(&ioa_cfg->reset_wait_q); init_waitqueue_head(&ioa_cfg->msi_wait_q); init_waitqueue_head(&ioa_cfg->eeh_wait_q); diff --git a/drivers/scsi/ipr.h b/drivers/scsi/ipr.h index 68afbbde54d3..f6baa2351313 100644 --- a/drivers/scsi/ipr.h +++ b/drivers/scsi/ipr.h @@ -1575,6 +1575,7 @@ struct ipr_ioa_cfg { u8 saved_mode_page_len; struct work_struct work_q; + struct work_struct scsi_add_work_q; struct workqueue_struct *reset_work_q; wait_queue_head_t reset_wait_q; From f1f1fadacaf08b7cf11714c0c29f8fa4d4ef68a9 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 21 Sep 2018 09:01:01 +0200 Subject: [PATCH 161/499] scsi: sd: don't crash the host on invalid commands When sd_init_command() get's a command with a unknown req_op() it crashes the system via BUG(). This makes debugging the actual reason for the broken request cmd_flags pretty hard as the system is down before it's able to write out debugging data on the serial console or the trace buffer. Change the BUG() to a WARN_ON() and return BLKPREP_KILL to fail gracefully and return an I/O error to the producer of the request. Signed-off-by: Johannes Thumshirn Cc: Hannes Reinecke Cc: Bart Van Assche Cc: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 5e4f10d28065..4a57ffecc7e6 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1276,7 +1276,8 @@ static int sd_init_command(struct scsi_cmnd *cmd) case REQ_OP_ZONE_RESET: return sd_zbc_setup_reset_cmnd(cmd); default: - BUG(); + WARN_ON_ONCE(1); + return BLKPREP_KILL; } } From 5607fff303636d48b88414c6be353d9fed700af2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 18 Sep 2018 09:01:44 -0700 Subject: [PATCH 162/499] bpf: sockmap only allow ESTABLISHED sock state After this patch we only allow socks that are in ESTABLISHED state or are being added via a sock_ops event that is transitioning into an ESTABLISHED state. By allowing sock_ops events we allow users to manage sockmaps directly from sock ops programs. The two supported sock_ops ops are BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB. Similar to TLS ULP this ensures sk_user_data is correct. Reported-by: Eric Dumazet Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 488ef9663c01..1f97b559892a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2097,8 +2097,12 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP) { + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { fput(socket->file); return -EOPNOTSUPP; } @@ -2453,6 +2457,16 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ + if (skops.sk->sk_type != SOCK_STREAM || + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { + fput(socket->file); + return -EOPNOTSUPP; + } + lock_sock(skops.sk); preempt_disable(); rcu_read_lock(); @@ -2543,10 +2557,22 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, }; +static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. This checks that the sock ops triggering the update is + * one indicating we are (or will be soon) in an ESTABLISHED state. + */ + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_map_ctx_update_elem(bpf_sock, map, key, flags); } @@ -2565,6 +2591,9 @@ BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); } From b05545e15e1ff1d6a6a8593971275f9cc3e6b92b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 18 Sep 2018 09:01:49 -0700 Subject: [PATCH 163/499] bpf: sockmap, fix transition through disconnect without close It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE state via tcp_disconnect() without actually calling tcp_close which would then call our bpf_tcp_close() callback. Because of this a user could disconnect a socket then put it in a LISTEN state which would break our assumptions about sockets always being ESTABLISHED state. To resolve this rely on the unhash hook, which is called in the disconnect case, to remove the sock from the sockmap. Reported-by: Eric Dumazet Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 60 ++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 1f97b559892a..0a0f2ec75370 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -132,6 +132,7 @@ struct smap_psock { struct work_struct gc_work; struct proto *sk_proto; + void (*save_unhash)(struct sock *sk); void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); @@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_unhash(struct sock *sk); static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -184,6 +186,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], struct proto *base) { prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; prot[SOCKMAP_BASE].close = bpf_tcp_close; prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; @@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk) return -EBUSY; } + psock->save_unhash = sk->sk_prot->unhash; psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; @@ -305,30 +309,12 @@ static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, return e; } -static void bpf_tcp_close(struct sock *sk, long timeout) +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) { - void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; struct sock *osk; - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - /* The psock may be destroyed anytime after exiting the RCU critial - * section so by the time we use close_fun the psock may no longer - * be valid. However, bpf_tcp_close is called with the sock lock - * held so the close hook and sk are still valid. - */ - close_fun = psock->save_close; - if (psock->cork) { free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); @@ -379,6 +365,42 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(e); e = psock_map_pop(sk, psock); } +} + +static void bpf_tcp_unhash(struct sock *sk) +{ + void (*unhash_fun)(struct sock *sk); + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + unhash_fun = psock->save_unhash; + bpf_tcp_remove(sk, psock); + rcu_read_unlock(); + unhash_fun(sk); +} + +static void bpf_tcp_close(struct sock *sk, long timeout) +{ + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + close_fun = psock->save_close; + bpf_tcp_remove(sk, psock); rcu_read_unlock(); release_sock(sk); close_fun(sk, timeout); From 5028027844cfc6168e39650abecd817ba64c9d98 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 18 Sep 2018 09:01:54 -0700 Subject: [PATCH 164/499] bpf: test_maps, only support ESTABLISHED socks Ensure that sockets added to a sock{map|hash} that is not in the ESTABLISHED state is rejected. Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_maps.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 6f54f84144a0..9b552c0fc47d 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -580,7 +580,11 @@ static void test_sockmap(int tasks, void *data) /* Test update without programs */ for (i = 0; i < 6; i++) { err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); - if (err) { + if (i < 2 && !err) { + printf("Allowed update sockmap '%i:%i' not in ESTABLISHED\n", + i, sfd[i]); + goto out_sockmap; + } else if (i >= 2 && err) { printf("Failed noprog update sockmap '%i:%i'\n", i, sfd[i]); goto out_sockmap; @@ -741,7 +745,7 @@ static void test_sockmap(int tasks, void *data) } /* Test map update elem afterwards fd lives in fd and map_fd */ - for (i = 0; i < 6; i++) { + for (i = 2; i < 6; i++) { err = bpf_map_update_elem(map_fd_rx, &i, &sfd[i], BPF_ANY); if (err) { printf("Failed map_fd_rx update sockmap %i '%i:%i'\n", @@ -845,7 +849,7 @@ static void test_sockmap(int tasks, void *data) } /* Delete the elems without programs */ - for (i = 0; i < 6; i++) { + for (i = 2; i < 6; i++) { err = bpf_map_delete_elem(fd, &i); if (err) { printf("Failed delete sockmap %i '%i:%i'\n", From f88b4c01b97e09535505cf3c327fdbce55c27f00 Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Thu, 20 Sep 2018 14:29:45 -0600 Subject: [PATCH 165/499] netlabel: check for IPV4MASK in addrinfo_get netlbl_unlabel_addrinfo_get() assumes that if it finds the NLBL_UNLABEL_A_IPV4ADDR attribute, it must also have the NLBL_UNLABEL_A_IPV4MASK attribute as well. However, this is not necessarily the case as the current checks in netlbl_unlabel_staticadd() and friends are not sufficent to enforce this. If passed a netlink message with NLBL_UNLABEL_A_IPV4ADDR, NLBL_UNLABEL_A_IPV6ADDR, and NLBL_UNLABEL_A_IPV6MASK attributes, these functions will all call netlbl_unlabel_addrinfo_get() which will then attempt dereference NULL when fetching the non-existent NLBL_UNLABEL_A_IPV4MASK attribute: Unable to handle kernel NULL pointer dereference at virtual address 0 Process unlab (pid: 31762, stack limit = 0xffffff80502d8000) Call trace: netlbl_unlabel_addrinfo_get+0x44/0xd8 netlbl_unlabel_staticremovedef+0x98/0xe0 genl_rcv_msg+0x354/0x388 netlink_rcv_skb+0xac/0x118 genl_rcv+0x34/0x48 netlink_unicast+0x158/0x1f0 netlink_sendmsg+0x32c/0x338 sock_sendmsg+0x44/0x60 ___sys_sendmsg+0x1d0/0x2a8 __sys_sendmsg+0x64/0xb4 SyS_sendmsg+0x34/0x4c el0_svc_naked+0x34/0x38 Code: 51001149 7100113f 540000a0 f9401508 (79400108) ---[ end trace f6438a488e737143 ]--- Kernel panic - not syncing: Fatal exception Signed-off-by: Sean Tranchetti Signed-off-by: David S. Miller --- net/netlabel/netlabel_unlabeled.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index c070dfc0190a..c92894c3e40a 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -781,7 +781,8 @@ static int netlbl_unlabel_addrinfo_get(struct genl_info *info, { u32 addr_len; - if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) { + if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] && + info->attrs[NLBL_UNLABEL_A_IPV4MASK]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); if (addr_len != sizeof(struct in_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) From 86f9bd1ff61c413a2a251fa736463295e4e24733 Mon Sep 17 00:00:00 2001 From: Jeff Barnhill <0xeffeff@gmail.com> Date: Fri, 21 Sep 2018 00:45:27 +0000 Subject: [PATCH 166/499] net/ipv6: Display all addresses in output of /proc/net/if_inet6 The backend handling for /proc/net/if_inet6 in addrconf.c doesn't properly handle starting/stopping the iteration. The problem is that at some point during the iteration, an overflow is detected and the process is subsequently stopped. The item being shown via seq_printf() when the overflow occurs is not actually shown, though. When start() is subsequently called to resume iterating, it returns the next item, and thus the item that was being processed when the overflow occurred never gets printed. Alter the meaning of the private data member "offset". Currently, when it is not 0 (which only happens at the very beginning), "offset" represents the next hlist item to be printed. After this change, "offset" always represents the current item. This is also consistent with the private data member "bucket", which represents the current bucket, and also the use of "pos" as defined in seq_file.txt: The pos passed to start() will always be either zero, or the most recent pos used in the previous session. Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com> Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d51a8c0b3372..c63ccce6425f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4201,7 +4201,6 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) p++; continue; } - state->offset++; return ifa; } @@ -4225,13 +4224,12 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, return ifa; } + state->offset = 0; while (++state->bucket < IN6_ADDR_HSIZE) { - state->offset = 0; hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket], addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; - state->offset++; return ifa; } } From 54be5b8ce33f8d1a05b258070c81ed98f935883d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 21 Sep 2018 02:53:17 +0000 Subject: [PATCH 167/499] PCI: hv: Fix return value check in hv_pci_assign_slots() In case of error, the function pci_create_slot() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: a15f2c08c708 ("PCI: hv: support reporting serial number as slot information") Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/pci/controller/pci-hyperv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index ee80e79db21a..9ba4d12c179c 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1484,8 +1484,10 @@ static void hv_pci_assign_slots(struct hv_pcibus_device *hbus) snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser); hpdev->pci_slot = pci_create_slot(hbus->pci_bus, slot_nr, name, NULL); - if (!hpdev->pci_slot) + if (IS_ERR(hpdev->pci_slot)) { pr_warn("pci_create slot %s failed\n", name); + hpdev->pci_slot = NULL; + } } } From 72b462798c2a258725dd741d1c6c139446e20d12 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 10:53:47 +0800 Subject: [PATCH 168/499] net: seeq: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/seeq/ether3.c | 5 +++-- drivers/net/ethernet/seeq/sgiseeq.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/seeq/ether3.c b/drivers/net/ethernet/seeq/ether3.c index c5bc124b41a9..d1bb73bf9914 100644 --- a/drivers/net/ethernet/seeq/ether3.c +++ b/drivers/net/ethernet/seeq/ether3.c @@ -77,7 +77,8 @@ static void ether3_setmulticastlist(struct net_device *dev); static int ether3_rx(struct net_device *dev, unsigned int maxcnt); static void ether3_tx(struct net_device *dev); static int ether3_open (struct net_device *dev); -static int ether3_sendpacket (struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t ether3_sendpacket(struct sk_buff *skb, + struct net_device *dev); static irqreturn_t ether3_interrupt (int irq, void *dev_id); static int ether3_close (struct net_device *dev); static void ether3_setmulticastlist (struct net_device *dev); @@ -481,7 +482,7 @@ static void ether3_timeout(struct net_device *dev) /* * Transmit a packet */ -static int +static netdev_tx_t ether3_sendpacket(struct sk_buff *skb, struct net_device *dev) { unsigned long flags; diff --git a/drivers/net/ethernet/seeq/sgiseeq.c b/drivers/net/ethernet/seeq/sgiseeq.c index 573691bc3b71..70cce63a6081 100644 --- a/drivers/net/ethernet/seeq/sgiseeq.c +++ b/drivers/net/ethernet/seeq/sgiseeq.c @@ -578,7 +578,8 @@ static inline int sgiseeq_reset(struct net_device *dev) return 0; } -static int sgiseeq_start_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t +sgiseeq_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct sgiseeq_private *sp = netdev_priv(dev); struct hpc3_ethregs *hregs = sp->hregs; From f3bf939f3d4552becb115d76e17b3e5957bab4a2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 11:02:37 +0800 Subject: [PATCH 169/499] net: cirrus: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/cirrus/ep93xx_eth.c | 2 +- drivers/net/ethernet/cirrus/mac89x0.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cirrus/ep93xx_eth.c b/drivers/net/ethernet/cirrus/ep93xx_eth.c index e2a702996db4..13dfdfca49fc 100644 --- a/drivers/net/ethernet/cirrus/ep93xx_eth.c +++ b/drivers/net/ethernet/cirrus/ep93xx_eth.c @@ -332,7 +332,7 @@ static int ep93xx_poll(struct napi_struct *napi, int budget) return rx; } -static int ep93xx_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ep93xx_xmit(struct sk_buff *skb, struct net_device *dev) { struct ep93xx_priv *ep = netdev_priv(dev); struct ep93xx_tdesc *txd; diff --git a/drivers/net/ethernet/cirrus/mac89x0.c b/drivers/net/ethernet/cirrus/mac89x0.c index 3f8fe8fd79cc..6324e80960c3 100644 --- a/drivers/net/ethernet/cirrus/mac89x0.c +++ b/drivers/net/ethernet/cirrus/mac89x0.c @@ -113,7 +113,7 @@ struct net_local { /* Index to functions, as function prototypes. */ static int net_open(struct net_device *dev); -static int net_send_packet(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t net_send_packet(struct sk_buff *skb, struct net_device *dev); static irqreturn_t net_interrupt(int irq, void *dev_id); static void set_multicast_list(struct net_device *dev); static void net_rx(struct net_device *dev); @@ -324,7 +324,7 @@ net_open(struct net_device *dev) return 0; } -static int +static netdev_tx_t net_send_packet(struct sk_buff *skb, struct net_device *dev) { struct net_local *lp = netdev_priv(dev); From 28d304efb88f18aadfe9219f0e4b2b0ef9558a3b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 11:05:50 +0800 Subject: [PATCH 170/499] net: sgi: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/sgi/ioc3-eth.c | 4 ++-- drivers/net/ethernet/sgi/meth.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c index 18d533fdf14c..3140999642ba 100644 --- a/drivers/net/ethernet/sgi/ioc3-eth.c +++ b/drivers/net/ethernet/sgi/ioc3-eth.c @@ -99,7 +99,7 @@ struct ioc3_private { static int ioc3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void ioc3_set_multicast_list(struct net_device *dev); -static int ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev); static void ioc3_timeout(struct net_device *dev); static inline unsigned int ioc3_hash(const unsigned char *addr); static inline void ioc3_stop(struct ioc3_private *ip); @@ -1390,7 +1390,7 @@ static struct pci_driver ioc3_driver = { .remove = ioc3_remove_one, }; -static int ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned long data; struct ioc3_private *ip = netdev_priv(dev); diff --git a/drivers/net/ethernet/sgi/meth.c b/drivers/net/ethernet/sgi/meth.c index ea55abd62ec7..703fbbefea44 100644 --- a/drivers/net/ethernet/sgi/meth.c +++ b/drivers/net/ethernet/sgi/meth.c @@ -697,7 +697,7 @@ static void meth_add_to_tx_ring(struct meth_private *priv, struct sk_buff *skb) /* * Transmit a packet (called by the kernel) */ -static int meth_tx(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t meth_tx(struct sk_buff *skb, struct net_device *dev) { struct meth_private *priv = netdev_priv(dev); unsigned long flags; From f0f25516e3b9efff2a34059347d13361d26e314a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 11:35:11 +0800 Subject: [PATCH 171/499] net: wiznet: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/wiznet/w5100.c | 2 +- drivers/net/ethernet/wiznet/w5300.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/wiznet/w5100.c b/drivers/net/ethernet/wiznet/w5100.c index 2bdfb39215e9..d8ba512f166a 100644 --- a/drivers/net/ethernet/wiznet/w5100.c +++ b/drivers/net/ethernet/wiznet/w5100.c @@ -835,7 +835,7 @@ static void w5100_tx_work(struct work_struct *work) w5100_tx_skb(priv->ndev, skb); } -static int w5100_start_tx(struct sk_buff *skb, struct net_device *ndev) +static netdev_tx_t w5100_start_tx(struct sk_buff *skb, struct net_device *ndev) { struct w5100_priv *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/wiznet/w5300.c b/drivers/net/ethernet/wiznet/w5300.c index 56ae573001e8..80fdbff67d82 100644 --- a/drivers/net/ethernet/wiznet/w5300.c +++ b/drivers/net/ethernet/wiznet/w5300.c @@ -365,7 +365,7 @@ static void w5300_tx_timeout(struct net_device *ndev) netif_wake_queue(ndev); } -static int w5300_start_tx(struct sk_buff *skb, struct net_device *ndev) +static netdev_tx_t w5300_start_tx(struct sk_buff *skb, struct net_device *ndev) { struct w5300_priv *priv = netdev_priv(ndev); From 648c361a568dcd098a3496c6c66bca4c8473e52b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 11:44:05 +0800 Subject: [PATCH 172/499] net: i825xx: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/i825xx/ether1.c | 5 +++-- drivers/net/ethernet/i825xx/lib82596.c | 4 ++-- drivers/net/ethernet/i825xx/sun3_82586.c | 6 ++++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/i825xx/ether1.c b/drivers/net/ethernet/i825xx/ether1.c index dc983450354b..35f6291a3672 100644 --- a/drivers/net/ethernet/i825xx/ether1.c +++ b/drivers/net/ethernet/i825xx/ether1.c @@ -64,7 +64,8 @@ static unsigned int net_debug = NET_DEBUG; #define RX_AREA_END 0x0fc00 static int ether1_open(struct net_device *dev); -static int ether1_sendpacket(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t ether1_sendpacket(struct sk_buff *skb, + struct net_device *dev); static irqreturn_t ether1_interrupt(int irq, void *dev_id); static int ether1_close(struct net_device *dev); static void ether1_setmulticastlist(struct net_device *dev); @@ -667,7 +668,7 @@ ether1_timeout(struct net_device *dev) netif_wake_queue(dev); } -static int +static netdev_tx_t ether1_sendpacket (struct sk_buff *skb, struct net_device *dev) { int tmp, tst, nopaddr, txaddr, tbdaddr, dataddr; diff --git a/drivers/net/ethernet/i825xx/lib82596.c b/drivers/net/ethernet/i825xx/lib82596.c index f00a1dc2128c..2f7ae118217f 100644 --- a/drivers/net/ethernet/i825xx/lib82596.c +++ b/drivers/net/ethernet/i825xx/lib82596.c @@ -347,7 +347,7 @@ static const char init_setup[] = 0x7f /* *multi IA */ }; static int i596_open(struct net_device *dev); -static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t i596_start_xmit(struct sk_buff *skb, struct net_device *dev); static irqreturn_t i596_interrupt(int irq, void *dev_id); static int i596_close(struct net_device *dev); static void i596_add_cmd(struct net_device *dev, struct i596_cmd *cmd); @@ -966,7 +966,7 @@ static void i596_tx_timeout (struct net_device *dev) } -static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t i596_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct i596_private *lp = netdev_priv(dev); struct tx_cmd *tx_cmd; diff --git a/drivers/net/ethernet/i825xx/sun3_82586.c b/drivers/net/ethernet/i825xx/sun3_82586.c index 8bb15a8c2a40..1a86184d44c0 100644 --- a/drivers/net/ethernet/i825xx/sun3_82586.c +++ b/drivers/net/ethernet/i825xx/sun3_82586.c @@ -121,7 +121,8 @@ static int sun3_82586_probe1(struct net_device *dev,int ioaddr); static irqreturn_t sun3_82586_interrupt(int irq,void *dev_id); static int sun3_82586_open(struct net_device *dev); static int sun3_82586_close(struct net_device *dev); -static int sun3_82586_send_packet(struct sk_buff *,struct net_device *); +static netdev_tx_t sun3_82586_send_packet(struct sk_buff *, + struct net_device *); static struct net_device_stats *sun3_82586_get_stats(struct net_device *dev); static void set_multicast_list(struct net_device *dev); static void sun3_82586_timeout(struct net_device *dev); @@ -1002,7 +1003,8 @@ static void sun3_82586_timeout(struct net_device *dev) * send frame */ -static int sun3_82586_send_packet(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t +sun3_82586_send_packet(struct sk_buff *skb, struct net_device *dev) { int len,i; #ifndef NO_NOPCOMMANDS From e6ce3822a9f264b3f24c1acdc624131fb014f2f0 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 11:46:37 +0800 Subject: [PATCH 173/499] net: apple: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/apple/bmac.c | 4 ++-- drivers/net/ethernet/apple/mace.c | 4 ++-- drivers/net/ethernet/apple/macmace.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/apple/bmac.c b/drivers/net/ethernet/apple/bmac.c index 024998d6d8c6..6a8e2567f2bd 100644 --- a/drivers/net/ethernet/apple/bmac.c +++ b/drivers/net/ethernet/apple/bmac.c @@ -154,7 +154,7 @@ static irqreturn_t bmac_txdma_intr(int irq, void *dev_id); static irqreturn_t bmac_rxdma_intr(int irq, void *dev_id); static void bmac_set_timeout(struct net_device *dev); static void bmac_tx_timeout(struct timer_list *t); -static int bmac_output(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t bmac_output(struct sk_buff *skb, struct net_device *dev); static void bmac_start(struct net_device *dev); #define DBDMA_SET(x) ( ((x) | (x) << 16) ) @@ -1456,7 +1456,7 @@ bmac_start(struct net_device *dev) spin_unlock_irqrestore(&bp->lock, flags); } -static int +static netdev_tx_t bmac_output(struct sk_buff *skb, struct net_device *dev) { struct bmac_data *bp = netdev_priv(dev); diff --git a/drivers/net/ethernet/apple/mace.c b/drivers/net/ethernet/apple/mace.c index 0b5429d76bcf..68b9ee489489 100644 --- a/drivers/net/ethernet/apple/mace.c +++ b/drivers/net/ethernet/apple/mace.c @@ -78,7 +78,7 @@ struct mace_data { static int mace_open(struct net_device *dev); static int mace_close(struct net_device *dev); -static int mace_xmit_start(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t mace_xmit_start(struct sk_buff *skb, struct net_device *dev); static void mace_set_multicast(struct net_device *dev); static void mace_reset(struct net_device *dev); static int mace_set_address(struct net_device *dev, void *addr); @@ -525,7 +525,7 @@ static inline void mace_set_timeout(struct net_device *dev) mp->timeout_active = 1; } -static int mace_xmit_start(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t mace_xmit_start(struct sk_buff *skb, struct net_device *dev) { struct mace_data *mp = netdev_priv(dev); volatile struct dbdma_regs __iomem *td = mp->tx_dma; diff --git a/drivers/net/ethernet/apple/macmace.c b/drivers/net/ethernet/apple/macmace.c index 137cbb470af2..376f2c2613e7 100644 --- a/drivers/net/ethernet/apple/macmace.c +++ b/drivers/net/ethernet/apple/macmace.c @@ -89,7 +89,7 @@ struct mace_frame { static int mace_open(struct net_device *dev); static int mace_close(struct net_device *dev); -static int mace_xmit_start(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t mace_xmit_start(struct sk_buff *skb, struct net_device *dev); static void mace_set_multicast(struct net_device *dev); static int mace_set_address(struct net_device *dev, void *addr); static void mace_reset(struct net_device *dev); @@ -444,7 +444,7 @@ static int mace_close(struct net_device *dev) * Transmit a frame */ -static int mace_xmit_start(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t mace_xmit_start(struct sk_buff *skb, struct net_device *dev) { struct mace_data *mp = netdev_priv(dev); unsigned long flags; From 83fe9a966111b51a34f10c35e568e45bff34de48 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 21 Sep 2018 11:07:55 +0300 Subject: [PATCH 174/499] devlink: double free in devlink_resource_fill() Smatch reports that devlink_dpipe_send_and_alloc_skb() frees the skb on error so this is a double free. We fixed a bunch of these bugs in commit 7fe4d6dcbcb4 ("devlink: Remove redundant free on error path") but we accidentally overlooked this one. Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/core/devlink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index 65fc366a78a4..8c0ed225e280 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2592,7 +2592,7 @@ static int devlink_resource_fill(struct genl_info *info, if (!nlh) { err = devlink_dpipe_send_and_alloc_skb(&skb, info); if (err) - goto err_skb_send_alloc; + return err; goto send_done; } return genlmsg_reply(skb, info); @@ -2600,7 +2600,6 @@ static int devlink_resource_fill(struct genl_info *info, nla_put_failure: err = -EMSGSIZE; err_resource_put: -err_skb_send_alloc: nlmsg_free(skb); return err; } From 8ac1ee6f4d62e781e3b3fd8b9c42b70371427669 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 21 Sep 2018 02:44:12 -0700 Subject: [PATCH 175/499] net/mlx4: Use cpumask_available for eq->affinity_mask Clang warns that the address of a pointer will always evaluated as true in a boolean context: drivers/net/ethernet/mellanox/mlx4/eq.c:243:11: warning: address of array 'eq->affinity_mask' will always evaluate to 'true' [-Wpointer-bool-conversion] if (!eq->affinity_mask || cpumask_empty(eq->affinity_mask)) ~~~~~^~~~~~~~~~~~~ 1 warning generated. Use cpumask_available, introduced in commit f7e30f01a9e2 ("cpumask: Add helper cpumask_available()"), which does the proper checking and avoids this warning. Link: https://github.com/ClangBuiltLinux/linux/issues/86 Signed-off-by: Nathan Chancellor Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/eq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index 1f3372c1802e..2df92dbd38e1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -240,7 +240,8 @@ static void mlx4_set_eq_affinity_hint(struct mlx4_priv *priv, int vec) struct mlx4_dev *dev = &priv->dev; struct mlx4_eq *eq = &priv->eq_table.eq[vec]; - if (!eq->affinity_mask || cpumask_empty(eq->affinity_mask)) + if (!cpumask_available(eq->affinity_mask) || + cpumask_empty(eq->affinity_mask)) return; hint_err = irq_set_affinity_hint(eq->irq, eq->affinity_mask); From 8360ed6745df6de2d8ddff8e2116789258fe5890 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 21 Sep 2018 11:04:51 -0700 Subject: [PATCH 176/499] RDS: IB: Use DEFINE_PER_CPU_SHARED_ALIGNED for rds_ib_stats Clang warns when two declarations' section attributes don't match. net/rds/ib_stats.c:40:1: warning: section does not match previous declaration [-Wsection] DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); ^ ./include/linux/percpu-defs.h:142:2: note: expanded from macro 'DEFINE_PER_CPU_SHARED_ALIGNED' DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ ^ ./include/linux/percpu-defs.h:93:9: note: expanded from macro 'DEFINE_PER_CPU_SECTION' extern __PCPU_ATTRS(sec) __typeof__(type) name; \ ^ ./include/linux/percpu-defs.h:49:26: note: expanded from macro '__PCPU_ATTRS' __percpu __attribute__((section(PER_CPU_BASE_SECTION sec))) \ ^ net/rds/ib.h:446:1: note: previous attribute is here DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); ^ ./include/linux/percpu-defs.h:111:2: note: expanded from macro 'DECLARE_PER_CPU' DECLARE_PER_CPU_SECTION(type, name, "") ^ ./include/linux/percpu-defs.h:87:9: note: expanded from macro 'DECLARE_PER_CPU_SECTION' extern __PCPU_ATTRS(sec) __typeof__(type) name ^ ./include/linux/percpu-defs.h:49:26: note: expanded from macro '__PCPU_ATTRS' __percpu __attribute__((section(PER_CPU_BASE_SECTION sec))) \ ^ 1 warning generated. The initial definition was added in commit ec16227e1414 ("RDS/IB: Infiniband transport") and the cache aligned definition was added in commit e6babe4cc4ce ("RDS/IB: Stats and sysctls") right after. The definition probably should have been updated in net/rds/ib.h, which is what this patch does. Link: https://github.com/ClangBuiltLinux/linux/issues/114 Signed-off-by: Nathan Chancellor Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/ib.h b/net/rds/ib.h index 73427ff439f9..fd483760c910 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -443,7 +443,7 @@ int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ -DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) #define rds_ib_stats_add(member, count) \ rds_stats_add_which(rds_ib_stats, member, count) From 41c9b1be335b5afc3b5fb71c5d16f9d5939cd13f Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 10 Sep 2018 16:18:29 -0700 Subject: [PATCH 177/499] device-dax: Add missing address_space_operations With address_space_operations missing for device dax, namely the .set_page_dirty, we hit a kernel warning when running destructive ndctl unit test: make TESTS=device-dax check WARNING: CPU: 3 PID: 7380 at fs/buffer.c:581 __set_page_dirty+0xb1/0xc0 Setting address_space_operations to noop_set_page_dirty and noop_invalidatepage for device dax to prevent fallback to __set_page_dirty_buffers() and block_invalidatepage() respectively. Fixes: 2232c6382a ("device-dax: Enable page_mapping()") Acked-by: Jeff Moyer Reported-by: Vishal Verma Signed-off-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/dax/device.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index bbe4d72ca105..948806e57cee 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -535,6 +535,11 @@ static unsigned long dax_get_unmapped_area(struct file *filp, return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); } +static const struct address_space_operations dev_dax_aops = { + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; + static int dax_open(struct inode *inode, struct file *filp) { struct dax_device *dax_dev = inode_dax(inode); @@ -544,6 +549,7 @@ static int dax_open(struct inode *inode, struct file *filp) dev_dbg(&dev_dax->dev, "trace\n"); inode->i_mapping = __dax_inode->i_mapping; inode->i_mapping->host = __dax_inode; + inode->i_mapping->a_ops = &dev_dax_aops; filp->f_mapping = inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); filp->private_data = dev_dax; From 474ff2600889e16280dbc6ada8bfecb216169a70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Sat, 22 Sep 2018 01:34:01 -0700 Subject: [PATCH 178/499] net-ethtool: ETHTOOL_GUFO did not and should not require CAP_NET_ADMIN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So it should not fail with EPERM even though it is no longer implemented... This is a fix for: (userns)$ egrep ^Cap /proc/self/status CapInh: 0000003fffffffff CapPrm: 0000003fffffffff CapEff: 0000003fffffffff CapBnd: 0000003fffffffff CapAmb: 0000003fffffffff (userns)$ tcpdump -i usb_rndis0 tcpdump: WARNING: usb_rndis0: SIOCETHTOOL(ETHTOOL_GUFO) ioctl failed: Operation not permitted Warning: Kernel filter failed: Bad file descriptor tcpdump: can't remove kernel filter: Bad file descriptor With this change it returns EOPNOTSUPP instead of EPERM. See also https://github.com/the-tcpdump-group/libpcap/issues/689 Fixes: 08a00fea6de2 "net: Remove references to NETIF_F_UFO from ethtool." Cc: David S. Miller Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c9993c6c2fd4..234a0ec2e932 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2624,6 +2624,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: + case ETHTOOL_GUFO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: From a4986b08287e0868fad6dd1b6e8ed840b2f1003f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 19 Sep 2018 08:58:02 +0200 Subject: [PATCH 179/499] MAINTAINERS: update the Annapurna Labs maintainer email Free Electrons became Bootlin. Update my email accordingly. Signed-off-by: Antoine Tenart Signed-off-by: Olof Johansson --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a5b256b25905..d8f42ec6d1c0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1251,7 +1251,7 @@ N: meson ARM/Annapurna Labs ALPINE ARCHITECTURE M: Tsahee Zidenberg -M: Antoine Tenart +M: Antoine Tenart L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-alpine/ From 16fdf8ba98391650ce4bc4f3f71629d8a413bc21 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 23 Sep 2018 12:25:15 -0700 Subject: [PATCH 180/499] rds: Fix build regression. Use DECLARE_* not DEFINE_* Fixes: 8360ed6745df ("RDS: IB: Use DEFINE_PER_CPU_SHARED_ALIGNED for rds_ib_stats") Signed-off-by: David S. Miller --- net/rds/ib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/ib.h b/net/rds/ib.h index fd483760c910..71ff356ee702 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -443,7 +443,7 @@ int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) #define rds_ib_stats_add(member, count) \ rds_stats_add_which(rds_ib_stats, member, count) From 4451d3f59f2a6f95e5d205c2d04ea072955d080d Mon Sep 17 00:00:00 2001 From: Tao Ren Date: Wed, 19 Sep 2018 15:13:31 -0700 Subject: [PATCH 181/499] clocksource/drivers/fttmr010: Fix set_next_event handler Currently, the aspeed MATCH1 register is updated to in set_next_event handler, with the assumption that COUNT register value is preserved when the timer is disabled and it continues decrementing after the timer is enabled. But the assumption is wrong: RELOAD register is loaded into COUNT register when the aspeed timer is enabled, which means the next event may be delayed because timer interrupt won't be generated until <0xFFFFFFFF - current_count + cycles>. The problem can be fixed by updating RELOAD register to , and COUNT register will be re-loaded when the timer is enabled and interrupt is generated when COUNT register overflows. The test result on Facebook Backpack-CMM BMC hardware (AST2500) shows the issue is fixed: without the patch, usleep(100) suspends the process for several milliseconds (and sometimes even over 40 milliseconds); after applying the fix, usleep(100) takes averagely 240 microseconds to return under the same workload level. Signed-off-by: Tao Ren Reviewed-by: Linus Walleij Tested-by: Lei YU Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-fttmr010.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/clocksource/timer-fttmr010.c b/drivers/clocksource/timer-fttmr010.c index c020038ebfab..cf93f6419b51 100644 --- a/drivers/clocksource/timer-fttmr010.c +++ b/drivers/clocksource/timer-fttmr010.c @@ -130,13 +130,17 @@ static int fttmr010_timer_set_next_event(unsigned long cycles, cr &= ~fttmr010->t1_enable_val; writel(cr, fttmr010->base + TIMER_CR); - /* Setup the match register forward/backward in time */ - cr = readl(fttmr010->base + TIMER1_COUNT); - if (fttmr010->count_down) - cr -= cycles; - else - cr += cycles; - writel(cr, fttmr010->base + TIMER1_MATCH1); + if (fttmr010->count_down) { + /* + * ASPEED Timer Controller will load TIMER1_LOAD register + * into TIMER1_COUNT register when the timer is re-enabled. + */ + writel(cycles, fttmr010->base + TIMER1_LOAD); + } else { + /* Setup the match register forward in time */ + cr = readl(fttmr010->base + TIMER1_COUNT); + writel(cr + cycles, fttmr010->base + TIMER1_MATCH1); + } /* Start */ cr = readl(fttmr010->base + TIMER_CR); From ac3d9dd034e565df2c034ab2ca71f0a9f69153c1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:38 -0700 Subject: [PATCH 182/499] netpoll: make ndo_poll_controller() optional As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. It seems that all networking drivers that do use NAPI for their TX completions, should not provide a ndo_poll_controller(). NAPI drivers have netpoll support already handled in core networking stack, since netpoll_poll_dev() uses poll_napi(dev) to iterate through registered NAPI contexts for a device. This patch allows netpoll_poll_dev() to process NAPI contexts even for drivers not providing ndo_poll_controller(), allowing for following patches in NAPI drivers. Also we export netpoll_poll_dev() so that it can be called by bonding/team drivers in following patches. Reported-by: Song Liu Signed-off-by: Eric Dumazet Tested-by: Song Liu Signed-off-by: David S. Miller --- include/linux/netpoll.h | 5 +++-- net/core/netpoll.c | 19 +++++++------------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 67662d01130a..3ef82d3a78db 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -49,8 +49,9 @@ struct netpoll_info { }; #ifdef CONFIG_NETPOLL -extern void netpoll_poll_disable(struct net_device *dev); -extern void netpoll_poll_enable(struct net_device *dev); +void netpoll_poll_dev(struct net_device *dev); +void netpoll_poll_disable(struct net_device *dev); +void netpoll_poll_enable(struct net_device *dev); #else static inline void netpoll_poll_disable(struct net_device *dev) { return; } static inline void netpoll_poll_enable(struct net_device *dev) { return; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 57557a6a950c..3219a2932463 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -187,16 +187,16 @@ static void poll_napi(struct net_device *dev) } } -static void netpoll_poll_dev(struct net_device *dev) +void netpoll_poll_dev(struct net_device *dev) { - const struct net_device_ops *ops; struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); + const struct net_device_ops *ops; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity * while changing device state */ - if (down_trylock(&ni->dev_lock)) + if (!ni || down_trylock(&ni->dev_lock)) return; if (!netif_running(dev)) { @@ -205,13 +205,8 @@ static void netpoll_poll_dev(struct net_device *dev) } ops = dev->netdev_ops; - if (!ops->ndo_poll_controller) { - up(&ni->dev_lock); - return; - } - - /* Process pending work on NIC */ - ops->ndo_poll_controller(dev); + if (ops->ndo_poll_controller) + ops->ndo_poll_controller(dev); poll_napi(dev); @@ -219,6 +214,7 @@ static void netpoll_poll_dev(struct net_device *dev) zap_completion_queue(); } +EXPORT_SYMBOL(netpoll_poll_dev); void netpoll_poll_disable(struct net_device *dev) { @@ -613,8 +609,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) strlcpy(np->dev_name, ndev->name, IFNAMSIZ); INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); - if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || - !ndev->netdev_ops->ndo_poll_controller) { + if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", np->dev_name); err = -ENOTSUPP; From 93f62ad5e83a13e0c224dfca5ef40f90c09aad51 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:39 -0700 Subject: [PATCH 183/499] bonding: use netpoll_poll_dev() helper We want to allow NAPI drivers to no longer provide ndo_poll_controller() method, as it has been proven problematic. team driver must not look at its presence, but instead call netpoll_poll_dev() which factorize the needed actions. Signed-off-by: Eric Dumazet Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index a764a83f99da..0d87e11e7f1d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -971,16 +971,13 @@ static void bond_poll_controller(struct net_device *bond_dev) struct slave *slave = NULL; struct list_head *iter; struct ad_info ad_info; - struct netpoll_info *ni; - const struct net_device_ops *ops; if (BOND_MODE(bond) == BOND_MODE_8023AD) if (bond_3ad_get_active_agg_info(bond, &ad_info)) return; bond_for_each_slave_rcu(bond, slave, iter) { - ops = slave->dev->netdev_ops; - if (!bond_slave_is_up(slave) || !ops->ndo_poll_controller) + if (!bond_slave_is_up(slave)) continue; if (BOND_MODE(bond) == BOND_MODE_8023AD) { @@ -992,11 +989,7 @@ static void bond_poll_controller(struct net_device *bond_dev) continue; } - ni = rcu_dereference_bh(slave->dev->npinfo); - if (down_trylock(&ni->dev_lock)) - continue; - ops->ndo_poll_controller(slave->dev); - up(&ni->dev_lock); + netpoll_poll_dev(slave->dev); } } From b80e71a986c2ab5677dc6b84923cd7030b690800 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:40 -0700 Subject: [PATCH 184/499] ixgbe: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ixgbe uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Reported-by: Song Liu Signed-off-by: Eric Dumazet Tested-by: Song Liu Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9a23d33a47ed..f27d73a7bf16 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8768,28 +8768,6 @@ static int ixgbe_del_sanmac_netdev(struct net_device *dev) return err; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/* - * Polling 'interrupt' - used by things like netconsole to send skbs - * without having to re-enable interrupts. It's not called while - * the interrupt routine is executing. - */ -static void ixgbe_netpoll(struct net_device *netdev) -{ - struct ixgbe_adapter *adapter = netdev_priv(netdev); - int i; - - /* if interface is down do nothing */ - if (test_bit(__IXGBE_DOWN, &adapter->state)) - return; - - /* loop through and schedule all active queues */ - for (i = 0; i < adapter->num_q_vectors; i++) - ixgbe_msix_clean_rings(0, adapter->q_vector[i]); -} - -#endif - static void ixgbe_get_ring_stats64(struct rtnl_link_stats64 *stats, struct ixgbe_ring *ring) { @@ -10251,9 +10229,6 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_get_vf_config = ixgbe_ndo_get_vf_config, .ndo_get_stats64 = ixgbe_get_stats64, .ndo_setup_tc = __ixgbe_setup_tc, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ixgbe_netpoll, -#endif #ifdef IXGBE_FCOE .ndo_select_queue = ixgbe_select_queue, .ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get, From 6f5d941ebadeb1150530f205af0eb5cf5bfeb219 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:41 -0700 Subject: [PATCH 185/499] ixgbevf: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ixgbevf uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index d86446d202d5..5a228582423b 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4233,24 +4233,6 @@ static int ixgbevf_change_mtu(struct net_device *netdev, int new_mtu) return 0; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/* Polling 'interrupt' - used by things like netconsole to send skbs - * without having to re-enable interrupts. It's not called while - * the interrupt routine is executing. - */ -static void ixgbevf_netpoll(struct net_device *netdev) -{ - struct ixgbevf_adapter *adapter = netdev_priv(netdev); - int i; - - /* if interface is down do nothing */ - if (test_bit(__IXGBEVF_DOWN, &adapter->state)) - return; - for (i = 0; i < adapter->num_rx_queues; i++) - ixgbevf_msix_clean_rings(0, adapter->q_vector[i]); -} -#endif /* CONFIG_NET_POLL_CONTROLLER */ - static int ixgbevf_suspend(struct pci_dev *pdev, pm_message_t state) { struct net_device *netdev = pci_get_drvdata(pdev); @@ -4482,9 +4464,6 @@ static const struct net_device_ops ixgbevf_netdev_ops = { .ndo_tx_timeout = ixgbevf_tx_timeout, .ndo_vlan_rx_add_vid = ixgbevf_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ixgbevf_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ixgbevf_netpoll, -#endif .ndo_features_check = ixgbevf_features_check, .ndo_bpf = ixgbevf_xdp, }; From dda9d57e2d4245c0d8e309ee7399bf2ebfca64ae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:42 -0700 Subject: [PATCH 186/499] fm10k: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture lasts for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. fm10k uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/fm10k/fm10k.h | 3 --- .../net/ethernet/intel/fm10k/fm10k_netdev.c | 3 --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 22 ------------------- 3 files changed, 28 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h index a903a0ba45e1..7d42582ed48d 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h @@ -504,9 +504,6 @@ void fm10k_update_stats(struct fm10k_intfc *interface); void fm10k_service_event_schedule(struct fm10k_intfc *interface); void fm10k_macvlan_schedule(struct fm10k_intfc *interface); void fm10k_update_rx_drop_en(struct fm10k_intfc *interface); -#ifdef CONFIG_NET_POLL_CONTROLLER -void fm10k_netpoll(struct net_device *netdev); -#endif /* Netdev */ struct net_device *fm10k_alloc_netdev(const struct fm10k_info *info); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c index 929f538d28bc..538a8467f434 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c @@ -1648,9 +1648,6 @@ static const struct net_device_ops fm10k_netdev_ops = { .ndo_udp_tunnel_del = fm10k_udp_tunnel_del, .ndo_dfwd_add_station = fm10k_dfwd_add_station, .ndo_dfwd_del_station = fm10k_dfwd_del_station, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = fm10k_netpoll, -#endif .ndo_features_check = fm10k_features_check, }; diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 15071e4adb98..c859ababeed5 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -1210,28 +1210,6 @@ static irqreturn_t fm10k_msix_mbx_vf(int __always_unused irq, void *data) return IRQ_HANDLED; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/** - * fm10k_netpoll - A Polling 'interrupt' handler - * @netdev: network interface device structure - * - * This is used by netconsole to send skbs without having to re-enable - * interrupts. It's not called while the normal interrupt routine is executing. - **/ -void fm10k_netpoll(struct net_device *netdev) -{ - struct fm10k_intfc *interface = netdev_priv(netdev); - int i; - - /* if interface is down do nothing */ - if (test_bit(__FM10K_DOWN, interface->state)) - return; - - for (i = 0; i < interface->num_q_vectors; i++) - fm10k_msix_clean_rings(0, interface->q_vector[i]); -} - -#endif #define FM10K_ERR_MSG(type) case (type): error = #type; break static void fm10k_handle_fault(struct fm10k_intfc *interface, int type, struct fm10k_fault *fault) From 2753166e4be9df4677745a7f7d9d8daa94938a08 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:43 -0700 Subject: [PATCH 187/499] ixgb: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ixgb uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. This also removes a problematic use of disable_irq() in a context it is forbidden, as explained in commit af3e0fcf7887 ("8139too: Use disable_irq_nosync() in rtl8139_poll_controller()") Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgb/ixgb_main.c | 25 --------------------- 1 file changed, 25 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c index d3e72d0f66ef..7722153c4ac2 100644 --- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c +++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c @@ -81,11 +81,6 @@ static int ixgb_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid); static void ixgb_restore_vlan(struct ixgb_adapter *adapter); -#ifdef CONFIG_NET_POLL_CONTROLLER -/* for netdump / net console */ -static void ixgb_netpoll(struct net_device *dev); -#endif - static pci_ers_result_t ixgb_io_error_detected (struct pci_dev *pdev, enum pci_channel_state state); static pci_ers_result_t ixgb_io_slot_reset (struct pci_dev *pdev); @@ -348,9 +343,6 @@ static const struct net_device_ops ixgb_netdev_ops = { .ndo_tx_timeout = ixgb_tx_timeout, .ndo_vlan_rx_add_vid = ixgb_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ixgb_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ixgb_netpoll, -#endif .ndo_fix_features = ixgb_fix_features, .ndo_set_features = ixgb_set_features, }; @@ -2195,23 +2187,6 @@ ixgb_restore_vlan(struct ixgb_adapter *adapter) ixgb_vlan_rx_add_vid(adapter->netdev, htons(ETH_P_8021Q), vid); } -#ifdef CONFIG_NET_POLL_CONTROLLER -/* - * Polling 'interrupt' - used by things like netconsole to send skbs - * without having to re-enable interrupts. It's not called while - * the interrupt routine is executing. - */ - -static void ixgb_netpoll(struct net_device *dev) -{ - struct ixgb_adapter *adapter = netdev_priv(dev); - - disable_irq(adapter->pdev->irq); - ixgb_intr(adapter->pdev->irq, dev); - enable_irq(adapter->pdev->irq); -} -#endif - /** * ixgb_io_error_detected - called when PCI error is detected * @pdev: pointer to pci device with error From 0542997edece2d75e0bf3e173dfddf1b2b3ef756 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:44 -0700 Subject: [PATCH 188/499] igb: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. igb uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igb/igb_main.c | 30 ----------------------- 1 file changed, 30 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index a32c576c1e65..0796cef96fa3 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -205,10 +205,6 @@ static struct notifier_block dca_notifier = { .priority = 0 }; #endif -#ifdef CONFIG_NET_POLL_CONTROLLER -/* for netdump / net console */ -static void igb_netpoll(struct net_device *); -#endif #ifdef CONFIG_PCI_IOV static unsigned int max_vfs; module_param(max_vfs, uint, 0); @@ -2881,9 +2877,6 @@ static const struct net_device_ops igb_netdev_ops = { .ndo_set_vf_spoofchk = igb_ndo_set_vf_spoofchk, .ndo_set_vf_trust = igb_ndo_set_vf_trust, .ndo_get_vf_config = igb_ndo_get_vf_config, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = igb_netpoll, -#endif .ndo_fix_features = igb_fix_features, .ndo_set_features = igb_set_features, .ndo_fdb_add = igb_ndo_fdb_add, @@ -9053,29 +9046,6 @@ static int igb_pci_sriov_configure(struct pci_dev *dev, int num_vfs) return 0; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/* Polling 'interrupt' - used by things like netconsole to send skbs - * without having to re-enable interrupts. It's not called while - * the interrupt routine is executing. - */ -static void igb_netpoll(struct net_device *netdev) -{ - struct igb_adapter *adapter = netdev_priv(netdev); - struct e1000_hw *hw = &adapter->hw; - struct igb_q_vector *q_vector; - int i; - - for (i = 0; i < adapter->num_q_vectors; i++) { - q_vector = adapter->q_vector[i]; - if (adapter->flags & IGB_FLAG_HAS_MSIX) - wr32(E1000_EIMC, q_vector->eims_value); - else - igb_irq_disable(adapter); - napi_schedule(&q_vector->napi); - } -} -#endif /* CONFIG_NET_POLL_CONTROLLER */ - /** * igb_io_error_detected - called when PCI error is detected * @pdev: Pointer to PCI device From 158a08a694c4ecf9574fe452db24a6b4943853aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:45 -0700 Subject: [PATCH 189/499] ice: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ice uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ice/ice_main.c | 27 ----------------------- 1 file changed, 27 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index f1e80eed2fd6..3f047bb43348 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4806,30 +4806,6 @@ void ice_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats) stats->rx_length_errors = vsi_stats->rx_length_errors; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/** - * ice_netpoll - polling "interrupt" handler - * @netdev: network interface device structure - * - * Used by netconsole to send skbs without having to re-enable interrupts. - * This is not called in the normal interrupt path. - */ -static void ice_netpoll(struct net_device *netdev) -{ - struct ice_netdev_priv *np = netdev_priv(netdev); - struct ice_vsi *vsi = np->vsi; - struct ice_pf *pf = vsi->back; - int i; - - if (test_bit(__ICE_DOWN, vsi->state) || - !test_bit(ICE_FLAG_MSIX_ENA, pf->flags)) - return; - - for (i = 0; i < vsi->num_q_vectors; i++) - ice_msix_clean_rings(0, vsi->q_vectors[i]); -} -#endif /* CONFIG_NET_POLL_CONTROLLER */ - /** * ice_napi_disable_all - Disable NAPI for all q_vectors in the VSI * @vsi: VSI having NAPI disabled @@ -5497,9 +5473,6 @@ static const struct net_device_ops ice_netdev_ops = { .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ice_change_mtu, .ndo_get_stats64 = ice_get_stats64, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ice_netpoll, -#endif /* CONFIG_NET_POLL_CONTROLLER */ .ndo_vlan_rx_add_vid = ice_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ice_vlan_rx_kill_vid, .ndo_set_features = ice_set_features, From 1aa28fb98368078bcaf527bf46c0e001db934414 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:46 -0700 Subject: [PATCH 190/499] i40evf: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. i40evf uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- .../net/ethernet/intel/i40evf/i40evf_main.c | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index 5906c1c1d19d..fef6d892ed4c 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -396,29 +396,6 @@ static void i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter) adapter->aq_required |= I40EVF_FLAG_AQ_MAP_VECTORS; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/** - * i40evf_netpoll - A Polling 'interrupt' handler - * @netdev: network interface device structure - * - * This is used by netconsole to send skbs without having to re-enable - * interrupts. It's not called while the normal interrupt routine is executing. - **/ -static void i40evf_netpoll(struct net_device *netdev) -{ - struct i40evf_adapter *adapter = netdev_priv(netdev); - int q_vectors = adapter->num_msix_vectors - NONQ_VECS; - int i; - - /* if interface is down do nothing */ - if (test_bit(__I40E_VSI_DOWN, adapter->vsi.state)) - return; - - for (i = 0; i < q_vectors; i++) - i40evf_msix_clean_rings(0, &adapter->q_vectors[i]); -} - -#endif /** * i40evf_irq_affinity_notify - Callback for affinity changes * @notify: context as to what irq was changed @@ -3229,9 +3206,6 @@ static const struct net_device_ops i40evf_netdev_ops = { .ndo_features_check = i40evf_features_check, .ndo_fix_features = i40evf_fix_features, .ndo_set_features = i40evf_set_features, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = i40evf_netpoll, -#endif .ndo_setup_tc = i40evf_setup_tc, }; From a24b66c249f71dde11da5ba3b4c397a42ece3fc9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:47 -0700 Subject: [PATCH 191/499] mlx4: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. mlx4 uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Tariq Toukan Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx4/en_netdev.c | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 6785661d1a72..fe49384eba48 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1286,20 +1286,6 @@ static void mlx4_en_do_set_rx_mode(struct work_struct *work) mutex_unlock(&mdev->state_lock); } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void mlx4_en_netpoll(struct net_device *dev) -{ - struct mlx4_en_priv *priv = netdev_priv(dev); - struct mlx4_en_cq *cq; - int i; - - for (i = 0; i < priv->tx_ring_num[TX]; i++) { - cq = priv->tx_cq[TX][i]; - napi_schedule(&cq->napi); - } -} -#endif - static int mlx4_en_set_rss_steer_rules(struct mlx4_en_priv *priv) { u64 reg_id; @@ -2946,9 +2932,6 @@ static const struct net_device_ops mlx4_netdev_ops = { .ndo_tx_timeout = mlx4_en_tx_timeout, .ndo_vlan_rx_add_vid = mlx4_en_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = mlx4_en_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = mlx4_en_netpoll, -#endif .ndo_set_features = mlx4_en_set_features, .ndo_fix_features = mlx4_en_fix_features, .ndo_setup_tc = __mlx4_en_setup_tc, @@ -2983,9 +2966,6 @@ static const struct net_device_ops mlx4_netdev_ops_master = { .ndo_set_vf_link_state = mlx4_en_set_vf_link_state, .ndo_get_vf_stats = mlx4_en_get_vf_stats, .ndo_get_vf_config = mlx4_en_get_vf_config, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = mlx4_en_netpoll, -#endif .ndo_set_features = mlx4_en_set_features, .ndo_fix_features = mlx4_en_fix_features, .ndo_setup_tc = __mlx4_en_setup_tc, From 9c29bcd189f4ab1644b7125713602532d0aefdb7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:48 -0700 Subject: [PATCH 192/499] mlx5: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. mlx5 uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Saeed Mahameed Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 5a7939e70190..54118b77dc1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4315,22 +4315,6 @@ static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } -#ifdef CONFIG_NET_POLL_CONTROLLER -/* Fake "interrupt" called by netpoll (eg netconsole) to send skbs without - * reenabling interrupts. - */ -static void mlx5e_netpoll(struct net_device *dev) -{ - struct mlx5e_priv *priv = netdev_priv(dev); - struct mlx5e_channels *chs = &priv->channels; - - int i; - - for (i = 0; i < chs->num; i++) - napi_schedule(&chs->c[i]->napi); -} -#endif - static const struct net_device_ops mlx5e_netdev_ops = { .ndo_open = mlx5e_open, .ndo_stop = mlx5e_close, @@ -4356,9 +4340,6 @@ static const struct net_device_ops mlx5e_netdev_ops = { #ifdef CONFIG_MLX5_EN_ARFS .ndo_rx_flow_steer = mlx5e_rx_flow_steer, #endif -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = mlx5e_netpoll, -#endif #ifdef CONFIG_MLX5_ESWITCH /* SRIOV E-Switch NDOs */ .ndo_set_vf_mac = mlx5e_set_vf_mac, From d8ea6a91ad70ae0881d76446df27d3f58fd478c3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:49 -0700 Subject: [PATCH 193/499] bnx2x: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. bnx2x uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 71362b7f6040..fcc2328bb0d9 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -12894,19 +12894,6 @@ static int bnx2x_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void poll_bnx2x(struct net_device *dev) -{ - struct bnx2x *bp = netdev_priv(dev); - int i; - - for_each_eth_queue(bp, i) { - struct bnx2x_fastpath *fp = &bp->fp[i]; - napi_schedule(&bnx2x_fp(bp, fp->index, napi)); - } -} -#endif - static int bnx2x_validate_addr(struct net_device *dev) { struct bnx2x *bp = netdev_priv(dev); @@ -13113,9 +13100,6 @@ static const struct net_device_ops bnx2x_netdev_ops = { .ndo_tx_timeout = bnx2x_tx_timeout, .ndo_vlan_rx_add_vid = bnx2x_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = bnx2x_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = poll_bnx2x, -#endif .ndo_setup_tc = __bnx2x_setup_tc, #ifdef CONFIG_BNX2X_SRIOV .ndo_set_vf_mac = bnx2x_set_vf_mac, From 58e0e22bff638055278ea73e34d0d07a95260790 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:50 -0700 Subject: [PATCH 194/499] bnxt: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. bnxt uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 177587f9c3f1..61957b0bbd8c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7672,21 +7672,6 @@ static void bnxt_tx_timeout(struct net_device *dev) bnxt_queue_sp_work(bp); } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void bnxt_poll_controller(struct net_device *dev) -{ - struct bnxt *bp = netdev_priv(dev); - int i; - - /* Only process tx rings/combined rings in netpoll mode. */ - for (i = 0; i < bp->tx_nr_rings; i++) { - struct bnxt_tx_ring_info *txr = &bp->tx_ring[i]; - - napi_schedule(&txr->bnapi->napi); - } -} -#endif - static void bnxt_timer(struct timer_list *t) { struct bnxt *bp = from_timer(bp, t, timer); @@ -8519,9 +8504,6 @@ static const struct net_device_ops bnxt_netdev_ops = { .ndo_set_vf_link_state = bnxt_set_vf_link_state, .ndo_set_vf_spoofchk = bnxt_set_vf_spoofchk, .ndo_set_vf_trust = bnxt_set_vf_trust, -#endif -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = bnxt_poll_controller, #endif .ndo_setup_tc = bnxt_setup_tc, #ifdef CONFIG_RFS_ACCEL From 0825ce70318e9c576acbdd5ceb4a8d563263cf8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:51 -0700 Subject: [PATCH 195/499] nfp: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. nfp uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jakub Kicinski Acked-by: Jakub Kicinski Tested-by: Jakub Kicinski Signed-off-by: David S. Miller --- .../ethernet/netronome/nfp/nfp_net_common.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 253bdaef1505..8ed38fd5a852 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3146,21 +3146,6 @@ nfp_net_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid) return nfp_net_reconfig_mbox(nn, NFP_NET_CFG_MBOX_CMD_CTAG_FILTER_KILL); } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void nfp_net_netpoll(struct net_device *netdev) -{ - struct nfp_net *nn = netdev_priv(netdev); - int i; - - /* nfp_net's NAPIs are statically allocated so even if there is a race - * with reconfig path this will simply try to schedule some disabled - * NAPI instances. - */ - for (i = 0; i < nn->dp.num_stack_tx_rings; i++) - napi_schedule_irqoff(&nn->r_vecs[i].napi); -} -#endif - static void nfp_net_stat64(struct net_device *netdev, struct rtnl_link_stats64 *stats) { @@ -3519,9 +3504,6 @@ const struct net_device_ops nfp_net_netdev_ops = { .ndo_get_stats64 = nfp_net_stat64, .ndo_vlan_rx_add_vid = nfp_net_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = nfp_net_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = nfp_net_netpoll, -#endif .ndo_set_vf_mac = nfp_app_set_vf_mac, .ndo_set_vf_vlan = nfp_app_set_vf_vlan, .ndo_set_vf_spoofchk = nfp_app_set_vf_spoofchk, From 765cdc209cb89fb81215310a066cd0a3018ffef7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:52 -0700 Subject: [PATCH 196/499] tun: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. tun uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ebd07ad82431..e2648b5a3861 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1153,43 +1153,6 @@ static netdev_features_t tun_net_fix_features(struct net_device *dev, return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void tun_poll_controller(struct net_device *dev) -{ - /* - * Tun only receives frames when: - * 1) the char device endpoint gets data from user space - * 2) the tun socket gets a sendmsg call from user space - * If NAPI is not enabled, since both of those are synchronous - * operations, we are guaranteed never to have pending data when we poll - * for it so there is nothing to do here but return. - * We need this though so netpoll recognizes us as an interface that - * supports polling, which enables bridge devices in virt setups to - * still use netconsole - * If NAPI is enabled, however, we need to schedule polling for all - * queues unless we are using napi_gro_frags(), which we call in - * process context and not in NAPI context. - */ - struct tun_struct *tun = netdev_priv(dev); - - if (tun->flags & IFF_NAPI) { - struct tun_file *tfile; - int i; - - if (tun_napi_frags_enabled(tun)) - return; - - rcu_read_lock(); - for (i = 0; i < tun->numqueues; i++) { - tfile = rcu_dereference(tun->tfiles[i]); - if (tfile->napi_enabled) - napi_schedule(&tfile->napi); - } - rcu_read_unlock(); - } - return; -} -#endif static void tun_set_headroom(struct net_device *dev, int new_hr) { @@ -1283,9 +1246,6 @@ static const struct net_device_ops tun_netdev_ops = { .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_select_queue = tun_select_queue, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = tun_poll_controller, -#endif .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, }; @@ -1365,9 +1325,6 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_select_queue = tun_select_queue, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = tun_poll_controller, -#endif .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, From d26ed6b0e5e23190d43ab34bc69cbecdc464a2cf Mon Sep 17 00:00:00 2001 From: Friedemann Gerold Date: Sat, 15 Sep 2018 18:03:39 +0300 Subject: [PATCH 197/499] net: aquantia: memory corruption on jumbo frames This patch fixes skb_shared area, which will be corrupted upon reception of 4K jumbo packets. Originally build_skb usage purpose was to reuse page for skb to eliminate needs of extra fragments. But that logic does not take into account that skb_shared_info should be reserved at the end of skb data area. In case packet data consumes all the page (4K), skb_shinfo location overflows the page. As a consequence, __build_skb zeroed shinfo data above the allocated page, corrupting next page. The issue is rarely seen in real life because jumbo are normally larger than 4K and that causes another code path to trigger. But it 100% reproducible with simple scapy packet, like: sendp(IP(dst="192.168.100.3") / TCP(dport=443) \ / Raw(RandString(size=(4096-40))), iface="enp1s0") Fixes: 018423e90bee ("net: ethernet: aquantia: Add ring support code") Reported-by: Friedemann Gerold Reported-by: Michael Rauch Signed-off-by: Friedemann Gerold Tested-by: Nikita Danilov Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_ring.c | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index b5f1f62e8e25..d1e1a0ba8615 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -225,9 +225,10 @@ int aq_ring_rx_clean(struct aq_ring_s *self, } /* for single fragment packets use build_skb() */ - if (buff->is_eop) { + if (buff->is_eop && + buff->len <= AQ_CFG_RX_FRAME_MAX - AQ_SKB_ALIGN) { skb = build_skb(page_address(buff->page), - buff->len + AQ_SKB_ALIGN); + AQ_CFG_RX_FRAME_MAX); if (unlikely(!skb)) { err = -ENOMEM; goto err_exit; @@ -247,18 +248,21 @@ int aq_ring_rx_clean(struct aq_ring_s *self, buff->len - ETH_HLEN, SKB_TRUESIZE(buff->len - ETH_HLEN)); - for (i = 1U, next_ = buff->next, - buff_ = &self->buff_ring[next_]; true; - next_ = buff_->next, - buff_ = &self->buff_ring[next_], ++i) { - skb_add_rx_frag(skb, i, buff_->page, 0, - buff_->len, - SKB_TRUESIZE(buff->len - - ETH_HLEN)); - buff_->is_cleaned = 1; + if (!buff->is_eop) { + for (i = 1U, next_ = buff->next, + buff_ = &self->buff_ring[next_]; + true; next_ = buff_->next, + buff_ = &self->buff_ring[next_], ++i) { + skb_add_rx_frag(skb, i, + buff_->page, 0, + buff_->len, + SKB_TRUESIZE(buff->len - + ETH_HLEN)); + buff_->is_cleaned = 1; - if (buff_->is_eop) - break; + if (buff_->is_eop) + break; + } } } From 63e132528032ce937126aba591a7b37ec593a6bb Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 18 Sep 2018 16:34:31 +0300 Subject: [PATCH 198/499] ovl: fix memory leak on unlink of indexed file The memory leak was detected by kmemleak when running xfstests overlay/051,053 Fixes: caf70cb2ba5d ("ovl: cleanup orphan index entries") Cc: # v4.13 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 8cfb62cc8672..ace4fe4c39a9 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -683,7 +683,7 @@ static void ovl_cleanup_index(struct dentry *dentry) struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *index = NULL; struct inode *inode; - struct qstr name; + struct qstr name = { }; int err; err = ovl_get_index_name(lowerdentry, &name); @@ -726,6 +726,7 @@ static void ovl_cleanup_index(struct dentry *dentry) goto fail; out: + kfree(name.name); dput(index); return; From 898cc19d8af2d6a80a9f2804b0db2e83c5dd8863 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 18 Sep 2018 16:34:32 +0300 Subject: [PATCH 199/499] ovl: fix freeze protection bypass in ovl_write_iter() Tested by re-writing to an open overlayfs file while upper ext4 is frozen: xfs_io -f /ovl/x xfs_io> pwrite 0 4096 fsfreeze -f /ext4 xfs_io> pwrite 0 4096 WARNING: CPU: 0 PID: 1492 at fs/ext4/ext4_jbd2.c:53 \ ext4_journal_check_start+0x48/0x82 After the fix, the second write blocks in ovl_write_iter() and avoids hitting WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE) in ext4_journal_check_start(). Fixes: 2a92e07edc5e ("ovl: add ovl_write_iter()") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index aeaefd2a551b..986313da0c88 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -240,8 +240,10 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); + file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, ovl_iocb_to_rwf(iocb)); + file_end_write(real.file); revert_creds(old_cred); /* Update size */ From d9d150ae50675e3c1d68047aafb2e981be685d90 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 18 Sep 2018 16:34:33 +0300 Subject: [PATCH 200/499] ovl: fix freeze protection bypass in ovl_clone_file_range() Tested by doing clone on overlayfs while upper xfs+reflink is frozen: xfs_io -f /ovl/y fsfreeze -f /xfs xfs_io> reflink /ovl/x Before the fix xfs_io enters xfs_reflink_remap_range() and blocks in xfs_trans_alloc(). After the fix, xfs_io blocks outside xfs code in ovl_clone_file_range(). Fixes: 8ede205541ff ("ovl: add reflink/copyfile/dedup support") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 986313da0c88..5d1b4b38f743 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -461,7 +461,7 @@ static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, break; case OVL_CLONE: - ret = vfs_clone_file_range(real_in.file, pos_in, + ret = do_clone_file_range(real_in.file, pos_in, real_out.file, pos_out, len); break; From a725356b6659469d182d662f22d770d83d3bc7b5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 18 Sep 2018 16:34:34 +0300 Subject: [PATCH 201/499] vfs: swap names of {do,vfs}_clone_file_range() Commit 031a072a0b8a ("vfs: call vfs_clone_file_range() under freeze protection") created a wrapper do_clone_file_range() around vfs_clone_file_range() moving the freeze protection to former, so overlayfs could call the latter. The more common vfs practice is to call do_xxx helpers from vfs_xxx helpers, where freeze protecction is taken in the vfs_xxx helper, so this anomality could be a source of confusion. It seems that commit 8ede205541ff ("ovl: add reflink/copyfile/dedup support") may have fallen a victim to this confusion - ovl_clone_file_range() calls the vfs_clone_file_range() helper in the hope of getting freeze protection on upper fs, but in fact results in overlayfs allowing to bypass upper fs freeze protection. Swap the names of the two helpers to conform to common vfs practice and call the correct helpers from overlayfs and nfsd. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/ioctl.c | 2 +- fs/nfsd/vfs.c | 3 ++- fs/overlayfs/copy_up.c | 2 +- fs/overlayfs/file.c | 2 +- fs/read_write.c | 17 +++++++++++++++-- include/linux/fs.h | 17 +++-------------- 6 files changed, 23 insertions(+), 20 deletions(-) diff --git a/fs/ioctl.c b/fs/ioctl.c index 3212c29235ce..2005529af560 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -230,7 +230,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ret = -EXDEV; if (src_file.file->f_path.mnt != dst_file->f_path.mnt) goto fdput; - ret = do_clone_file_range(src_file.file, off, dst_file, destoff, olen); + ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); fdput: fdput(src_file); return ret; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 55a099e47ba2..b53e76391e52 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -541,7 +541,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { - return nfserrno(do_clone_file_range(src, src_pos, dst, dst_pos, count)); + return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, + count)); } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 296037afecdb..1cc797a08a5b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -141,7 +141,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) } /* Try to use clone_file_range to clone up within the same fs */ - error = vfs_clone_file_range(old_file, 0, new_file, 0, len); + error = do_clone_file_range(old_file, 0, new_file, 0, len); if (!error) goto out; /* Couldn't clone, so now we try to copy the data */ diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 5d1b4b38f743..986313da0c88 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -461,7 +461,7 @@ static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, break; case OVL_CLONE: - ret = do_clone_file_range(real_in.file, pos_in, + ret = vfs_clone_file_range(real_in.file, pos_in, real_out.file, pos_out, len); break; diff --git a/fs/read_write.c b/fs/read_write.c index 39b4a21dd933..8a2737f0d61d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1818,8 +1818,8 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, } EXPORT_SYMBOL(vfs_clone_file_prep_inodes); -int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len) +int do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1866,6 +1866,19 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, return ret; } +EXPORT_SYMBOL(do_clone_file_range); + +int vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + int ret; + + file_start_write(file_out); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); + file_end_write(file_out); + + return ret; +} EXPORT_SYMBOL(vfs_clone_file_range); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 6c0b4a1c22ff..897eae8faee1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1828,8 +1828,10 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, struct inode *inode_out, loff_t pos_out, u64 *len, bool is_dedupe); +extern int do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len); extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); + struct file *file_out, loff_t pos_out, u64 len); extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same); @@ -2773,19 +2775,6 @@ static inline void file_end_write(struct file *file) __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE); } -static inline int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 len) -{ - int ret; - - file_start_write(file_out); - ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len); - file_end_write(file_out); - - return ret; -} - /* * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. From afbb1169ed5b58cfca017e368b53e019cf285853 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 19 Sep 2018 20:52:45 +0200 Subject: [PATCH 202/499] HID: i2c-hid: Remove RESEND_REPORT_DESCR quirk and its handling Commit 52cf93e63ee6 ("HID: i2c-hid: Don't reset device upon system resume") removes the need for the RESEND_REPORT_DESCR quirk for Raydium devices, but kept it for the SIS device id 10FB touchscreens, as the author of that commit could not determine if the quirk is still necessary there. I've tested suspend/resume on a Toshiba Click Mini L9W-B which is the device for which this quirk was added in the first place and with the "Don't reset device upon system resume" fix the quirk is no longer necessary, so this commit removes it. Note even better I also had some other devices with SIS touchscreens which suspend/resume issues, where the RESEND_REPORT_DESCR quirk did not help. I've also tested these devices with the "Don't reset device upon system resume" fix and I'm happy to report that that fix also fixes touchscreen resume on the following devices: Asus T100HA Asus T200TA Peaq C1010 Cc: Kai-Heng Feng Acked-by: Benjamin Tissoires Signed-off-by: Hans de Goede Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 - drivers/hid/i2c-hid/i2c-hid.c | 18 +++--------------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 5146ee029db4..bc49909aba8e 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -976,7 +976,6 @@ #define USB_DEVICE_ID_SIS817_TOUCH 0x0817 #define USB_DEVICE_ID_SIS_TS 0x1013 #define USB_DEVICE_ID_SIS1030_TOUCH 0x1030 -#define USB_DEVICE_ID_SIS10FB_TOUCH 0x10fb #define USB_VENDOR_ID_SKYCABLE 0x1223 #define USB_DEVICE_ID_SKYCABLE_WIRELESS_PRESENTER 0x3F07 diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c index a8610f5bf6f5..4e3592e7a3f7 100644 --- a/drivers/hid/i2c-hid/i2c-hid.c +++ b/drivers/hid/i2c-hid/i2c-hid.c @@ -47,8 +47,7 @@ /* quirks to control the device */ #define I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV BIT(0) #define I2C_HID_QUIRK_NO_IRQ_AFTER_RESET BIT(1) -#define I2C_HID_QUIRK_RESEND_REPORT_DESCR BIT(2) -#define I2C_HID_QUIRK_NO_RUNTIME_PM BIT(3) +#define I2C_HID_QUIRK_NO_RUNTIME_PM BIT(2) /* flags */ #define I2C_HID_STARTED 0 @@ -172,8 +171,6 @@ static const struct i2c_hid_quirks { { I2C_VENDOR_ID_HANTICK, I2C_PRODUCT_ID_HANTICK_5288, I2C_HID_QUIRK_NO_IRQ_AFTER_RESET | I2C_HID_QUIRK_NO_RUNTIME_PM }, - { USB_VENDOR_ID_SIS_TOUCH, USB_DEVICE_ID_SIS10FB_TOUCH, - I2C_HID_QUIRK_RESEND_REPORT_DESCR }, { 0, 0 } }; @@ -1241,22 +1238,13 @@ static int i2c_hid_resume(struct device *dev) /* Instead of resetting device, simply powers the device on. This * solves "incomplete reports" on Raydium devices 2386:3118 and - * 2386:4B33 + * 2386:4B33 and fixes various SIS touchscreens no longer sending + * data after a suspend/resume. */ ret = i2c_hid_set_power(client, I2C_HID_PWR_ON); if (ret) return ret; - /* Some devices need to re-send report descr cmd - * after resume, after this it will be back normal. - * otherwise it issues too many incomplete reports. - */ - if (ihid->quirks & I2C_HID_QUIRK_RESEND_REPORT_DESCR) { - ret = i2c_hid_command(client, &hid_report_descr_cmd, NULL, 0); - if (ret) - return ret; - } - if (hid->driver && hid->driver->reset_resume) { ret = hid->driver->reset_resume(hid); return ret; From 9ff3541e3ddf96800ce8fcd225c9e7956da49418 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 11 Sep 2018 16:45:43 -0700 Subject: [PATCH 203/499] HID: intel-ish-hid: Enable Ice Lake mobile Added PCI ID for Ice Lake mobile platform. Signed-off-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/hw-ish.h | 1 + drivers/hid/intel-ish-hid/ipc/pci-ish.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/hid/intel-ish-hid/ipc/hw-ish.h b/drivers/hid/intel-ish-hid/ipc/hw-ish.h index da133716bed0..08a8327dfd22 100644 --- a/drivers/hid/intel-ish-hid/ipc/hw-ish.h +++ b/drivers/hid/intel-ish-hid/ipc/hw-ish.h @@ -29,6 +29,7 @@ #define CNL_Ax_DEVICE_ID 0x9DFC #define GLK_Ax_DEVICE_ID 0x31A2 #define CNL_H_DEVICE_ID 0xA37C +#define ICL_MOBILE_DEVICE_ID 0x34FC #define SPT_H_DEVICE_ID 0xA135 #define REVISION_ID_CHT_A0 0x6 diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index a1125a5c7965..256b3016116c 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -38,6 +38,7 @@ static const struct pci_device_id ish_pci_tbl[] = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, CNL_Ax_DEVICE_ID)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, GLK_Ax_DEVICE_ID)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, CNL_H_DEVICE_ID)}, + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, ICL_MOBILE_DEVICE_ID)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, SPT_H_DEVICE_ID)}, {0, } }; From 8604895a34d92f5e186ceb931b0d1b384030ea3d Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Thu, 20 Sep 2018 11:45:13 -0500 Subject: [PATCH 204/499] powerpc/pseries: Fix unitialized timer reset on migration After migration of a powerpc LPAR, the kernel executes code to update the system state to reflect new platform characteristics. Such changes include modifications to device tree properties provided to the system by PHYP. Property notifications received by the post_mobility_fixup() code are passed along to the kernel in general through a call to of_update_property() which in turn passes such events back to all modules through entries like the '.notifier_call' function within the NUMA module. When the NUMA module updates its state, it resets its event timer. If this occurs after a previous call to stop_topology_update() or on a system without VPHN enabled, the code runs into an unitialized timer structure and crashes. This patch adds a safety check along this path toward the problem code. An example crash log is as follows. ibmvscsi 30000081: Re-enabling adapter! ------------[ cut here ]------------ kernel BUG at kernel/time/timer.c:958! Oops: Exception in kernel mode, sig: 5 [#1] LE SMP NR_CPUS=2048 NUMA pSeries Modules linked in: nfsv3 nfs_acl nfs tcp_diag udp_diag inet_diag lockd unix_diag af_packet_diag netlink_diag grace fscache sunrpc xts vmx_crypto pseries_rng sg binfmt_misc ip_tables xfs libcrc32c sd_mod ibmvscsi ibmveth scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod CPU: 11 PID: 3067 Comm: drmgr Not tainted 4.17.0+ #179 ... NIP mod_timer+0x4c/0x400 LR reset_topology_timer+0x40/0x60 Call Trace: 0xc0000003f9407830 (unreliable) reset_topology_timer+0x40/0x60 dt_update_callback+0x100/0x120 notifier_call_chain+0x90/0x100 __blocking_notifier_call_chain+0x60/0x90 of_property_notify+0x90/0xd0 of_update_property+0x104/0x150 update_dt_property+0xdc/0x1f0 pseries_devicetree_update+0x2d0/0x510 post_mobility_fixup+0x7c/0xf0 migration_store+0xa4/0xc0 kobj_attr_store+0x30/0x60 sysfs_kf_write+0x64/0xa0 kernfs_fop_write+0x16c/0x240 __vfs_write+0x40/0x200 vfs_write+0xc8/0x240 ksys_write+0x5c/0x100 system_call+0x58/0x6c Fixes: 5d88aa85c00b ("powerpc/pseries: Update CPU maps when device tree is updated") Cc: stable@vger.kernel.org # v3.10+ Signed-off-by: Michael Bringmann Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 35ac5422903a..b5a71baedbc2 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1452,7 +1452,8 @@ static struct timer_list topology_timer; static void reset_topology_timer(void) { - mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ); + if (vphn_enabled) + mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ); } #ifdef CONFIG_SMP From f5fad711c06e652f90f581fc7c2caee327c33d31 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 24 Sep 2018 15:28:10 +0200 Subject: [PATCH 205/499] USB: serial: simple: add Motorola Tetra MTP6550 id Add device-id for the Motorola Tetra radio MTP6550. Bus 001 Device 004: ID 0cad:9012 Motorola CGISS Device Descriptor: bLength 18 bDescriptorType 1 bcdUSB 2.00 bDeviceClass 0 (Defined at Interface level) bDeviceSubClass 0 bDeviceProtocol 0 bMaxPacketSize0 64 idVendor 0x0cad Motorola CGISS idProduct 0x9012 bcdDevice 24.16 iManufacturer 1 Motorola Solutions, Inc. iProduct 2 TETRA PEI interface iSerial 0 bNumConfigurations 1 Configuration Descriptor: bLength 9 bDescriptorType 2 wTotalLength 55 bNumInterfaces 2 bConfigurationValue 1 iConfiguration 3 Generic Serial config bmAttributes 0x80 (Bus Powered) MaxPower 500mA Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 0 bAlternateSetting 0 bNumEndpoints 2 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 0 bInterfaceProtocol 0 iInterface 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x81 EP 1 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0200 1x 512 bytes bInterval 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x01 EP 1 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0200 1x 512 bytes Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 1 bAlternateSetting 0 bNumEndpoints 2 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 0 bInterfaceProtocol 0 iInterface 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x82 EP 2 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0200 1x 512 bytes bInterval 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x02 EP 2 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0200 1x 512 bytes bInterval 0 Device Qualifier (for other device speed): bLength 10 bDescriptorType 6 bcdUSB 2.00 bDeviceClass 0 (Defined at Interface level) bDeviceSubClass 0 bDeviceProtocol 0 bMaxPacketSize0 64 bNumConfigurations 1 Device Status: 0x0000 (Bus Powered) Reported-by: Hans Hult Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/usb-serial-simple.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/usb-serial-simple.c b/drivers/usb/serial/usb-serial-simple.c index 40864c2bd9dc..4d0273508043 100644 --- a/drivers/usb/serial/usb-serial-simple.c +++ b/drivers/usb/serial/usb-serial-simple.c @@ -84,7 +84,8 @@ DEVICE(moto_modem, MOTO_IDS); /* Motorola Tetra driver */ #define MOTOROLA_TETRA_IDS() \ - { USB_DEVICE(0x0cad, 0x9011) } /* Motorola Solutions TETRA PEI */ + { USB_DEVICE(0x0cad, 0x9011) }, /* Motorola Solutions TETRA PEI */ \ + { USB_DEVICE(0x0cad, 0x9012) } /* MTP6550 */ DEVICE(motorola_tetra, MOTOROLA_TETRA_IDS); /* Novatel Wireless GPS driver */ From de5c95d0f518537f59ee5aef762abc46f868c377 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Thu, 20 Sep 2018 22:33:00 -0700 Subject: [PATCH 206/499] RDMA/bnxt_re: Fix system crash during RDMA resource initialization bnxt_re_ib_reg acquires and releases the rtnl lock whenever it accesses the L2 driver. The following sequence can trigger a crash Acquires the rtnl_lock -> Registers roce driver callback with L2 driver -> release the rtnl lock bnxt_re acquires the rtnl_lock -> Request for MSIx vectors -> release the rtnl_lock Issue happens when bnxt_re proceeds with remaining part of initialization and L2 driver invokes bnxt_ulp_irq_stop as a part of bnxt_open_nic. The crash is in bnxt_qplib_nq_stop_irq as the NQ structures are not initialized yet, [ 3551.726647] BUG: unable to handle kernel NULL pointer dereference at (null) [ 3551.726656] IP: [] bnxt_qplib_nq_stop_irq+0x59/0xb0 [bnxt_re] [ 3551.726674] PGD 0 [ 3551.726679] Oops: 0002 1 SMP ... [ 3551.726822] Hardware name: Dell Inc. PowerEdge R720/08RW36, BIOS 2.4.3 07/09/2014 [ 3551.726826] task: ffff97e30eec5ee0 ti: ffff97e3173bc000 task.ti: ffff97e3173bc000 [ 3551.726829] RIP: 0010:[] [] bnxt_qplib_nq_stop_irq+0x59/0xb0 [bnxt_re] ... [ 3551.726872] Call Trace: [ 3551.726886] [] bnxt_re_stop_irq+0x4e/0x70 [bnxt_re] [ 3551.726899] [] bnxt_ulp_irq_stop+0x43/0x70 [bnxt_en] [ 3551.726908] [] bnxt_reserve_rings+0x174/0x1e0 [bnxt_en] [ 3551.726917] [] __bnxt_open_nic+0x368/0x9a0 [bnxt_en] [ 3551.726925] [] bnxt_open_nic+0x1b/0x50 [bnxt_en] [ 3551.726934] [] bnxt_setup_mq_tc+0x11f/0x260 [bnxt_en] [ 3551.726943] [] bnxt_dcbnl_ieee_setets+0xb8/0x1f0 [bnxt_en] [ 3551.726954] [] dcbnl_ieee_set+0x9a/0x250 [ 3551.726966] [] ? __alloc_skb+0xa1/0x2d0 [ 3551.726972] [] dcb_doit+0x13a/0x210 [ 3551.726981] [] rtnetlink_rcv_msg+0xa7/0x260 [ 3551.726989] [] ? rtnl_unicast+0x20/0x30 [ 3551.726996] [] ? __kmalloc_node_track_caller+0x58/0x290 [ 3551.727002] [] ? dcb_doit+0x166/0x210 [ 3551.727007] [] ? __alloc_skb+0x8d/0x2d0 [ 3551.727012] [] ? rtnl_newlink+0x880/0x880 ... [ 3551.727104] [] system_call_fastpath+0x1c/0x21 ... [ 3551.727164] RIP [] bnxt_qplib_nq_stop_irq+0x59/0xb0 [bnxt_re] [ 3551.727175] RSP [ 3551.727177] CR2: 0000000000000000 Avoid this inconsistent state and system crash by acquiring the rtnl lock for the entire duration of device initialization. Re-factor the code to remove the rtnl lock from the individual function and acquire and release it from the caller. Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Fixes: 6e04b1035689 ("RDMA/bnxt_re: Fix broken RoCE driver due to recent L2 driver changes") Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 93 ++++++++++++---------------- 1 file changed, 38 insertions(+), 55 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 20b9f31052bf..85cd1a3593d6 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -78,7 +78,7 @@ static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list); /* Mutex to protect the list of bnxt_re devices added */ static DEFINE_MUTEX(bnxt_re_dev_lock); static struct workqueue_struct *bnxt_re_wq; -static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait); +static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev); /* SR-IOV helper functions */ @@ -182,7 +182,7 @@ static void bnxt_re_shutdown(void *p) if (!rdev) return; - bnxt_re_ib_unreg(rdev, false); + bnxt_re_ib_unreg(rdev); } static void bnxt_re_stop_irq(void *handle) @@ -251,7 +251,7 @@ static struct bnxt_ulp_ops bnxt_re_ulp_ops = { /* Driver registration routines used to let the networking driver (bnxt_en) * to know that the RoCE driver is now installed */ -static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev, bool lock_wait) +static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev; int rc; @@ -260,14 +260,9 @@ static int bnxt_re_unregister_netdev(struct bnxt_re_dev *rdev, bool lock_wait) return -EINVAL; en_dev = rdev->en_dev; - /* Acquire rtnl lock if it is not invokded from netdev event */ - if (lock_wait) - rtnl_lock(); rc = en_dev->en_ops->bnxt_unregister_device(rdev->en_dev, BNXT_ROCE_ULP); - if (lock_wait) - rtnl_unlock(); return rc; } @@ -281,14 +276,12 @@ static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) en_dev = rdev->en_dev; - rtnl_lock(); rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP, &bnxt_re_ulp_ops, rdev); - rtnl_unlock(); return rc; } -static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait) +static int bnxt_re_free_msix(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev; int rc; @@ -298,13 +291,9 @@ static int bnxt_re_free_msix(struct bnxt_re_dev *rdev, bool lock_wait) en_dev = rdev->en_dev; - if (lock_wait) - rtnl_lock(); rc = en_dev->en_ops->bnxt_free_msix(rdev->en_dev, BNXT_ROCE_ULP); - if (lock_wait) - rtnl_unlock(); return rc; } @@ -320,7 +309,6 @@ static int bnxt_re_request_msix(struct bnxt_re_dev *rdev) num_msix_want = min_t(u32, BNXT_RE_MAX_MSIX, num_online_cpus()); - rtnl_lock(); num_msix_got = en_dev->en_ops->bnxt_request_msix(en_dev, BNXT_ROCE_ULP, rdev->msix_entries, num_msix_want); @@ -335,7 +323,6 @@ static int bnxt_re_request_msix(struct bnxt_re_dev *rdev) } rdev->num_msix = num_msix_got; done: - rtnl_unlock(); return rc; } @@ -358,24 +345,18 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg, fw_msg->timeout = timeout; } -static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id, - bool lock_wait) +static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id) { struct bnxt_en_dev *en_dev = rdev->en_dev; struct hwrm_ring_free_input req = {0}; struct hwrm_ring_free_output resp; struct bnxt_fw_msg fw_msg; - bool do_unlock = false; int rc = -EINVAL; if (!en_dev) return rc; memset(&fw_msg, 0, sizeof(fw_msg)); - if (lock_wait) { - rtnl_lock(); - do_unlock = true; - } bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1); req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL; @@ -386,8 +367,6 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id, if (rc) dev_err(rdev_to_dev(rdev), "Failed to free HW ring:%d :%#x", req.ring_id, rc); - if (do_unlock) - rtnl_unlock(); return rc; } @@ -405,7 +384,6 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr, return rc; memset(&fw_msg, 0, sizeof(fw_msg)); - rtnl_lock(); bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_ALLOC, -1, -1); req.enables = 0; req.page_tbl_addr = cpu_to_le64(dma_arr[0]); @@ -426,27 +404,21 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr, if (!rc) *fw_ring_id = le16_to_cpu(resp.ring_id); - rtnl_unlock(); return rc; } static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, - u32 fw_stats_ctx_id, bool lock_wait) + u32 fw_stats_ctx_id) { struct bnxt_en_dev *en_dev = rdev->en_dev; struct hwrm_stat_ctx_free_input req = {0}; struct bnxt_fw_msg fw_msg; - bool do_unlock = false; int rc = -EINVAL; if (!en_dev) return rc; memset(&fw_msg, 0, sizeof(fw_msg)); - if (lock_wait) { - rtnl_lock(); - do_unlock = true; - } bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_FREE, -1, -1); req.stat_ctx_id = cpu_to_le32(fw_stats_ctx_id); @@ -457,8 +429,6 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, dev_err(rdev_to_dev(rdev), "Failed to free HW stats context %#x", rc); - if (do_unlock) - rtnl_unlock(); return rc; } @@ -478,7 +448,6 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, return rc; memset(&fw_msg, 0, sizeof(fw_msg)); - rtnl_lock(); bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_ALLOC, -1, -1); req.update_period_ms = cpu_to_le32(1000); @@ -490,7 +459,6 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, if (!rc) *fw_stats_ctx_id = le32_to_cpu(resp.stat_ctx_id); - rtnl_unlock(); return rc; } @@ -929,19 +897,19 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) return rc; } -static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev, bool lock_wait) +static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev) { int i; for (i = 0; i < rdev->num_msix - 1; i++) { - bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, lock_wait); + bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id); bnxt_qplib_free_nq(&rdev->nq[i]); } } -static void bnxt_re_free_res(struct bnxt_re_dev *rdev, bool lock_wait) +static void bnxt_re_free_res(struct bnxt_re_dev *rdev) { - bnxt_re_free_nq_res(rdev, lock_wait); + bnxt_re_free_nq_res(rdev); if (rdev->qplib_res.dpi_tbl.max) { bnxt_qplib_dealloc_dpi(&rdev->qplib_res, @@ -1219,7 +1187,7 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) return 0; } -static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait) +static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev) { int i, rc; @@ -1234,28 +1202,27 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait) cancel_delayed_work(&rdev->worker); bnxt_re_cleanup_res(rdev); - bnxt_re_free_res(rdev, lock_wait); + bnxt_re_free_res(rdev); if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) { rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw); if (rc) dev_warn(rdev_to_dev(rdev), "Failed to deinitialize RCFW: %#x", rc); - bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id, - lock_wait); + bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id); bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx); bnxt_qplib_disable_rcfw_channel(&rdev->rcfw); - bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, lock_wait); + bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id); bnxt_qplib_free_rcfw_channel(&rdev->rcfw); } if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) { - rc = bnxt_re_free_msix(rdev, lock_wait); + rc = bnxt_re_free_msix(rdev); if (rc) dev_warn(rdev_to_dev(rdev), "Failed to free MSI-X vectors: %#x", rc); } if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) { - rc = bnxt_re_unregister_netdev(rdev, lock_wait); + rc = bnxt_re_unregister_netdev(rdev); if (rc) dev_warn(rdev_to_dev(rdev), "Failed to unregister with netdev: %#x", rc); @@ -1276,6 +1243,12 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) { int i, j, rc; + bool locked; + + /* Acquire rtnl lock through out this function */ + rtnl_lock(); + locked = true; + /* Registered a new RoCE device instance to netdev */ rc = bnxt_re_register_netdev(rdev); if (rc) { @@ -1374,12 +1347,16 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); } + rtnl_unlock(); + locked = false; + /* Register ib dev */ rc = bnxt_re_register_ib(rdev); if (rc) { pr_err("Failed to register with IB: %#x\n", rc); goto fail; } + set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags); dev_info(rdev_to_dev(rdev), "Device registered successfully"); for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) { rc = device_create_file(&rdev->ibdev.dev, @@ -1395,7 +1372,6 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) goto fail; } } - set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags); ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, &rdev->active_width); set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags); @@ -1404,17 +1380,21 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) return 0; free_sctx: - bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id, true); + bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id); free_ctx: bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx); disable_rcfw: bnxt_qplib_disable_rcfw_channel(&rdev->rcfw); free_ring: - bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, true); + bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id); free_rcfw: bnxt_qplib_free_rcfw_channel(&rdev->rcfw); fail: - bnxt_re_ib_unreg(rdev, true); + if (!locked) + rtnl_lock(); + bnxt_re_ib_unreg(rdev); + rtnl_unlock(); + return rc; } @@ -1567,7 +1547,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, */ if (atomic_read(&rdev->sched_count) > 0) goto exit; - bnxt_re_ib_unreg(rdev, false); + bnxt_re_ib_unreg(rdev); bnxt_re_remove_one(rdev); bnxt_re_dev_unreg(rdev); break; @@ -1646,7 +1626,10 @@ static void __exit bnxt_re_mod_exit(void) */ flush_workqueue(bnxt_re_wq); bnxt_re_dev_stop(rdev); - bnxt_re_ib_unreg(rdev, true); + /* Acquire the rtnl_lock as the L2 resources are freed here */ + rtnl_lock(); + bnxt_re_ib_unreg(rdev); + rtnl_unlock(); bnxt_re_remove_one(rdev); bnxt_re_dev_unreg(rdev); } From 4679b61f2640e07724507fcd9dc070517bc6e6cb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Sep 2018 17:23:01 +0200 Subject: [PATCH 207/499] KVM: x86: never trap MSR_KERNEL_GS_BASE KVM has an old optimization whereby accesses to the kernel GS base MSR are trapped when the guest is in 32-bit and not when it is in 64-bit mode. The idea is that swapgs is not available in 32-bit mode, thus the guest has no reason to access the MSR unless in 64-bit mode and 32-bit applications need not pay the price of switching the kernel GS base between the host and the guest values. However, this optimization adds complexity to the code for little benefit (these days most guests are going to be 64-bit anyway) and in fact broke after commit 678e315e78a7 ("KVM: vmx: add dedicated utility to access guest's kernel_gs_base", 2018-08-06); the guest kernel GS base can be corrupted across SMIs and UEFI Secure Boot is therefore broken (a secure boot Linux guest, for example, fails to reach the login prompt about half the time). This patch just removes the optimization; the kernel GS base MSR is now never trapped by KVM, similarly to the FS and GS base MSRs. Fixes: 678e315e78a780dbef384b92339c8414309dbc11 Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 47 ++++++++++------------------------------------ 1 file changed, 10 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06412ba46aa3..8b066480224b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -121,7 +121,6 @@ module_param_named(pml, enable_pml, bool, S_IRUGO); #define MSR_BITMAP_MODE_X2APIC 1 #define MSR_BITMAP_MODE_X2APIC_APICV 2 -#define MSR_BITMAP_MODE_LM 4 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL @@ -2899,8 +2898,7 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } - if (is_long_mode(&vmx->vcpu)) - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); @@ -2951,8 +2949,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) vmx->loaded_cpu_state = NULL; #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); @@ -2980,24 +2977,19 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { - if (is_long_mode(&vmx->vcpu)) { - preempt_disable(); - if (vmx->loaded_cpu_state) - rdmsrl(MSR_KERNEL_GS_BASE, - vmx->msr_guest_kernel_gs_base); - preempt_enable(); - } + preempt_disable(); + if (vmx->loaded_cpu_state) + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + preempt_enable(); return vmx->msr_guest_kernel_gs_base; } static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { - if (is_long_mode(&vmx->vcpu)) { - preempt_disable(); - if (vmx->loaded_cpu_state) - wrmsrl(MSR_KERNEL_GS_BASE, data); - preempt_enable(); - } + preempt_disable(); + if (vmx->loaded_cpu_state) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); vmx->msr_guest_kernel_gs_base = data; } #endif @@ -5073,19 +5065,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) if (!msr) return; - /* - * MSR_KERNEL_GS_BASE is not intercepted when the guest is in - * 64-bit mode as a 64-bit kernel may frequently access the - * MSR. This means we need to manually save/restore the MSR - * when switching between guest and host state, but only if - * the guest is in 64-bit mode. Sync our cached value if the - * guest is transitioning to 32-bit mode and the CPU contains - * guest state, i.e. the cache is stale. - */ -#ifdef CONFIG_X86_64 - if (!(efer & EFER_LMA)) - (void)vmx_read_guest_kernel_gs_base(vmx); -#endif vcpu->arch.efer = efer; if (efer & EFER_LMA) { vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); @@ -6078,9 +6057,6 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) mode |= MSR_BITMAP_MODE_X2APIC_APICV; } - if (is_long_mode(vcpu)) - mode |= MSR_BITMAP_MODE_LM; - return mode; } @@ -6121,9 +6097,6 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) if (!changed) return; - vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, - !(mode & MSR_BITMAP_MODE_LM)); - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); From 469ed9d823b7d240d6b9574f061ded7c3834c167 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Fri, 21 Sep 2018 16:34:04 +0100 Subject: [PATCH 208/499] arm64: hugetlb: Fix handling of young ptes In the contiguous bit hugetlb break-before-make code we assume that all hugetlb pages are young. In fact, remove_migration_pte is able to place an old hugetlb pte so this assumption is not valid. This patch fixes the contiguous hugetlb scanning code to preserve young ptes. Fixes: d8bdcff28764 ("arm64: hugetlb: Add break-before-make logic for contiguous entries") Signed-off-by: Steve Capper Signed-off-by: Will Deacon --- arch/arm64/mm/hugetlbpage.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 192b3ba07075..f85be2f8b140 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -117,11 +117,14 @@ static pte_t get_clear_flush(struct mm_struct *mm, /* * If HW_AFDBM is enabled, then the HW could turn on - * the dirty bit for any page in the set, so check - * them all. All hugetlb entries are already young. + * the dirty or accessed bit for any page in the set, + * so check them all. */ if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); } if (valid) { @@ -340,10 +343,13 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, if (!pte_same(orig_pte, pte)) changed = 1; - /* Make sure we don't lose the dirty state */ + /* Make sure we don't lose the dirty or young state */ if (pte_dirty(orig_pte)) pte = pte_mkdirty(pte); + if (pte_young(orig_pte)) + pte = pte_mkyoung(pte); + hugeprot = pte_pgprot(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) set_pte_at(vma->vm_mm, addr, ptep, pfn_pte(pfn, hugeprot)); From 031e6e6b4e1277e76e73a6ab209095ad9bf3ce52 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Fri, 21 Sep 2018 16:34:05 +0100 Subject: [PATCH 209/499] arm64: hugetlb: Avoid unnecessary clearing in huge_ptep_set_access_flags For contiguous hugetlb, huge_ptep_set_access_flags performs a get_clear_flush (which then flushes the TLBs) even when no change of ptes is necessary. Unfortunately, this behaviour can lead to back-to-back page faults being generated when running with multiple threads that access the same contiguous huge page. Thread 1 | Thread 2 -----------------------------+------------------------------ hugetlb_fault | huge_ptep_set_access_flags | -> invalidate pte range | hugetlb_fault continue processing | wait for hugetlb_fault_mutex release mutex and return | huge_ptep_set_access_flags | -> invalidate pte range hugetlb_fault ... This patch changes huge_ptep_set_access_flags s.t. we first read the contiguous range of ptes (whilst preserving dirty information); the pte range is only then invalidated where necessary and this prevents further spurious page faults. Fixes: d8bdcff28764 ("arm64: hugetlb: Add break-before-make logic for contiguous entries") Reported-by: Lei Zhang Signed-off-by: Steve Capper Signed-off-by: Will Deacon --- arch/arm64/mm/hugetlbpage.c | 38 +++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index f85be2f8b140..f58ea503ad01 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -323,11 +323,40 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return get_clear_flush(mm, addr, ptep, pgsize, ncontig); } +/* + * huge_ptep_set_access_flags will update access flags (dirty, accesssed) + * and write permission. + * + * For a contiguous huge pte range we need to check whether or not write + * permission has to change only on the first pte in the set. Then for + * all the contiguous ptes we need to check whether or not there is a + * discrepancy between dirty or young. + */ +static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig) +{ + int i; + + if (pte_write(pte) != pte_write(huge_ptep_get(ptep))) + return 1; + + for (i = 0; i < ncontig; i++) { + pte_t orig_pte = huge_ptep_get(ptep + i); + + if (pte_dirty(pte) != pte_dirty(orig_pte)) + return 1; + + if (pte_young(pte) != pte_young(orig_pte)) + return 1; + } + + return 0; +} + int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - int ncontig, i, changed = 0; + int ncontig, i; size_t pgsize = 0; unsigned long pfn = pte_pfn(pte), dpfn; pgprot_t hugeprot; @@ -339,9 +368,10 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); dpfn = pgsize >> PAGE_SHIFT; + if (!__cont_access_flags_changed(ptep, pte, ncontig)) + return 0; + orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig); - if (!pte_same(orig_pte, pte)) - changed = 1; /* Make sure we don't lose the dirty or young state */ if (pte_dirty(orig_pte)) @@ -354,7 +384,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) set_pte_at(vma->vm_mm, addr, ptep, pfn_pte(pfn, hugeprot)); - return changed; + return 1; } void huge_ptep_set_wrprotect(struct mm_struct *mm, From 3ab97942d0213b6583a5408630a8cbbfbf54730f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 19 Sep 2018 17:14:01 -0700 Subject: [PATCH 210/499] ARM: dts: BCM63xx: Fix incorrect interrupt specifiers A number of our interrupts were incorrectly specified, fix both the PPI and SPI interrupts to be correct. Fixes: b5762cacc411 ("ARM: bcm63138: add NAND DT support") Fixes: 46d4bca0445a ("ARM: BCM63XX: add BCM63138 minimal Device Tree") Signed-off-by: Florian Fainelli --- arch/arm/boot/dts/bcm63138.dtsi | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/arm/boot/dts/bcm63138.dtsi b/arch/arm/boot/dts/bcm63138.dtsi index 43ee992ccdcf..6df61518776f 100644 --- a/arch/arm/boot/dts/bcm63138.dtsi +++ b/arch/arm/boot/dts/bcm63138.dtsi @@ -106,21 +106,23 @@ gic: interrupt-controller@1e100 { global_timer: timer@1e200 { compatible = "arm,cortex-a9-global-timer"; reg = <0x1e200 0x20>; - interrupts = ; + interrupts = ; clocks = <&axi_clk>; }; local_timer: local-timer@1e600 { compatible = "arm,cortex-a9-twd-timer"; reg = <0x1e600 0x20>; - interrupts = ; + interrupts = ; clocks = <&axi_clk>; }; twd_watchdog: watchdog@1e620 { compatible = "arm,cortex-a9-twd-wdt"; reg = <0x1e620 0x20>; - interrupts = ; + interrupts = ; }; armpll: armpll { @@ -158,7 +160,7 @@ timer: timer@80 { serial0: serial@600 { compatible = "brcm,bcm6345-uart"; reg = <0x600 0x1b>; - interrupts = ; + interrupts = ; clocks = <&periph_clk>; clock-names = "periph"; status = "disabled"; @@ -167,7 +169,7 @@ serial0: serial@600 { serial1: serial@620 { compatible = "brcm,bcm6345-uart"; reg = <0x620 0x1b>; - interrupts = ; + interrupts = ; clocks = <&periph_clk>; clock-names = "periph"; status = "disabled"; @@ -180,7 +182,7 @@ nand: nand@2000 { reg = <0x2000 0x600>, <0xf0 0x10>; reg-names = "nand", "nand-int-base"; status = "disabled"; - interrupts = ; + interrupts = ; interrupt-names = "nand"; }; From d8e2262a5044c1a05b4da51e5d0976f10c487a9f Mon Sep 17 00:00:00 2001 From: Saif Hasan Date: Fri, 21 Sep 2018 14:30:05 -0700 Subject: [PATCH 211/499] mpls: allow routes on ip6gre devices Summary: This appears to be necessary and sufficient change to enable `MPLS` on `ip6gre` tunnels (RFC4023). This diff allows IP6GRE devices to be recognized by MPLS kernel module and hence user can configure interface to accept packets with mpls headers as well setup mpls routes on them. Test Plan: Test plan consists of multiple containers connected via GRE-V6 tunnel. Then carrying out testing steps as below. - Carry out necessary sysctl settings on all containers ``` sysctl -w net.mpls.platform_labels=65536 sysctl -w net.mpls.ip_ttl_propagate=1 sysctl -w net.mpls.conf.lo.input=1 ``` - Establish IP6GRE tunnels ``` ip -6 tunnel add name if_1_2_1 mode ip6gre \ local 2401:db00:21:6048:feed:0::1 \ remote 2401:db00:21:6048:feed:0::2 key 1 ip link set dev if_1_2_1 up sysctl -w net.mpls.conf.if_1_2_1.input=1 ip -4 addr add 169.254.0.2/31 dev if_1_2_1 scope link ip -6 tunnel add name if_1_3_1 mode ip6gre \ local 2401:db00:21:6048:feed:0::1 \ remote 2401:db00:21:6048:feed:0::3 key 1 ip link set dev if_1_3_1 up sysctl -w net.mpls.conf.if_1_3_1.input=1 ip -4 addr add 169.254.0.4/31 dev if_1_3_1 scope link ``` - Install MPLS encap rules on node-1 towards node-2 ``` ip route add 192.168.0.11/32 nexthop encap mpls 32/64 \ via inet 169.254.0.3 dev if_1_2_1 ``` - Install MPLS forwarding rules on node-2 and node-3 ``` // node2 ip -f mpls route add 32 via inet 169.254.0.7 dev if_2_4_1 // node3 ip -f mpls route add 64 via inet 169.254.0.12 dev if_4_3_1 ``` - Ping 192.168.0.11 (node4) from 192.168.0.1 (node1) (where routing towards 192.168.0.1 is via IP route directly towards node1 from node4) ``` ping 192.168.0.11 ``` - tcpdump on interface to capture ping packets wrapped within MPLS header which inturn wrapped within IP6GRE header ``` 16:43:41.121073 IP6 2401:db00:21:6048:feed::1 > 2401:db00:21:6048:feed::2: DSTOPT GREv0, key=0x1, length 100: MPLS (label 32, exp 0, ttl 255) (label 64, exp 0, [S], ttl 255) IP 192.168.0.1 > 192.168.0.11: ICMP echo request, id 1208, seq 45, length 64 0x0000: 6000 2cdb 006c 3c3f 2401 db00 0021 6048 `.,..l Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 7a4de6d618b1..8fbe6cdbe255 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1533,10 +1533,14 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, unsigned int flags; if (event == NETDEV_REGISTER) { - /* For now just support Ethernet, IPGRE, SIT and IPIP devices */ + + /* For now just support Ethernet, IPGRE, IP6GRE, SIT and + * IPIP devices + */ if (dev->type == ARPHRD_ETHER || dev->type == ARPHRD_LOOPBACK || dev->type == ARPHRD_IPGRE || + dev->type == ARPHRD_IP6GRE || dev->type == ARPHRD_SIT || dev->type == ARPHRD_TUNNEL) { mdev = mpls_add_dev(dev); From ccfec9e5cb2d48df5a955b7bf47f7782157d3bc2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 24 Sep 2018 15:48:19 +0200 Subject: [PATCH 212/499] ip_tunnel: be careful when accessing the inner header Cong noted that we need the same checks introduced by commit 76c0ddd8c3a6 ("ip6_tunnel: be careful when accessing the inner header") even for ipv4 tunnels. Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Suggested-by: Cong Wang Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c4f5602308ed..284a22154b4e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -627,6 +627,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); + unsigned int inner_nhdr_len = 0; const struct iphdr *inner_iph; struct flowi4 fl4; u8 tos, ttl; @@ -636,6 +637,14 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, __be32 dst; bool connected; + /* ensure we can access the inner net header, for several users below */ + if (skb->protocol == htons(ETH_P_IP)) + inner_nhdr_len = sizeof(struct iphdr); + else if (skb->protocol == htons(ETH_P_IPV6)) + inner_nhdr_len = sizeof(struct ipv6hdr); + if (unlikely(!pskb_may_pull(skb, inner_nhdr_len))) + goto tx_error; + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); From f4a518797b40e956b6e6220b42657e045dc24470 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 24 Sep 2018 16:56:13 +0200 Subject: [PATCH 213/499] net: mvneta: fix the remaining Rx descriptor unmapping issues With CONFIG_DMA_API_DEBUG enabled we get DMA unmapping warning in various places of the mvneta driver, for example when putting down an interface while traffic is passing through. The issue is when using s/w buffer management, the Rx buffers are mapped using dma_map_page but unmapped with dma_unmap_single. This patch fixes this by using the right unmapping function. Fixes: 562e2f467e71 ("net: mvneta: Improve the buffer allocation method for SWBM") Signed-off-by: Antoine Tenart Reviewed-by: Gregory CLEMENT Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 2db9708f2e24..b4ed7d394d07 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -1890,8 +1890,8 @@ static void mvneta_rxq_drop_pkts(struct mvneta_port *pp, if (!data || !(rx_desc->buf_phys_addr)) continue; - dma_unmap_single(pp->dev->dev.parent, rx_desc->buf_phys_addr, - MVNETA_RX_BUF_SIZE(pp->pkt_size), DMA_FROM_DEVICE); + dma_unmap_page(pp->dev->dev.parent, rx_desc->buf_phys_addr, + PAGE_SIZE, DMA_FROM_DEVICE); __free_page(data); } } @@ -2039,9 +2039,8 @@ static int mvneta_rx_swbm(struct napi_struct *napi, frag_offset, frag_size, PAGE_SIZE); - dma_unmap_single(dev->dev.parent, phys_addr, - PAGE_SIZE, - DMA_FROM_DEVICE); + dma_unmap_page(dev->dev.parent, phys_addr, + PAGE_SIZE, DMA_FROM_DEVICE); rxq->left_size -= frag_size; } From 57a489786de9ec37d6e25ef1305dc337047f0236 Mon Sep 17 00:00:00 2001 From: James Cowgill Date: Thu, 6 Sep 2018 22:57:56 +0100 Subject: [PATCH 214/499] RISC-V: include linux/ftrace.h in asm-prototypes.h Building a riscv kernel with CONFIG_FUNCTION_TRACER and CONFIG_MODVERSIONS enabled results in these two warnings: MODPOST vmlinux.o WARNING: EXPORT symbol "return_to_handler" [vmlinux] version generation failed, symbol will not be versioned. WARNING: EXPORT symbol "_mcount" [vmlinux] version generation failed, symbol will not be versioned. When exporting symbols from an assembly file, the MODVERSIONS code requires their prototypes to be defined in asm-prototypes.h (see scripts/Makefile.build). Since both of these symbols have prototypes defined in linux/ftrace.h, include this header from RISC-V's asm-prototypes.h. Reported-by: Karsten Merker Signed-off-by: James Cowgill Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/asm-prototypes.h | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 arch/riscv/include/asm/asm-prototypes.h diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..c9fecd120d18 --- /dev/null +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_PROTOTYPES_H + +#include +#include + +#endif /* _ASM_RISCV_PROTOTYPES_H */ From 6697576788816985f9c79da190abfaad7c9e1738 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 20 Sep 2018 11:03:22 -0700 Subject: [PATCH 215/499] i2c: i2c-qcom-geni: Properly handle DMA safe buffers We shouldn't attempt to DMA map the message buffers passed into this driver from the i2c core unless the message we're mapping have been properly setup for DMA. The i2c core indicates such a situation by setting the I2C_M_DMA_SAFE flag, so check for that flag before using DMA mode. We can also bounce the buffer if it isn't already mapped properly by using the i2c_get_dma_safe_msg_buf() APIs, so do that when we want to use DMA for a message. This fixes a problem where the kernel oopses cleaning pages for a buffer that's mapped into the vmalloc space. The pages are returned from request_firmware() and passed down directly to the i2c master to write to the i2c touchscreen device. Mapping vmalloc buffers with dma_map_single() won't work reliably, causing an oops like below: Unable to handle kernel paging request at virtual address ffffffc01391d000 ... Reported-by: Philip Chen Signed-off-by: Stephen Boyd Reviewed-by: Douglas Anderson Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-qcom-geni.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index 36732eb688a4..9f2eb02481d3 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -367,20 +367,26 @@ static int geni_i2c_rx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, dma_addr_t rx_dma; enum geni_se_xfer_mode mode; unsigned long time_left = XFER_TIMEOUT; + void *dma_buf; gi2c->cur = msg; - mode = msg->len > 32 ? GENI_SE_DMA : GENI_SE_FIFO; + mode = GENI_SE_FIFO; + dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (dma_buf) + mode = GENI_SE_DMA; + geni_se_select_mode(&gi2c->se, mode); writel_relaxed(msg->len, gi2c->se.base + SE_I2C_RX_TRANS_LEN); geni_se_setup_m_cmd(&gi2c->se, I2C_READ, m_param); if (mode == GENI_SE_DMA) { int ret; - ret = geni_se_rx_dma_prep(&gi2c->se, msg->buf, msg->len, + ret = geni_se_rx_dma_prep(&gi2c->se, dma_buf, msg->len, &rx_dma); if (ret) { mode = GENI_SE_FIFO; geni_se_select_mode(&gi2c->se, mode); + i2c_put_dma_safe_msg_buf(dma_buf, msg, false); } } @@ -393,6 +399,7 @@ static int geni_i2c_rx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, if (gi2c->err) geni_i2c_rx_fsm_rst(gi2c); geni_se_rx_dma_unprep(&gi2c->se, rx_dma, msg->len); + i2c_put_dma_safe_msg_buf(dma_buf, msg, !gi2c->err); } return gi2c->err; } @@ -403,20 +410,26 @@ static int geni_i2c_tx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, dma_addr_t tx_dma; enum geni_se_xfer_mode mode; unsigned long time_left; + void *dma_buf; gi2c->cur = msg; - mode = msg->len > 32 ? GENI_SE_DMA : GENI_SE_FIFO; + mode = GENI_SE_FIFO; + dma_buf = i2c_get_dma_safe_msg_buf(msg, 32); + if (dma_buf) + mode = GENI_SE_DMA; + geni_se_select_mode(&gi2c->se, mode); writel_relaxed(msg->len, gi2c->se.base + SE_I2C_TX_TRANS_LEN); geni_se_setup_m_cmd(&gi2c->se, I2C_WRITE, m_param); if (mode == GENI_SE_DMA) { int ret; - ret = geni_se_tx_dma_prep(&gi2c->se, msg->buf, msg->len, + ret = geni_se_tx_dma_prep(&gi2c->se, dma_buf, msg->len, &tx_dma); if (ret) { mode = GENI_SE_FIFO; geni_se_select_mode(&gi2c->se, mode); + i2c_put_dma_safe_msg_buf(dma_buf, msg, false); } } @@ -432,6 +445,7 @@ static int geni_i2c_tx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, if (gi2c->err) geni_i2c_tx_fsm_rst(gi2c); geni_se_tx_dma_unprep(&gi2c->se, tx_dma, msg->len); + i2c_put_dma_safe_msg_buf(dma_buf, msg, !gi2c->err); } return gi2c->err; } From ea51e17b953f391c0a7d6684141e76a5ab209fef Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 24 Sep 2018 18:14:37 +0100 Subject: [PATCH 216/499] i2c: i2c-isch: fix spelling mistake "unitialized" -> "uninitialized" Trivial fix to spelling mistake in dev_notice message. Signed-off-by: Colin Ian King Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-isch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-isch.c b/drivers/i2c/busses/i2c-isch.c index 0cf1379f4e80..5c754bf659e2 100644 --- a/drivers/i2c/busses/i2c-isch.c +++ b/drivers/i2c/busses/i2c-isch.c @@ -164,7 +164,7 @@ static s32 sch_access(struct i2c_adapter *adap, u16 addr, * run ~75 kHz instead which should do no harm. */ dev_notice(&sch_adapter.dev, - "Clock divider unitialized. Setting defaults\n"); + "Clock divider uninitialized. Setting defaults\n"); outw(backbone_speed / (4 * 100), SMBHSTCLK); } From 74bc2abca7603c956d1e331e8b9bee7b874c1eec Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 27 Aug 2018 12:56:24 +0200 Subject: [PATCH 217/499] iommu/rockchip: Free irqs in shutdown handler In the iommu's shutdown handler we disable runtime-pm which could result in the irq-handler running unclocked and since commit 3fc7c5c0cff3 ("iommu/rockchip: Handle errors returned from PM framework") we warn about that fact. This can cause warnings on shutdown on some Rockchip machines, so free the irqs in the shutdown handler before we disable runtime-pm. Reported-by: Enric Balletbo i Serra Fixes: 3fc7c5c0cff3 ("iommu/rockchip: Handle errors returned from PM framework") Signed-off-by: Heiko Stuebner Tested-by: Enric Balletbo i Serra Acked-by: Marc Zyngier Signed-off-by: Joerg Roedel --- drivers/iommu/rockchip-iommu.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 258115b10fa9..ad3e2b97469e 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -1241,6 +1241,12 @@ static int rk_iommu_probe(struct platform_device *pdev) static void rk_iommu_shutdown(struct platform_device *pdev) { + struct rk_iommu *iommu = platform_get_drvdata(pdev); + int i = 0, irq; + + while ((irq = platform_get_irq(pdev, i++)) != -ENXIO) + devm_free_irq(iommu->dev, irq, iommu); + pm_runtime_force_suspend(&pdev->dev); } From b85bfa246efd24ea3fdb5ee949c28e3110c6d299 Mon Sep 17 00:00:00 2001 From: Daniel Kurtz Date: Sat, 22 Sep 2018 13:58:26 -0600 Subject: [PATCH 218/499] pinctrl/amd: poll InterruptEnable bits in amd_gpio_irq_set_type From the AMD BKDG, if WAKE_INT_MASTER_REG.MaskStsEn is set, a software write to the debounce registers of *any* gpio will block wake/interrupt status generation for *all* gpios for a length of time that depends on WAKE_INT_MASTER_REG.MaskStsLength[11:0]. During this period the Interrupt Delivery bit (INTERRUPT_ENABLE) will read as 0. In commit 4c1de0414a1340 ("pinctrl/amd: poll InterruptEnable bits in enable_irq") we tried to fix this same "gpio Interrupts are blocked immediately after writing debounce registers" problem, but incorrectly assumed it only affected the gpio whose debounce was being configured and not ALL gpios. To solve this for all gpios, we move the polling loop from amd_gpio_irq_enable() to amd_gpio_irq_set_type(), while holding the gpio spinlock. This ensures that another gpio operation (e.g. amd_gpio_irq_unmask()) can read a temporarily disabled IRQ and incorrectly disable it while trying to modify some other register bits. Fixes: 4c1de0414a1340 pinctrl/amd: poll InterruptEnable bits in enable_irq Signed-off-by: Daniel Kurtz Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 41ccc759b8b8..1425c2874d40 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -348,21 +348,12 @@ static void amd_gpio_irq_enable(struct irq_data *d) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); - u32 mask = BIT(INTERRUPT_ENABLE_OFF) | BIT(INTERRUPT_MASK_OFF); raw_spin_lock_irqsave(&gpio_dev->lock, flags); pin_reg = readl(gpio_dev->base + (d->hwirq)*4); pin_reg |= BIT(INTERRUPT_ENABLE_OFF); pin_reg |= BIT(INTERRUPT_MASK_OFF); writel(pin_reg, gpio_dev->base + (d->hwirq)*4); - /* - * When debounce logic is enabled it takes ~900 us before interrupts - * can be enabled. During this "debounce warm up" period the - * "INTERRUPT_ENABLE" bit will read as 0. Poll the bit here until it - * reads back as 1, signaling that interrupts are now enabled. - */ - while ((readl(gpio_dev->base + (d->hwirq)*4) & mask) != mask) - continue; raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); } @@ -426,7 +417,7 @@ static void amd_gpio_irq_eoi(struct irq_data *d) static int amd_gpio_irq_set_type(struct irq_data *d, unsigned int type) { int ret = 0; - u32 pin_reg; + u32 pin_reg, pin_reg_irq_en, mask; unsigned long flags, irq_flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); @@ -495,6 +486,28 @@ static int amd_gpio_irq_set_type(struct irq_data *d, unsigned int type) } pin_reg |= CLR_INTR_STAT << INTERRUPT_STS_OFF; + /* + * If WAKE_INT_MASTER_REG.MaskStsEn is set, a software write to the + * debounce registers of any GPIO will block wake/interrupt status + * generation for *all* GPIOs for a lenght of time that depends on + * WAKE_INT_MASTER_REG.MaskStsLength[11:0]. During this period the + * INTERRUPT_ENABLE bit will read as 0. + * + * We temporarily enable irq for the GPIO whose configuration is + * changing, and then wait for it to read back as 1 to know when + * debounce has settled and then disable the irq again. + * We do this polling with the spinlock held to ensure other GPIO + * access routines do not read an incorrect value for the irq enable + * bit of other GPIOs. We keep the GPIO masked while polling to avoid + * spurious irqs, and disable the irq again after polling. + */ + mask = BIT(INTERRUPT_ENABLE_OFF); + pin_reg_irq_en = pin_reg; + pin_reg_irq_en |= mask; + pin_reg_irq_en &= ~BIT(INTERRUPT_MASK_OFF); + writel(pin_reg_irq_en, gpio_dev->base + (d->hwirq)*4); + while ((readl(gpio_dev->base + (d->hwirq)*4) & mask) != mask) + continue; writel(pin_reg, gpio_dev->base + (d->hwirq)*4); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); From e50d95e2ad1266f8d3fcdf0724f03dbdffd400aa Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 24 Sep 2018 17:32:11 +0300 Subject: [PATCH 219/499] pinctrl: cannonlake: Fix HOSTSW_OWN register offset of H variant It turns out the HOSTSW_OWN register offset is different between LP and H variants. The latter should use 0xc0 instead so fix that. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199911 Fixes: a663ccf0fea1 ("pinctrl: intel: Add Intel Cannon Lake PCH-H pin controller support") Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Linus Walleij --- drivers/pinctrl/intel/pinctrl-cannonlake.c | 33 +++++++++++++--------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-cannonlake.c b/drivers/pinctrl/intel/pinctrl-cannonlake.c index 8d48371caaa2..e7f45d96b0cb 100644 --- a/drivers/pinctrl/intel/pinctrl-cannonlake.c +++ b/drivers/pinctrl/intel/pinctrl-cannonlake.c @@ -15,10 +15,11 @@ #include "pinctrl-intel.h" -#define CNL_PAD_OWN 0x020 -#define CNL_PADCFGLOCK 0x080 -#define CNL_HOSTSW_OWN 0x0b0 -#define CNL_GPI_IE 0x120 +#define CNL_PAD_OWN 0x020 +#define CNL_PADCFGLOCK 0x080 +#define CNL_LP_HOSTSW_OWN 0x0b0 +#define CNL_H_HOSTSW_OWN 0x0c0 +#define CNL_GPI_IE 0x120 #define CNL_GPP(r, s, e, g) \ { \ @@ -30,12 +31,12 @@ #define CNL_NO_GPIO -1 -#define CNL_COMMUNITY(b, s, e, g) \ +#define CNL_COMMUNITY(b, s, e, o, g) \ { \ .barno = (b), \ .padown_offset = CNL_PAD_OWN, \ .padcfglock_offset = CNL_PADCFGLOCK, \ - .hostown_offset = CNL_HOSTSW_OWN, \ + .hostown_offset = (o), \ .ie_offset = CNL_GPI_IE, \ .pin_base = (s), \ .npins = ((e) - (s) + 1), \ @@ -43,6 +44,12 @@ .ngpps = ARRAY_SIZE(g), \ } +#define CNLLP_COMMUNITY(b, s, e, g) \ + CNL_COMMUNITY(b, s, e, CNL_LP_HOSTSW_OWN, g) + +#define CNLH_COMMUNITY(b, s, e, g) \ + CNL_COMMUNITY(b, s, e, CNL_H_HOSTSW_OWN, g) + /* Cannon Lake-H */ static const struct pinctrl_pin_desc cnlh_pins[] = { /* GPP_A */ @@ -442,10 +449,10 @@ static const struct intel_function cnlh_functions[] = { }; static const struct intel_community cnlh_communities[] = { - CNL_COMMUNITY(0, 0, 50, cnlh_community0_gpps), - CNL_COMMUNITY(1, 51, 154, cnlh_community1_gpps), - CNL_COMMUNITY(2, 155, 248, cnlh_community3_gpps), - CNL_COMMUNITY(3, 249, 298, cnlh_community4_gpps), + CNLH_COMMUNITY(0, 0, 50, cnlh_community0_gpps), + CNLH_COMMUNITY(1, 51, 154, cnlh_community1_gpps), + CNLH_COMMUNITY(2, 155, 248, cnlh_community3_gpps), + CNLH_COMMUNITY(3, 249, 298, cnlh_community4_gpps), }; static const struct intel_pinctrl_soc_data cnlh_soc_data = { @@ -803,9 +810,9 @@ static const struct intel_padgroup cnllp_community4_gpps[] = { }; static const struct intel_community cnllp_communities[] = { - CNL_COMMUNITY(0, 0, 67, cnllp_community0_gpps), - CNL_COMMUNITY(1, 68, 180, cnllp_community1_gpps), - CNL_COMMUNITY(2, 181, 243, cnllp_community4_gpps), + CNLLP_COMMUNITY(0, 0, 67, cnllp_community0_gpps), + CNLLP_COMMUNITY(1, 68, 180, cnllp_community1_gpps), + CNLLP_COMMUNITY(2, 181, 243, cnllp_community4_gpps), }; static const struct intel_pinctrl_soc_data cnllp_soc_data = { From 72923e5488f0604fac8ef2c7e683fabd3b4c203b Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 24 Sep 2018 17:32:12 +0300 Subject: [PATCH 220/499] Revert "pinctrl: intel: Do pin translation when lock IRQ" This reverts commit 55aedef50d4d810670916d9fce4a40d5da2079e7. Commit 55aedef50d4d ("pinctrl: intel: Do pin translation when lock IRQ") added special translation from GPIO number to hardware pin number to irq_reqres/relres hooks to avoid failure when IRQs are requested. The actual failure happened inside gpiochip_lock_as_irq() because it calls gpiod_get_direction() and pinctrl-intel.c::intel_gpio_get_direction() implementation originally missed the translation so the two hooks made it work by skipping the ->get_direction() call entirely (it overwrote the default GPIOLIB provided functions). The proper fix that adds translation to GPIO callbacks was merged with commit 96147db1e1df ("pinctrl: intel: Do pin translation in other GPIO operations as well"). This allows us to use the default GPIOLIB provided functions again. In addition as find out by Benjamin Tissoires the two functions (intel_gpio_irq_reqres()/intel_gpio_irq_relres()) now cause problems of their own because they operate on pin numbers and pass that pin number to gpiochip_lock_as_irq() which actually expects a GPIO number. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199911 Fixes: 55aedef50d4d ("pinctrl: intel: Do pin translation when lock IRQ") Reported-and-tested-by: Benjamin Tissoires Signed-off-by: Mika Westerberg Acked-by: Andy Shevchenko Signed-off-by: Linus Walleij --- drivers/pinctrl/intel/pinctrl-intel.c | 32 --------------------------- 1 file changed, 32 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index ec8dafc94694..1ea3438ea67e 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -887,36 +887,6 @@ static const struct gpio_chip intel_gpio_chip = { .set_config = gpiochip_generic_config, }; -static int intel_gpio_irq_reqres(struct irq_data *d) -{ - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct intel_pinctrl *pctrl = gpiochip_get_data(gc); - int pin; - int ret; - - pin = intel_gpio_to_pin(pctrl, irqd_to_hwirq(d), NULL, NULL); - if (pin >= 0) { - ret = gpiochip_lock_as_irq(gc, pin); - if (ret) { - dev_err(pctrl->dev, "unable to lock HW IRQ %d for IRQ\n", - pin); - return ret; - } - } - return 0; -} - -static void intel_gpio_irq_relres(struct irq_data *d) -{ - struct gpio_chip *gc = irq_data_get_irq_chip_data(d); - struct intel_pinctrl *pctrl = gpiochip_get_data(gc); - int pin; - - pin = intel_gpio_to_pin(pctrl, irqd_to_hwirq(d), NULL, NULL); - if (pin >= 0) - gpiochip_unlock_as_irq(gc, pin); -} - static void intel_gpio_irq_ack(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); @@ -1132,8 +1102,6 @@ static irqreturn_t intel_gpio_irq(int irq, void *data) static struct irq_chip intel_gpio_irqchip = { .name = "intel-gpio", - .irq_request_resources = intel_gpio_irq_reqres, - .irq_release_resources = intel_gpio_irq_relres, .irq_enable = intel_gpio_irq_enable, .irq_ack = intel_gpio_irq_ack, .irq_mask = intel_gpio_irq_mask, From 8c0f9f5b309d627182d5da72a69246f58bde1026 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 24 Sep 2018 13:18:34 +0100 Subject: [PATCH 221/499] Revert "uapi/linux/keyctl.h: don't use C++ reserved keyword as a struct member name" This changes UAPI, breaking iwd and libell: ell/key.c: In function 'kernel_dh_compute': ell/key.c:205:38: error: 'struct keyctl_dh_params' has no member named 'private'; did you mean 'dh_private'? struct keyctl_dh_params params = { .private = private, ^~~~~~~ dh_private This reverts commit 8a2336e549d385bb0b46880435b411df8d8200e8. Fixes: 8a2336e549d3 ("uapi/linux/keyctl.h: don't use C++ reserved keyword as a struct member name") Signed-off-by: Lubomir Rintel Signed-off-by: David Howells cc: Randy Dunlap cc: Mat Martineau cc: Stephan Mueller cc: James Morris cc: "Serge E. Hallyn" cc: Mat Martineau cc: Andrew Morton cc: Linus Torvalds cc: Signed-off-by: James Morris Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/keyctl.h | 2 +- security/keys/dh.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index 910cc4334b21..7b8c9e19bad1 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -65,7 +65,7 @@ /* keyctl structures */ struct keyctl_dh_params { - __s32 dh_private; + __s32 private; __s32 prime; __s32 base; }; diff --git a/security/keys/dh.c b/security/keys/dh.c index 3b602a1e27fa..711e89d8c415 100644 --- a/security/keys/dh.c +++ b/security/keys/dh.c @@ -300,7 +300,7 @@ long __keyctl_dh_compute(struct keyctl_dh_params __user *params, } dh_inputs.g_size = dlen; - dlen = dh_data_from_key(pcopy.dh_private, &dh_inputs.key); + dlen = dh_data_from_key(pcopy.private, &dh_inputs.key); if (dlen < 0) { ret = dlen; goto out2; From be9e6598aeb0db70a7927d6b3bb4d3d6fb1c3e18 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 8 Sep 2018 09:42:53 +0800 Subject: [PATCH 222/499] iommu/vt-d: Handle memory shortage on pasid table allocation Pasid table memory allocation could return failure due to memory shortage. Limit the pasid table size to 1MiB because current 8MiB contiguous physical memory allocation can be hard to come by. W/o a PASID table, the device could continue to work with only shared virtual memory impacted. So, let's go ahead with context mapping even the memory allocation for pasid table failed. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107783 Fixes: cc580e41260d ("iommu/vt-d: Per PCI device pasid table interfaces") Cc: Ashok Raj Cc: Jacob Pan Cc: Mika Westerberg Reported-and-tested-by: Pelton Kyle D Tested-by: Mika Westerberg Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 6 +++--- drivers/iommu/intel-pasid.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 5f3f10cf9d9d..bedc801b06a0 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2540,9 +2540,9 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, if (dev && dev_is_pci(dev) && info->pasid_supported) { ret = intel_pasid_alloc_table(dev); if (ret) { - __dmar_remove_one_dev_info(info); - spin_unlock_irqrestore(&device_domain_lock, flags); - return NULL; + pr_warn("No pasid table for %s, pasid disabled\n", + dev_name(dev)); + info->pasid_supported = 0; } } spin_unlock_irqrestore(&device_domain_lock, flags); diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h index 1c05ed6fc5a5..1fb5e12b029a 100644 --- a/drivers/iommu/intel-pasid.h +++ b/drivers/iommu/intel-pasid.h @@ -11,7 +11,7 @@ #define __INTEL_PASID_H #define PASID_MIN 0x1 -#define PASID_MAX 0x100000 +#define PASID_MAX 0x20000 struct pasid_entry { u64 val; From cf13435b730a502e814c63c84d93db131e563f5f Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 24 Sep 2018 17:27:04 +1000 Subject: [PATCH 223/499] powerpc/tm: Fix userspace r13 corruption When we treclaim we store the userspace checkpointed r13 to a scratch SPR and then later save the scratch SPR to the user thread struct. Unfortunately, this doesn't work as accessing the user thread struct can take an SLB fault and the SLB fault handler will write the same scratch SPRG that now contains the userspace r13. To fix this, we store r13 to the kernel stack (which can't fault) before we access the user thread struct. Found by running P8 guest + powervm + disable_1tb_segments + TM. Seen as a random userspace segfault with r13 looking like a kernel address. Signed-off-by: Michael Neuling Reviewed-by: Breno Leitao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/tm.S | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 6bffbc5affe7..183e8d75936f 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -176,13 +176,20 @@ _GLOBAL(tm_reclaim) std r1, PACATMSCRATCH(r13) ld r1, PACAR1(r13) - /* Store the PPR in r11 and reset to decent value */ std r11, GPR11(r1) /* Temporary stash */ + /* + * Store r13 away so we can free up the scratch SPR for the SLB fault + * handler (needed once we start accessing the thread_struct). + */ + GET_SCRATCH0(r11) + std r11, GPR13(r1) + /* Reset MSR RI so we can take SLB faults again */ li r11, MSR_RI mtmsrd r11, 1 + /* Store the PPR in r11 and reset to decent value */ mfspr r11, SPRN_PPR HMT_MEDIUM @@ -211,7 +218,7 @@ _GLOBAL(tm_reclaim) ld r4, GPR7(r1) /* user r7 */ ld r5, GPR11(r1) /* user r11 */ ld r6, GPR12(r1) /* user r12 */ - GET_SCRATCH0(8) /* user r13 */ + ld r8, GPR13(r1) /* user r13 */ std r3, GPR1(r7) std r4, GPR7(r7) std r5, GPR11(r7) From 96dc89d526ef77604376f06220e3d2931a0bfd58 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 25 Sep 2018 19:36:47 +1000 Subject: [PATCH 224/499] powerpc/tm: Avoid possible userspace r1 corruption on reclaim Current we store the userspace r1 to PACATMSCRATCH before finally saving it to the thread struct. In theory an exception could be taken here (like a machine check or SLB miss) that could write PACATMSCRATCH and hence corrupt the userspace r1. The SLB fault currently doesn't touch PACATMSCRATCH, but others do. We've never actually seen this happen but it's theoretically possible. Either way, the code is fragile as it is. This patch saves r1 to the kernel stack (which can't fault) before we turn MSR[RI] back on. PACATMSCRATCH is still used but only with MSR[RI] off. We then copy r1 from the kernel stack to the thread struct once we have MSR[RI] back on. Suggested-by: Breno Leitao Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/tm.S | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 183e8d75936f..7716374786bd 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -178,6 +178,13 @@ _GLOBAL(tm_reclaim) std r11, GPR11(r1) /* Temporary stash */ + /* + * Move the saved user r1 to the kernel stack in case PACATMSCRATCH is + * clobbered by an exception once we turn on MSR_RI below. + */ + ld r11, PACATMSCRATCH(r13) + std r11, GPR1(r1) + /* * Store r13 away so we can free up the scratch SPR for the SLB fault * handler (needed once we start accessing the thread_struct). @@ -214,7 +221,7 @@ _GLOBAL(tm_reclaim) SAVE_GPR(8, r7) /* user r8 */ SAVE_GPR(9, r7) /* user r9 */ SAVE_GPR(10, r7) /* user r10 */ - ld r3, PACATMSCRATCH(r13) /* user r1 */ + ld r3, GPR1(r1) /* user r1 */ ld r4, GPR7(r1) /* user r7 */ ld r5, GPR11(r1) /* user r11 */ ld r6, GPR12(r1) /* user r12 */ From 2483ef056f6e42f61cd266452e2841165dfe1b5c Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 25 Sep 2018 17:55:15 +0530 Subject: [PATCH 225/499] powerpc/numa: Use associativity if VPHN hcall is successful Currently associativity is used to lookup node-id even if the preceding VPHN hcall failed. However this can cause CPU to be made part of the wrong node, (most likely to be node 0). This is because VPHN is not enabled on KVM guests. With 2ea6263 ("powerpc/topology: Get topology for shared processors at boot"), associativity is used to set to the wrong node. Hence KVM guest topology is broken. For example : A 4 node KVM guest before would have reported. [root@localhost ~]# numactl -H available: 4 nodes (0-3) node 0 cpus: 0 1 2 3 node 0 size: 1746 MB node 0 free: 1604 MB node 1 cpus: 4 5 6 7 node 1 size: 2044 MB node 1 free: 1765 MB node 2 cpus: 8 9 10 11 node 2 size: 2044 MB node 2 free: 1837 MB node 3 cpus: 12 13 14 15 node 3 size: 2044 MB node 3 free: 1903 MB node distances: node 0 1 2 3 0: 10 40 40 40 1: 40 10 40 40 2: 40 40 10 40 3: 40 40 40 10 Would now report: [root@localhost ~]# numactl -H available: 4 nodes (0-3) node 0 cpus: 0 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 0 size: 1746 MB node 0 free: 1244 MB node 1 cpus: node 1 size: 2044 MB node 1 free: 2032 MB node 2 cpus: 1 node 2 size: 2044 MB node 2 free: 2028 MB node 3 cpus: node 3 size: 2044 MB node 3 free: 2032 MB node distances: node 0 1 2 3 0: 10 40 40 40 1: 40 10 40 40 2: 40 40 10 40 3: 40 40 40 10 Fix this by skipping associativity lookup if the VPHN hcall failed. Fixes: 2ea626306810 ("powerpc/topology: Get topology for shared processors at boot") Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b5a71baedbc2..59d07bd5374a 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1204,7 +1204,9 @@ int find_and_online_cpu_nid(int cpu) int new_nid; /* Use associativity from first thread for all siblings */ - vphn_get_associativity(cpu, associativity); + if (vphn_get_associativity(cpu, associativity)) + return cpu_to_node(cpu); + new_nid = associativity_to_nid(associativity); if (new_nid < 0 || !node_possible(new_nid)) new_nid = first_online_node; From e1e5d8a9fe737d94ccc0ccbaf0c97f69a8f3e000 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Tue, 25 Sep 2018 08:32:50 +0200 Subject: [PATCH 226/499] net: macb: Clean 64b dma addresses if they are not detected Clear ADDR64 dma bit in DMACFG register in case that HW_DMA_CAP_64B is not detected on 64bit system. The issue was observed when bootloader(u-boot) does not check macb feature at DCFG6 register (DAW64_OFFSET) and enabling 64bit dma support by default. Then macb driver is reading DMACFG register back and only adding 64bit dma configuration but not cleaning it out. Signed-off-by: Michal Simek Acked-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index f1a86b422617..58b9744c4058 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -2160,6 +2160,7 @@ static void macb_configure_dma(struct macb *bp) else dmacfg &= ~GEM_BIT(TXCOEN); + dmacfg &= ~GEM_BIT(ADDR64); #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT if (bp->hw_dma_cap & HW_DMA_CAP_64B) dmacfg |= GEM_BIT(ADDR64); From 780e83c259fc33e8959fed8dfdad17e378d72b62 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 25 Sep 2018 02:12:30 -0600 Subject: [PATCH 227/499] xen-netback: fix input validation in xenvif_set_hash_mapping() Both len and off are frontend specified values, so we need to make sure there's no overflow when adding the two for the bounds check. We also want to avoid undefined behavior and hence use off to index into ->hash.mapping[] only after bounds checking. This at the same time allows to take care of not applying off twice for the bounds checking against vif->num_queues. It is also insufficient to bounds check copy_op.len, as this is len truncated to 16 bits. This is XSA-270 / CVE-2018-15471. Reported-by: Felix Wilhelm Signed-off-by: Jan Beulich Reviewed-by: Paul Durrant Tested-by: Paul Durrant Cc: stable@vger.kernel.org [4.7 onwards] Signed-off-by: David S. Miller --- drivers/net/xen-netback/hash.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/xen-netback/hash.c b/drivers/net/xen-netback/hash.c index 3c4c58b9fe76..3b6fb5b3bdb2 100644 --- a/drivers/net/xen-netback/hash.c +++ b/drivers/net/xen-netback/hash.c @@ -332,20 +332,22 @@ u32 xenvif_set_hash_mapping_size(struct xenvif *vif, u32 size) u32 xenvif_set_hash_mapping(struct xenvif *vif, u32 gref, u32 len, u32 off) { - u32 *mapping = &vif->hash.mapping[off]; + u32 *mapping = vif->hash.mapping; struct gnttab_copy copy_op = { .source.u.ref = gref, .source.domid = vif->domid, - .dest.u.gmfn = virt_to_gfn(mapping), .dest.domid = DOMID_SELF, - .dest.offset = xen_offset_in_page(mapping), - .len = len * sizeof(u32), + .len = len * sizeof(*mapping), .flags = GNTCOPY_source_gref }; - if ((off + len > vif->hash.size) || copy_op.len > XEN_PAGE_SIZE) + if ((off + len < off) || (off + len > vif->hash.size) || + len > XEN_PAGE_SIZE / sizeof(*mapping)) return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER; + copy_op.dest.u.gmfn = virt_to_gfn(mapping + off); + copy_op.dest.offset = xen_offset_in_page(mapping + off); + while (len-- != 0) if (mapping[off++] >= vif->num_queues) return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER; From 22f9cde3401077ea450b69bf9b0bba373e12e454 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 25 Sep 2018 02:13:01 -0600 Subject: [PATCH 228/499] xen-netback: validate queue numbers in xenvif_set_hash_mapping() Checking them before the grant copy means nothing as to the validity of the incoming request. As we shouldn't make the new data live before having validated it, introduce a second instance of the mapping array. Signed-off-by: Jan Beulich Reviewed-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 3 ++- drivers/net/xen-netback/hash.c | 20 ++++++++++++++------ drivers/net/xen-netback/interface.c | 3 ++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index a46a1e94505d..936c0b3e0ba2 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -241,8 +241,9 @@ struct xenvif_hash_cache { struct xenvif_hash { unsigned int alg; u32 flags; + bool mapping_sel; u8 key[XEN_NETBK_MAX_HASH_KEY_SIZE]; - u32 mapping[XEN_NETBK_MAX_HASH_MAPPING_SIZE]; + u32 mapping[2][XEN_NETBK_MAX_HASH_MAPPING_SIZE]; unsigned int size; struct xenvif_hash_cache cache; }; diff --git a/drivers/net/xen-netback/hash.c b/drivers/net/xen-netback/hash.c index 3b6fb5b3bdb2..dc9841ea2fff 100644 --- a/drivers/net/xen-netback/hash.c +++ b/drivers/net/xen-netback/hash.c @@ -324,7 +324,8 @@ u32 xenvif_set_hash_mapping_size(struct xenvif *vif, u32 size) return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER; vif->hash.size = size; - memset(vif->hash.mapping, 0, sizeof(u32) * size); + memset(vif->hash.mapping[vif->hash.mapping_sel], 0, + sizeof(u32) * size); return XEN_NETIF_CTRL_STATUS_SUCCESS; } @@ -332,7 +333,7 @@ u32 xenvif_set_hash_mapping_size(struct xenvif *vif, u32 size) u32 xenvif_set_hash_mapping(struct xenvif *vif, u32 gref, u32 len, u32 off) { - u32 *mapping = vif->hash.mapping; + u32 *mapping = vif->hash.mapping[!vif->hash.mapping_sel]; struct gnttab_copy copy_op = { .source.u.ref = gref, .source.domid = vif->domid, @@ -348,9 +349,8 @@ u32 xenvif_set_hash_mapping(struct xenvif *vif, u32 gref, u32 len, copy_op.dest.u.gmfn = virt_to_gfn(mapping + off); copy_op.dest.offset = xen_offset_in_page(mapping + off); - while (len-- != 0) - if (mapping[off++] >= vif->num_queues) - return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER; + memcpy(mapping, vif->hash.mapping[vif->hash.mapping_sel], + vif->hash.size * sizeof(*mapping)); if (copy_op.len != 0) { gnttab_batch_copy(©_op, 1); @@ -359,6 +359,12 @@ u32 xenvif_set_hash_mapping(struct xenvif *vif, u32 gref, u32 len, return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER; } + while (len-- != 0) + if (mapping[off++] >= vif->num_queues) + return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER; + + vif->hash.mapping_sel = !vif->hash.mapping_sel; + return XEN_NETIF_CTRL_STATUS_SUCCESS; } @@ -410,6 +416,8 @@ void xenvif_dump_hash_info(struct xenvif *vif, struct seq_file *m) } if (vif->hash.size != 0) { + const u32 *mapping = vif->hash.mapping[vif->hash.mapping_sel]; + seq_puts(m, "\nHash Mapping:\n"); for (i = 0; i < vif->hash.size; ) { @@ -422,7 +430,7 @@ void xenvif_dump_hash_info(struct xenvif *vif, struct seq_file *m) seq_printf(m, "[%4u - %4u]: ", i, i + n - 1); for (j = 0; j < n; j++, i++) - seq_printf(m, "%4u ", vif->hash.mapping[i]); + seq_printf(m, "%4u ", mapping[i]); seq_puts(m, "\n"); } diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 92274c237200..f6ae23fc3f6b 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -162,7 +162,8 @@ static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb, if (size == 0) return skb_get_hash_raw(skb) % dev->real_num_tx_queues; - return vif->hash.mapping[skb_get_hash_raw(skb) % size]; + return vif->hash.mapping[vif->hash.mapping_sel] + [skb_get_hash_raw(skb) % size]; } static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) From 871088bf92e11efb69bbdbd537e48c0ad4f63729 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 25 Sep 2018 02:13:37 -0600 Subject: [PATCH 229/499] xen-netback: handle page straddling in xenvif_set_hash_mapping() There's no guarantee that the mapping array doesn't cross a page boundary. Use a second grant copy operation if necessary. Signed-off-by: Jan Beulich Acked-by: Wei Liu Reviewed-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/net/xen-netback/hash.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/net/xen-netback/hash.c b/drivers/net/xen-netback/hash.c index dc9841ea2fff..0ccb021f1e78 100644 --- a/drivers/net/xen-netback/hash.c +++ b/drivers/net/xen-netback/hash.c @@ -334,28 +334,39 @@ u32 xenvif_set_hash_mapping(struct xenvif *vif, u32 gref, u32 len, u32 off) { u32 *mapping = vif->hash.mapping[!vif->hash.mapping_sel]; - struct gnttab_copy copy_op = { + unsigned int nr = 1; + struct gnttab_copy copy_op[2] = {{ .source.u.ref = gref, .source.domid = vif->domid, .dest.domid = DOMID_SELF, .len = len * sizeof(*mapping), .flags = GNTCOPY_source_gref - }; + }}; if ((off + len < off) || (off + len > vif->hash.size) || len > XEN_PAGE_SIZE / sizeof(*mapping)) return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER; - copy_op.dest.u.gmfn = virt_to_gfn(mapping + off); - copy_op.dest.offset = xen_offset_in_page(mapping + off); + copy_op[0].dest.u.gmfn = virt_to_gfn(mapping + off); + copy_op[0].dest.offset = xen_offset_in_page(mapping + off); + if (copy_op[0].dest.offset + copy_op[0].len > XEN_PAGE_SIZE) { + copy_op[1] = copy_op[0]; + copy_op[1].source.offset = XEN_PAGE_SIZE - copy_op[0].dest.offset; + copy_op[1].dest.u.gmfn = virt_to_gfn(mapping + off + len); + copy_op[1].dest.offset = 0; + copy_op[1].len = copy_op[0].len - copy_op[1].source.offset; + copy_op[0].len = copy_op[1].source.offset; + nr = 2; + } memcpy(mapping, vif->hash.mapping[vif->hash.mapping_sel], vif->hash.size * sizeof(*mapping)); - if (copy_op.len != 0) { - gnttab_batch_copy(©_op, 1); + if (copy_op[0].len != 0) { + gnttab_batch_copy(copy_op, nr); - if (copy_op.status != GNTST_okay) + if (copy_op[0].status != GNTST_okay || + copy_op[nr - 1].status != GNTST_okay) return XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER; } From 2e9361efa707e186d91b938e44f9e326725259f7 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Tue, 25 Sep 2018 10:21:55 +0100 Subject: [PATCH 230/499] net: hns: fix for unmapping problem when SMMU is on If SMMU is on, there is more likely that skb_shinfo(skb)->frags[i] can not send by a single BD. when this happen, the hns_nic_net_xmit_hw function map the whole data in a frags using skb_frag_dma_map, but unmap each BD' data individually when tx is done, which causes problem when SMMU is on. This patch fixes this problem by ummapping the whole data in a frags when tx is done. Signed-off-by: Yunsheng Lin Signed-off-by: Peng Li Reviewed-by: Yisen Zhuang Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hnae.c | 2 +- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 30 ++++++++++++------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.c b/drivers/net/ethernet/hisilicon/hns/hnae.c index a051e582d541..79d03f8ee7b1 100644 --- a/drivers/net/ethernet/hisilicon/hns/hnae.c +++ b/drivers/net/ethernet/hisilicon/hns/hnae.c @@ -84,7 +84,7 @@ static void hnae_unmap_buffer(struct hnae_ring *ring, struct hnae_desc_cb *cb) if (cb->type == DESC_TYPE_SKB) dma_unmap_single(ring_to_dev(ring), cb->dma, cb->length, ring_to_dma_dir(ring)); - else + else if (cb->length) dma_unmap_page(ring_to_dev(ring), cb->dma, cb->length, ring_to_dma_dir(ring)); } diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index f56855e63c96..5ce23d4b717e 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -40,9 +40,9 @@ #define SKB_TMP_LEN(SKB) \ (((SKB)->transport_header - (SKB)->mac_header) + tcp_hdrlen(SKB)) -static void fill_v2_desc(struct hnae_ring *ring, void *priv, - int size, dma_addr_t dma, int frag_end, - int buf_num, enum hns_desc_type type, int mtu) +static void fill_v2_desc_hw(struct hnae_ring *ring, void *priv, int size, + int send_sz, dma_addr_t dma, int frag_end, + int buf_num, enum hns_desc_type type, int mtu) { struct hnae_desc *desc = &ring->desc[ring->next_to_use]; struct hnae_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; @@ -64,7 +64,7 @@ static void fill_v2_desc(struct hnae_ring *ring, void *priv, desc_cb->type = type; desc->addr = cpu_to_le64(dma); - desc->tx.send_size = cpu_to_le16((u16)size); + desc->tx.send_size = cpu_to_le16((u16)send_sz); /* config bd buffer end */ hnae_set_bit(rrcfv, HNSV2_TXD_VLD_B, 1); @@ -133,6 +133,14 @@ static void fill_v2_desc(struct hnae_ring *ring, void *priv, ring_ptr_move_fw(ring, next_to_use); } +static void fill_v2_desc(struct hnae_ring *ring, void *priv, + int size, dma_addr_t dma, int frag_end, + int buf_num, enum hns_desc_type type, int mtu) +{ + fill_v2_desc_hw(ring, priv, size, size, dma, frag_end, + buf_num, type, mtu); +} + static const struct acpi_device_id hns_enet_acpi_match[] = { { "HISI00C1", 0 }, { "HISI00C2", 0 }, @@ -289,15 +297,15 @@ static void fill_tso_desc(struct hnae_ring *ring, void *priv, /* when the frag size is bigger than hardware, split this frag */ for (k = 0; k < frag_buf_num; k++) - fill_v2_desc(ring, priv, - (k == frag_buf_num - 1) ? + fill_v2_desc_hw(ring, priv, k == 0 ? size : 0, + (k == frag_buf_num - 1) ? sizeoflast : BD_MAX_SEND_SIZE, - dma + BD_MAX_SEND_SIZE * k, - frag_end && (k == frag_buf_num - 1) ? 1 : 0, - buf_num, - (type == DESC_TYPE_SKB && !k) ? + dma + BD_MAX_SEND_SIZE * k, + frag_end && (k == frag_buf_num - 1) ? 1 : 0, + buf_num, + (type == DESC_TYPE_SKB && !k) ? DESC_TYPE_SKB : DESC_TYPE_PAGE, - mtu); + mtu); } netdev_tx_t hns_nic_net_xmit_hw(struct net_device *ndev, From 92ef12b32feab8f277b69e9fb89ede2796777f4d Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 25 Sep 2018 18:21:58 +0200 Subject: [PATCH 231/499] tipc: fix flow control accounting for implicit connect In the case of implicit connect message with data > 1K, the flow control accounting is incorrect. At this state, the socket does not know the peer nodes capability and falls back to legacy flow control by return 1, however the receiver of this message will perform the new block accounting. This leads to a slack and eventually traffic disturbance. In this commit, we perform tipc_node_get_capabilities() at implicit connect and perform accounting based on the peer's capability. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3f03ddd0e35b..b6f99b021d09 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1419,8 +1419,10 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) /* Handle implicit connection setup */ if (unlikely(dest)) { rc = __tipc_sendmsg(sock, m, dlen); - if (dlen && (dlen == rc)) + if (dlen && dlen == rc) { + tsk->peer_caps = tipc_node_get_capabilities(net, dnode); tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr)); + } return rc; } From 69383c5913a5d74b4aa7e2b78a5cf14ebfbba538 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 25 Sep 2018 14:57:42 +0000 Subject: [PATCH 232/499] ovl: make symbol 'ovl_aops' static Fixes the following sparse warning: fs/overlayfs/inode.c:507:39: warning: symbol 'ovl_aops' was not declared. Should it be static? Fixes: 5b910bd615ba ("ovl: fix GPF in swapfile_activate of file from overlayfs over xfs") Signed-off-by: Wei Yongjun Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b6ac545b5a32..3b7ed5d2279c 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -504,7 +504,7 @@ static const struct inode_operations ovl_special_inode_operations = { .update_time = ovl_update_time, }; -const struct address_space_operations ovl_aops = { +static const struct address_space_operations ovl_aops = { /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ .direct_IO = noop_direct_IO, }; From a9360abd3de0aad745d25d003923d56afb28a04b Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 25 Sep 2018 11:23:55 +0300 Subject: [PATCH 233/499] IB/uverbs: Free uapi on destroy Make sure we free struct uverbs_api once we clean the radix tree. It was allocated by uverbs_alloc_api(). Fixes: 9ed3e5f44772 ("IB/uverbs: Build the specs into a radix tree at runtime") Reported-by: Bart Van Assche Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_uapi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 73ea6f0db88f..be854628a7c6 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -248,6 +248,7 @@ void uverbs_destroy_api(struct uverbs_api *uapi) kfree(rcu_dereference_protected(*slot, true)); radix_tree_iter_delete(&uapi->radix, &iter, slot); } + kfree(uapi); } struct uverbs_api *uverbs_alloc_api( From e8ef090a614292db01b5956a6f5467afbe6c5cf7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 25 Sep 2018 12:11:12 +0300 Subject: [PATCH 234/499] IB/mlx5: Destroy the DEVX object upon error flow Upon DEVX object creation the object must be destroyed upon a follows error flow. Fixes: 7efce3691d33 ("IB/mlx5: Add obj create and destroy functionality") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index ac116d63e466..f2f11e652dcd 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -723,6 +723,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE); struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; struct devx_obj *obj; int err; @@ -754,10 +755,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); if (err) - goto obj_free; + goto obj_destroy; return 0; +obj_destroy: + mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); obj_free: kfree(obj); return err; From 64e9e22e68512da8df3c9a7430f07621e48db3c2 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 23 Aug 2018 23:36:00 +0200 Subject: [PATCH 235/499] soc: fsl: qbman: qman: avoid allocating from non existing gen_pool If the qman driver didn't probe, calling qman_alloc_fqid_range, qman_alloc_pool_range or qman_alloc_cgrid_range (as done in dpaa_eth) will pass a NULL pointer to gen_pool_alloc, leading to a NULL pointer dereference. Signed-off-by: Alexandre Belloni Reviewed-by: Roy Pledge Signed-off-by: Li Yang (cherry picked from commit f72487a2788aa70c3aee1d0ebd5470de9bac953a) Signed-off-by: Olof Johansson --- drivers/soc/fsl/qbman/qman.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c index ecb22749df0b..8cc015183043 100644 --- a/drivers/soc/fsl/qbman/qman.c +++ b/drivers/soc/fsl/qbman/qman.c @@ -2729,6 +2729,9 @@ static int qman_alloc_range(struct gen_pool *p, u32 *result, u32 cnt) { unsigned long addr; + if (!p) + return -ENODEV; + addr = gen_pool_alloc(p, cnt); if (!addr) return -ENOMEM; From 96fc74333f84cfdf8d434c6c07254e215e2aad00 Mon Sep 17 00:00:00 2001 From: Zhao Qiang Date: Thu, 1 Feb 2018 14:54:32 +0800 Subject: [PATCH 236/499] soc: fsl: qe: Fix copy/paste bug in ucc_get_tdm_sync_shift() There is a copy and paste bug so we accidentally use the RX_ shift when we're in TX_ mode. Fixes: bb8b2062aff3 ("fsl/qe: setup clock source for TDM mode") Signed-off-by: Dan Carpenter Signed-off-by: Zhao Qiang Signed-off-by: Li Yang (cherry picked from commit 3cb31b634052ed458922e0c8e2b4b093d7fb60b9) Signed-off-by: Olof Johansson --- drivers/soc/fsl/qe/ucc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/fsl/qe/ucc.c b/drivers/soc/fsl/qe/ucc.c index c646d8713861..681f7d4b7724 100644 --- a/drivers/soc/fsl/qe/ucc.c +++ b/drivers/soc/fsl/qe/ucc.c @@ -626,7 +626,7 @@ static u32 ucc_get_tdm_sync_shift(enum comm_dir mode, u32 tdm_num) { u32 shift; - shift = (mode == COMM_DIR_RX) ? RX_SYNC_SHIFT_BASE : RX_SYNC_SHIFT_BASE; + shift = (mode == COMM_DIR_RX) ? RX_SYNC_SHIFT_BASE : TX_SYNC_SHIFT_BASE; shift -= tdm_num * 2; return shift; From 5c5702e259dc66e6fceed5117effab79c186e87a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 25 Sep 2018 12:10:40 +0300 Subject: [PATCH 237/499] RDMA/core: Set right entry state before releasing reference Currently add_modify_gid() for IB link layer has followong issue in cache update path. When GID update event occurs, core releases reference to the GID table without updating its state and/or entry pointer. CPU-0 CPU-1 ------ ----- ib_cache_update() IPoIB ULP add_modify_gid() [..] put_gid_entry() refcnt = 0, but state = valid, entry is valid. (work item is not yet executed). ipoib_create_ah() rdma_create_ah() rdma_get_gid_attr() <-- Tries to acquire gid_attr which has refcnt = 0. This is incorrect. GID entry state and entry pointer is provides the accurate GID enty state. Such fields must be updated with rwlock to protect against readers and, such fields must be in sane state before refcount can drop to zero. Otherwise above race condition can happen leading to use-after-free situation. Following backtrace has been observed when cache update for an IB port is triggered while IPoIB ULP is creating an AH. Therefore, when updating GID entry, first mark a valid entry as invalid through state and set the barrier so that no callers can acquired the GID entry, followed by release reference to it. refcount_t: increment on 0; use-after-free. WARNING: CPU: 4 PID: 29106 at lib/refcount.c:153 refcount_inc_checked+0x30/0x50 Workqueue: ib-comp-unb-wq ib_cq_poll_work [ib_core] RIP: 0010:refcount_inc_checked+0x30/0x50 RSP: 0018:ffff8802ad36f600 EFLAGS: 00010082 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000002 RSI: 0000000000000008 RDI: ffffffff86710100 RBP: ffff8802d6e60a30 R08: ffffed005d67bf8b R09: ffffed005d67bf8b R10: 0000000000000001 R11: ffffed005d67bf8a R12: ffff88027620cee8 R13: ffff8802d6e60988 R14: ffff8802d6e60a78 R15: 0000000000000202 FS: 0000000000000000(0000) GS:ffff8802eb200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3ab35e5c88 CR3: 00000002ce84a000 CR4: 00000000000006e0 IPv6: ADDRCONF(NETDEV_CHANGE): ib1: link becomes ready Call Trace: rdma_get_gid_attr+0x220/0x310 [ib_core] ? lock_acquire+0x145/0x3a0 rdma_fill_sgid_attr+0x32c/0x470 [ib_core] rdma_create_ah+0x89/0x160 [ib_core] ? rdma_fill_sgid_attr+0x470/0x470 [ib_core] ? ipoib_create_ah+0x52/0x260 [ib_ipoib] ipoib_create_ah+0xf5/0x260 [ib_ipoib] ipoib_mcast_join_complete+0xbbe/0x2540 [ib_ipoib] Fixes: b150c3862d21 ("IB/core: Introduce GID entry reference counts") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 98 ++++++++++++++++----------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 0bee1f4b914e..3208ad6ad540 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -337,55 +337,6 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) return 0; } -/** - * add_modify_gid - Add or modify GID table entry - * - * @table: GID table in which GID to be added or modified - * @attr: Attributes of the GID - * - * Returns 0 on success or appropriate error code. It accepts zero - * GID addition for non RoCE ports for HCA's who report them as valid - * GID. However such zero GIDs are not added to the cache. - */ -static int add_modify_gid(struct ib_gid_table *table, - const struct ib_gid_attr *attr) -{ - struct ib_gid_table_entry *entry; - int ret = 0; - - /* - * Invalidate any old entry in the table to make it safe to write to - * this index. - */ - if (is_gid_entry_valid(table->data_vec[attr->index])) - put_gid_entry(table->data_vec[attr->index]); - - /* - * Some HCA's report multiple GID entries with only one valid GID, and - * leave other unused entries as the zero GID. Convert zero GIDs to - * empty table entries instead of storing them. - */ - if (rdma_is_zero_gid(&attr->gid)) - return 0; - - entry = alloc_gid_entry(attr); - if (!entry) - return -ENOMEM; - - if (rdma_protocol_roce(attr->device, attr->port_num)) { - ret = add_roce_gid(entry); - if (ret) - goto done; - } - - store_gid_entry(table, entry); - return 0; - -done: - put_gid_entry(entry); - return ret; -} - /** * del_gid - Delete GID table entry * @@ -419,6 +370,55 @@ static void del_gid(struct ib_device *ib_dev, u8 port, put_gid_entry_locked(entry); } +/** + * add_modify_gid - Add or modify GID table entry + * + * @table: GID table in which GID to be added or modified + * @attr: Attributes of the GID + * + * Returns 0 on success or appropriate error code. It accepts zero + * GID addition for non RoCE ports for HCA's who report them as valid + * GID. However such zero GIDs are not added to the cache. + */ +static int add_modify_gid(struct ib_gid_table *table, + const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry; + int ret = 0; + + /* + * Invalidate any old entry in the table to make it safe to write to + * this index. + */ + if (is_gid_entry_valid(table->data_vec[attr->index])) + del_gid(attr->device, attr->port_num, table, attr->index); + + /* + * Some HCA's report multiple GID entries with only one valid GID, and + * leave other unused entries as the zero GID. Convert zero GIDs to + * empty table entries instead of storing them. + */ + if (rdma_is_zero_gid(&attr->gid)) + return 0; + + entry = alloc_gid_entry(attr); + if (!entry) + return -ENOMEM; + + if (rdma_protocol_roce(attr->device, attr->port_num)) { + ret = add_roce_gid(entry); + if (ret) + goto done; + } + + store_gid_entry(table, entry); + return 0; + +done: + put_gid_entry(entry); + return ret; +} + /* rwlock should be read locked, or lock should be held */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, From 974c24c5bed75b53e229a6f68a0533b6d5f48feb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Sep 2018 11:00:49 +0200 Subject: [PATCH 238/499] dma-mapping: add the missing ARCH_HAS_SYNC_DMA_FOR_CPU_ALL declaration The patch adding the infrastructure failed to actually add the symbol declaration, oops.. Fixes: faef87723a ("dma-noncoherent: add a arch_sync_dma_for_cpu_all hook") Signed-off-by: Christoph Hellwig Reviewed-by: Paul Burton Acked-by: Florian Fainelli Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 9bd54304446f..1b1d63b3634b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -23,6 +23,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU bool select NEED_DMA_MAP_STATE +config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL + bool + config DMA_DIRECT_OPS bool depends on HAS_DMA From bb830add192e9d8338082c0fc2c209e23b43d865 Mon Sep 17 00:00:00 2001 From: Susobhan Dey Date: Tue, 25 Sep 2018 12:29:15 -0700 Subject: [PATCH 239/499] nvme: properly propagate errors in nvme_mpath_init Signed-off-by: Susobhan Dey Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5a9562881d4e..9fe3fff818b8 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -537,8 +537,10 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) INIT_WORK(&ctrl->ana_work, nvme_ana_work); ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); - if (!ctrl->ana_log_buf) + if (!ctrl->ana_log_buf) { + error = -ENOMEM; goto out; + } error = nvme_read_ana_log(ctrl, true); if (error) @@ -547,7 +549,7 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) out_free_ana_log_buf: kfree(ctrl->ana_log_buf); out: - return -ENOMEM; + return error; } void nvme_mpath_uninit(struct nvme_ctrl *ctrl) From 530ca2c9bd6949c72c9b5cfc330cb3dbccaa3f5b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 25 Sep 2018 10:36:20 -0600 Subject: [PATCH 240/499] blk-mq: Allow blocking queue tag iter callbacks A recent commit runs tag iterator callbacks under the rcu read lock, but existing callbacks do not satisfy the non-blocking requirement. The commit intended to prevent an iterator from accessing a queue that's being modified. This patch fixes the original issue by taking a queue reference instead of reading it, which allows callbacks to make blocking calls. Fixes: f5bbbbe4d6357 ("blk-mq: sync the update nr_hw_queues with blk_mq_queue_tag_busy_iter") Acked-by: Jianchao Wang Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 94e1ed667b6e..41317c50a446 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -322,16 +322,11 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, /* * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and - * queue_hw_ctx after freeze the queue. So we could use q_usage_counter - * to avoid race with it. __blk_mq_update_nr_hw_queues will users - * synchronize_rcu to ensure all of the users go out of the critical - * section below and see zeroed q_usage_counter. + * queue_hw_ctx after freeze the queue, so we use q_usage_counter + * to avoid race with it. */ - rcu_read_lock(); - if (percpu_ref_is_zero(&q->q_usage_counter)) { - rcu_read_unlock(); + if (!percpu_ref_tryget(&q->q_usage_counter)) return; - } queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; @@ -347,7 +342,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } - rcu_read_unlock(); + blk_queue_exit(q); } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, From 082dc55e3c5e559b0e6855c0de23ebd674bcc91e Mon Sep 17 00:00:00 2001 From: Amelie Delaunay Date: Thu, 20 Sep 2018 18:34:16 +0200 Subject: [PATCH 241/499] ARM: dts: stm32: update SPI6 dmas property on stm32mp157c Remove unused parameter from SPI6 dmas property on stm32mp157c SoC. Fixes: dc3f8c86c10d ("ARM: dts: stm32: add SPI support on stm32mp157c") Signed-off-by: Amelie Delaunay Signed-off-by: Alexandre Torgue [olof: Without this patch, SPI6 will fall back to interrupt mode with lower perfmance] Signed-off-by: Olof Johansson --- arch/arm/boot/dts/stm32mp157c.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/stm32mp157c.dtsi b/arch/arm/boot/dts/stm32mp157c.dtsi index 661be948ab74..185541a5b69f 100644 --- a/arch/arm/boot/dts/stm32mp157c.dtsi +++ b/arch/arm/boot/dts/stm32mp157c.dtsi @@ -1078,8 +1078,8 @@ spi6: spi@5c001000 { interrupts = ; clocks = <&rcc SPI6_K>; resets = <&rcc SPI6_R>; - dmas = <&mdma1 34 0x0 0x40008 0x0 0x0 0>, - <&mdma1 35 0x0 0x40002 0x0 0x0 0>; + dmas = <&mdma1 34 0x0 0x40008 0x0 0x0>, + <&mdma1 35 0x0 0x40002 0x0 0x0>; dma-names = "rx", "tx"; status = "disabled"; }; From 94b6ddce71780575fbbf9d2c36afc8440e61a281 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 25 Sep 2018 21:56:57 +0200 Subject: [PATCH 242/499] tipc: reset bearer if device carrier not ok If we detect that under lying carrier detects errors and goes down, we reset the bearer. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 418f03d0be90..645c16052052 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -609,16 +609,18 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, switch (evt) { case NETDEV_CHANGE: - if (netif_carrier_ok(dev)) + if (netif_carrier_ok(dev) && netif_oper_up(dev)) { + test_and_set_bit_lock(0, &b->up); break; - /* else: fall through */ - case NETDEV_UP: - test_and_set_bit_lock(0, &b->up); - break; + } + /* fall through */ case NETDEV_GOING_DOWN: clear_bit_unlock(0, &b->up); tipc_reset_bearer(net, b); break; + case NETDEV_UP: + test_and_set_bit_lock(0, &b->up); + break; case NETDEV_CHANGEMTU: if (tipc_mtu_bad(dev, 0)) { bearer_disable(net, b); From 3f32d0be6c16b902b687453c962d17eea5b8ea19 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 25 Sep 2018 22:09:10 +0200 Subject: [PATCH 243/499] tipc: lock wakeup & inputq at tipc_link_reset() In tipc_link_reset() we copy the wakeup queue to input queue using skb_queue_splice_init(link->wakeupq, link->inputq). This is performed without holding any locks. The lists might be simultaneously be accessed by other cpu threads in tipc_sk_rcv(), something leading to to random missing packets. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index b1f0bee54eac..26cc033ee167 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -841,9 +841,14 @@ void tipc_link_reset(struct tipc_link *l) l->in_session = false; l->session++; l->mtu = l->advertised_mtu; + spin_lock_bh(&l->wakeupq.lock); + spin_lock_bh(&l->inputq->lock); + skb_queue_splice_init(&l->wakeupq, l->inputq); + spin_unlock_bh(&l->inputq->lock); + spin_unlock_bh(&l->wakeupq.lock); + __skb_queue_purge(&l->transmq); __skb_queue_purge(&l->deferdq); - skb_queue_splice_init(&l->wakeupq, l->inputq); __skb_queue_purge(&l->backlogq); l->backlog[TIPC_LOW_IMPORTANCE].len = 0; l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0; From 5ebb1bc2d63d90dd204169e21fd7a0b4bb8c776e Mon Sep 17 00:00:00 2001 From: Arindam Nath Date: Tue, 18 Sep 2018 15:40:58 +0530 Subject: [PATCH 244/499] iommu/amd: Return devid as alias for ACPI HID devices ACPI HID devices do not actually have an alias for them in the IVRS. But dev_data->alias is still used for indexing into the IOMMU device table for devices being handled by the IOMMU. So for ACPI HID devices, we simply return the corresponding devid as an alias, as parsed from IVRS table. Signed-off-by: Arindam Nath Fixes: 2bf9a0a12749 ('iommu/amd: Add iommu support for ACPI HID devices') Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 4e04fff23977..73e47d93e7a0 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -246,7 +246,13 @@ static u16 get_alias(struct device *dev) /* The callers make sure that get_device_id() does not fail here */ devid = get_device_id(dev); + + /* For ACPI HID devices, we simply return the devid as such */ + if (!dev_is_pci(dev)) + return devid; + ivrs_alias = amd_iommu_alias_table[devid]; + pci_for_each_dma_alias(pdev, __last_alias, &pci_alias); if (ivrs_alias == pci_alias) From 8105f9b8a8879bff7f1d43d0720c993a99c9d135 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 22 Sep 2018 18:35:31 +0200 Subject: [PATCH 245/499] mac80211: allocate TXQs for active monitor interfaces Monitor mode interfaces with the active flag are passed down to the driver. Drivers using TXQ expect that all interfaces have allocated TXQs before they get added. Fixes: 79af1f866193d ("mac80211: avoid allocating TXQs that won't be used") Cc: stable@vger.kernel.org Reported-by: Catrinel Catrinescu Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 5e6cf2cee965..5836ddeac9e3 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1756,7 +1756,8 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, if (local->ops->wake_tx_queue && type != NL80211_IFTYPE_AP_VLAN && - type != NL80211_IFTYPE_MONITOR) + (type != NL80211_IFTYPE_MONITOR || + (params->flags & MONITOR_FLAG_ACTIVE))) txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; From 0bcbf6518456f63038a290bd359237d31f6f8ac3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 23 Sep 2018 11:59:13 -0700 Subject: [PATCH 246/499] cfg80211: fix reg_query_regdb_wmm kernel-doc Drop @ptr from kernel-doc for function reg_query_regdb_wmm(). This function parameter was recently removed so update the kernel-doc to match that and remove the kernel-doc warnings. Removes 109 occurrences of this warning message: ../include/net/cfg80211.h:4869: warning: Excess function parameter 'ptr' description in 'reg_query_regdb_wmm' Fixes: 38cb87ee47fb ("cfg80211: make wmm_rule part of the reg_rule structure") Signed-off-by: Randy Dunlap Cc: Stanislaw Gruszka Cc: Johannes Berg Cc: Kalle Valo Cc: linux-wireless@vger.kernel.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8ebabc9873d1..4de121e24ce5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4852,8 +4852,6 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); * * @alpha2: the ISO/IEC 3166 alpha2 wmm rule to be queried. * @freq: the freqency(in MHz) to be queried. - * @ptr: pointer where the regdb wmm data is to be stored (or %NULL if - * irrelevant). This can be used later for deduplication. * @rule: pointer to store the wmm rule from the regulatory db. * * Self-managed wireless drivers can use this function to query From 30fe6d50eb088783c8729c7d930f65296b2b3fa7 Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Tue, 25 Sep 2018 11:15:00 +0900 Subject: [PATCH 247/499] nl80211: Fix possible Spectre-v1 for NL80211_TXRATE_HT Use array_index_nospec() to sanitize ridx with respect to speculation. Signed-off-by: Masashi Honma Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4b8ec659e797..bd26230de63e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3756,6 +3756,7 @@ static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband, return false; /* check availability */ + ridx = array_index_nospec(ridx, IEEE80211_HT_MCS_MASK_LEN); if (sband->ht_cap.mcs.rx_mask[ridx] & rbit) mcs[ridx] |= rbit; else From 628980e5c8f038f730582c6ee50b7410741cd96e Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 25 Sep 2018 09:41:13 +0200 Subject: [PATCH 248/499] mac80211_hwsim: fix locking when iterating radios during ns exit The cleanup of radios during namespace exit has recently been reworked to directly delete a radio while temporarily releasing the spinlock, fixing a race condition between the work-queue execution and namespace exits. However, the temporary unlock allows unsafe modifications on the iterated list, resulting in a potential crash when continuing the iteration of additional radios. Move radios about to destroy to a temporary list, and clean that up after releasing the spinlock once iteration is complete. Fixes: 8cfd36a0b53a ("mac80211_hwsim: fix use-after-free bug in hwsim_exit_net") Signed-off-by: Martin Willi Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 1068757ec42e..f1150d321875 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3646,6 +3646,7 @@ static __net_init int hwsim_init_net(struct net *net) static void __net_exit hwsim_exit_net(struct net *net) { struct mac80211_hwsim_data *data, *tmp; + LIST_HEAD(list); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry_safe(data, tmp, &hwsim_radios, list) { @@ -3656,17 +3657,19 @@ static void __net_exit hwsim_exit_net(struct net *net) if (data->netgroup == hwsim_net_get_netgroup(&init_net)) continue; - list_del(&data->list); + list_move(&data->list, &list); rhashtable_remove_fast(&hwsim_radios_rht, &data->rht, hwsim_rht_params); hwsim_radios_generation++; - spin_unlock_bh(&hwsim_radio_lock); + } + spin_unlock_bh(&hwsim_radio_lock); + + list_for_each_entry_safe(data, tmp, &list, list) { + list_del(&data->list); mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy), NULL); - spin_lock_bh(&hwsim_radio_lock); } - spin_unlock_bh(&hwsim_radio_lock); ida_simple_remove(&hwsim_netgroup_ida, hwsim_net_get_netgroup(net)); } From f1c47eb61d52379de5747d02bb36be20d7a2d0d3 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 25 Sep 2018 09:41:14 +0200 Subject: [PATCH 249/499] mac80211_hwsim: fix race in radio destruction from netlink notifier The asynchronous destruction from a work-queue of radios tagged with destroy-on-close may race with the owning namespace about to exit, resulting in potential use-after-free of that namespace. Instead of using a work-queue, move radios about to destroy to a temporary list, which can be worked on synchronously after releasing the lock. This should be safe to do from the netlink socket notifier, as the namespace is guaranteed to not get released. Signed-off-by: Martin Willi Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index f1150d321875..6fccb4d717db 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -520,7 +520,6 @@ struct mac80211_hwsim_data { int channels, idx; bool use_chanctx; bool destroy_on_close; - struct work_struct destroy_work; u32 portid; char alpha2[2]; const struct ieee80211_regdomain *regd; @@ -3565,30 +3564,27 @@ static struct genl_family hwsim_genl_family __ro_after_init = { .n_mcgrps = ARRAY_SIZE(hwsim_mcgrps), }; -static void destroy_radio(struct work_struct *work) -{ - struct mac80211_hwsim_data *data = - container_of(work, struct mac80211_hwsim_data, destroy_work); - - hwsim_radios_generation++; - mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy), NULL); -} - static void remove_user_radios(u32 portid) { struct mac80211_hwsim_data *entry, *tmp; + LIST_HEAD(list); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry_safe(entry, tmp, &hwsim_radios, list) { if (entry->destroy_on_close && entry->portid == portid) { - list_del(&entry->list); + list_move(&entry->list, &list); rhashtable_remove_fast(&hwsim_radios_rht, &entry->rht, hwsim_rht_params); - INIT_WORK(&entry->destroy_work, destroy_radio); - queue_work(hwsim_wq, &entry->destroy_work); + hwsim_radios_generation++; } } spin_unlock_bh(&hwsim_radio_lock); + + list_for_each_entry_safe(entry, tmp, &list, list) { + list_del(&entry->list); + mac80211_hwsim_del_radio(entry, wiphy_name(entry->hw->wiphy), + NULL); + } } static int mac80211_hwsim_netlink_notify(struct notifier_block *nb, From 28ef8b49a338dc1844e86b7954cfffc7dfa2660a Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 25 Sep 2018 09:51:02 +0200 Subject: [PATCH 250/499] mac80211_hwsim: do not omit multicast announce of first added radio The allocation of hwsim radio identifiers uses a post-increment from 0, so the first radio has idx 0. This idx is explicitly excluded from multicast announcements ever since, but it is unclear why. Drop that idx check and announce the first radio as well. This makes userspace happy if it relies on these events. Signed-off-by: Martin Willi Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 6fccb4d717db..07442ada6dd0 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2934,8 +2934,7 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, hwsim_radios_generation++; spin_unlock_bh(&hwsim_radio_lock); - if (idx > 0) - hwsim_mcast_new_radio(idx, info, param); + hwsim_mcast_new_radio(idx, info, param); return idx; From cb28c306b93b71f2741ce1a5a66289db26715f4d Mon Sep 17 00:00:00 2001 From: Matias Karhumaa Date: Wed, 26 Sep 2018 09:13:46 +0300 Subject: [PATCH 251/499] Bluetooth: SMP: fix crash in unpairing In case unpair_device() was called through mgmt interface at the same time when pairing was in progress, Bluetooth kernel module crash was seen. [ 600.351225] general protection fault: 0000 [#1] SMP PTI [ 600.351235] CPU: 1 PID: 11096 Comm: btmgmt Tainted: G OE 4.19.0-rc1+ #1 [ 600.351238] Hardware name: Dell Inc. Latitude E5440/08RCYC, BIOS A18 05/14/2017 [ 600.351272] RIP: 0010:smp_chan_destroy.isra.10+0xce/0x2c0 [bluetooth] [ 600.351276] Code: c0 0f 84 b4 01 00 00 80 78 28 04 0f 84 53 01 00 00 4d 85 ed 0f 85 ab 00 00 00 48 8b 08 48 8b 50 08 be 10 00 00 00 48 89 51 08 <48> 89 0a 48 b9 00 02 00 00 00 00 ad de 48 89 48 08 48 8b 83 00 01 [ 600.351279] RSP: 0018:ffffa9be839b3b50 EFLAGS: 00010246 [ 600.351282] RAX: ffff9c999ac565a0 RBX: ffff9c9996e98c00 RCX: ffff9c999aa28b60 [ 600.351285] RDX: dead000000000200 RSI: 0000000000000010 RDI: ffff9c999e403500 [ 600.351287] RBP: ffffa9be839b3b70 R08: 0000000000000000 R09: ffffffff92a25c00 [ 600.351290] R10: ffffa9be839b3ae8 R11: 0000000000000001 R12: ffff9c995375b800 [ 600.351292] R13: 0000000000000000 R14: ffff9c99619a5000 R15: ffff9c9962a01c00 [ 600.351295] FS: 00007fb2be27c700(0000) GS:ffff9c999e880000(0000) knlGS:0000000000000000 [ 600.351298] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 600.351300] CR2: 00007fb2bdadbad0 CR3: 000000041c328001 CR4: 00000000001606e0 [ 600.351302] Call Trace: [ 600.351325] smp_failure+0x4f/0x70 [bluetooth] [ 600.351345] smp_cancel_pairing+0x74/0x80 [bluetooth] [ 600.351370] unpair_device+0x1c1/0x330 [bluetooth] [ 600.351399] hci_sock_sendmsg+0x960/0x9f0 [bluetooth] [ 600.351409] ? apparmor_socket_sendmsg+0x1e/0x20 [ 600.351417] sock_sendmsg+0x3e/0x50 [ 600.351422] sock_write_iter+0x85/0xf0 [ 600.351429] do_iter_readv_writev+0x12b/0x1b0 [ 600.351434] do_iter_write+0x87/0x1a0 [ 600.351439] vfs_writev+0x98/0x110 [ 600.351443] ? ep_poll+0x16d/0x3d0 [ 600.351447] ? ep_modify+0x73/0x170 [ 600.351451] do_writev+0x61/0xf0 [ 600.351455] ? do_writev+0x61/0xf0 [ 600.351460] __x64_sys_writev+0x1c/0x20 [ 600.351465] do_syscall_64+0x5a/0x110 [ 600.351471] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 600.351474] RIP: 0033:0x7fb2bdb62fe0 [ 600.351477] Code: 73 01 c3 48 8b 0d b8 6e 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 69 c7 2c 00 00 75 10 b8 14 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 de 80 01 00 48 89 04 24 [ 600.351479] RSP: 002b:00007ffe062cb8f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000014 [ 600.351484] RAX: ffffffffffffffda RBX: 000000000255b3d0 RCX: 00007fb2bdb62fe0 [ 600.351487] RDX: 0000000000000001 RSI: 00007ffe062cb920 RDI: 0000000000000004 [ 600.351490] RBP: 00007ffe062cb920 R08: 000000000255bd80 R09: 0000000000000000 [ 600.351494] R10: 0000000000000353 R11: 0000000000000246 R12: 0000000000000001 [ 600.351497] R13: 00007ffe062cbbe0 R14: 0000000000000000 R15: 0000000000000000 [ 600.351501] Modules linked in: algif_hash algif_skcipher af_alg cmac ipt_MASQUERADE nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo iptable_nat nf_nat_ipv4 xt_addrtype iptable_filter ip_tables xt_conntrack x_tables nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c br_netfilter bridge stp llc overlay arc4 nls_iso8859_1 dm_crypt intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp dell_laptop kvm_intel crct10dif_pclmul dell_smm_hwmon crc32_pclmul ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper intel_cstate intel_rapl_perf uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videobuf2_common videodev media hid_multitouch input_leds joydev serio_raw dell_wmi snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic dell_smbios dcdbas sparse_keymap [ 600.351569] snd_hda_intel btusb snd_hda_codec btrtl btbcm btintel snd_hda_core bluetooth(OE) snd_hwdep snd_pcm iwlmvm ecdh_generic wmi_bmof dell_wmi_descriptor snd_seq_midi mac80211 snd_seq_midi_event lpc_ich iwlwifi snd_rawmidi snd_seq snd_seq_device snd_timer cfg80211 snd soundcore mei_me mei dell_rbtn dell_smo8800 mac_hid parport_pc ppdev lp parport autofs4 hid_generic usbhid hid i915 nouveau kvmgt vfio_mdev mdev vfio_iommu_type1 vfio kvm irqbypass i2c_algo_bit ttm drm_kms_helper syscopyarea sysfillrect sysimgblt mxm_wmi psmouse ahci sdhci_pci cqhci libahci fb_sys_fops sdhci drm e1000e video wmi [ 600.351637] ---[ end trace e49e9f1df09c94fb ]--- [ 600.351664] RIP: 0010:smp_chan_destroy.isra.10+0xce/0x2c0 [bluetooth] [ 600.351666] Code: c0 0f 84 b4 01 00 00 80 78 28 04 0f 84 53 01 00 00 4d 85 ed 0f 85 ab 00 00 00 48 8b 08 48 8b 50 08 be 10 00 00 00 48 89 51 08 <48> 89 0a 48 b9 00 02 00 00 00 00 ad de 48 89 48 08 48 8b 83 00 01 [ 600.351669] RSP: 0018:ffffa9be839b3b50 EFLAGS: 00010246 [ 600.351672] RAX: ffff9c999ac565a0 RBX: ffff9c9996e98c00 RCX: ffff9c999aa28b60 [ 600.351674] RDX: dead000000000200 RSI: 0000000000000010 RDI: ffff9c999e403500 [ 600.351676] RBP: ffffa9be839b3b70 R08: 0000000000000000 R09: ffffffff92a25c00 [ 600.351679] R10: ffffa9be839b3ae8 R11: 0000000000000001 R12: ffff9c995375b800 [ 600.351681] R13: 0000000000000000 R14: ffff9c99619a5000 R15: ffff9c9962a01c00 [ 600.351684] FS: 00007fb2be27c700(0000) GS:ffff9c999e880000(0000) knlGS:0000000000000000 [ 600.351686] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 600.351689] CR2: 00007fb2bdadbad0 CR3: 000000041c328001 CR4: 00000000001606e0 Crash happened because list_del_rcu() was called twice for smp->ltk. This was possible if unpair_device was called right after ltk was generated but before keys were distributed. In this commit smp_cancel_pairing was refactored to cancel pairing if it is in progress and otherwise just removes keys. Once keys are removed from rcu list, pointers to smp context's keys are set to NULL to make sure removed list items are not accessed later. This commit also adjusts the functionality of mgmt unpair_device() little bit. Previously pairing was canceled only if pairing was in state that keys were already generated. With this commit unpair_device() cancels pairing already in earlier states. Bug was found by fuzzing kernel SMP implementation using Synopsys Defensics. Reported-by: Pekka Oikarainen Signed-off-by: Matias Karhumaa Signed-off-by: Johan Hedberg --- net/bluetooth/mgmt.c | 7 ++----- net/bluetooth/smp.c | 29 +++++++++++++++++++++++++---- net/bluetooth/smp.h | 3 ++- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 3bdc8f3ca259..ccce954f8146 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2434,9 +2434,8 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, /* LE address type */ addr_type = le_addr_type(cp->addr.type); - hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type); - - err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type); + /* Abort any ongoing SMP pairing. Removes ltk and irk if they exist. */ + err = smp_cancel_and_remove_pairing(hdev, &cp->addr.bdaddr, addr_type); if (err < 0) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_NOT_PAIRED, &rp, @@ -2450,8 +2449,6 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, goto done; } - /* Abort any ongoing SMP pairing */ - smp_cancel_pairing(conn); /* Defer clearing up the connection parameters until closing to * give a chance of keeping them if a repairing happens. diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 3a7b0773536b..73f7211d0431 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2422,30 +2422,51 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) return ret; } -void smp_cancel_pairing(struct hci_conn *hcon) +int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct hci_conn *hcon; + struct l2cap_conn *conn; struct l2cap_chan *chan; struct smp_chan *smp; + int err; + err = hci_remove_ltk(hdev, bdaddr, addr_type); + hci_remove_irk(hdev, bdaddr, addr_type); + + hcon = hci_conn_hash_lookup_le(hdev, bdaddr, addr_type); + if (!hcon) + goto done; + + conn = hcon->l2cap_data; if (!conn) - return; + goto done; chan = conn->smp; if (!chan) - return; + goto done; l2cap_chan_lock(chan); smp = chan->data; if (smp) { + /* Set keys to NULL to make sure smp_failure() does not try to + * remove and free already invalidated rcu list entries. */ + smp->ltk = NULL; + smp->slave_ltk = NULL; + smp->remote_irk = NULL; + if (test_bit(SMP_FLAG_COMPLETE, &smp->flags)) smp_failure(conn, 0); else smp_failure(conn, SMP_UNSPECIFIED); + err = 0; } l2cap_chan_unlock(chan); + +done: + return err; } static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 0ff6247eaa6c..121edadd5f8d 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -181,7 +181,8 @@ enum smp_key_pref { }; /* SMP Commands */ -void smp_cancel_pairing(struct hci_conn *hcon); +int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type); bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level, enum smp_key_pref key_pref); int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); From 337fe9f5c1e7de1f391c6a692531379d2aa2ee11 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 26 Sep 2018 02:17:03 -0500 Subject: [PATCH 252/499] drm/syncobj: Don't leak fences when WAIT_FOR_SUBMIT is set We attempt to get fences earlier in the hopes that everything will already have fences and no callbacks will be needed. If we do succeed in getting a fence, getting one a second time will result in a duplicate ref with no unref. This is causing memory leaks in Vulkan applications that create a lot of fences; playing for a few hours can, apparently, bring down the system. Cc: stable@vger.kernel.org Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107899 Reviewed-by: Chris Wilson Signed-off-by: Jason Ekstrand Signed-off-by: Sean Paul Link: https://patchwork.freedesktop.org/patch/msgid/20180926071703.15257-1-jason.ekstrand@intel.com --- drivers/gpu/drm/drm_syncobj.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index adb3cb27d31e..759278fef35a 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -97,6 +97,8 @@ static int drm_syncobj_fence_get_or_add_callback(struct drm_syncobj *syncobj, { int ret; + WARN_ON(*fence); + *fence = drm_syncobj_fence_get(syncobj); if (*fence) return 1; @@ -743,6 +745,9 @@ static signed long drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs, if (flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT) { for (i = 0; i < count; ++i) { + if (entries[i].fence) + continue; + drm_syncobj_fence_get_or_add_callback(syncobjs[i], &entries[i].fence, &entries[i].syncobj_cb, From f188b99f0b2d33794b4af8a225f95d1e968c0a3f Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 26 Sep 2018 15:39:28 -0500 Subject: [PATCH 253/499] ACPI / hotplug / PCI: Don't scan for non-hotplug bridges if slot is not bridge HP 6730b laptop has an ethernet NIC connected to one of the PCIe root ports. The root ports themselves are native PCIe hotplug capable. Now, during boot after PCI devices are scanned the BIOS triggers ACPI bus check directly to the NIC: ACPI: \_SB_.PCI0.RP06.NIC_: Bus check in hotplug_event() It is not clear why it is sending bus check but regardless the ACPI hotplug notify handler calls enable_slot() directly (instead of going through acpiphp_check_bridge() as there is no bridge), which ends up handling special case for non-hotplug bridges with native PCIe hotplug. This results a crash of some kind but the reporter only sees black screen so it is hard to figure out the exact spot and what actually happens. Based on a few fix proposals it was tracked to crash somewhere inside pci_assign_unassigned_bridge_resources(). In any case we should not really be in that special branch at all because the ACPI notify happened to a slot that is not a PCI bridge (it is just a regular PCI device). Fix this so that we only go to that special branch if we are calling enable_slot() for a bridge (e.g., the ACPI notification was for the bridge). Link: https://bugzilla.kernel.org/show_bug.cgi?id=201127 Fixes: 84c8b58ed3ad ("ACPI / hotplug / PCI: Don't scan bridges managed by native hotplug") Reported-by: Peter Anemone Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki CC: stable@vger.kernel.org # v4.18+ --- drivers/pci/hotplug/acpiphp_glue.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index ef0b1b6ba86f..12afa7fdf77e 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -457,17 +457,18 @@ static void acpiphp_native_scan_bridge(struct pci_dev *bridge) /** * enable_slot - enable, configure a slot * @slot: slot to be enabled + * @bridge: true if enable is for the whole bridge (not a single slot) * * This function should be called per *physical slot*, * not per each slot object in ACPI namespace. */ -static void enable_slot(struct acpiphp_slot *slot) +static void enable_slot(struct acpiphp_slot *slot, bool bridge) { struct pci_dev *dev; struct pci_bus *bus = slot->bus; struct acpiphp_func *func; - if (bus->self && hotplug_is_native(bus->self)) { + if (bridge && bus->self && hotplug_is_native(bus->self)) { /* * If native hotplug is used, it will take care of hotplug * slot management and resource allocation for hotplug @@ -701,7 +702,7 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge) trim_stale_devices(dev); /* configure all functions */ - enable_slot(slot); + enable_slot(slot, true); } else { disable_slot(slot); } @@ -785,7 +786,7 @@ static void hotplug_event(u32 type, struct acpiphp_context *context) if (bridge) acpiphp_check_bridge(bridge); else if (!(slot->flags & SLOT_IS_GOING_AWAY)) - enable_slot(slot); + enable_slot(slot, false); break; @@ -973,7 +974,7 @@ int acpiphp_enable_slot(struct acpiphp_slot *slot) /* configure all functions */ if (!(slot->flags & SLOT_ENABLED)) - enable_slot(slot); + enable_slot(slot, false); pci_unlock_rescan_remove(); return 0; From 1d71926bbd59facc4bdb6f13117d3a1aee8b83ba Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 18 Sep 2018 16:16:56 -0700 Subject: [PATCH 254/499] mmc: core: Fix debounce time to use microseconds The debounce value in device tree is in milliseconds but needs to be in microseconds for mmc_gpiod_request_cd(). Fixes: bfd694d5e21c ("mmc: core: Add tunable delay before detecting card after card is inserted") Cc: Shawn Lin Signed-off-by: Tony Lindgren Cc: stable@vger.kernel.org # v4.18+ Signed-off-by: Ulf Hansson --- drivers/mmc/core/host.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index abf9e884386c..f57f5de54206 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -235,7 +235,7 @@ int mmc_of_parse(struct mmc_host *host) host->caps |= MMC_CAP_NEEDS_POLL; ret = mmc_gpiod_request_cd(host, "cd", 0, true, - cd_debounce_delay_ms, + cd_debounce_delay_ms * 1000, &cd_gpio_invert); if (!ret) dev_info(host->parent, "Got CD GPIO\n"); From 854f31ccdd7964c9c2e68da234a3a8aedb51cf6b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 27 Sep 2018 10:55:13 +0900 Subject: [PATCH 255/499] block: fix deadline elevator drain for zoned block devices When the deadline scheduler is used with a zoned block device, writes to a zone will be dispatched one at a time. This causes the warning message: deadline: forced dispatching is broken (nr_sorted=X), please report this to be displayed when switching to another elevator with the legacy I/O path while write requests to a zone are being retained in the scheduler queue. Prevent this message from being displayed when executing elv_drain_elevator() for a zoned block device. __blk_drain_queue() will loop until all writes are dispatched and completed, resulting in the desired elevator queue drain without extensive modifications to the deadline code itself to handle forced-dispatch calls. Signed-off-by: Damien Le Moal Fixes: 8dc8146f9c92 ("deadline-iosched: Introduce zone locking support") Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/elevator.c b/block/elevator.c index 6a06b5d040e5..fae58b2f906f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -609,7 +609,7 @@ void elv_drain_elevator(struct request_queue *q) while (e->type->ops.sq.elevator_dispatch_fn(q, 1)) ; - if (q->nr_sorted && printed++ < 10) { + if (q->nr_sorted && !blk_queue_is_zoned(q) && printed++ < 10 ) { printk(KERN_ERR "%s: forced dispatching is broken " "(nr_sorted=%u), please report this\n", q->elevator->type->elevator_name, q->nr_sorted); From 36f19d5b4f99fa9fa8263877e5f8e669d7fddc14 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 26 Sep 2018 17:35:14 -0700 Subject: [PATCH 256/499] net/ipv6: Remove extra call to ip6_convert_metrics for multipath case The change to move metrics from the dst to rt6_info moved the call to ip6_convert_metrics from ip6_route_add to ip6_route_info_create. In doing so it makes the call in ip6_route_info_append redundant and actually leaks the metrics installed as part of the ip6_route_info_create. Remove the now unnecessary call. Fixes: d4ead6b34b67f ("net/ipv6: move metrics from dst to rt6_info") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 826b14de7dbb..a366c05a239d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4321,11 +4321,6 @@ static int ip6_route_info_append(struct net *net, if (!nh) return -ENOMEM; nh->fib6_info = rt; - err = ip6_convert_metrics(net, rt, r_cfg); - if (err) { - kfree(nh); - return err; - } memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); list_add_tail(&nh->next, rt6_nh_list); From e0511f6c1ccdd153cf063764e93ac177a8553c5d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 23 Sep 2018 15:38:21 +0200 Subject: [PATCH 257/499] net: phy: fix WoL handling when suspending the PHY Actually there's nothing wrong with the two changes marked as "Fixes", they just revealed a problem which has been existing before. After having switched r8169 to phylib it was reported that WoL from shutdown doesn't work any longer (WoL from suspend isn't affected). Reason is that during shutdown phy_disconnect()->phy_detach()-> phy_suspend() is called. A similar issue occurs when the phylib state machine calls phy_suspend() when handling state PHY_HALTED. Core of the problem is that phy_suspend() suspends the PHY when it should not due to WoL. phy_suspend() checks for WoL already, but this works only if the PHY driver handles WoL (what is rarely the case). Typically WoL is handled by the MAC driver. phylib knows about this and handles it in mdio_bus_phy_may_suspend(), but that's used only when suspending the system, not in other cases like shutdown. Therefore factor out the relevant check from mdio_bus_phy_may_suspend() to a new function phy_may_suspend() and use it in phy_suspend(). Last but not least change phy_detach() to call phy_suspend() before attached_dev is set to NULL. phy_suspend() accesses attached_dev when checking whether the MAC driver activated WoL. Fixes: f1e911d5d0df ("r8169: add basic phylib support") Fixes: e8cfd9d6c772 ("net: phy: call state machine synchronously in phy_stop") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 42 ++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index db1172db1e7c..2c80d3c44cd6 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -75,6 +75,26 @@ extern struct phy_driver genphy_10g_driver; static LIST_HEAD(phy_fixup_list); static DEFINE_MUTEX(phy_fixup_lock); +static bool phy_may_suspend(struct phy_device *phydev) +{ + struct net_device *netdev = phydev->attached_dev; + + if (!netdev) + return true; + + /* Don't suspend PHY if the attached netdev parent may wakeup. + * The parent may point to a PCI device, as in tg3 driver. + */ + if (netdev->dev.parent && device_may_wakeup(netdev->dev.parent)) + return false; + + /* Also don't suspend PHY if the netdev itself may wakeup. This + * is the case for devices w/o underlaying pwr. mgmt. aware bus, + * e.g. SoC devices. + */ + return !device_may_wakeup(&netdev->dev); +} + #ifdef CONFIG_PM static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) { @@ -93,20 +113,7 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) if (!netdev) return !phydev->suspended; - /* Don't suspend PHY if the attached netdev parent may wakeup. - * The parent may point to a PCI device, as in tg3 driver. - */ - if (netdev->dev.parent && device_may_wakeup(netdev->dev.parent)) - return false; - - /* Also don't suspend PHY if the netdev itself may wakeup. This - * is the case for devices w/o underlaying pwr. mgmt. aware bus, - * e.g. SoC devices. - */ - if (device_may_wakeup(&netdev->dev)) - return false; - - return true; + return phy_may_suspend(phydev); } static int mdio_bus_phy_suspend(struct device *dev) @@ -1132,9 +1139,9 @@ void phy_detach(struct phy_device *phydev) sysfs_remove_link(&dev->dev.kobj, "phydev"); sysfs_remove_link(&phydev->mdio.dev.kobj, "attached_dev"); } + phy_suspend(phydev); phydev->attached_dev->phydev = NULL; phydev->attached_dev = NULL; - phy_suspend(phydev); phydev->phylink = NULL; phy_led_triggers_unregister(phydev); @@ -1171,9 +1178,12 @@ int phy_suspend(struct phy_device *phydev) struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; int ret = 0; + if (phydev->suspended) + return 0; + /* If the device has WOL enabled, we cannot suspend the PHY */ phy_ethtool_get_wol(phydev, &wol); - if (wol.wolopts) + if (wol.wolopts || !phy_may_suspend(phydev)) return -EBUSY; if (phydev->drv && phydrv->suspend) From d31d1d03aa909aa6257d9d581eb0eb5d0ed366e2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 26 Sep 2018 20:01:36 -0700 Subject: [PATCH 258/499] Revert "net: phy: fix WoL handling when suspending the PHY" This reverts commit e0511f6c1ccdd153cf063764e93ac177a8553c5d. I commited the wrong version of these changes. Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 42 ++++++++++++++---------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 2c80d3c44cd6..db1172db1e7c 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -75,26 +75,6 @@ extern struct phy_driver genphy_10g_driver; static LIST_HEAD(phy_fixup_list); static DEFINE_MUTEX(phy_fixup_lock); -static bool phy_may_suspend(struct phy_device *phydev) -{ - struct net_device *netdev = phydev->attached_dev; - - if (!netdev) - return true; - - /* Don't suspend PHY if the attached netdev parent may wakeup. - * The parent may point to a PCI device, as in tg3 driver. - */ - if (netdev->dev.parent && device_may_wakeup(netdev->dev.parent)) - return false; - - /* Also don't suspend PHY if the netdev itself may wakeup. This - * is the case for devices w/o underlaying pwr. mgmt. aware bus, - * e.g. SoC devices. - */ - return !device_may_wakeup(&netdev->dev); -} - #ifdef CONFIG_PM static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) { @@ -113,7 +93,20 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) if (!netdev) return !phydev->suspended; - return phy_may_suspend(phydev); + /* Don't suspend PHY if the attached netdev parent may wakeup. + * The parent may point to a PCI device, as in tg3 driver. + */ + if (netdev->dev.parent && device_may_wakeup(netdev->dev.parent)) + return false; + + /* Also don't suspend PHY if the netdev itself may wakeup. This + * is the case for devices w/o underlaying pwr. mgmt. aware bus, + * e.g. SoC devices. + */ + if (device_may_wakeup(&netdev->dev)) + return false; + + return true; } static int mdio_bus_phy_suspend(struct device *dev) @@ -1139,9 +1132,9 @@ void phy_detach(struct phy_device *phydev) sysfs_remove_link(&dev->dev.kobj, "phydev"); sysfs_remove_link(&phydev->mdio.dev.kobj, "attached_dev"); } - phy_suspend(phydev); phydev->attached_dev->phydev = NULL; phydev->attached_dev = NULL; + phy_suspend(phydev); phydev->phylink = NULL; phy_led_triggers_unregister(phydev); @@ -1178,12 +1171,9 @@ int phy_suspend(struct phy_device *phydev) struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; int ret = 0; - if (phydev->suspended) - return 0; - /* If the device has WOL enabled, we cannot suspend the PHY */ phy_ethtool_get_wol(phydev, &wol); - if (wol.wolopts || !phy_may_suspend(phydev)) + if (wol.wolopts) return -EBUSY; if (phydev->drv && phydrv->suspend) From 6194114324139dc16f3251c67ed853bd6d4ae056 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 24 Sep 2018 21:58:59 +0200 Subject: [PATCH 259/499] net: core: add member wol_enabled to struct net_device Add flag wol_enabled to struct net_device indicating whether Wake-on-LAN is enabled. As first user phy_suspend() will use it to decide whether PHY can be suspended or not. Fixes: f1e911d5d0df ("r8169: add basic phylib support") Fixes: e8cfd9d6c772 ("net: phy: call state machine synchronously in phy_stop") Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/ethtool.c | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ca5ab98053c8..c7861e4b402c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1730,6 +1730,8 @@ enum netdev_priv_flags { * switch driver and used to set the phys state of the * switch port. * + * @wol_enabled: Wake-on-LAN is enabled + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2014,6 +2016,7 @@ struct net_device { struct lock_class_key *qdisc_tx_busylock; struct lock_class_key *qdisc_running_key; bool proto_down; + unsigned wol_enabled:1; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 234a0ec2e932..0762aaf8e964 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1483,6 +1483,7 @@ static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol; + int ret; if (!dev->ethtool_ops->set_wol) return -EOPNOTSUPP; @@ -1490,7 +1491,13 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) if (copy_from_user(&wol, useraddr, sizeof(wol))) return -EFAULT; - return dev->ethtool_ops->set_wol(dev, &wol); + ret = dev->ethtool_ops->set_wol(dev, &wol); + if (ret) + return ret; + + dev->wol_enabled = !!wol.wolopts; + + return 0; } static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) From 93f41e67dc8ff0fd987120a6ef2717f21462c534 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 24 Sep 2018 22:01:32 +0200 Subject: [PATCH 260/499] net: phy: fix WoL handling when suspending the PHY Core of the problem is that phy_suspend() suspends the PHY when it should not because of WoL. phy_suspend() checks for WoL already, but this works only if the PHY driver handles WoL (what is rarely the case). Typically WoL is handled by the MAC driver. This patch uses new member wol_enabled of struct net_device as additional criteria in the check when not to suspend the PHY because of WoL. Last but not least change phy_detach() to call phy_suspend() before attached_dev is set to NULL. phy_suspend() accesses attached_dev when checking whether the MAC driver activated WoL. Fixes: f1e911d5d0df ("r8169: add basic phylib support") Fixes: e8cfd9d6c772 ("net: phy: call state machine synchronously in phy_stop") Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index db1172db1e7c..19ab8a7d1e48 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -93,7 +93,12 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) if (!netdev) return !phydev->suspended; - /* Don't suspend PHY if the attached netdev parent may wakeup. + if (netdev->wol_enabled) + return false; + + /* As long as not all affected network drivers support the + * wol_enabled flag, let's check for hints that WoL is enabled. + * Don't suspend PHY if the attached netdev parent may wake up. * The parent may point to a PCI device, as in tg3 driver. */ if (netdev->dev.parent && device_may_wakeup(netdev->dev.parent)) @@ -1132,9 +1137,9 @@ void phy_detach(struct phy_device *phydev) sysfs_remove_link(&dev->dev.kobj, "phydev"); sysfs_remove_link(&phydev->mdio.dev.kobj, "attached_dev"); } + phy_suspend(phydev); phydev->attached_dev->phydev = NULL; phydev->attached_dev = NULL; - phy_suspend(phydev); phydev->phylink = NULL; phy_led_triggers_unregister(phydev); @@ -1168,12 +1173,13 @@ EXPORT_SYMBOL(phy_detach); int phy_suspend(struct phy_device *phydev) { struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver); + struct net_device *netdev = phydev->attached_dev; struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; int ret = 0; /* If the device has WOL enabled, we cannot suspend the PHY */ phy_ethtool_get_wol(phydev, &wol); - if (wol.wolopts) + if (wol.wolopts || (netdev && netdev->wol_enabled)) return -EBUSY; if (phydev->drv && phydrv->suspend) From 5a94df70d3878ae597dd8331ec0add491bdfe851 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 24 Sep 2018 16:51:41 +0100 Subject: [PATCH 261/499] qed: fix spelling mistake "toogle" -> "toggle" Trivial fix to spelling mistake in DP_VERBOSE message Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index be941cfaa2d4..c71391b9c757 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -228,7 +228,7 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn, num_cons, "Toggle"); if (rc) { DP_VERBOSE(p_hwfn, QED_MSG_RDMA, - "Failed to allocate toogle bits, rc = %d\n", rc); + "Failed to allocate toggle bits, rc = %d\n", rc); goto free_cq_map; } From 079db3fd4ef164a59006425a988b1c73639fade8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 24 Sep 2018 18:39:51 +0100 Subject: [PATCH 262/499] wimax/i2400m: fix spelling mistake "not unitialized" -> "uninitialized" Trivial fix to spelling mistake in ms_to_errno array of error messages and remove confusing "not" from the error text since the error code refers to an uninitialized error code. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/wimax/i2400m/control.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wimax/i2400m/control.c b/drivers/net/wimax/i2400m/control.c index 094cea775d0c..ef298d8525c5 100644 --- a/drivers/net/wimax/i2400m/control.c +++ b/drivers/net/wimax/i2400m/control.c @@ -257,7 +257,7 @@ static const struct [I2400M_MS_ACCESSIBILITY_ERROR] = { "accesibility error", -EIO }, [I2400M_MS_BUSY] = { "busy", -EBUSY }, [I2400M_MS_CORRUPTED_TLV] = { "corrupted TLV", -EILSEQ }, - [I2400M_MS_UNINITIALIZED] = { "not unitialized", -EILSEQ }, + [I2400M_MS_UNINITIALIZED] = { "uninitialized", -EILSEQ }, [I2400M_MS_UNKNOWN_ERROR] = { "unknown error", -EIO }, [I2400M_MS_PRODUCTION_ERROR] = { "production error", -EIO }, [I2400M_MS_NO_RF] = { "no RF", -EIO }, From a898fba32229efd5e6b6154f83fa86a7145156b9 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 24 Sep 2018 13:53:34 -0700 Subject: [PATCH 263/499] qed: Avoid implicit enum conversion in qed_set_tunn_cls_info Clang warns when one enumerated type is implicitly converted to another. drivers/net/ethernet/qlogic/qed/qed_sp_commands.c:163:25: warning: implicit conversion from enumeration type 'enum tunnel_clss' to different enumeration type 'enum qed_tunn_clss' [-Wenum-conversion] p_tun->vxlan.tun_cls = type; ~ ^~~~ drivers/net/ethernet/qlogic/qed/qed_sp_commands.c:165:26: warning: implicit conversion from enumeration type 'enum tunnel_clss' to different enumeration type 'enum qed_tunn_clss' [-Wenum-conversion] p_tun->l2_gre.tun_cls = type; ~ ^~~~ drivers/net/ethernet/qlogic/qed/qed_sp_commands.c:167:26: warning: implicit conversion from enumeration type 'enum tunnel_clss' to different enumeration type 'enum qed_tunn_clss' [-Wenum-conversion] p_tun->ip_gre.tun_cls = type; ~ ^~~~ drivers/net/ethernet/qlogic/qed/qed_sp_commands.c:169:29: warning: implicit conversion from enumeration type 'enum tunnel_clss' to different enumeration type 'enum qed_tunn_clss' [-Wenum-conversion] p_tun->l2_geneve.tun_cls = type; ~ ^~~~ drivers/net/ethernet/qlogic/qed/qed_sp_commands.c:171:29: warning: implicit conversion from enumeration type 'enum tunnel_clss' to different enumeration type 'enum qed_tunn_clss' [-Wenum-conversion] p_tun->ip_geneve.tun_cls = type; ~ ^~~~ 5 warnings generated. Avoid this by changing type to an int. Link: https://github.com/ClangBuiltLinux/linux/issues/125 Signed-off-by: Nathan Chancellor Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c index 8de644b4721e..77b6248ad3b9 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c +++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c @@ -154,7 +154,7 @@ qed_set_pf_update_tunn_mode(struct qed_tunnel_info *p_tun, static void qed_set_tunn_cls_info(struct qed_tunnel_info *p_tun, struct qed_tunnel_info *p_src) { - enum tunnel_clss type; + int type; p_tun->b_update_rx_cls = p_src->b_update_rx_cls; p_tun->b_update_tx_cls = p_src->b_update_tx_cls; From db803f36e56f23b5a2266807e190d1dc11554d54 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 24 Sep 2018 14:05:27 -0700 Subject: [PATCH 264/499] qed: Fix mask parameter in qed_vf_prep_tunn_req_tlv Clang complains when one enumerated type is implicitly converted to another. drivers/net/ethernet/qlogic/qed/qed_vf.c:686:6: warning: implicit conversion from enumeration type 'enum qed_tunn_mode' to different enumeration type 'enum qed_tunn_clss' [-Wenum-conversion] QED_MODE_L2GENEVE_TUNN, ^~~~~~~~~~~~~~~~~~~~~~ Update mask's parameter to expect qed_tunn_mode, which is what was intended. Link: https://github.com/ClangBuiltLinux/linux/issues/125 Signed-off-by: Nathan Chancellor Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_vf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c index 3d4269659820..fcd8da08274f 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_vf.c +++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c @@ -572,7 +572,7 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn) static void __qed_vf_prep_tunn_req_tlv(struct vfpf_update_tunn_param_tlv *p_req, struct qed_tunn_update_type *p_src, - enum qed_tunn_clss mask, u8 *p_cls) + enum qed_tunn_mode mask, u8 *p_cls) { if (p_src->b_update_mode) { p_req->tun_mode_update_mask |= BIT(mask); @@ -587,7 +587,7 @@ __qed_vf_prep_tunn_req_tlv(struct vfpf_update_tunn_param_tlv *p_req, static void qed_vf_prep_tunn_req_tlv(struct vfpf_update_tunn_param_tlv *p_req, struct qed_tunn_update_type *p_src, - enum qed_tunn_clss mask, + enum qed_tunn_mode mask, u8 *p_cls, struct qed_tunn_update_udp_port *p_port, u8 *p_update_port, u16 *p_udp_port) { From d3a315795b4ce8b105a64a90699103121bde04a8 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 24 Sep 2018 14:34:53 -0700 Subject: [PATCH 265/499] qed: Avoid implicit enum conversion in qed_roce_mode_to_flavor Clang warns when one enumerated type is implicitly converted to another. drivers/net/ethernet/qlogic/qed/qed_roce.c:153:12: warning: implicit conversion from enumeration type 'enum roce_mode' to different enumeration type 'enum roce_flavor' [-Wenum-conversion] flavor = ROCE_V2_IPV6; ~ ^~~~~~~~~~~~ drivers/net/ethernet/qlogic/qed/qed_roce.c:156:12: warning: implicit conversion from enumeration type 'enum roce_mode' to different enumeration type 'enum roce_flavor' [-Wenum-conversion] flavor = MAX_ROCE_MODE; ~ ^~~~~~~~~~~~~ 2 warnings generated. Use the appropriate values from the expected type, roce_flavor: ROCE_V2_IPV6 = RROCE_IPV6 = 2 MAX_ROCE_MODE = MAX_ROCE_FLAVOR = 3 While we're add it, ditch the local variable flavor, we can just return the value directly from the switch statement. Link: https://github.com/ClangBuiltLinux/linux/issues/125 Signed-off-by: Nathan Chancellor Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_roce.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c index 7d7a64c55ff1..f9167d1354bb 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_roce.c +++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c @@ -140,23 +140,16 @@ static void qed_rdma_copy_gids(struct qed_rdma_qp *qp, __le32 *src_gid, static enum roce_flavor qed_roce_mode_to_flavor(enum roce_mode roce_mode) { - enum roce_flavor flavor; - switch (roce_mode) { case ROCE_V1: - flavor = PLAIN_ROCE; - break; + return PLAIN_ROCE; case ROCE_V2_IPV4: - flavor = RROCE_IPV4; - break; + return RROCE_IPV4; case ROCE_V2_IPV6: - flavor = ROCE_V2_IPV6; - break; + return RROCE_IPV6; default: - flavor = MAX_ROCE_MODE; - break; + return MAX_ROCE_FLAVOR; } - return flavor; } static void qed_roce_free_cid_pair(struct qed_hwfn *p_hwfn, u16 cid) From 6a9e461f6fe4434e6172304b69774daff9a3ac4c Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Mon, 24 Sep 2018 14:39:42 -0700 Subject: [PATCH 266/499] bonding: pass link-local packets to bonding master also. Commit b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to link that packets arrived on") changed the behavior of how link-local-multicast packets are processed. The change in the behavior broke some legacy use cases where these packets are expected to arrive on bonding master device also. This patch passes the packet to the stack with the link it arrived on as well as passes to the bonding-master device to preserve the legacy use case. Fixes: b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to link that packets arrived on") Reported-by: Michal Soltys Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 0d87e11e7f1d..8c0a0908875d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1170,9 +1170,26 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) } } - /* don't change skb->dev for link-local packets */ - if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) + /* Link-local multicast packets should be passed to the + * stack on the link they arrive as well as pass them to the + * bond-master device. These packets are mostly usable when + * stack receives it with the link on which they arrive + * (e.g. LLDP) they also must be available on master. Some of + * the use cases include (but are not limited to): LLDP agents + * that must be able to operate both on enslaved interfaces as + * well as on bonds themselves; linux bridges that must be able + * to process/pass BPDUs from attached bonds when any kind of + * STP version is enabled on the network. + */ + if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (nskb) { + nskb->dev = bond->dev; + netif_rx(nskb); + } return RX_HANDLER_PASS; + } if (bond_should_deliver_exact_match(skb, slave, bond)) return RX_HANDLER_EXACT; From d4859d749aa7090ffb743d15648adb962a1baeae Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Mon, 24 Sep 2018 14:40:11 -0700 Subject: [PATCH 267/499] bonding: avoid possible dead-lock Syzkaller reported this on a slightly older kernel but it's still applicable to the current kernel - ====================================================== WARNING: possible circular locking dependency detected 4.18.0-next-20180823+ #46 Not tainted ------------------------------------------------------ syz-executor4/26841 is trying to acquire lock: 00000000dd41ef48 ((wq_completion)bond_dev->name){+.+.}, at: flush_workqueue+0x2db/0x1e10 kernel/workqueue.c:2652 but task is already holding lock: 00000000768ab431 (rtnl_mutex){+.+.}, at: rtnl_lock net/core/rtnetlink.c:77 [inline] 00000000768ab431 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x412/0xc30 net/core/rtnetlink.c:4708 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (rtnl_mutex){+.+.}: __mutex_lock_common kernel/locking/mutex.c:925 [inline] __mutex_lock+0x171/0x1700 kernel/locking/mutex.c:1073 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:1088 rtnl_lock+0x17/0x20 net/core/rtnetlink.c:77 bond_netdev_notify drivers/net/bonding/bond_main.c:1310 [inline] bond_netdev_notify_work+0x44/0xd0 drivers/net/bonding/bond_main.c:1320 process_one_work+0xc73/0x1aa0 kernel/workqueue.c:2153 worker_thread+0x189/0x13c0 kernel/workqueue.c:2296 kthread+0x35a/0x420 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415 -> #1 ((work_completion)(&(&nnw->work)->work)){+.+.}: process_one_work+0xc0b/0x1aa0 kernel/workqueue.c:2129 worker_thread+0x189/0x13c0 kernel/workqueue.c:2296 kthread+0x35a/0x420 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415 -> #0 ((wq_completion)bond_dev->name){+.+.}: lock_acquire+0x1e4/0x4f0 kernel/locking/lockdep.c:3901 flush_workqueue+0x30a/0x1e10 kernel/workqueue.c:2655 drain_workqueue+0x2a9/0x640 kernel/workqueue.c:2820 destroy_workqueue+0xc6/0x9d0 kernel/workqueue.c:4155 __alloc_workqueue_key+0xef9/0x1190 kernel/workqueue.c:4138 bond_init+0x269/0x940 drivers/net/bonding/bond_main.c:4734 register_netdevice+0x337/0x1100 net/core/dev.c:8410 bond_newlink+0x49/0xa0 drivers/net/bonding/bond_netlink.c:453 rtnl_newlink+0xef4/0x1d50 net/core/rtnetlink.c:3099 rtnetlink_rcv_msg+0x46e/0xc30 net/core/rtnetlink.c:4711 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2454 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4729 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x5a0/0x760 net/netlink/af_netlink.c:1343 netlink_sendmsg+0xa18/0xfc0 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:622 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:632 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2115 __sys_sendmsg+0x11d/0x290 net/socket.c:2153 __do_sys_sendmsg net/socket.c:2162 [inline] __se_sys_sendmsg net/socket.c:2160 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2160 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Chain exists of: (wq_completion)bond_dev->name --> (work_completion)(&(&nnw->work)->work) --> rtnl_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(rtnl_mutex); lock((work_completion)(&(&nnw->work)->work)); lock(rtnl_mutex); lock((wq_completion)bond_dev->name); *** DEADLOCK *** 1 lock held by syz-executor4/26841: stack backtrace: CPU: 1 PID: 26841 Comm: syz-executor4 Not tainted 4.18.0-next-20180823+ #46 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 print_circular_bug.isra.34.cold.55+0x1bd/0x27d kernel/locking/lockdep.c:1222 check_prev_add kernel/locking/lockdep.c:1862 [inline] check_prevs_add kernel/locking/lockdep.c:1975 [inline] validate_chain kernel/locking/lockdep.c:2416 [inline] __lock_acquire+0x3449/0x5020 kernel/locking/lockdep.c:3412 lock_acquire+0x1e4/0x4f0 kernel/locking/lockdep.c:3901 flush_workqueue+0x30a/0x1e10 kernel/workqueue.c:2655 drain_workqueue+0x2a9/0x640 kernel/workqueue.c:2820 destroy_workqueue+0xc6/0x9d0 kernel/workqueue.c:4155 __alloc_workqueue_key+0xef9/0x1190 kernel/workqueue.c:4138 bond_init+0x269/0x940 drivers/net/bonding/bond_main.c:4734 register_netdevice+0x337/0x1100 net/core/dev.c:8410 bond_newlink+0x49/0xa0 drivers/net/bonding/bond_netlink.c:453 rtnl_newlink+0xef4/0x1d50 net/core/rtnetlink.c:3099 rtnetlink_rcv_msg+0x46e/0xc30 net/core/rtnetlink.c:4711 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2454 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4729 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x5a0/0x760 net/netlink/af_netlink.c:1343 netlink_sendmsg+0xa18/0xfc0 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:622 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:632 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2115 __sys_sendmsg+0x11d/0x290 net/socket.c:2153 __do_sys_sendmsg net/socket.c:2162 [inline] __se_sys_sendmsg net/socket.c:2160 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2160 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457089 Code: fd b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f2df20a5c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f2df20a66d4 RCX: 0000000000457089 RDX: 0000000000000000 RSI: 0000000020000180 RDI: 0000000000000003 RBP: 0000000000930140 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000004d40b8 R14: 00000000004c8ad8 R15: 0000000000000001 Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 43 +++++++++++++-------------------- include/net/bonding.h | 7 +----- 2 files changed, 18 insertions(+), 32 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8c0a0908875d..c05c01a00755 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -210,6 +210,7 @@ static void bond_get_stats(struct net_device *bond_dev, static void bond_slave_arr_handler(struct work_struct *work); static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, int mod); +static void bond_netdev_notify_work(struct work_struct *work); /*---------------------------- General routines -----------------------------*/ @@ -1286,6 +1287,8 @@ static struct slave *bond_alloc_slave(struct bonding *bond) return NULL; } } + INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work); + return slave; } @@ -1293,6 +1296,7 @@ static void bond_free_slave(struct slave *slave) { struct bonding *bond = bond_get_bond_by_slave(slave); + cancel_delayed_work_sync(&slave->notify_work); if (BOND_MODE(bond) == BOND_MODE_8023AD) kfree(SLAVE_AD_INFO(slave)); @@ -1314,39 +1318,26 @@ static void bond_fill_ifslave(struct slave *slave, struct ifslave *info) info->link_failure_count = slave->link_failure_count; } -static void bond_netdev_notify(struct net_device *dev, - struct netdev_bonding_info *info) -{ - rtnl_lock(); - netdev_bonding_info_change(dev, info); - rtnl_unlock(); -} - static void bond_netdev_notify_work(struct work_struct *_work) { - struct netdev_notify_work *w = - container_of(_work, struct netdev_notify_work, work.work); + struct slave *slave = container_of(_work, struct slave, + notify_work.work); - bond_netdev_notify(w->dev, &w->bonding_info); - dev_put(w->dev); - kfree(w); + if (rtnl_trylock()) { + struct netdev_bonding_info binfo; + + bond_fill_ifslave(slave, &binfo.slave); + bond_fill_ifbond(slave->bond, &binfo.master); + netdev_bonding_info_change(slave->dev, &binfo); + rtnl_unlock(); + } else { + queue_delayed_work(slave->bond->wq, &slave->notify_work, 1); + } } void bond_queue_slave_event(struct slave *slave) { - struct bonding *bond = slave->bond; - struct netdev_notify_work *nnw = kzalloc(sizeof(*nnw), GFP_ATOMIC); - - if (!nnw) - return; - - dev_hold(slave->dev); - nnw->dev = slave->dev; - bond_fill_ifslave(slave, &nnw->bonding_info.slave); - bond_fill_ifbond(bond, &nnw->bonding_info.master); - INIT_DELAYED_WORK(&nnw->work, bond_netdev_notify_work); - - queue_delayed_work(slave->bond->wq, &nnw->work, 0); + queue_delayed_work(slave->bond->wq, &slave->notify_work, 0); } void bond_lower_state_changed(struct slave *slave) diff --git a/include/net/bonding.h b/include/net/bonding.h index a2d058170ea3..b46d68acf701 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -139,12 +139,6 @@ struct bond_parm_tbl { int mode; }; -struct netdev_notify_work { - struct delayed_work work; - struct net_device *dev; - struct netdev_bonding_info bonding_info; -}; - struct slave { struct net_device *dev; /* first - useful for panic debug */ struct bonding *bond; /* our master */ @@ -172,6 +166,7 @@ struct slave { #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif + struct delayed_work notify_work; struct kobject kobj; struct rtnl_link_stats64 slave_stats; }; From 1c492a9d55ba99079210ed901dd8a5423f980487 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 24 Sep 2018 15:17:03 -0700 Subject: [PATCH 268/499] qed: Avoid constant logical operation warning in qed_vf_pf_acquire Clang warns when a constant is used in a boolean context as it thinks a bitwise operation may have been intended. drivers/net/ethernet/qlogic/qed/qed_vf.c:415:27: warning: use of logical '&&' with constant operand [-Wconstant-logical-operand] if (!p_iov->b_pre_fp_hsi && ^ drivers/net/ethernet/qlogic/qed/qed_vf.c:415:27: note: use '&' for a bitwise operation if (!p_iov->b_pre_fp_hsi && ^~ & drivers/net/ethernet/qlogic/qed/qed_vf.c:415:27: note: remove constant to silence this warning if (!p_iov->b_pre_fp_hsi && ~^~ 1 warning generated. This has been here since commit 1fe614d10f45 ("qed: Relax VF firmware requirements") and I am not entirely sure why since 0 isn't a special case. Just remove the statement causing Clang to warn since it isn't required. Link: https://github.com/ClangBuiltLinux/linux/issues/126 Signed-off-by: Nathan Chancellor Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_vf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c index fcd8da08274f..be118d057b92 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_vf.c +++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c @@ -413,7 +413,6 @@ static int qed_vf_pf_acquire(struct qed_hwfn *p_hwfn) } if (!p_iov->b_pre_fp_hsi && - ETH_HSI_VER_MINOR && (resp->pfdev_info.minor_fp_hsi < ETH_HSI_VER_MINOR)) { DP_INFO(p_hwfn, "PF is using older fastpath HSI; %02x.%02x is configured\n", From 77f2d753819b7d50c16abfb778caf1fe075faed0 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 24 Sep 2018 14:42:12 -0700 Subject: [PATCH 269/499] qed: Avoid implicit enum conversion in qed_iwarp_parse_rx_pkt Clang warns when one enumerated type is implicitly converted to another. drivers/net/ethernet/qlogic/qed/qed_iwarp.c:1713:25: warning: implicit conversion from enumeration type 'enum tcp_ip_version' to different enumeration type 'enum qed_tcp_ip_version' [-Wenum-conversion] cm_info->ip_version = TCP_IPV4; ~ ^~~~~~~~ drivers/net/ethernet/qlogic/qed/qed_iwarp.c:1733:25: warning: implicit conversion from enumeration type 'enum tcp_ip_version' to different enumeration type 'enum qed_tcp_ip_version' [-Wenum-conversion] cm_info->ip_version = TCP_IPV6; ~ ^~~~~~~~ 2 warnings generated. Use the appropriate values from the expected type, qed_tcp_ip_version: TCP_IPV4 = QED_TCP_IPV4 = 0 TCP_IPV6 = QED_TCP_IPV6 = 1 Link: https://github.com/ClangBuiltLinux/linux/issues/125 Signed-off-by: Nathan Chancellor Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c index 17f3dfa2cc94..e860bdf0f752 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c @@ -1710,7 +1710,7 @@ qed_iwarp_parse_rx_pkt(struct qed_hwfn *p_hwfn, cm_info->local_ip[0] = ntohl(iph->daddr); cm_info->remote_ip[0] = ntohl(iph->saddr); - cm_info->ip_version = TCP_IPV4; + cm_info->ip_version = QED_TCP_IPV4; ip_hlen = (iph->ihl) * sizeof(u32); *payload_len = ntohs(iph->tot_len) - ip_hlen; @@ -1730,7 +1730,7 @@ qed_iwarp_parse_rx_pkt(struct qed_hwfn *p_hwfn, cm_info->remote_ip[i] = ntohl(ip6h->saddr.in6_u.u6_addr32[i]); } - cm_info->ip_version = TCP_IPV6; + cm_info->ip_version = QED_TCP_IPV6; ip_hlen = sizeof(*ip6h); *payload_len = ntohs(ip6h->payload_len); From 3e322474485931e7ea6e4c5560089991a7f03cbc Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 25 Sep 2018 01:50:00 +0200 Subject: [PATCH 270/499] net: phy: sfp: Fix unregistering of HWMON SFP device A HWMON device is only registered is the SFP module supports the diagnostic page and is complient to SFF8472. Don't unconditionally unregister the hwmon device when the SFP module is remove, otherwise we access data structures which don't exist. Reported-by: Florian Fainelli Fixes: 1323061a018a ("net: phy: sfp: Add HWMON support for module sensors") Signed-off-by: Andrew Lunn Tested-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/sfp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 52fffb98fde9..6e13b8832bc7 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -1098,8 +1098,11 @@ static int sfp_hwmon_insert(struct sfp *sfp) static void sfp_hwmon_remove(struct sfp *sfp) { - hwmon_device_unregister(sfp->hwmon_dev); - kfree(sfp->hwmon_name); + if (!IS_ERR_OR_NULL(sfp->hwmon_dev)) { + hwmon_device_unregister(sfp->hwmon_dev); + sfp->hwmon_dev = NULL; + kfree(sfp->hwmon_name); + } } #else static int sfp_hwmon_insert(struct sfp *sfp) From 8fd780698745ba121530c5c20fd237aacde4c371 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 26 Sep 2018 10:35:42 +0800 Subject: [PATCH 271/499] vxlan: fill ttl inherit info When add vxlan ttl inherit support, I forgot to fill it when dump vlxan info. Fix it now. Fixes: 72f6d71e491e6 ("vxlan: add ttl inherit support") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ababba37d735..2b8da2b7e721 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3539,6 +3539,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL_INHERIT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ @@ -3603,6 +3604,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) } if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || + nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT, + !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) || nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || nla_put_u8(skb, IFLA_VXLAN_LEARNING, From 73f21c653f930f438d53eed29b5e4c65c8a0f906 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 26 Sep 2018 00:41:04 -0400 Subject: [PATCH 272/499] bnxt_en: Fix TX timeout during netpoll. The current netpoll implementation in the bnxt_en driver has problems that may miss TX completion events. bnxt_poll_work() in effect is only handling at most 1 TX packet before exiting. In addition, there may be in flight TX completions that ->poll() may miss even after we fix bnxt_poll_work() to handle all visible TX completions. netpoll may not call ->poll() again and HW may not generate IRQ because the driver does not ARM the IRQ when the budget (0 for netpoll) is reached. We fix it by handling all TX completions and to always ARM the IRQ when we exit ->poll() with 0 budget. Also, the logic to ACK the completion ring in case it is almost filled with TX completions need to be adjusted to take care of the 0 budget case, as discussed with Eric Dumazet Reported-by: Song Liu Reviewed-by: Song Liu Tested-by: Song Liu Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 61957b0bbd8c..0478e562abac 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1884,8 +1884,11 @@ static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) if (TX_CMP_TYPE(txcmp) == CMP_TYPE_TX_L2_CMP) { tx_pkts++; /* return full budget so NAPI will complete. */ - if (unlikely(tx_pkts > bp->tx_wake_thresh)) + if (unlikely(tx_pkts > bp->tx_wake_thresh)) { rx_pkts = budget; + raw_cons = NEXT_RAW_CMP(raw_cons); + break; + } } else if ((TX_CMP_TYPE(txcmp) & 0x30) == 0x10) { if (likely(budget)) rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event); @@ -1913,7 +1916,7 @@ static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) } raw_cons = NEXT_RAW_CMP(raw_cons); - if (rx_pkts == budget) + if (rx_pkts && rx_pkts == budget) break; } @@ -2027,8 +2030,12 @@ static int bnxt_poll(struct napi_struct *napi, int budget) while (1) { work_done += bnxt_poll_work(bp, bnapi, budget - work_done); - if (work_done >= budget) + if (work_done >= budget) { + if (!budget) + BNXT_CP_DB_REARM(cpr->cp_doorbell, + cpr->cp_raw_cons); break; + } if (!bnxt_has_work(bp, cpr)) { if (napi_complete_done(napi, work_done)) From d4ce58082f206bf6e7d697380c7bc5480a8b0264 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 25 Sep 2018 21:59:28 -0700 Subject: [PATCH 273/499] net-tcp: /proc/sys/net/ipv4/tcp_probe_interval is a u32 not int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (fix documentation and sysctl access to treat it as such) Tested: # zcat /proc/config.gz | egrep ^CONFIG_HZ CONFIG_HZ_1000=y CONFIG_HZ=1000 # echo $[(1<<32)/1000 + 1] | tee /proc/sys/net/ipv4/tcp_probe_interval 4294968 tee: /proc/sys/net/ipv4/tcp_probe_interval: Invalid argument # echo $[(1<<32)/1000] | tee /proc/sys/net/ipv4/tcp_probe_interval 4294967 # echo 0 | tee /proc/sys/net/ipv4/tcp_probe_interval # echo -1 | tee /proc/sys/net/ipv4/tcp_probe_interval -1 tee: /proc/sys/net/ipv4/tcp_probe_interval: Invalid argument Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 2 +- net/ipv4/sysctl_net_ipv4.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 8313a636dd53..960de8fe3f40 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -425,7 +425,7 @@ tcp_mtu_probing - INTEGER 1 - Disabled by default, enabled when an ICMP black hole detected 2 - Always enabled, use initial MSS of tcp_base_mss. -tcp_probe_interval - INTEGER +tcp_probe_interval - UNSIGNED INTEGER Controls how often to start TCP Packetization-Layer Path MTU Discovery reprobe. The default is reprobing every 10 minutes as per RFC4821. diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b92f422f2fa8..891ed2f91467 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -48,6 +48,7 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; +static u32 u32_max_div_HZ = UINT_MAX / HZ; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -745,9 +746,10 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_probe_interval", .data = &init_net.ipv4.sysctl_tcp_probe_interval, - .maxlen = sizeof(int), + .maxlen = sizeof(u32), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec_minmax, + .extra2 = &u32_max_div_HZ, }, { .procname = "igmp_link_local_mcast_reports", From 1222a16014888ed9733c11e221730d4a8196222b Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Tue, 25 Sep 2018 11:15:01 +0900 Subject: [PATCH 274/499] nl80211: Fix possible Spectre-v1 for CQM RSSI thresholds Use array_index_nospec() to sanitize i with respect to speculation. Note that the user doesn't control i directly, but can make it out of bounds by not finding a threshold in the array. Signed-off-by: Masashi Honma [add note about user control, as explained by Masashi] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bd26230de63e..176edfefcbaa 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10231,7 +10231,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev = dev->ieee80211_ptr; s32 last, low, high; u32 hyst; - int i, n; + int i, n, low_index; int err; /* RSSI reporting disabled? */ @@ -10268,10 +10268,19 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, if (last < wdev->cqm_config->rssi_thresholds[i]) break; - low = i > 0 ? - (wdev->cqm_config->rssi_thresholds[i - 1] - hyst) : S32_MIN; - high = i < n ? - (wdev->cqm_config->rssi_thresholds[i] + hyst - 1) : S32_MAX; + low_index = i - 1; + if (low_index >= 0) { + low_index = array_index_nospec(low_index, n); + low = wdev->cqm_config->rssi_thresholds[low_index] - hyst; + } else { + low = S32_MIN; + } + if (i < n) { + i = array_index_nospec(i, n); + high = wdev->cqm_config->rssi_thresholds[i] + hyst - 1; + } else { + high = S32_MAX; + } return rdev_set_cqm_rssi_range_config(rdev, dev, low, high); } From 52bf4a900d9cede3eb14982d0f2c5e6db6d97cc3 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 25 Apr 2018 12:14:39 +0200 Subject: [PATCH 275/499] clocksource/drivers/timer-atmel-pit: Properly handle error cases The smatch utility reports a possible leak: smatch warnings: drivers/clocksource/timer-atmel-pit.c:183 at91sam926x_pit_dt_init() warn: possible memory leak of 'data' Ensure data is freed before exiting with an error. Reported-by: Dan Carpenter Signed-off-by: Alexandre Belloni Cc: stable@vger.kernel.org Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-atmel-pit.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c index ec8a4376f74f..2fab18fae4fc 100644 --- a/drivers/clocksource/timer-atmel-pit.c +++ b/drivers/clocksource/timer-atmel-pit.c @@ -180,26 +180,29 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node) data->base = of_iomap(node, 0); if (!data->base) { pr_err("Could not map PIT address\n"); - return -ENXIO; + ret = -ENXIO; + goto exit; } data->mck = of_clk_get(node, 0); if (IS_ERR(data->mck)) { pr_err("Unable to get mck clk\n"); - return PTR_ERR(data->mck); + ret = PTR_ERR(data->mck); + goto exit; } ret = clk_prepare_enable(data->mck); if (ret) { pr_err("Unable to enable mck\n"); - return ret; + goto exit; } /* Get the interrupts property */ data->irq = irq_of_parse_and_map(node, 0); if (!data->irq) { pr_err("Unable to get IRQ from DT\n"); - return -EINVAL; + ret = -EINVAL; + goto exit; } /* @@ -227,7 +230,7 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node) ret = clocksource_register_hz(&data->clksrc, pit_rate); if (ret) { pr_err("Failed to register clocksource\n"); - return ret; + goto exit; } /* Set up irq handler */ @@ -236,7 +239,8 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node) "at91_tick", data); if (ret) { pr_err("Unable to setup IRQ\n"); - return ret; + clocksource_unregister(&data->clksrc); + goto exit; } /* Set up and register clockevents */ @@ -254,6 +258,10 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node) clockevents_register_device(&data->clkevt); return 0; + +exit: + kfree(data); + return ret; } TIMER_OF_DECLARE(at91sam926x_pit, "atmel,at91sam9260-pit", at91sam926x_pit_dt_init); From 659902db7008f233e3d514c774b5ff3d46c8edd8 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 14 Sep 2018 13:04:37 +0800 Subject: [PATCH 276/499] ieee802154: ca8210: remove redundant condition check before debugfs_remove debugfs_remove has taken the IS_ERR into account. Just remove the unnecessary condition. Signed-off-by: zhong jiang Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/ca8210.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index e21279dde85c..0ff5a403a8dc 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -3043,8 +3043,7 @@ static void ca8210_test_interface_clear(struct ca8210_priv *priv) { struct ca8210_test *test = &priv->test; - if (!IS_ERR(test->ca8210_dfs_spi_int)) - debugfs_remove(test->ca8210_dfs_spi_int); + debugfs_remove(test->ca8210_dfs_spi_int); kfifo_free(&test->up_fifo); dev_info(&priv->spi->dev, "Test interface removed\n"); } From 092ffc51fb3f9b8369e737c9320bf0bffb2c898f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:07 +0100 Subject: [PATCH 277/499] rxrpc: Remove dup code from rxrpc_find_connection_rcu() rxrpc_find_connection_rcu() initialises variable k twice with the same information. Remove one of the initialisations. Signed-off-by: David Howells --- net/rxrpc/conn_object.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 77440a356b14..1746b48cb165 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -85,9 +85,6 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0) goto not_found; - k.epoch = sp->hdr.epoch; - k.cid = sp->hdr.cid & RXRPC_CIDMASK; - /* We may have to handle mixing IPv4 and IPv6 */ if (srx.transport.family != local->srx.transport.family) { pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", From f151ba989d149bbdfc90e5405724bbea094f9b17 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Sat, 22 Sep 2018 15:55:49 -0400 Subject: [PATCH 278/499] xen/blkfront: When purging persistent grants, keep them in the buffer Commit a46b53672b2c ("xen/blkfront: cleanup stale persistent grants") added support for purging persistent grants when they are not in use. As part of the purge, the grants were removed from the grant buffer, This eventually causes the buffer to become empty, with BUG_ON triggered in get_free_grant(). This can be observed even on an idle system, within 20-30 minutes. We should keep the grants in the buffer when purging, and only free the grant ref. Fixes: a46b53672b2c ("xen/blkfront: cleanup stale persistent grants") Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index a71d817e900d..3b441fe69c0d 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2667,11 +2667,9 @@ static void purge_persistent_grants(struct blkfront_info *info) gnttab_query_foreign_access(gnt_list_entry->gref)) continue; - list_del(&gnt_list_entry->node); gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); + gnt_list_entry->gref = GRANT_INVALID_REF; rinfo->persistent_gnts_c--; - __free_page(gnt_list_entry->page); - kfree(gnt_list_entry); } spin_unlock_irqrestore(&rinfo->ring_lock, flags); From d6a77ba0eb92d8ffa4b05a442fc20d0a9b11c4c4 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 27 Sep 2018 14:41:30 +0200 Subject: [PATCH 279/499] Revert "drm/panel: Add device_link from panel device to DRM device" This reverts commit 0c08754b59da5557532d946599854e6df28edc22. commit 0c08754b59da ("drm/panel: Add device_link from panel device to DRM device") creates a circular dependency under these circumstances: 1. The panel depends on dsi-host because it is MIPI-DSI child device. 2. dsi-host depends on the drm parent device (connector->dev->dev) this should be allowed. 3. drm parent dev (connector->dev->dev) depends on the panel after this patch. This makes the dependency circular and while it appears it does not affect any in-tree drivers (they do not seem to have dsi hosts depending on the same parent device) this does not seem right. As noted in a response from Andrzej Hajda, the intent is likely to make the panel dependent on the DRM device (connector->dev) not its parent. But we have no way of doing that since the DRM device doesn't contain any struct device on its own (arguably it should). Revert this until a proper approach is figured out. Cc: Jyri Sarha Cc: Eric Anholt Cc: Andrzej Hajda Signed-off-by: Linus Walleij Signed-off-by: Sean Paul Link: https://patchwork.freedesktop.org/patch/msgid/20180927124130.9102-1-linus.walleij@linaro.org --- drivers/gpu/drm/drm_panel.c | 10 ---------- include/drm/drm_panel.h | 1 - 2 files changed, 11 deletions(-) diff --git a/drivers/gpu/drm/drm_panel.c b/drivers/gpu/drm/drm_panel.c index b902361dee6e..1d9a9d2fe0e0 100644 --- a/drivers/gpu/drm/drm_panel.c +++ b/drivers/gpu/drm/drm_panel.c @@ -24,7 +24,6 @@ #include #include -#include #include #include @@ -105,13 +104,6 @@ int drm_panel_attach(struct drm_panel *panel, struct drm_connector *connector) if (panel->connector) return -EBUSY; - panel->link = device_link_add(connector->dev->dev, panel->dev, 0); - if (!panel->link) { - dev_err(panel->dev, "failed to link panel to %s\n", - dev_name(connector->dev->dev)); - return -EINVAL; - } - panel->connector = connector; panel->drm = connector->dev; @@ -133,8 +125,6 @@ EXPORT_SYMBOL(drm_panel_attach); */ int drm_panel_detach(struct drm_panel *panel) { - device_link_del(panel->link); - panel->connector = NULL; panel->drm = NULL; diff --git a/include/drm/drm_panel.h b/include/drm/drm_panel.h index 582a0ec0aa70..777814755fa6 100644 --- a/include/drm/drm_panel.h +++ b/include/drm/drm_panel.h @@ -89,7 +89,6 @@ struct drm_panel { struct drm_device *drm; struct drm_connector *connector; struct device *dev; - struct device_link *link; const struct drm_panel_funcs *funcs; From 61ea6f5831974ebd1a57baffd7cc30600a2e26fc Mon Sep 17 00:00:00 2001 From: Rex Zhu Date: Thu, 27 Sep 2018 20:48:39 +0800 Subject: [PATCH 280/499] drm/amdgpu: Fix vce work queue was not cancelled when suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vce cancel_delayed_work_sync never be called. driver call the function in error path. This caused the A+A suspend hang when runtime pm enebled. As we will visit the smu in the idle queue. this will cause smu hang because the dgpu has been suspend, and the dgpu also will be waked up. As the smu has been hang, so the dgpu resume will failed. Reviewed-by: Christian König Reviewed-by: Feifei Xu Signed-off-by: Rex Zhu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index 0cc5190f4f36..5f3f54073818 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -258,6 +258,8 @@ int amdgpu_vce_suspend(struct amdgpu_device *adev) { int i; + cancel_delayed_work_sync(&adev->vce.idle_work); + if (adev->vce.vcpu_bo == NULL) return 0; @@ -268,7 +270,6 @@ int amdgpu_vce_suspend(struct amdgpu_device *adev) if (i == AMDGPU_MAX_VCE_HANDLES) return 0; - cancel_delayed_work_sync(&adev->vce.idle_work); /* TODO: suspending running encoding sessions isn't supported */ return -EINVAL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index fd654a4406db..400fc74bbae2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -153,11 +153,11 @@ int amdgpu_vcn_suspend(struct amdgpu_device *adev) unsigned size; void *ptr; + cancel_delayed_work_sync(&adev->vcn.idle_work); + if (adev->vcn.vcpu_bo == NULL) return 0; - cancel_delayed_work_sync(&adev->vcn.idle_work); - size = amdgpu_bo_size(adev->vcn.vcpu_bo); ptr = adev->vcn.cpu_addr; From 599760d6d0abbab71f9726b49858d2ec45e74c0a Mon Sep 17 00:00:00 2001 From: Roman Li Date: Wed, 26 Sep 2018 13:42:16 -0400 Subject: [PATCH 281/499] drm/amd/display: Fix Vega10 lightup on S3 resume [Why] There have been a few reports of Vega10 display remaining blank after S3 resume. The regression is caused by workaround for mode change on Vega10 - skip set_bandwidth if stream count is 0. As a result we skipped dispclk reset on suspend, thus on resume we may skip the clock update assuming it hasn't been changed. On some systems it causes display blank or 'out of range'. [How] Revert "drm/amd/display: Fix Vega10 black screen after mode change" Verified that it hadn't cause mode change regression. Signed-off-by: Roman Li Reviewed-by: Sun peng Li Acked-by: Leo Li Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/dce110/dce110_hw_sequencer.c | 2 +- .../drm/amd/display/dc/dce110/dce110_hw_sequencer.h | 5 ----- .../drm/amd/display/dc/dce120/dce120_hw_sequencer.c | 12 ------------ 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c index 14384d9675a8..b2f308766a9e 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c @@ -2560,7 +2560,7 @@ static void pplib_apply_display_requirements( dc->prev_display_config = *pp_display_cfg; } -void dce110_set_bandwidth( +static void dce110_set_bandwidth( struct dc *dc, struct dc_state *context, bool decrease_allowed) diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.h b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.h index e4c5db75c4c6..d6db3dbd9015 100644 --- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.h +++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.h @@ -68,11 +68,6 @@ void dce110_fill_display_configs( const struct dc_state *context, struct dm_pp_display_configuration *pp_display_cfg); -void dce110_set_bandwidth( - struct dc *dc, - struct dc_state *context, - bool decrease_allowed); - uint32_t dce110_get_min_vblank_time_us(const struct dc_state *context); void dp_receiver_power_ctrl(struct dc_link *link, bool on); diff --git a/drivers/gpu/drm/amd/display/dc/dce120/dce120_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce120/dce120_hw_sequencer.c index 5853522a6182..eb0f5f9a973b 100644 --- a/drivers/gpu/drm/amd/display/dc/dce120/dce120_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dce120/dce120_hw_sequencer.c @@ -244,17 +244,6 @@ static void dce120_update_dchub( dh_data->dchub_info_valid = false; } -static void dce120_set_bandwidth( - struct dc *dc, - struct dc_state *context, - bool decrease_allowed) -{ - if (context->stream_count <= 0) - return; - - dce110_set_bandwidth(dc, context, decrease_allowed); -} - void dce120_hw_sequencer_construct(struct dc *dc) { /* All registers used by dce11.2 match those in dce11 in offset and @@ -263,6 +252,5 @@ void dce120_hw_sequencer_construct(struct dc *dc) dce110_hw_sequencer_construct(dc); dc->hwss.enable_display_power_gating = dce120_enable_display_power_gating; dc->hwss.update_dchub = dce120_update_dchub; - dc->hwss.set_bandwidth = dce120_set_bandwidth; } From fbbdadf2faf17cd88e9c447701495540377c5743 Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Wed, 26 Sep 2018 13:42:10 -0400 Subject: [PATCH 282/499] drm/amd/display: Fix Edid emulation for linux [Why] EDID emulation didn't work properly for linux, as we stop programming if nothing is connected physically. [How] We get a flag from DRM when we want to do edid emulation. We check if this flag is true and nothing is connected physically, if so we only program the front end using VIRTUAL_SIGNAL. Signed-off-by: Bhawanpreet Lakha Reviewed-by: Harry Wentland Acked-by: Leo Li Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 139 +++++++++++++++++- drivers/gpu/drm/amd/display/dc/core/dc_link.c | 4 +- drivers/gpu/drm/amd/display/dc/dc_link.h | 1 + 3 files changed, 137 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 800f481a6995..96875950845a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -641,6 +641,87 @@ amdgpu_dm_find_first_crtc_matching_connector(struct drm_atomic_state *state, return NULL; } +static void emulated_link_detect(struct dc_link *link) +{ + struct dc_sink_init_data sink_init_data = { 0 }; + struct display_sink_capability sink_caps = { 0 }; + enum dc_edid_status edid_status; + struct dc_context *dc_ctx = link->ctx; + struct dc_sink *sink = NULL; + struct dc_sink *prev_sink = NULL; + + link->type = dc_connection_none; + prev_sink = link->local_sink; + + if (prev_sink != NULL) + dc_sink_retain(prev_sink); + + switch (link->connector_signal) { + case SIGNAL_TYPE_HDMI_TYPE_A: { + sink_caps.transaction_type = DDC_TRANSACTION_TYPE_I2C; + sink_caps.signal = SIGNAL_TYPE_HDMI_TYPE_A; + break; + } + + case SIGNAL_TYPE_DVI_SINGLE_LINK: { + sink_caps.transaction_type = DDC_TRANSACTION_TYPE_I2C; + sink_caps.signal = SIGNAL_TYPE_DVI_SINGLE_LINK; + break; + } + + case SIGNAL_TYPE_DVI_DUAL_LINK: { + sink_caps.transaction_type = DDC_TRANSACTION_TYPE_I2C; + sink_caps.signal = SIGNAL_TYPE_DVI_DUAL_LINK; + break; + } + + case SIGNAL_TYPE_LVDS: { + sink_caps.transaction_type = DDC_TRANSACTION_TYPE_I2C; + sink_caps.signal = SIGNAL_TYPE_LVDS; + break; + } + + case SIGNAL_TYPE_EDP: { + sink_caps.transaction_type = + DDC_TRANSACTION_TYPE_I2C_OVER_AUX; + sink_caps.signal = SIGNAL_TYPE_EDP; + break; + } + + case SIGNAL_TYPE_DISPLAY_PORT: { + sink_caps.transaction_type = + DDC_TRANSACTION_TYPE_I2C_OVER_AUX; + sink_caps.signal = SIGNAL_TYPE_VIRTUAL; + break; + } + + default: + DC_ERROR("Invalid connector type! signal:%d\n", + link->connector_signal); + return; + } + + sink_init_data.link = link; + sink_init_data.sink_signal = sink_caps.signal; + + sink = dc_sink_create(&sink_init_data); + if (!sink) { + DC_ERROR("Failed to create sink!\n"); + return; + } + + link->local_sink = sink; + + edid_status = dm_helpers_read_local_edid( + link->ctx, + link, + sink); + + if (edid_status != EDID_OK) + DC_ERROR("Failed to read EDID"); + +} + static int dm_resume(void *handle) { struct amdgpu_device *adev = handle; @@ -654,6 +735,7 @@ static int dm_resume(void *handle) struct drm_plane *plane; struct drm_plane_state *new_plane_state; struct dm_plane_state *dm_new_plane_state; + enum dc_connection_type new_connection_type = dc_connection_none; int ret; int i; @@ -684,7 +766,13 @@ static int dm_resume(void *handle) continue; mutex_lock(&aconnector->hpd_lock); - dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD); + if (!dc_link_detect_sink(aconnector->dc_link, &new_connection_type)) + DRM_ERROR("KMS: Failed to detect connector\n"); + + if (aconnector->base.force && new_connection_type == dc_connection_none) + emulated_link_detect(aconnector->dc_link); + else + dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD); if (aconnector->fake_enable && aconnector->dc_link->local_sink) aconnector->fake_enable = false; @@ -922,6 +1010,7 @@ static void handle_hpd_irq(void *param) struct amdgpu_dm_connector *aconnector = (struct amdgpu_dm_connector *)param; struct drm_connector *connector = &aconnector->base; struct drm_device *dev = connector->dev; + enum dc_connection_type new_connection_type = dc_connection_none; /* In case of failure or MST no need to update connector status or notify the OS * since (for MST case) MST does this in it's own context. @@ -931,7 +1020,21 @@ static void handle_hpd_irq(void *param) if (aconnector->fake_enable) aconnector->fake_enable = false; - if (dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD)) { + if (!dc_link_detect_sink(aconnector->dc_link, &new_connection_type)) + DRM_ERROR("KMS: Failed to detect connector\n"); + + if (aconnector->base.force && new_connection_type == dc_connection_none) { + emulated_link_detect(aconnector->dc_link); + + + drm_modeset_lock_all(dev); + dm_restore_drm_connector_state(dev, connector); + drm_modeset_unlock_all(dev); + + if (aconnector->base.force == DRM_FORCE_UNSPECIFIED) + drm_kms_helper_hotplug_event(dev); + + } else if (dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD)) { amdgpu_dm_update_connector_after_detect(aconnector); @@ -1031,6 +1134,7 @@ static void handle_hpd_rx_irq(void *param) struct drm_device *dev = connector->dev; struct dc_link *dc_link = aconnector->dc_link; bool is_mst_root_connector = aconnector->mst_mgr.mst_state; + enum dc_connection_type new_connection_type = dc_connection_none; /* TODO:Temporary add mutex to protect hpd interrupt not have a gpio * conflict, after implement i2c helper, this mutex should be @@ -1042,7 +1146,24 @@ static void handle_hpd_rx_irq(void *param) if (dc_link_handle_hpd_rx_irq(dc_link, NULL, NULL) && !is_mst_root_connector) { /* Downstream Port status changed. */ - if (dc_link_detect(dc_link, DETECT_REASON_HPDRX)) { + if (!dc_link_detect_sink(dc_link, &new_connection_type)) + DRM_ERROR("KMS: Failed to detect connector\n"); + + if (aconnector->base.force && new_connection_type == dc_connection_none) { + emulated_link_detect(dc_link); + + if (aconnector->fake_enable) + aconnector->fake_enable = false; + + amdgpu_dm_update_connector_after_detect(aconnector); + + + drm_modeset_lock_all(dev); + dm_restore_drm_connector_state(dev, connector); + drm_modeset_unlock_all(dev); + + drm_kms_helper_hotplug_event(dev); + } else if (dc_link_detect(dc_link, DETECT_REASON_HPDRX)) { if (aconnector->fake_enable) aconnector->fake_enable = false; @@ -1433,6 +1554,7 @@ static int amdgpu_dm_initialize_drm_device(struct amdgpu_device *adev) struct amdgpu_mode_info *mode_info = &adev->mode_info; uint32_t link_cnt; int32_t total_overlay_planes, total_primary_planes; + enum dc_connection_type new_connection_type = dc_connection_none; link_cnt = dm->dc->caps.max_links; if (amdgpu_dm_mode_config_init(dm->adev)) { @@ -1499,7 +1621,14 @@ static int amdgpu_dm_initialize_drm_device(struct amdgpu_device *adev) link = dc_get_link_at_index(dm->dc, i); - if (dc_link_detect(link, DETECT_REASON_BOOT)) { + if (!dc_link_detect_sink(link, &new_connection_type)) + DRM_ERROR("KMS: Failed to detect connector\n"); + + if (aconnector->base.force && new_connection_type == dc_connection_none) { + emulated_link_detect(link); + amdgpu_dm_update_connector_after_detect(aconnector); + + } else if (dc_link_detect(link, DETECT_REASON_BOOT)) { amdgpu_dm_update_connector_after_detect(aconnector); register_backlight_device(dm, link); } @@ -2494,7 +2623,7 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector, if (dm_state && dm_state->freesync_capable) stream->ignore_msa_timing_param = true; finish: - if (sink && sink->sink_signal == SIGNAL_TYPE_VIRTUAL) + if (sink && sink->sink_signal == SIGNAL_TYPE_VIRTUAL && aconnector->base.force != DRM_FORCE_ON) dc_sink_release(sink); return stream; diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c index 37eaf72ace54..fced3c1c2ef5 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c @@ -195,7 +195,7 @@ static bool program_hpd_filter( return result; } -static bool detect_sink(struct dc_link *link, enum dc_connection_type *type) +bool dc_link_detect_sink(struct dc_link *link, enum dc_connection_type *type) { uint32_t is_hpd_high = 0; struct gpio *hpd_pin; @@ -604,7 +604,7 @@ bool dc_link_detect(struct dc_link *link, enum dc_detect_reason reason) if (link->connector_signal == SIGNAL_TYPE_VIRTUAL) return false; - if (false == detect_sink(link, &new_connection_type)) { + if (false == dc_link_detect_sink(link, &new_connection_type)) { BREAK_TO_DEBUGGER(); return false; } diff --git a/drivers/gpu/drm/amd/display/dc/dc_link.h b/drivers/gpu/drm/amd/display/dc/dc_link.h index d43cefbc43d3..1b48ab9aea89 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_link.h +++ b/drivers/gpu/drm/amd/display/dc/dc_link.h @@ -215,6 +215,7 @@ void dc_link_enable_hpd_filter(struct dc_link *link, bool enable); bool dc_link_is_dp_sink_present(struct dc_link *link); +bool dc_link_detect_sink(struct dc_link *link, enum dc_connection_type *type); /* * DPCD access interfaces */ From d6d1cd2578c4da0764ad334e3411c1c1b1557f58 Mon Sep 17 00:00:00 2001 From: Xue Liu Date: Fri, 31 Aug 2018 23:46:41 +0200 Subject: [PATCH 283/499] ieee802154: mcr20a: Replace magic number with constants The combination of defined constants are used to present the state of IRQ so the magic numbers has been replaced. This is a simple coding style change which should have no impact on runtime code execution. Signed-off-by: Xue Liu Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mcr20a.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ieee802154/mcr20a.c b/drivers/net/ieee802154/mcr20a.c index e428277781ac..04891429a554 100644 --- a/drivers/net/ieee802154/mcr20a.c +++ b/drivers/net/ieee802154/mcr20a.c @@ -903,19 +903,19 @@ mcr20a_irq_clean_complete(void *context) switch (seq_state) { /* TX IRQ, RX IRQ and SEQ IRQ */ - case (0x03): + case (DAR_IRQSTS1_TXIRQ | DAR_IRQSTS1_SEQIRQ): if (lp->is_tx) { lp->is_tx = 0; dev_dbg(printdev(lp), "TX is done. No ACK\n"); mcr20a_handle_tx_complete(lp); } break; - case (0x05): + case (DAR_IRQSTS1_RXIRQ | DAR_IRQSTS1_SEQIRQ): /* rx is starting */ dev_dbg(printdev(lp), "RX is starting\n"); mcr20a_handle_rx(lp); break; - case (0x07): + case (DAR_IRQSTS1_RXIRQ | DAR_IRQSTS1_TXIRQ | DAR_IRQSTS1_SEQIRQ): if (lp->is_tx) { /* tx is done */ lp->is_tx = 0; @@ -927,7 +927,7 @@ mcr20a_irq_clean_complete(void *context) mcr20a_handle_rx(lp); } break; - case (0x01): + case (DAR_IRQSTS1_SEQIRQ): if (lp->is_tx) { dev_dbg(printdev(lp), "TX is starting\n"); mcr20a_handle_tx(lp); From 0f843e65d9eef4936929bb036c5f771fb261eea4 Mon Sep 17 00:00:00 2001 From: Guoju Fang Date: Thu, 27 Sep 2018 23:41:46 +0800 Subject: [PATCH 284/499] bcache: add separate workqueue for journal_write to avoid deadlock After write SSD completed, bcache schedules journal_write work to system_wq, which is a public workqueue in system, without WQ_MEM_RECLAIM flag. system_wq is also a bound wq, and there may be no idle kworker on current processor. Creating a new kworker may unfortunately need to reclaim memory first, by shrinking cache and slab used by vfs, which depends on bcache device. That's a deadlock. This patch create a new workqueue for journal_write with WQ_MEM_RECLAIM flag. It's rescuer thread will work to avoid the deadlock. Signed-off-by: Guoju Fang Cc: stable@vger.kernel.org Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/journal.c | 6 +++--- drivers/md/bcache/super.c | 8 ++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 83504dd8100a..954dad29e6e8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -965,6 +965,7 @@ void bch_prio_write(struct cache *ca); void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; +extern struct workqueue_struct *bch_journal_wq; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 6116bbf870d8..522c7426f3a0 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -485,7 +485,7 @@ static void do_journal_discard(struct cache *ca) closure_get(&ca->set->cl); INIT_WORK(&ja->discard_work, journal_discard_work); - schedule_work(&ja->discard_work); + queue_work(bch_journal_wq, &ja->discard_work); } } @@ -592,7 +592,7 @@ static void journal_write_done(struct closure *cl) : &j->w[0]; __closure_wake_up(&w->wait); - continue_at_nobarrier(cl, journal_write, system_wq); + continue_at_nobarrier(cl, journal_write, bch_journal_wq); } static void journal_write_unlock(struct closure *cl) @@ -627,7 +627,7 @@ static void journal_write_unlocked(struct closure *cl) spin_unlock(&c->journal.lock); btree_flush_write(c); - continue_at(cl, journal_write, system_wq); + continue_at(cl, journal_write, bch_journal_wq); return; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 94c756c66bd7..30ba9aeb5ee8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -47,6 +47,7 @@ static int bcache_major; static DEFINE_IDA(bcache_device_idx); static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; +struct workqueue_struct *bch_journal_wq; #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) /* limitation of partitions number on single bcache device */ @@ -2341,6 +2342,9 @@ static void bcache_exit(void) kobject_put(bcache_kobj); if (bcache_wq) destroy_workqueue(bcache_wq); + if (bch_journal_wq) + destroy_workqueue(bch_journal_wq); + if (bcache_major) unregister_blkdev(bcache_major, "bcache"); unregister_reboot_notifier(&reboot); @@ -2370,6 +2374,10 @@ static int __init bcache_init(void) if (!bcache_wq) goto err; + bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0); + if (!bch_journal_wq) + goto err; + bcache_kobj = kobject_create_and_add("bcache", fs_kobj); if (!bcache_kobj) goto err; From bdec8d7fa55e6f5314ed72e5a0b435d90ff90548 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 27 Sep 2018 20:38:45 +0800 Subject: [PATCH 285/499] x86/boot: Fix kexec booting failure in the SEV bit detection code Commit 1958b5fc4010 ("x86/boot: Add early boot support when running with SEV active") can occasionally cause system resets when kexec-ing a second kernel even if SEV is not active. That's because get_sev_encryption_bit() uses 32-bit rIP-relative addressing to read the value of enc_bit - a variable which caches a previously detected encryption bit position - but kexec may allocate the early boot code to a higher location, beyond the 32-bit addressing limit. In this case, garbage will be read and get_sev_encryption_bit() will return the wrong value, leading to accessing memory with the wrong encryption setting. Therefore, remove enc_bit, and thus get rid of the need to do 32-bit rIP-relative addressing in the first place. [ bp: massage commit message heavily. ] Fixes: 1958b5fc4010 ("x86/boot: Add early boot support when running with SEV active") Suggested-by: Borislav Petkov Signed-off-by: Kairui Song Signed-off-by: Borislav Petkov Reviewed-by: Tom Lendacky Cc: linux-kernel@vger.kernel.org Cc: tglx@linutronix.de Cc: mingo@redhat.com Cc: hpa@zytor.com Cc: brijesh.singh@amd.com Cc: kexec@lists.infradead.org Cc: dyoung@redhat.com Cc: bhe@redhat.com Cc: ghook@redhat.com Link: https://lkml.kernel.org/r/20180927123845.32052-1-kasong@redhat.com --- arch/x86/boot/compressed/mem_encrypt.S | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index eaa843a52907..a480356e0ed8 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -25,20 +25,6 @@ ENTRY(get_sev_encryption_bit) push %ebx push %ecx push %edx - push %edi - - /* - * RIP-relative addressing is needed to access the encryption bit - * variable. Since we are running in 32-bit mode we need this call/pop - * sequence to get the proper relative addressing. - */ - call 1f -1: popl %edi - subl $1b, %edi - - movl enc_bit(%edi), %eax - cmpl $0, %eax - jge .Lsev_exit /* Check if running under a hypervisor */ movl $1, %eax @@ -69,15 +55,12 @@ ENTRY(get_sev_encryption_bit) movl %ebx, %eax andl $0x3f, %eax /* Return the encryption bit location */ - movl %eax, enc_bit(%edi) jmp .Lsev_exit .Lno_sev: xor %eax, %eax - movl %eax, enc_bit(%edi) .Lsev_exit: - pop %edi pop %edx pop %ecx pop %ebx @@ -113,8 +96,6 @@ ENTRY(set_sev_encryption_mask) ENDPROC(set_sev_encryption_mask) .data -enc_bit: - .int 0xffffffff #ifdef CONFIG_AMD_MEM_ENCRYPT .balign 8 From f52afc93cd018fe6910133a05d44671192d1aeb0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 27 Sep 2018 13:23:32 +0200 Subject: [PATCH 286/499] dax: Fix deadlock in dax_lock_mapping_entry() When dax_lock_mapping_entry() has to sleep to obtain entry lock, it will fail to unlock mapping->i_pages spinlock and thus immediately deadlock against itself when retrying to grab the entry lock again. Fix the problem by unlocking mapping->i_pages before retrying. Fixes: c2a7d2a11552 ("filesystem-dax: Introduce dax_lock_mapping_entry()") Reported-by: Barret Rhoden Signed-off-by: Jan Kara Signed-off-by: Dan Williams --- fs/dax.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dax.c b/fs/dax.c index f32d7125ad0f..e4ef8af31aa6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -447,6 +447,7 @@ bool dax_lock_mapping_entry(struct page *page) xa_unlock_irq(&mapping->i_pages); break; } else if (IS_ERR(entry)) { + xa_unlock_irq(&mapping->i_pages); WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN); continue; } From 3baafeffa48a12b3cec9a0b6d4049fba02d53cea Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 25 Sep 2018 16:56:53 +0300 Subject: [PATCH 287/499] iwlwifi: 1000: set the TFD queue size .max_tfd_queue_size was ommited for 1000 card serries leading to oops in swiotlb. Fixes: 7b3e42ea2ead ("iwlwifi: support multiple tfd queue max sizes for different devices") Tested-by: Randy Dunlap Signed-off-by: Pavel Machek Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/cfg/1000.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/1000.c b/drivers/net/wireless/intel/iwlwifi/cfg/1000.c index 591687984962..497fd766d87c 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/1000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/1000.c @@ -51,6 +51,7 @@ static const struct iwl_base_params iwl1000_base_params = { .num_of_queues = IWLAGN_NUM_QUEUES, + .max_tfd_queue_size = 256, .eeprom_size = OTP_LOW_IMAGE_SIZE, .pll_cfg = true, .max_ll_items = OTP_MAX_LL_ITEMS_1000, From ce01a1575f45bf319e374592656441021a7f5823 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 27 Sep 2018 14:39:19 -0400 Subject: [PATCH 288/499] rseq/selftests: fix parametrized test with -fpie On x86-64, the parametrized selftest code for rseq crashes with a segmentation fault when compiled with -fpie. This happens when the param_test binary is loaded at an address beyond 32-bit on x86-64. The issue is caused by use of a 32-bit register to hold the address of the loop counter variable. Fix this by using a 64-bit register to calculate the address of the loop counter variables as an offset from rip. Signed-off-by: Mathieu Desnoyers Acked-by: "Paul E . McKenney" Cc: # v4.18 Cc: Shuah Khan Cc: Thomas Gleixner Cc: Joel Fernandes Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Dave Watson Cc: Will Deacon Cc: Andi Kleen Cc: linux-kselftest@vger.kernel.org Cc: "H . Peter Anvin" Cc: Chris Lameter Cc: Russell King Cc: Michael Kerrisk Cc: "Paul E . McKenney" Cc: Paul Turner Cc: Boqun Feng Cc: Josh Triplett Cc: Steven Rostedt Cc: Ben Maurer Cc: Andy Lutomirski Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/rseq/param_test.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c index 642d4e12abea..eec2663261f2 100644 --- a/tools/testing/selftests/rseq/param_test.c +++ b/tools/testing/selftests/rseq/param_test.c @@ -56,15 +56,13 @@ unsigned int yield_mod_cnt, nr_abort; printf(fmt, ## __VA_ARGS__); \ } while (0) -#if defined(__x86_64__) || defined(__i386__) +#ifdef __i386__ #define INJECT_ASM_REG "eax" #define RSEQ_INJECT_CLOBBER \ , INJECT_ASM_REG -#ifdef __i386__ - #define RSEQ_INJECT_ASM(n) \ "mov asm_loop_cnt_" #n ", %%" INJECT_ASM_REG "\n\t" \ "test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \ @@ -76,9 +74,16 @@ unsigned int yield_mod_cnt, nr_abort; #elif defined(__x86_64__) +#define INJECT_ASM_REG_P "rax" +#define INJECT_ASM_REG "eax" + +#define RSEQ_INJECT_CLOBBER \ + , INJECT_ASM_REG_P \ + , INJECT_ASM_REG + #define RSEQ_INJECT_ASM(n) \ - "lea asm_loop_cnt_" #n "(%%rip), %%" INJECT_ASM_REG "\n\t" \ - "mov (%%" INJECT_ASM_REG "), %%" INJECT_ASM_REG "\n\t" \ + "lea asm_loop_cnt_" #n "(%%rip), %%" INJECT_ASM_REG_P "\n\t" \ + "mov (%%" INJECT_ASM_REG_P "), %%" INJECT_ASM_REG "\n\t" \ "test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \ "jz 333f\n\t" \ "222:\n\t" \ @@ -86,10 +91,6 @@ unsigned int yield_mod_cnt, nr_abort; "jnz 222b\n\t" \ "333:\n\t" -#else -#error "Unsupported architecture" -#endif - #elif defined(__s390__) #define RSEQ_INJECT_INPUT \ From 587562d0c7cd6861f4f90a2eb811cccb1a376f5f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 26 Sep 2018 14:35:50 +0200 Subject: [PATCH 289/499] blk-mq: I/O and timer unplugs are inverted in blktrace trace_block_unplug() takes true for explicit unplugs and false for implicit unplugs. schedule() unplugs are implicit and should be reported as timer unplugs. While correct in the legacy code, this has been inverted in blk-mq since 4.11. Cc: stable@vger.kernel.org Fixes: bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers") Reviewed-by: Omar Sandoval Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 85a1c1a59c72..e3c39ea8e17b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1628,7 +1628,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(!rq->q); if (rq->mq_ctx != this_ctx) { if (this_ctx) { - trace_block_unplug(this_q, depth, from_schedule); + trace_block_unplug(this_q, depth, !from_schedule); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); @@ -1648,7 +1648,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) * on 'ctx_list'. Do those. */ if (this_ctx) { - trace_block_unplug(this_q, depth, from_schedule); + trace_block_unplug(this_q, depth, !from_schedule); blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, from_schedule); } From 083874549fdfefa629dfa752785e20427dde1511 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Thu, 27 Sep 2018 15:47:33 -0500 Subject: [PATCH 290/499] PCI: Reprogram bridge prefetch registers on resume On 38+ Intel-based ASUS products, the NVIDIA GPU becomes unusable after S3 suspend/resume. The affected products include multiple generations of NVIDIA GPUs and Intel SoCs. After resume, nouveau logs many errors such as: fifo: fault 00 [READ] at 0000005555555000 engine 00 [GR] client 04 [HUB/FE] reason 4a [] on channel -1 [007fa91000 unknown] DRM: failed to idle channel 0 [DRM] Similarly, the NVIDIA proprietary driver also fails after resume (black screen, 100% CPU usage in Xorg process). We shipped a sample to NVIDIA for diagnosis, and their response indicated that it's a problem with the parent PCI bridge (on the Intel SoC), not the GPU. Runtime suspend/resume works fine, only S3 suspend is affected. We found a workaround: on resume, rewrite the Intel PCI bridge 'Prefetchable Base Upper 32 Bits' register (PCI_PREF_BASE_UPPER32). In the cases that I checked, this register has value 0 and we just have to rewrite that value. Linux already saves and restores PCI config space during suspend/resume, but this register was being skipped because upon resume, it already has value 0 (the correct, pre-suspend value). Intel appear to have previously acknowledged this behaviour and the requirement to rewrite this register: https://bugzilla.kernel.org/show_bug.cgi?id=116851#c23 Based on that, rewrite the prefetch register values even when that appears unnecessary. We have confirmed this solution on all the affected models we have in-hands (X542UQ, UX533FD, X530UN, V272UN). Additionally, this solves an issue where r8169 MSI-X interrupts were broken after S3 suspend/resume on ASUS X441UAR. This issue was recently worked around in commit 7bb05b85bc2d ("r8169: don't use MSI-X on RTL8106e"). It also fixes the same issue on RTL6186evl/8111evl on an Aimfor-tech laptop that we had not yet patched. I suspect it will also fix the issue that was worked around in commit 7c53a722459c ("r8169: don't use MSI-X on RTL8168g"). Thomas Martitz reports that this change also solves an issue where the AMD Radeon Polaris 10 GPU on the HP Zbook 14u G5 is unresponsive after S3 suspend/resume. Link: https://bugzilla.kernel.org/show_bug.cgi?id=201069 Signed-off-by: Daniel Drake Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki Reviewed-By: Peter Wu CC: stable@vger.kernel.org --- drivers/pci/pci.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 1835f3a7aa8d..51b6c81671c1 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1289,12 +1289,12 @@ int pci_save_state(struct pci_dev *dev) EXPORT_SYMBOL(pci_save_state); static void pci_restore_config_dword(struct pci_dev *pdev, int offset, - u32 saved_val, int retry) + u32 saved_val, int retry, bool force) { u32 val; pci_read_config_dword(pdev, offset, &val); - if (val == saved_val) + if (!force && val == saved_val) return; for (;;) { @@ -1313,25 +1313,36 @@ static void pci_restore_config_dword(struct pci_dev *pdev, int offset, } static void pci_restore_config_space_range(struct pci_dev *pdev, - int start, int end, int retry) + int start, int end, int retry, + bool force) { int index; for (index = end; index >= start; index--) pci_restore_config_dword(pdev, 4 * index, pdev->saved_config_space[index], - retry); + retry, force); } static void pci_restore_config_space(struct pci_dev *pdev) { if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) { - pci_restore_config_space_range(pdev, 10, 15, 0); + pci_restore_config_space_range(pdev, 10, 15, 0, false); /* Restore BARs before the command register. */ - pci_restore_config_space_range(pdev, 4, 9, 10); - pci_restore_config_space_range(pdev, 0, 3, 0); + pci_restore_config_space_range(pdev, 4, 9, 10, false); + pci_restore_config_space_range(pdev, 0, 3, 0, false); + } else if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + pci_restore_config_space_range(pdev, 12, 15, 0, false); + + /* + * Force rewriting of prefetch registers to avoid S3 resume + * issues on Intel PCI bridges that occur when these + * registers are not explicitly written. + */ + pci_restore_config_space_range(pdev, 9, 11, 0, true); + pci_restore_config_space_range(pdev, 0, 8, 0, false); } else { - pci_restore_config_space_range(pdev, 0, 15, 0); + pci_restore_config_space_range(pdev, 0, 15, 0, false); } } From add92a817e60e308a419693413a38d9d1e663aff Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Wed, 19 Sep 2018 22:42:16 +0530 Subject: [PATCH 291/499] crypto: chelsio - Fix memory corruption in DMA Mapped buffers. Update PCI Id in "cpl_rx_phys_dsgl" header. In case pci_chan_id and tx_chan_id are not derived from same queue, H/W can send request completion indication before completing DMA Transfer. Herbert, It would be good if fix can be merge to stable tree. For 4.14 kernel, It requires some update to avoid mege conficts. Cc: Signed-off-by: Harsh Jain Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chcr_algo.c | 32 +++++++++++++++++++--------- drivers/crypto/chelsio/chcr_crypto.h | 2 ++ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index 5c539af8ed60..010bbf607797 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -367,7 +367,8 @@ static inline void dsgl_walk_init(struct dsgl_walk *walk, walk->to = (struct phys_sge_pairs *)(dsgl + 1); } -static inline void dsgl_walk_end(struct dsgl_walk *walk, unsigned short qid) +static inline void dsgl_walk_end(struct dsgl_walk *walk, unsigned short qid, + int pci_chan_id) { struct cpl_rx_phys_dsgl *phys_cpl; @@ -385,6 +386,7 @@ static inline void dsgl_walk_end(struct dsgl_walk *walk, unsigned short qid) phys_cpl->rss_hdr_int.opcode = CPL_RX_PHYS_ADDR; phys_cpl->rss_hdr_int.qid = htons(qid); phys_cpl->rss_hdr_int.hash_val = 0; + phys_cpl->rss_hdr_int.channel = pci_chan_id; } static inline void dsgl_walk_add_page(struct dsgl_walk *walk, @@ -718,7 +720,7 @@ static inline void create_wreq(struct chcr_context *ctx, FILL_WR_RX_Q_ID(ctx->dev->rx_channel_id, qid, !!lcb, ctx->tx_qidx); - chcr_req->ulptx.cmd_dest = FILL_ULPTX_CMD_DEST(ctx->dev->tx_channel_id, + chcr_req->ulptx.cmd_dest = FILL_ULPTX_CMD_DEST(ctx->tx_chan_id, qid); chcr_req->ulptx.len = htonl((DIV_ROUND_UP(len16, 16) - ((sizeof(chcr_req->wreq)) >> 4))); @@ -1339,16 +1341,23 @@ static int chcr_device_init(struct chcr_context *ctx) adap->vres.ncrypto_fc); rxq_perchan = u_ctx->lldi.nrxq / u_ctx->lldi.nchan; txq_perchan = ntxq / u_ctx->lldi.nchan; - rxq_idx = ctx->dev->tx_channel_id * rxq_perchan; - rxq_idx += id % rxq_perchan; - txq_idx = ctx->dev->tx_channel_id * txq_perchan; - txq_idx += id % txq_perchan; spin_lock(&ctx->dev->lock_chcr_dev); - ctx->rx_qidx = rxq_idx; - ctx->tx_qidx = txq_idx; + ctx->tx_chan_id = ctx->dev->tx_channel_id; ctx->dev->tx_channel_id = !ctx->dev->tx_channel_id; ctx->dev->rx_channel_id = 0; spin_unlock(&ctx->dev->lock_chcr_dev); + rxq_idx = ctx->tx_chan_id * rxq_perchan; + rxq_idx += id % rxq_perchan; + txq_idx = ctx->tx_chan_id * txq_perchan; + txq_idx += id % txq_perchan; + ctx->rx_qidx = rxq_idx; + ctx->tx_qidx = txq_idx; + /* Channel Id used by SGE to forward packet to Host. + * Same value should be used in cpl_fw6_pld RSS_CH field + * by FW. Driver programs PCI channel ID to be used in fw + * at the time of queue allocation with value "pi->tx_chan" + */ + ctx->pci_chan_id = txq_idx / txq_perchan; } out: return err; @@ -2503,6 +2512,7 @@ void chcr_add_aead_dst_ent(struct aead_request *req, struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct dsgl_walk dsgl_walk; unsigned int authsize = crypto_aead_authsize(tfm); + struct chcr_context *ctx = a_ctx(tfm); u32 temp; dsgl_walk_init(&dsgl_walk, phys_cpl); @@ -2512,7 +2522,7 @@ void chcr_add_aead_dst_ent(struct aead_request *req, dsgl_walk_add_page(&dsgl_walk, IV, &reqctx->iv_dma); temp = req->cryptlen + (reqctx->op ? -authsize : authsize); dsgl_walk_add_sg(&dsgl_walk, req->dst, temp, req->assoclen); - dsgl_walk_end(&dsgl_walk, qid); + dsgl_walk_end(&dsgl_walk, qid, ctx->pci_chan_id); } void chcr_add_cipher_src_ent(struct ablkcipher_request *req, @@ -2544,6 +2554,8 @@ void chcr_add_cipher_dst_ent(struct ablkcipher_request *req, unsigned short qid) { struct chcr_blkcipher_req_ctx *reqctx = ablkcipher_request_ctx(req); + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(wrparam->req); + struct chcr_context *ctx = c_ctx(tfm); struct dsgl_walk dsgl_walk; dsgl_walk_init(&dsgl_walk, phys_cpl); @@ -2552,7 +2564,7 @@ void chcr_add_cipher_dst_ent(struct ablkcipher_request *req, reqctx->dstsg = dsgl_walk.last_sg; reqctx->dst_ofst = dsgl_walk.last_sg_len; - dsgl_walk_end(&dsgl_walk, qid); + dsgl_walk_end(&dsgl_walk, qid, ctx->pci_chan_id); } void chcr_add_hash_src_ent(struct ahash_request *req, diff --git a/drivers/crypto/chelsio/chcr_crypto.h b/drivers/crypto/chelsio/chcr_crypto.h index 54835cb109e5..0d2c70c344f3 100644 --- a/drivers/crypto/chelsio/chcr_crypto.h +++ b/drivers/crypto/chelsio/chcr_crypto.h @@ -255,6 +255,8 @@ struct chcr_context { struct chcr_dev *dev; unsigned char tx_qidx; unsigned char rx_qidx; + unsigned char tx_chan_id; + unsigned char pci_chan_id; struct __crypto_ctx crypto_ctx[0]; }; From d80771c08363ad7fbf0f56f5301e7ca65065c582 Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Fri, 21 Sep 2018 18:03:18 +0300 Subject: [PATCH 292/499] crypto: mxs-dcp - Fix wait logic on chan threads When compiling with CONFIG_DEBUG_ATOMIC_SLEEP=y the mxs-dcp driver prints warnings such as: WARNING: CPU: 0 PID: 120 at kernel/sched/core.c:7736 __might_sleep+0x98/0x9c do not call blocking ops when !TASK_RUNNING; state=1 set at [<8081978c>] dcp_chan_thread_sha+0x3c/0x2ec The problem is that blocking ops will manipulate current->state themselves so it is not allowed to call them between set_current_state(TASK_INTERRUPTIBLE) and schedule(). Fix this by converting the per-chan mutex to a spinlock (it only protects tiny list ops anyway) and rearranging the wait logic so that callbacks are called current->state as TASK_RUNNING. Those callbacks will indeed call blocking ops themselves so this is required. Cc: Signed-off-by: Leonard Crestez Signed-off-by: Herbert Xu --- drivers/crypto/mxs-dcp.c | 53 +++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c index a10c418d4e5c..56bd28174f52 100644 --- a/drivers/crypto/mxs-dcp.c +++ b/drivers/crypto/mxs-dcp.c @@ -63,7 +63,7 @@ struct dcp { struct dcp_coherent_block *coh; struct completion completion[DCP_MAX_CHANS]; - struct mutex mutex[DCP_MAX_CHANS]; + spinlock_t lock[DCP_MAX_CHANS]; struct task_struct *thread[DCP_MAX_CHANS]; struct crypto_queue queue[DCP_MAX_CHANS]; }; @@ -349,13 +349,20 @@ static int dcp_chan_thread_aes(void *data) int ret; - do { - __set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); - mutex_lock(&sdcp->mutex[chan]); + spin_lock(&sdcp->lock[chan]); backlog = crypto_get_backlog(&sdcp->queue[chan]); arq = crypto_dequeue_request(&sdcp->queue[chan]); - mutex_unlock(&sdcp->mutex[chan]); + spin_unlock(&sdcp->lock[chan]); + + if (!backlog && !arq) { + schedule(); + continue; + } + + set_current_state(TASK_RUNNING); if (backlog) backlog->complete(backlog, -EINPROGRESS); @@ -363,11 +370,8 @@ static int dcp_chan_thread_aes(void *data) if (arq) { ret = mxs_dcp_aes_block_crypt(arq); arq->complete(arq, ret); - continue; } - - schedule(); - } while (!kthread_should_stop()); + } return 0; } @@ -409,9 +413,9 @@ static int mxs_dcp_aes_enqueue(struct ablkcipher_request *req, int enc, int ecb) rctx->ecb = ecb; actx->chan = DCP_CHAN_CRYPTO; - mutex_lock(&sdcp->mutex[actx->chan]); + spin_lock(&sdcp->lock[actx->chan]); ret = crypto_enqueue_request(&sdcp->queue[actx->chan], &req->base); - mutex_unlock(&sdcp->mutex[actx->chan]); + spin_unlock(&sdcp->lock[actx->chan]); wake_up_process(sdcp->thread[actx->chan]); @@ -640,13 +644,20 @@ static int dcp_chan_thread_sha(void *data) struct ahash_request *req; int ret, fini; - do { - __set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); - mutex_lock(&sdcp->mutex[chan]); + spin_lock(&sdcp->lock[chan]); backlog = crypto_get_backlog(&sdcp->queue[chan]); arq = crypto_dequeue_request(&sdcp->queue[chan]); - mutex_unlock(&sdcp->mutex[chan]); + spin_unlock(&sdcp->lock[chan]); + + if (!backlog && !arq) { + schedule(); + continue; + } + + set_current_state(TASK_RUNNING); if (backlog) backlog->complete(backlog, -EINPROGRESS); @@ -658,12 +669,8 @@ static int dcp_chan_thread_sha(void *data) ret = dcp_sha_req_to_buf(arq); fini = rctx->fini; arq->complete(arq, ret); - if (!fini) - continue; } - - schedule(); - } while (!kthread_should_stop()); + } return 0; } @@ -721,9 +728,9 @@ static int dcp_sha_update_fx(struct ahash_request *req, int fini) rctx->init = 1; } - mutex_lock(&sdcp->mutex[actx->chan]); + spin_lock(&sdcp->lock[actx->chan]); ret = crypto_enqueue_request(&sdcp->queue[actx->chan], &req->base); - mutex_unlock(&sdcp->mutex[actx->chan]); + spin_unlock(&sdcp->lock[actx->chan]); wake_up_process(sdcp->thread[actx->chan]); mutex_unlock(&actx->mutex); @@ -997,7 +1004,7 @@ static int mxs_dcp_probe(struct platform_device *pdev) platform_set_drvdata(pdev, sdcp); for (i = 0; i < DCP_MAX_CHANS; i++) { - mutex_init(&sdcp->mutex[i]); + spin_lock_init(&sdcp->lock[i]); init_completion(&sdcp->completion[i]); crypto_init_queue(&sdcp->queue[i], 50); } From ba439a6cbfa2936a6713f64cb499de7943673fe3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 22 Sep 2018 20:41:55 -0400 Subject: [PATCH 293/499] crypto: qat - Fix KASAN stack-out-of-bounds bug in adf_probe() The following KASAN warning was printed when booting a 64-bit kernel on some systems with Intel CPUs: [ 44.512826] ================================================================== [ 44.520165] BUG: KASAN: stack-out-of-bounds in find_first_bit+0xb0/0xc0 [ 44.526786] Read of size 8 at addr ffff88041e02fc50 by task kworker/0:2/124 [ 44.535253] CPU: 0 PID: 124 Comm: kworker/0:2 Tainted: G X --------- --- 4.18.0-12.el8.x86_64+debug #1 [ 44.545858] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS BKVDTRL1.86B.0005.D08.1712070559 12/07/2017 [ 44.555682] Workqueue: events work_for_cpu_fn [ 44.560043] Call Trace: [ 44.562502] dump_stack+0x9a/0xe9 [ 44.565832] print_address_description+0x65/0x22e [ 44.570683] ? find_first_bit+0xb0/0xc0 [ 44.570689] kasan_report.cold.6+0x92/0x19f [ 44.578726] find_first_bit+0xb0/0xc0 [ 44.578737] adf_probe+0x9eb/0x19a0 [qat_c62x] [ 44.578751] ? adf_remove+0x110/0x110 [qat_c62x] [ 44.591490] ? mark_held_locks+0xc8/0x140 [ 44.591498] ? _raw_spin_unlock+0x30/0x30 [ 44.591505] ? trace_hardirqs_on_caller+0x381/0x570 [ 44.604418] ? adf_remove+0x110/0x110 [qat_c62x] [ 44.604427] local_pci_probe+0xd4/0x180 [ 44.604432] ? pci_device_shutdown+0x110/0x110 [ 44.617386] work_for_cpu_fn+0x51/0xa0 [ 44.621145] process_one_work+0x8fe/0x16e0 [ 44.625263] ? pwq_dec_nr_in_flight+0x2d0/0x2d0 [ 44.629799] ? lock_acquire+0x14c/0x400 [ 44.633645] ? move_linked_works+0x12e/0x2a0 [ 44.637928] worker_thread+0x536/0xb50 [ 44.641690] ? __kthread_parkme+0xb6/0x180 [ 44.645796] ? process_one_work+0x16e0/0x16e0 [ 44.650160] kthread+0x30c/0x3d0 [ 44.653400] ? kthread_create_worker_on_cpu+0xc0/0xc0 [ 44.658457] ret_from_fork+0x3a/0x50 [ 44.663557] The buggy address belongs to the page: [ 44.668350] page:ffffea0010780bc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 44.676356] flags: 0x17ffffc0000000() [ 44.680023] raw: 0017ffffc0000000 ffffea0010780bc8 ffffea0010780bc8 0000000000000000 [ 44.687769] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 44.695510] page dumped because: kasan: bad access detected [ 44.702578] Memory state around the buggy address: [ 44.707372] ffff88041e02fb00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 44.714593] ffff88041e02fb80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 44.721810] >ffff88041e02fc00: 00 00 00 00 00 00 f1 f1 f1 f1 04 f2 f2 f2 f2 f2 [ 44.729028] ^ [ 44.734864] ffff88041e02fc80: f2 f2 00 00 00 00 f3 f3 f3 f3 00 00 00 00 00 00 [ 44.742082] ffff88041e02fd00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 44.749299] ================================================================== Looking into the code: int ret, bar_mask; : for_each_set_bit(bar_nr, (const unsigned long *)&bar_mask, It is casting a 32-bit integer pointer to a 64-bit unsigned long pointer. There are two problems here. First, the 32-bit pointer address may not be 64-bit aligned. Secondly, it is accessing an extra 4 bytes. This is fixed by changing the bar_mask type to unsigned long. Cc: Signed-off-by: Waiman Long Signed-off-by: Herbert Xu --- drivers/crypto/qat/qat_c3xxx/adf_drv.c | 6 +++--- drivers/crypto/qat/qat_c3xxxvf/adf_drv.c | 6 +++--- drivers/crypto/qat/qat_c62x/adf_drv.c | 6 +++--- drivers/crypto/qat/qat_c62xvf/adf_drv.c | 6 +++--- drivers/crypto/qat/qat_dh895xcc/adf_drv.c | 6 +++--- drivers/crypto/qat/qat_dh895xccvf/adf_drv.c | 6 +++--- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/crypto/qat/qat_c3xxx/adf_drv.c b/drivers/crypto/qat/qat_c3xxx/adf_drv.c index ba197f34c252..763c2166ee0e 100644 --- a/drivers/crypto/qat/qat_c3xxx/adf_drv.c +++ b/drivers/crypto/qat/qat_c3xxx/adf_drv.c @@ -123,7 +123,8 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) struct adf_hw_device_data *hw_data; char name[ADF_DEVICE_NAME_LENGTH]; unsigned int i, bar_nr; - int ret, bar_mask; + unsigned long bar_mask; + int ret; switch (ent->device) { case ADF_C3XXX_PCI_DEVICE_ID: @@ -235,8 +236,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Find and map all the device's BARS */ i = 0; bar_mask = pci_select_bars(pdev, IORESOURCE_MEM); - for_each_set_bit(bar_nr, (const unsigned long *)&bar_mask, - ADF_PCI_MAX_BARS * 2) { + for_each_set_bit(bar_nr, &bar_mask, ADF_PCI_MAX_BARS * 2) { struct adf_bar *bar = &accel_pci_dev->pci_bars[i++]; bar->base_addr = pci_resource_start(pdev, bar_nr); diff --git a/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c b/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c index 24ec908eb26c..613c7d5644ce 100644 --- a/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c +++ b/drivers/crypto/qat/qat_c3xxxvf/adf_drv.c @@ -125,7 +125,8 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) struct adf_hw_device_data *hw_data; char name[ADF_DEVICE_NAME_LENGTH]; unsigned int i, bar_nr; - int ret, bar_mask; + unsigned long bar_mask; + int ret; switch (ent->device) { case ADF_C3XXXIOV_PCI_DEVICE_ID: @@ -215,8 +216,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Find and map all the device's BARS */ i = 0; bar_mask = pci_select_bars(pdev, IORESOURCE_MEM); - for_each_set_bit(bar_nr, (const unsigned long *)&bar_mask, - ADF_PCI_MAX_BARS * 2) { + for_each_set_bit(bar_nr, &bar_mask, ADF_PCI_MAX_BARS * 2) { struct adf_bar *bar = &accel_pci_dev->pci_bars[i++]; bar->base_addr = pci_resource_start(pdev, bar_nr); diff --git a/drivers/crypto/qat/qat_c62x/adf_drv.c b/drivers/crypto/qat/qat_c62x/adf_drv.c index 59a5a0df50b6..9cb832963357 100644 --- a/drivers/crypto/qat/qat_c62x/adf_drv.c +++ b/drivers/crypto/qat/qat_c62x/adf_drv.c @@ -123,7 +123,8 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) struct adf_hw_device_data *hw_data; char name[ADF_DEVICE_NAME_LENGTH]; unsigned int i, bar_nr; - int ret, bar_mask; + unsigned long bar_mask; + int ret; switch (ent->device) { case ADF_C62X_PCI_DEVICE_ID: @@ -235,8 +236,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Find and map all the device's BARS */ i = (hw_data->fuses & ADF_DEVICE_FUSECTL_MASK) ? 1 : 0; bar_mask = pci_select_bars(pdev, IORESOURCE_MEM); - for_each_set_bit(bar_nr, (const unsigned long *)&bar_mask, - ADF_PCI_MAX_BARS * 2) { + for_each_set_bit(bar_nr, &bar_mask, ADF_PCI_MAX_BARS * 2) { struct adf_bar *bar = &accel_pci_dev->pci_bars[i++]; bar->base_addr = pci_resource_start(pdev, bar_nr); diff --git a/drivers/crypto/qat/qat_c62xvf/adf_drv.c b/drivers/crypto/qat/qat_c62xvf/adf_drv.c index b9f3e0e4fde9..278452b8ef81 100644 --- a/drivers/crypto/qat/qat_c62xvf/adf_drv.c +++ b/drivers/crypto/qat/qat_c62xvf/adf_drv.c @@ -125,7 +125,8 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) struct adf_hw_device_data *hw_data; char name[ADF_DEVICE_NAME_LENGTH]; unsigned int i, bar_nr; - int ret, bar_mask; + unsigned long bar_mask; + int ret; switch (ent->device) { case ADF_C62XIOV_PCI_DEVICE_ID: @@ -215,8 +216,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Find and map all the device's BARS */ i = 0; bar_mask = pci_select_bars(pdev, IORESOURCE_MEM); - for_each_set_bit(bar_nr, (const unsigned long *)&bar_mask, - ADF_PCI_MAX_BARS * 2) { + for_each_set_bit(bar_nr, &bar_mask, ADF_PCI_MAX_BARS * 2) { struct adf_bar *bar = &accel_pci_dev->pci_bars[i++]; bar->base_addr = pci_resource_start(pdev, bar_nr); diff --git a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c index be5c5a988ca5..3a9708ef4ce2 100644 --- a/drivers/crypto/qat/qat_dh895xcc/adf_drv.c +++ b/drivers/crypto/qat/qat_dh895xcc/adf_drv.c @@ -123,7 +123,8 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) struct adf_hw_device_data *hw_data; char name[ADF_DEVICE_NAME_LENGTH]; unsigned int i, bar_nr; - int ret, bar_mask; + unsigned long bar_mask; + int ret; switch (ent->device) { case ADF_DH895XCC_PCI_DEVICE_ID: @@ -237,8 +238,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Find and map all the device's BARS */ i = 0; bar_mask = pci_select_bars(pdev, IORESOURCE_MEM); - for_each_set_bit(bar_nr, (const unsigned long *)&bar_mask, - ADF_PCI_MAX_BARS * 2) { + for_each_set_bit(bar_nr, &bar_mask, ADF_PCI_MAX_BARS * 2) { struct adf_bar *bar = &accel_pci_dev->pci_bars[i++]; bar->base_addr = pci_resource_start(pdev, bar_nr); diff --git a/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c b/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c index 26ab17bfc6da..3da0f951cb59 100644 --- a/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c +++ b/drivers/crypto/qat/qat_dh895xccvf/adf_drv.c @@ -125,7 +125,8 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) struct adf_hw_device_data *hw_data; char name[ADF_DEVICE_NAME_LENGTH]; unsigned int i, bar_nr; - int ret, bar_mask; + unsigned long bar_mask; + int ret; switch (ent->device) { case ADF_DH895XCCIOV_PCI_DEVICE_ID: @@ -215,8 +216,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Find and map all the device's BARS */ i = 0; bar_mask = pci_select_bars(pdev, IORESOURCE_MEM); - for_each_set_bit(bar_nr, (const unsigned long *)&bar_mask, - ADF_PCI_MAX_BARS * 2) { + for_each_set_bit(bar_nr, &bar_mask, ADF_PCI_MAX_BARS * 2) { struct adf_bar *bar = &accel_pci_dev->pci_bars[i++]; bar->base_addr = pci_resource_start(pdev, bar_nr); From 7e0cf1c983b5b24426d130fd949a055d520acc9a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 28 Sep 2018 14:53:18 +1000 Subject: [PATCH 294/499] selftests/powerpc: Fix Makefiles for headers_install change Commit b2d35fa5fc80 ("selftests: add headers_install to lib.mk") introduced a requirement that Makefiles more than one level below the selftests directory need to define top_srcdir, but it didn't update any of the powerpc Makefiles. This broke building all the powerpc selftests with eg: make[1]: Entering directory '/src/linux/tools/testing/selftests/powerpc' BUILD_TARGET=/src/linux/tools/testing/selftests/powerpc/alignment; mkdir -p $BUILD_TARGET; make OUTPUT=$BUILD_TARGET -k -C alignment all make[2]: Entering directory '/src/linux/tools/testing/selftests/powerpc/alignment' ../../lib.mk:20: ../../../../scripts/subarch.include: No such file or directory make[2]: *** No rule to make target '../../../../scripts/subarch.include'. make[2]: Failed to remake makefile '../../../../scripts/subarch.include'. Makefile:38: recipe for target 'alignment' failed Fix it by setting top_srcdir in the affected Makefiles. Fixes: b2d35fa5fc80 ("selftests: add headers_install to lib.mk") Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/alignment/Makefile | 1 + tools/testing/selftests/powerpc/benchmarks/Makefile | 1 + tools/testing/selftests/powerpc/cache_shape/Makefile | 1 + tools/testing/selftests/powerpc/copyloops/Makefile | 1 + tools/testing/selftests/powerpc/dscr/Makefile | 1 + tools/testing/selftests/powerpc/math/Makefile | 1 + tools/testing/selftests/powerpc/mm/Makefile | 1 + tools/testing/selftests/powerpc/pmu/Makefile | 1 + tools/testing/selftests/powerpc/pmu/ebb/Makefile | 1 + tools/testing/selftests/powerpc/primitives/Makefile | 1 + tools/testing/selftests/powerpc/ptrace/Makefile | 1 + tools/testing/selftests/powerpc/signal/Makefile | 1 + tools/testing/selftests/powerpc/stringloops/Makefile | 1 + tools/testing/selftests/powerpc/switch_endian/Makefile | 1 + tools/testing/selftests/powerpc/syscalls/Makefile | 1 + tools/testing/selftests/powerpc/tm/Makefile | 1 + tools/testing/selftests/powerpc/vphn/Makefile | 1 + 17 files changed, 17 insertions(+) diff --git a/tools/testing/selftests/powerpc/alignment/Makefile b/tools/testing/selftests/powerpc/alignment/Makefile index 93baacab7693..d056486f49de 100644 --- a/tools/testing/selftests/powerpc/alignment/Makefile +++ b/tools/testing/selftests/powerpc/alignment/Makefile @@ -1,5 +1,6 @@ TEST_GEN_PROGS := copy_first_unaligned alignment_handler +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index b4d7432a0ecd..d40300a65b42 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -4,6 +4,7 @@ TEST_GEN_FILES := exec_target CFLAGS += -O2 +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile index 1be547434a49..ede4d3dae750 100644 --- a/tools/testing/selftests/powerpc/cache_shape/Makefile +++ b/tools/testing/selftests/powerpc/cache_shape/Makefile @@ -5,6 +5,7 @@ all: $(TEST_PROGS) $(TEST_PROGS): ../harness.c ../utils.c +top_srcdir = ../../../../.. include ../../lib.mk clean: diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 1cf89a34d97c..44574f3818b3 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -17,6 +17,7 @@ TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ EXTRA_SOURCES := validate.c ../harness.c stubs.S +top_srcdir = ../../../../.. include ../../lib.mk $(OUTPUT)/copyuser_64_t%: copyuser_64.S $(EXTRA_SOURCES) diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile index 55d7db7a616b..5df476364b4d 100644 --- a/tools/testing/selftests/powerpc/dscr/Makefile +++ b/tools/testing/selftests/powerpc/dscr/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := dscr_default_test dscr_explicit_test dscr_user_test \ dscr_inherit_test dscr_inherit_exec_test dscr_sysfs_test \ dscr_sysfs_thread_test +top_srcdir = ../../../../.. include ../../lib.mk $(OUTPUT)/dscr_default_test: LDLIBS += -lpthread diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile index 0dd3a01fdab9..11a10d7a2bbd 100644 --- a/tools/testing/selftests/powerpc/math/Makefile +++ b/tools/testing/selftests/powerpc/math/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := fpu_syscall fpu_preempt fpu_signal vmx_syscall vmx_preempt vmx_signal vsx_preempt +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 8ebbe96d80a8..33ced6e0ad25 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -5,6 +5,7 @@ noarg: TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors TEST_GEN_FILES := tempfile +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index 6e1629bf5b09..19046db995fe 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -5,6 +5,7 @@ noarg: TEST_GEN_PROGS := count_instructions l3_bank_test per_event_excludes EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c +top_srcdir = ../../../../.. include ../../lib.mk all: $(TEST_GEN_PROGS) ebb diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index c4e64bc2e265..bd5dfa509272 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -17,6 +17,7 @@ TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ lost_exception_test no_handler_test \ cycles_with_mmcr2_test +top_srcdir = ../../../../../.. include ../../../lib.mk $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c \ diff --git a/tools/testing/selftests/powerpc/primitives/Makefile b/tools/testing/selftests/powerpc/primitives/Makefile index 175366db7be8..ea2b7bd09e36 100644 --- a/tools/testing/selftests/powerpc/primitives/Makefile +++ b/tools/testing/selftests/powerpc/primitives/Makefile @@ -2,6 +2,7 @@ CFLAGS += -I$(CURDIR) TEST_GEN_PROGS := load_unaligned_zeropad +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 28f5b781a553..923d531265f8 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -4,6 +4,7 @@ TEST_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \ ptrace-tm-spd-vsx ptrace-tm-spr ptrace-hwbreak ptrace-pkey core-pkey \ perf-hwbreak +top_srcdir = ../../../../.. include ../../lib.mk all: $(TEST_PROGS) diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index a7cbd5082e27..1fca25c6ace0 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -8,6 +8,7 @@ $(TEST_PROGS): ../harness.c ../utils.c signal.S CFLAGS += -maltivec signal_tm: CFLAGS += -mhtm +top_srcdir = ../../../../.. include ../../lib.mk clean: diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index 10b35c87a4f4..7fc0623d85c3 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -29,6 +29,7 @@ endif ASFLAGS = $(CFLAGS) +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): $(EXTRA_SOURCES) diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index 30b8ff8fb82e..fcd2dcb8972b 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -5,6 +5,7 @@ ASFLAGS += -O2 -Wall -g -nostdlib -m64 EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S +top_srcdir = ../../../../.. include ../../lib.mk $(OUTPUT)/switch_endian_test: $(OUTPUT)/check-reversed.S diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile index da22ca7c38c1..161b8846336f 100644 --- a/tools/testing/selftests/powerpc/syscalls/Makefile +++ b/tools/testing/selftests/powerpc/syscalls/Makefile @@ -2,6 +2,7 @@ TEST_GEN_PROGS := ipc_unmuxed CFLAGS += -I../../../../../usr/include +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index c0e45d2dde25..9fc2cf6fbc92 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \ $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile index f8ced26748f8..fb82068c9fda 100644 --- a/tools/testing/selftests/powerpc/vphn/Makefile +++ b/tools/testing/selftests/powerpc/vphn/Makefile @@ -2,6 +2,7 @@ TEST_GEN_PROGS := test-vphn CFLAGS += -m64 +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c From 3cc5746e5ad7688e274e193fa71278d98aa52759 Mon Sep 17 00:00:00 2001 From: Nilesh Javali Date: Thu, 27 Sep 2018 05:15:35 -0700 Subject: [PATCH 295/499] scsi: qedi: Initialize the stats mutex lock Fix kernel NULL pointer dereference, Call Trace: [] __mutex_lock_slowpath+0xa6/0x1d0 [] mutex_lock+0x1f/0x2f [] qedi_get_protocol_tlv_data+0x61/0x450 [qedi] [] ? map_vm_area+0x2e/0x40 [] ? __vmalloc_node_range+0x170/0x280 [] ? qed_mfw_process_tlv_req+0x27d/0xbd0 [qed] [] qed_mfw_fill_tlv_data+0x4b/0xb0 [qed] [] qed_mfw_process_tlv_req+0x299/0xbd0 [qed] [] ? __switch_to+0xce/0x580 [] qed_slowpath_task+0x5b/0x80 [qed] Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qedi/qedi_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index cc8e64dc65ad..e5bd035ebad0 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -2472,6 +2472,7 @@ static int __qedi_probe(struct pci_dev *pdev, int mode) /* start qedi context */ spin_lock_init(&qedi->hba_lock); spin_lock_init(&qedi->task_idx_lock); + mutex_init(&qedi->stats_lock); } qedi_ops->ll2->register_cb_ops(qedi->cdev, &qedi_ll2_cb_ops, qedi); qedi_ops->ll2->start(qedi->cdev, ¶ms); From dc71db34e4f3c06b8277c8f3c2ff014610607a8c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: [PATCH 296/499] rxrpc: Fix checks as to whether we should set up a new call There's a check in rxrpc_data_ready() that's checking the CLIENT_INITIATED flag in the packet type field rather than in the packet flags field. Fix this by creating a pair of helper functions to check whether the packet is going to the client or to the server and use them generally. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 10 ++++++++++ net/rxrpc/conn_object.c | 2 +- net/rxrpc/input.c | 12 ++++-------- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c97558710421..9fcb3e197b14 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -463,6 +463,16 @@ struct rxrpc_connection { u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ }; +static inline bool rxrpc_to_server(const struct rxrpc_skb_priv *sp) +{ + return sp->hdr.flags & RXRPC_CLIENT_INITIATED; +} + +static inline bool rxrpc_to_client(const struct rxrpc_skb_priv *sp) +{ + return !rxrpc_to_server(sp); +} + /* * Flags in call->flags. */ diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 1746b48cb165..390ba50cfab4 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -96,7 +96,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, k.epoch = sp->hdr.epoch; k.cid = sp->hdr.cid & RXRPC_CIDMASK; - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) { + if (rxrpc_to_server(sp)) { /* We need to look up service connections by the full protocol * parameter set. We look up the peer first as an intermediate * step and then the connection from the peer's tree. diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index cfdc199c6351..ec299c627f77 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1177,10 +1177,6 @@ void rxrpc_data_ready(struct sock *udp_sk) trace_rxrpc_rx_packet(sp); - _net("Rx RxRPC %s ep=%x call=%x:%x", - sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", - sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); - if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { _proto("Rx Bad Packet Type %u", sp->hdr.type); @@ -1189,13 +1185,13 @@ void rxrpc_data_ready(struct sock *udp_sk) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: - if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) + if (rxrpc_to_client(sp)) goto discard; rxrpc_post_packet_to_local(local, skb); goto out; case RXRPC_PACKET_TYPE_BUSY: - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) + if (rxrpc_to_server(sp)) goto discard; /* Fall through */ @@ -1280,7 +1276,7 @@ void rxrpc_data_ready(struct sock *udp_sk) call = rcu_dereference(chan->call); if (sp->hdr.callNumber > chan->call_id) { - if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) { + if (rxrpc_to_client(sp)) { rcu_read_unlock(); goto reject_packet; } @@ -1303,7 +1299,7 @@ void rxrpc_data_ready(struct sock *udp_sk) } if (!call || atomic_read(&call->usage) == 0) { - if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) || + if (rxrpc_to_client(sp) || sp->hdr.callNumber == 0 || sp->hdr.type != RXRPC_PACKET_TYPE_DATA) goto bad_message_unlock; From b604dd9883f783a94020d772e4fe03160f455372 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: [PATCH 297/499] rxrpc: Fix RTT gathering Fix RTT information gathering in AF_RXRPC by the following means: (1) Enable Rx timestamping on the transport socket with SO_TIMESTAMPNS. (2) If the sk_buff doesn't have a timestamp set when rxrpc_data_ready() collects it, set it at that point. (3) Allow ACKs to be requested on the last packet of a client call, but not a service call. We need to be careful lest we undo: bf7d620abf22c321208a4da4f435e7af52551a21 Author: David Howells Date: Thu Oct 6 08:11:51 2016 +0100 rxrpc: Don't request an ACK on the last DATA packet of a call's Tx phase but that only really applies to service calls that we're handling, since the client side gets to send the final ACK (or not). (4) When about to transmit an ACK or DATA packet, record the Tx timestamp before only; don't update the timestamp afterwards. (5) Switch the ordering between recording the serial and recording the timestamp to always set the serial number first. The serial number shouldn't be seen referenced by an ACK packet until we've transmitted the packet bearing it - so in the Rx path, we don't need the timestamp until we've checked the serial number. Fixes: cf1a6474f807 ("rxrpc: Add per-peer RTT tracker") Signed-off-by: David Howells --- net/rxrpc/input.c | 8 ++++++-- net/rxrpc/local_object.c | 9 +++++++++ net/rxrpc/output.c | 31 ++++++++++++++++++------------- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index ec299c627f77..7f9ed3a60b9a 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -622,13 +622,14 @@ static void rxrpc_input_requested_ack(struct rxrpc_call *call, if (!skb) continue; + sent_at = skb->tstamp; + smp_rmb(); /* Read timestamp before serial. */ sp = rxrpc_skb(skb); if (sp->hdr.serial != orig_serial) continue; - smp_rmb(); - sent_at = skb->tstamp; goto found; } + return; found: @@ -1143,6 +1144,9 @@ void rxrpc_data_ready(struct sock *udp_sk) return; } + if (skb->tstamp == 0) + skb->tstamp = ktime_get_real(); + rxrpc_new_skb(skb, rxrpc_skb_rx_received); _net("recv skb %p", skb); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 777c3ed4cfc0..81de7d889ffa 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -173,6 +173,15 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) _debug("setsockopt failed"); goto error; } + + /* We want receive timestamps. */ + opt = 1; + ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS, + (char *)&opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } break; default: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index ccf5de160444..8a4da3fe96df 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -124,7 +124,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, struct kvec iov[2]; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; - ktime_t now; size_t len, n; int ret; u8 reason; @@ -196,9 +195,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, /* We need to stick a time in before we send the packet in case * the reply gets back before kernel_sendmsg() completes - but * asking UDP to send the packet can take a relatively long - * time, so we update the time after, on the assumption that - * the packet transmission is more likely to happen towards the - * end of the kernel_sendmsg() call. + * time. */ call->ping_time = ktime_get_real(); set_bit(RXRPC_CALL_PINGING, &call->flags); @@ -206,9 +203,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, } ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); - now = ktime_get_real(); - if (ping) - call->ping_time = now; conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, @@ -363,8 +357,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. + * + * However, we mustn't request an ACK on the last reply packet of a + * service call, lest OpenAFS incorrectly send us an ACK with some + * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (!(sp->hdr.flags & RXRPC_LAST_PACKET) && + if ((!(sp->hdr.flags & RXRPC_LAST_PACKET) || + rxrpc_to_server(sp) + ) && (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || retrans || call->cong_mode == RXRPC_CALL_SLOW_START || @@ -390,6 +390,11 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, goto send_fragmentable; down_read(&conn->params.local->defrag_sem); + + sp->hdr.serial = serial; + smp_wmb(); /* Set serial before timestamp */ + skb->tstamp = ktime_get_real(); + /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface @@ -413,12 +418,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, retrans, lost); if (ret >= 0) { - ktime_t now = ktime_get_real(); - skb->tstamp = now; - smp_wmb(); - sp->hdr.serial = serial; if (whdr.flags & RXRPC_REQUEST_ACK) { - call->peer->rtt_last_req = now; + call->peer->rtt_last_req = skb->tstamp; trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); if (call->peer->rtt_usage > 1) { unsigned long nowj = jiffies, ack_lost_at; @@ -457,6 +458,10 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, down_write(&conn->params.local->defrag_sem); + sp->hdr.serial = serial; + smp_wmb(); /* Set serial before timestamp */ + skb->tstamp = ktime_get_real(); + switch (conn->params.local->srx.transport.family) { case AF_INET: opt = IP_PMTUDISC_DONT; From ece64fec164f523bfbe874abdef2a0e6ff376251 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: [PATCH 298/499] rxrpc: Emit BUSY packets when supposed to rather than ABORTs In the input path, a received sk_buff can be marked for rejection by setting RXRPC_SKB_MARK_* in skb->mark and, if needed, some auxiliary data (such as an abort code) in skb->priority. The rejection is handled by queueing the sk_buff up for dealing with in process context. The output code reads the mark and priority and, theoretically, generates an appropriate response packet. However, if RXRPC_SKB_MARK_BUSY is set, this isn't noticed and an ABORT message with a random abort code is generated (since skb->priority wasn't set to anything). Fix this by outputting the appropriate sort of packet. Also, whilst we're at it, most of the marks are no longer used, so remove them and rename the remaining two to something more obvious. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 13 ++++--------- net/rxrpc/call_accept.c | 6 +++--- net/rxrpc/input.c | 2 +- net/rxrpc/output.c | 23 ++++++++++++++++++----- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9fcb3e197b14..e8861cb78070 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -40,17 +40,12 @@ struct rxrpc_crypt { struct rxrpc_connection; /* - * Mark applied to socket buffers. + * Mark applied to socket buffers in skb->mark. skb->priority is used + * to pass supplementary information. */ enum rxrpc_skb_mark { - RXRPC_SKB_MARK_DATA, /* data message */ - RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */ - RXRPC_SKB_MARK_BUSY, /* server busy message */ - RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */ - RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */ - RXRPC_SKB_MARK_NET_ERROR, /* network error message */ - RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */ - RXRPC_SKB_MARK_NEW_CALL, /* local error message */ + RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ + RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ }; /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 9d1e298b784c..e88f131c1d7f 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -353,7 +353,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, EOPNOTSUPP); - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; _leave(" = NULL [service]"); return NULL; @@ -364,7 +364,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, rx->sk.sk_state == RXRPC_CLOSE) { trace_rxrpc_abort(0, "CLS", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = RX_INVALID_OPERATION; _leave(" = NULL [close]"); call = NULL; @@ -373,7 +373,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, call = rxrpc_alloc_incoming_call(rx, local, conn, skb); if (!call) { - skb->mark = RXRPC_SKB_MARK_BUSY; + skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; _leave(" = NULL [busy]"); call = NULL; goto out; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 7f9ed3a60b9a..b0f12471f5e7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1354,7 +1354,7 @@ void rxrpc_data_ready(struct sock *udp_sk) protocol_error: skb->priority = RX_PROTOCOL_ERROR; post_abort: - skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; reject_packet: trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_reject_packet(local, skb); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8a4da3fe96df..e8fb8922bca8 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -524,7 +524,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) struct kvec iov[2]; size_t size; __be32 code; - int ret; + int ret, ioc; _enter("%d", local->debug_id); @@ -532,7 +532,6 @@ void rxrpc_reject_packets(struct rxrpc_local *local) iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &code; iov[1].iov_len = sizeof(code); - size = sizeof(whdr) + sizeof(code); msg.msg_name = &srx.transport; msg.msg_control = NULL; @@ -540,17 +539,31 @@ void rxrpc_reject_packets(struct rxrpc_local *local) msg.msg_flags = 0; memset(&whdr, 0, sizeof(whdr)); - whdr.type = RXRPC_PACKET_TYPE_ABORT; while ((skb = skb_dequeue(&local->reject_queue))) { rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); + switch (skb->mark) { + case RXRPC_SKB_MARK_REJECT_BUSY: + whdr.type = RXRPC_PACKET_TYPE_BUSY; + size = sizeof(whdr); + ioc = 1; + break; + case RXRPC_SKB_MARK_REJECT_ABORT: + whdr.type = RXRPC_PACKET_TYPE_ABORT; + code = htonl(skb->priority); + size = sizeof(whdr) + sizeof(code); + ioc = 2; + break; + default: + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + continue; + } + if (rxrpc_extract_addr_from_skb(local, &srx, skb) == 0) { msg.msg_namelen = srx.transport_len; - code = htonl(skb->priority); - whdr.epoch = htonl(sp->hdr.epoch); whdr.cid = htonl(sp->hdr.cid); whdr.callNumber = htonl(sp->hdr.callNumber); From 403fc2a138457f1071b186786a7589ef7382c8bc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:08 +0100 Subject: [PATCH 299/499] rxrpc: Improve up-front incoming packet checking Do more up-front checking on incoming packets to weed out invalid ones and also ones aimed at services that we don't support. Whilst we're at it, replace the clearing of call and skew if we don't find a connection with just initialising the variables to zero at the top of the function. Signed-off-by: David Howells --- net/rxrpc/input.c | 63 +++++++++++++++++++++++++++++++++++--------- net/rxrpc/protocol.h | 15 ----------- 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index b0f12471f5e7..a569e9e010d1 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1125,12 +1125,13 @@ void rxrpc_data_ready(struct sock *udp_sk) { struct rxrpc_connection *conn; struct rxrpc_channel *chan; - struct rxrpc_call *call; + struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; struct rxrpc_local *local = udp_sk->sk_user_data; + struct rxrpc_sock *rx; struct sk_buff *skb; unsigned int channel; - int ret, skew; + int ret, skew = 0; _enter("%p", udp_sk); @@ -1181,12 +1182,6 @@ void rxrpc_data_ready(struct sock *udp_sk) trace_rxrpc_rx_packet(sp); - if (sp->hdr.type >= RXRPC_N_PACKET_TYPES || - !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) { - _proto("Rx Bad Packet Type %u", sp->hdr.type); - goto bad_message; - } - switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: if (rxrpc_to_client(sp)) @@ -1198,24 +1193,63 @@ void rxrpc_data_ready(struct sock *udp_sk) if (rxrpc_to_server(sp)) goto discard; /* Fall through */ + case RXRPC_PACKET_TYPE_ACK: + case RXRPC_PACKET_TYPE_ACKALL: + if (sp->hdr.callNumber == 0) + goto bad_message; + /* Fall through */ + case RXRPC_PACKET_TYPE_ABORT: + break; case RXRPC_PACKET_TYPE_DATA: - if (sp->hdr.callNumber == 0) + if (sp->hdr.callNumber == 0 || + sp->hdr.seq == 0) goto bad_message; if (sp->hdr.flags & RXRPC_JUMBO_PACKET && !rxrpc_validate_jumbo(skb)) goto bad_message; break; + case RXRPC_PACKET_TYPE_CHALLENGE: + if (rxrpc_to_server(sp)) + goto discard; + break; + case RXRPC_PACKET_TYPE_RESPONSE: + if (rxrpc_to_client(sp)) + goto discard; + break; + /* Packet types 9-11 should just be ignored. */ case RXRPC_PACKET_TYPE_PARAMS: case RXRPC_PACKET_TYPE_10: case RXRPC_PACKET_TYPE_11: goto discard; + + default: + _proto("Rx Bad Packet Type %u", sp->hdr.type); + goto bad_message; } + if (sp->hdr.serviceId == 0) + goto bad_message; + rcu_read_lock(); + if (rxrpc_to_server(sp)) { + /* Weed out packets to services we're not offering. Packets + * that would begin a call are explicitly rejected and the rest + * are just discarded. + */ + rx = rcu_dereference(local->service); + if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && + sp->hdr.serviceId != rx->second_service)) { + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.seq == 1) + goto unsupported_service; + goto discard_unlock; + } + } + conn = rxrpc_find_connection_rcu(local, skb); if (conn) { if (sp->hdr.securityIndex != conn->security_ix) @@ -1297,14 +1331,10 @@ void rxrpc_data_ready(struct sock *udp_sk) if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) set_bit(RXRPC_CALL_RX_HEARD, &call->flags); } - } else { - skew = 0; - call = NULL; } if (!call || atomic_read(&call->usage) == 0) { if (rxrpc_to_client(sp) || - sp->hdr.callNumber == 0 || sp->hdr.type != RXRPC_PACKET_TYPE_DATA) goto bad_message_unlock; if (sp->hdr.seq != 1) @@ -1340,6 +1370,13 @@ void rxrpc_data_ready(struct sock *udp_sk) skb->priority = RXKADINCONSISTENCY; goto post_abort; +unsupported_service: + rcu_read_unlock(); + trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EOPNOTSUPP); + skb->priority = RX_INVALID_OPERATION; + goto post_abort; + reupgrade: rcu_read_unlock(); trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 93da73bf7098..f9cb83c938f3 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -50,7 +50,6 @@ struct rxrpc_wire_header { #define RXRPC_PACKET_TYPE_10 10 /* Ignored */ #define RXRPC_PACKET_TYPE_11 11 /* Ignored */ #define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */ -#define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */ uint8_t flags; /* packet flags */ #define RXRPC_CLIENT_INITIATED 0x01 /* signifies a packet generated by a client */ @@ -72,20 +71,6 @@ struct rxrpc_wire_header { } __packed; -#define RXRPC_SUPPORTED_PACKET_TYPES ( \ - (1 << RXRPC_PACKET_TYPE_DATA) | \ - (1 << RXRPC_PACKET_TYPE_ACK) | \ - (1 << RXRPC_PACKET_TYPE_BUSY) | \ - (1 << RXRPC_PACKET_TYPE_ABORT) | \ - (1 << RXRPC_PACKET_TYPE_ACKALL) | \ - (1 << RXRPC_PACKET_TYPE_CHALLENGE) | \ - (1 << RXRPC_PACKET_TYPE_RESPONSE) | \ - /*(1 << RXRPC_PACKET_TYPE_DEBUG) | */ \ - (1 << RXRPC_PACKET_TYPE_PARAMS) | \ - (1 << RXRPC_PACKET_TYPE_10) | \ - (1 << RXRPC_PACKET_TYPE_11) | \ - (1 << RXRPC_PACKET_TYPE_VERSION)) - /*****************************************************************************/ /* * jumbo packet secondary header From 0099dc589bfa7caf6f2608c4cbc1181cfee22b0c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:09 +0100 Subject: [PATCH 300/499] rxrpc: Make service call handling more robust Make the following changes to improve the robustness of the code that sets up a new service call: (1) Cache the rxrpc_sock struct obtained in rxrpc_data_ready() to do a service ID check and pass that along to rxrpc_new_incoming_call(). This means that I can remove the check from rxrpc_new_incoming_call() without the need to worry about the socket attached to the local endpoint getting replaced - which would invalidate the check. (2) Cache the rxrpc_peer struct, thereby allowing the peer search to be done once. The peer is passed to rxrpc_new_incoming_call(), thereby saving the need to repeat the search. This also reduces the possibility of rxrpc_publish_service_conn() BUG()'ing due to the detection of a duplicate connection, despite the initial search done by rxrpc_find_connection_rcu() having turned up nothing. This BUG() shouldn't ever get hit since rxrpc_data_ready() *should* be non-reentrant and the result of the initial search should still hold true, but it has proven possible to hit. I *think* this may be due to __rxrpc_lookup_peer_rcu() cutting short the iteration over the hash table if it finds a matching peer with a zero usage count, but I don't know for sure since it's only ever been hit once that I know of. Another possibility is that a bug in rxrpc_data_ready() that checked the wrong byte in the header for the RXRPC_CLIENT_INITIATED flag might've let through a packet that caused a spurious and invalid call to be set up. That is addressed in another patch. (3) Fix __rxrpc_lookup_peer_rcu() to skip peer records that have a zero usage count rather than stopping and returning not found, just in case there's another peer record behind it in the bucket. (4) Don't search the peer records in rxrpc_alloc_incoming_call(), but rather either use the peer cached in (2) or, if one wasn't found, preemptively install a new one. Fixes: 8496af50eb38 ("rxrpc: Use RCU to access a peer's service connection tree") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 8 +++++--- net/rxrpc/call_accept.c | 41 ++++++++++++----------------------------- net/rxrpc/conn_object.c | 7 ++++++- net/rxrpc/input.c | 7 ++++--- net/rxrpc/peer_object.c | 35 +++++++++++------------------------ 5 files changed, 38 insertions(+), 60 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e8861cb78070..c72686193d83 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -722,6 +722,8 @@ extern struct workqueue_struct *rxrpc_workqueue; int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *, + struct rxrpc_sock *, + struct rxrpc_peer *, struct rxrpc_connection *, struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); @@ -913,7 +915,8 @@ extern unsigned int rxrpc_closed_conn_expiry; struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, - struct sk_buff *); + struct sk_buff *, + struct rxrpc_peer **); void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_connection(struct rxrpc_connection *); @@ -1049,8 +1052,7 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, struct sockaddr_rxrpc *, gfp_t); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); -struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *, - struct rxrpc_peer *); +void rxrpc_new_incoming_peer(struct rxrpc_local *, struct rxrpc_peer *); void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index e88f131c1d7f..9c7f26d06a52 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -249,11 +249,11 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) */ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_local *local, + struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; - struct rxrpc_peer *peer, *xpeer; struct rxrpc_call *call; unsigned short call_head, conn_head, peer_head; unsigned short call_tail, conn_tail, peer_tail; @@ -276,21 +276,18 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, return NULL; if (!conn) { - /* No connection. We're going to need a peer to start off - * with. If one doesn't yet exist, use a spare from the - * preallocation set. We dump the address into the spare in - * anticipation - and to save on stack space. - */ - xpeer = b->peer_backlog[peer_tail]; - if (rxrpc_extract_addr_from_skb(local, &xpeer->srx, skb) < 0) - return NULL; - - peer = rxrpc_lookup_incoming_peer(local, xpeer); - if (peer == xpeer) { + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + peer = b->peer_backlog[peer_tail]; + if (rxrpc_extract_addr_from_skb(local, &peer->srx, skb) < 0) + return NULL; b->peer_backlog[peer_tail] = NULL; smp_store_release(&b->peer_backlog_tail, (peer_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); + + rxrpc_new_incoming_peer(local, peer); } /* Now allocate and set up the connection */ @@ -335,30 +332,16 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * The call is returned with the user access mutex held. */ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_sock *rx, + struct rxrpc_peer *peer, struct rxrpc_connection *conn, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_sock *rx; struct rxrpc_call *call; - u16 service_id = sp->hdr.serviceId; _enter(""); - /* Get the socket providing the service */ - rx = rcu_dereference(local->service); - if (rx && (service_id == rx->srx.srx_service || - service_id == rx->second_service)) - goto found_service; - - trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EOPNOTSUPP); - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; - skb->priority = RX_INVALID_OPERATION; - _leave(" = NULL [service]"); - return NULL; - -found_service: spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { @@ -371,7 +354,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, goto out; } - call = rxrpc_alloc_incoming_call(rx, local, conn, skb); + call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; _leave(" = NULL [busy]"); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 390ba50cfab4..b4438f98dc5c 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -69,10 +69,14 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) * If successful, a pointer to the connection is returned, but no ref is taken. * NULL is returned if there is no match. * + * When searching for a service call, if we find a peer but no connection, we + * return that through *_peer in case we need to create a new service call. + * * The caller must be holding the RCU read lock. */ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, - struct sk_buff *skb) + struct sk_buff *skb, + struct rxrpc_peer **_peer) { struct rxrpc_connection *conn; struct rxrpc_conn_proto k; @@ -104,6 +108,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, peer = rxrpc_lookup_peer_rcu(local, &srx); if (!peer) goto not_found; + *_peer = peer; conn = rxrpc_find_service_conn_rcu(peer, skb); if (!conn || atomic_read(&conn->usage) == 0) goto not_found; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index a569e9e010d1..800f5b8a1baa 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1128,7 +1128,8 @@ void rxrpc_data_ready(struct sock *udp_sk) struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; struct rxrpc_local *local = udp_sk->sk_user_data; - struct rxrpc_sock *rx; + struct rxrpc_peer *peer = NULL; + struct rxrpc_sock *rx = NULL; struct sk_buff *skb; unsigned int channel; int ret, skew = 0; @@ -1250,7 +1251,7 @@ void rxrpc_data_ready(struct sock *udp_sk) } } - conn = rxrpc_find_connection_rcu(local, skb); + conn = rxrpc_find_connection_rcu(local, skb, &peer); if (conn) { if (sp->hdr.securityIndex != conn->security_ix) goto wrong_security; @@ -1339,7 +1340,7 @@ void rxrpc_data_ready(struct sock *udp_sk) goto bad_message_unlock; if (sp->hdr.seq != 1) goto discard_unlock; - call = rxrpc_new_incoming_call(local, conn, skb); + call = rxrpc_new_incoming_call(local, rx, peer, conn, skb); if (!call) { rcu_read_unlock(); goto reject_packet; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 1dc7648e3eff..70083e8fb6e5 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -124,11 +124,9 @@ static struct rxrpc_peer *__rxrpc_lookup_peer_rcu( struct rxrpc_net *rxnet = local->rxnet; hash_for_each_possible_rcu(rxnet->peer_hash, peer, hash_link, hash_key) { - if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0) { - if (atomic_read(&peer->usage) == 0) - return NULL; + if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0 && + atomic_read(&peer->usage) > 0) return peer; - } } return NULL; @@ -299,34 +297,23 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, } /* - * Set up a new incoming peer. The address is prestored in the preallocated - * peer. + * Set up a new incoming peer. There shouldn't be any other matching peers + * since we've already done a search in the list from the non-reentrant context + * (the data_ready handler) that is the only place we can add new peers. */ -struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, - struct rxrpc_peer *prealloc) +void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { - struct rxrpc_peer *peer; struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key; - hash_key = rxrpc_peer_hash_key(local, &prealloc->srx); - prealloc->local = local; - rxrpc_init_peer(prealloc, hash_key); + hash_key = rxrpc_peer_hash_key(local, &peer->srx); + peer->local = local; + rxrpc_init_peer(peer, hash_key); spin_lock(&rxnet->peer_hash_lock); - - /* Need to check that we aren't racing with someone else */ - peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key); - if (peer && !rxrpc_get_peer_maybe(peer)) - peer = NULL; - if (!peer) { - peer = prealloc; - hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); - list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); - } - + hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); + list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); spin_unlock(&rxnet->peer_hash_lock); - return peer; } /* From 37a675e768d7606fe8a53e0c459c9b53e121ac20 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:09 +0100 Subject: [PATCH 301/499] rxrpc: Fix transport sockopts to get IPv4 errors on an IPv6 socket It seems that enabling IPV6_RECVERR on an IPv6 socket doesn't also turn on IP_RECVERR, so neither local errors nor ICMP-transported remote errors from IPv4 peer addresses are returned to the AF_RXRPC protocol. Make the sockopt setting code in rxrpc_open_socket() fall through from the AF_INET6 case to the AF_INET case to turn on all the AF_INET options too in the AF_INET6 case. Fixes: f2aeed3a591f ("rxrpc: Fix error reception on AF_INET6 sockets") Signed-off-by: David Howells --- net/rxrpc/local_object.c | 43 +++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 81de7d889ffa..94d234e9c685 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -135,6 +135,29 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } switch (local->srx.transport.family) { + case AF_INET6: + /* we want to receive ICMPv6 errors */ + opt = 1; + ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, + (char *) &opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } + + /* we want to set the don't fragment bit */ + opt = IPV6_PMTUDISC_DO; + ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER, + (char *) &opt, sizeof(opt)); + if (ret < 0) { + _debug("setsockopt failed"); + goto error; + } + + /* Fall through and set IPv4 options too otherwise we don't get + * errors from IPv4 packets sent through the IPv6 socket. + */ + case AF_INET: /* we want to receive ICMP errors */ opt = 1; @@ -153,26 +176,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) _debug("setsockopt failed"); goto error; } - break; - - case AF_INET6: - /* we want to receive ICMP errors */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } - - /* we want to set the don't fragment bit */ - opt = IPV6_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } /* We want receive timestamps. */ opt = 1; From f334430316e7fd37c4821ebec627e27714bb5d76 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Sep 2018 15:13:09 +0100 Subject: [PATCH 302/499] rxrpc: Fix error distribution Fix error distribution by immediately delivering the errors to all the affected calls rather than deferring them to a worker thread. The problem with the latter is that retries and things can happen in the meantime when we want to stop that sooner. To this end: (1) Stop the error distributor from removing calls from the error_targets list so that peer->lock isn't needed to synchronise against other adds and removals. (2) Require the peer's error_targets list to be accessed with RCU, thereby avoiding the need to take peer->lock over distribution. (3) Don't attempt to affect a call's state if it is already marked complete. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 4 +--- net/rxrpc/ar-internal.h | 5 ---- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_client.c | 4 ++-- net/rxrpc/conn_object.c | 2 +- net/rxrpc/peer_event.c | 46 +++++++++--------------------------- net/rxrpc/peer_object.c | 17 ------------- 7 files changed, 16 insertions(+), 64 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 196587b8f204..837393fa897b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -56,7 +56,6 @@ enum rxrpc_peer_trace { rxrpc_peer_new, rxrpc_peer_processing, rxrpc_peer_put, - rxrpc_peer_queued_error, }; enum rxrpc_conn_trace { @@ -257,8 +256,7 @@ enum rxrpc_tx_point { EM(rxrpc_peer_got, "GOT") \ EM(rxrpc_peer_new, "NEW") \ EM(rxrpc_peer_processing, "PRO") \ - EM(rxrpc_peer_put, "PUT") \ - E_(rxrpc_peer_queued_error, "QER") + E_(rxrpc_peer_put, "PUT") #define rxrpc_conn_traces \ EM(rxrpc_conn_got, "GOT") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c72686193d83..ef9554131434 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -288,7 +288,6 @@ struct rxrpc_peer { struct hlist_node hash_link; struct rxrpc_local *local; struct hlist_head error_targets; /* targets for net error distribution */ - struct work_struct error_distributor; struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ time64_t last_tx_at; /* Last time packet sent here */ @@ -299,8 +298,6 @@ struct rxrpc_peer { unsigned int maxdata; /* data size (MTU - hdrsize) */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ - int error_report; /* Net (+0) or local (+1000000) to distribute */ -#define RXRPC_LOCAL_ERROR_OFFSET 1000000 struct sockaddr_rxrpc srx; /* remote address */ /* calculated RTT cache */ @@ -1039,7 +1036,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); * peer_event.c */ void rxrpc_error_report(struct sock *); -void rxrpc_peer_error_distributor(struct work_struct *); void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); void rxrpc_peer_keepalive_worker(struct work_struct *); @@ -1057,7 +1053,6 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); void rxrpc_put_peer(struct rxrpc_peer *); -void __rxrpc_queue_peer_error(struct rxrpc_peer *); /* * proc.c diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 9486293fef5c..799f75b6900d 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -400,7 +400,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, rcu_assign_pointer(conn->channels[chan].call, call); spin_lock(&conn->params.peer->lock); - hlist_add_head(&call->error_link, &conn->params.peer->error_targets); + hlist_add_head_rcu(&call->error_link, &conn->params.peer->error_targets); spin_unlock(&conn->params.peer->lock); _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index f8f37188a932..8acf74fe24c0 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -710,8 +710,8 @@ int rxrpc_connect_call(struct rxrpc_call *call, } spin_lock_bh(&call->conn->params.peer->lock); - hlist_add_head(&call->error_link, - &call->conn->params.peer->error_targets); + hlist_add_head_rcu(&call->error_link, + &call->conn->params.peer->error_targets); spin_unlock_bh(&call->conn->params.peer->lock); out: diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index b4438f98dc5c..885dae829f4a 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -216,7 +216,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) call->peer->cong_cwnd = call->cong_cwnd; spin_lock_bh(&conn->params.peer->lock); - hlist_del_init(&call->error_link); + hlist_del_rcu(&call->error_link); spin_unlock_bh(&conn->params.peer->lock); if (rxrpc_is_client_call(call)) diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 4f9da2f51c69..f3e6fc670da2 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -23,6 +23,8 @@ #include "ar-internal.h" static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); +static void rxrpc_distribute_error(struct rxrpc_peer *, int, + enum rxrpc_call_completion); /* * Find the peer associated with an ICMP packet. @@ -194,8 +196,6 @@ void rxrpc_error_report(struct sock *sk) rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_rx_freed); - /* The ref we obtained is passed off to the work item */ - __rxrpc_queue_peer_error(peer); _leave(""); } @@ -205,6 +205,7 @@ void rxrpc_error_report(struct sock *sk) static void rxrpc_store_error(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) { + enum rxrpc_call_completion compl = RXRPC_CALL_NETWORK_ERROR; struct sock_extended_err *ee; int err; @@ -255,7 +256,7 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_LOCAL: _proto("Rx Received local error { error=%d }", err); - err += RXRPC_LOCAL_ERROR_OFFSET; + compl = RXRPC_CALL_LOCAL_ERROR; break; case SO_EE_ORIGIN_ICMP6: @@ -264,48 +265,23 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, break; } - peer->error_report = err; + rxrpc_distribute_error(peer, err, compl); } /* - * Distribute an error that occurred on a peer + * Distribute an error that occurred on a peer. */ -void rxrpc_peer_error_distributor(struct work_struct *work) +static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, + enum rxrpc_call_completion compl) { - struct rxrpc_peer *peer = - container_of(work, struct rxrpc_peer, error_distributor); struct rxrpc_call *call; - enum rxrpc_call_completion compl; - int error; - _enter(""); - - error = READ_ONCE(peer->error_report); - if (error < RXRPC_LOCAL_ERROR_OFFSET) { - compl = RXRPC_CALL_NETWORK_ERROR; - } else { - compl = RXRPC_CALL_LOCAL_ERROR; - error -= RXRPC_LOCAL_ERROR_OFFSET; - } - - _debug("ISSUE ERROR %s %d", rxrpc_call_completions[compl], error); - - spin_lock_bh(&peer->lock); - - while (!hlist_empty(&peer->error_targets)) { - call = hlist_entry(peer->error_targets.first, - struct rxrpc_call, error_link); - hlist_del_init(&call->error_link); + hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) { rxrpc_see_call(call); - - if (rxrpc_set_call_completion(call, compl, 0, -error)) + if (call->state < RXRPC_CALL_COMPLETE && + rxrpc_set_call_completion(call, compl, 0, -error)) rxrpc_notify_socket(call); } - - spin_unlock_bh(&peer->lock); - - rxrpc_put_peer(peer); - _leave(""); } /* diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 70083e8fb6e5..01a9febfa367 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -220,8 +220,6 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) atomic_set(&peer->usage, 1); peer->local = local; INIT_HLIST_HEAD(&peer->error_targets); - INIT_WORK(&peer->error_distributor, - &rxrpc_peer_error_distributor); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); @@ -402,21 +400,6 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) return peer; } -/* - * Queue a peer record. This passes the caller's ref to the workqueue. - */ -void __rxrpc_queue_peer_error(struct rxrpc_peer *peer) -{ - const void *here = __builtin_return_address(0); - int n; - - n = atomic_read(&peer->usage); - if (rxrpc_queue_work(&peer->error_distributor)) - trace_rxrpc_peer(peer, rxrpc_peer_queued_error, n, here); - else - rxrpc_put_peer(peer); -} - /* * Discard a peer record. */ From a13f814a67b12a2f29d1decf4b4f4e700658a517 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 30 Aug 2018 17:56:52 +0900 Subject: [PATCH 303/499] netfilter: nft_set_rbtree: add missing rb_erase() in GC routine The nft_set_gc_batch_check() checks whether gc buffer is full. If gc buffer is full, gc buffer is released by the nft_set_gc_batch_complete() internally. In case of rbtree, the rb_erase() should be called before calling the nft_set_gc_batch_complete(). therefore the rb_erase() should be called before calling the nft_set_gc_batch_check() too. test commands: table ip filter { set set1 { type ipv4_addr; flags interval, timeout; gc-interval 10s; timeout 1s; elements = { 1-2, 3-4, 5-6, ... 10000-10001, } } } %nft -f test.nft splat looks like: [ 430.273885] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 430.282158] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 430.283116] CPU: 1 PID: 190 Comm: kworker/1:2 Tainted: G B 4.18.0+ #7 [ 430.283116] Workqueue: events_power_efficient nft_rbtree_gc [nf_tables_set] [ 430.313559] RIP: 0010:rb_next+0x81/0x130 [ 430.313559] Code: 08 49 bd 00 00 00 00 00 fc ff df 48 bb 00 00 00 00 00 fc ff df 48 85 c0 75 05 eb 58 48 89 d4 [ 430.313559] RSP: 0018:ffff88010cdb7680 EFLAGS: 00010207 [ 430.313559] RAX: 0000000000b84854 RBX: dffffc0000000000 RCX: ffffffff83f01973 [ 430.313559] RDX: 000000000017090c RSI: 0000000000000008 RDI: 0000000000b84864 [ 430.313559] RBP: ffff8801060d4588 R08: fffffbfff09bc349 R09: fffffbfff09bc349 [ 430.313559] R10: 0000000000000001 R11: fffffbfff09bc348 R12: ffff880100f081a8 [ 430.313559] R13: dffffc0000000000 R14: ffff880100ff8688 R15: dffffc0000000000 [ 430.313559] FS: 0000000000000000(0000) GS:ffff88011b400000(0000) knlGS:0000000000000000 [ 430.313559] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 430.313559] CR2: 0000000001551008 CR3: 000000005dc16000 CR4: 00000000001006e0 [ 430.313559] Call Trace: [ 430.313559] nft_rbtree_gc+0x112/0x5c0 [nf_tables_set] [ 430.313559] process_one_work+0xc13/0x1ec0 [ 430.313559] ? _raw_spin_unlock_irq+0x29/0x40 [ 430.313559] ? pwq_dec_nr_in_flight+0x3c0/0x3c0 [ 430.313559] ? set_load_weight+0x270/0x270 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __switch_to_asm+0x40/0x70 [ 430.313559] ? __switch_to_asm+0x34/0x70 [ 430.313559] ? __schedule+0x6d3/0x1f50 [ 430.313559] ? find_held_lock+0x39/0x1c0 [ 430.313559] ? __sched_text_start+0x8/0x8 [ 430.313559] ? cyc2ns_read_end+0x10/0x10 [ 430.313559] ? save_trace+0x300/0x300 [ 430.313559] ? sched_clock_local+0xd4/0x140 [ 430.313559] ? find_held_lock+0x39/0x1c0 [ 430.313559] ? worker_thread+0x353/0x1120 [ 430.313559] ? worker_thread+0x353/0x1120 [ 430.313559] ? lock_contended+0xe70/0xe70 [ 430.313559] ? __lock_acquire+0x4500/0x4500 [ 430.535635] ? do_raw_spin_unlock+0xa5/0x330 [ 430.535635] ? do_raw_spin_trylock+0x101/0x1a0 [ 430.535635] ? do_raw_spin_lock+0x1f0/0x1f0 [ 430.535635] ? _raw_spin_lock_irq+0x10/0x70 [ 430.535635] worker_thread+0x15d/0x1120 [ ... ] Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 55e2d9215c0d..0e5ec126f6ad 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -355,12 +355,11 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, static void nft_rbtree_gc(struct work_struct *work) { + struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; struct nft_set_gc_batch *gcb = NULL; - struct rb_node *node, *prev = NULL; - struct nft_rbtree_elem *rbe; struct nft_rbtree *priv; + struct rb_node *node; struct nft_set *set; - int i; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); @@ -371,7 +370,7 @@ static void nft_rbtree_gc(struct work_struct *work) rbe = rb_entry(node, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe)) { - prev = node; + rbe_end = rbe; continue; } if (!nft_set_elem_expired(&rbe->ext)) @@ -379,29 +378,30 @@ static void nft_rbtree_gc(struct work_struct *work) if (nft_set_elem_mark_busy(&rbe->ext)) continue; + if (rbe_prev) { + rb_erase(&rbe_prev->node, &priv->root); + rbe_prev = NULL; + } gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); if (!gcb) break; atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, rbe); + rbe_prev = rbe; - if (prev) { - rbe = rb_entry(prev, struct nft_rbtree_elem, node); + if (rbe_end) { atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - prev = NULL; + nft_set_gc_batch_add(gcb, rbe_end); + rb_erase(&rbe_end->node, &priv->root); + rbe_end = NULL; } node = rb_next(node); if (!node) break; } - if (gcb) { - for (i = 0; i < gcb->head.cnt; i++) { - rbe = gcb->elems[i]; - rb_erase(&rbe->node, &priv->root); - } - } + if (rbe_prev) + rb_erase(&rbe_prev->node, &priv->root); write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); From 421c119f558761556afca6a62ad183bc2d8659e0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 24 Sep 2018 14:10:04 +0200 Subject: [PATCH 304/499] netfilter: avoid erronous array bounds warning Unfortunately some versions of gcc emit following warning: $ make net/xfrm/xfrm_output.o linux/compiler.h:252:20: warning: array subscript is above array bounds [-Warray-bounds] hook_head = rcu_dereference(net->nf.hooks_arp[hook]); ^~~~~~~~~~~~~~~~~~~~~ xfrm_output_resume passes skb_dst(skb)->ops->family as its 'pf' arg so compiler can't know that we'll never access hooks_arp[]. (NFPROTO_IPV4 or NFPROTO_IPV6 are only possible cases). Avoid this by adding an explicit WARN_ON_ONCE() check. This patch has no effect if the family is a compile-time constant as gcc will remove the switch() construct entirely. Reported-by: David Ahern Signed-off-by: Florian Westphal Reviewed-by: David Ahern Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 07efffd0c759..bbe99d2b28b4 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -215,6 +215,8 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, break; case NFPROTO_ARP: #ifdef CONFIG_NETFILTER_FAMILY_ARP + if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp))) + break; hook_head = rcu_dereference(net->nf.hooks_arp[hook]); #endif break; From 40e4f26e6a14fc1496eabb8b0004a547303114e6 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Thu, 27 Sep 2018 19:36:28 -0300 Subject: [PATCH 305/499] netfilter: xt_socket: check sk before checking for netns. Only check for the network namespace if the socket is available. Fixes: f564650106a6 ("netfilter: check if the socket netns is correct.") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Flavio Leitner Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 0472f3472842..ada144e5645b 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -56,7 +56,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; - if (!net_eq(xt_net(par), sock_net(sk))) + if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) @@ -117,7 +117,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; - if (!net_eq(xt_net(par), sock_net(sk))) + if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) From 4288ea006c73e37c2a4f60dfaef20dd167b8df31 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:33:21 +0100 Subject: [PATCH 306/499] bpf: harden flags check in cgroup_storage_update_elem() cgroup_storage_update_elem() shouldn't accept any flags argument values except BPF_ANY and BPF_EXIST to guarantee the backward compatibility, had a new flag value been added. Fixes: de9cbbaadba5 ("bpf: introduce cgroup storage maps") Signed-off-by: Roman Gushchin Reported-by: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 22ad967d1e5f..94126cbffc88 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -129,7 +129,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags & BPF_NOEXIST) + if (flags != BPF_ANY && flags != BPF_EXIST) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, From 15c206887603a452f13fbfde2db0f8830d37028c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 28 Sep 2018 09:40:17 -0600 Subject: [PATCH 307/499] Revert "xen/blkfront: When purging persistent grants, keep them in the buffer" Fix didn't work for all cases, reverting to add a (hopefully) better fix. This reverts commit f151ba989d149bbdfc90e5405724bbea094f9b17. Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 3b441fe69c0d..a71d817e900d 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2667,9 +2667,11 @@ static void purge_persistent_grants(struct blkfront_info *info) gnttab_query_foreign_access(gnt_list_entry->gref)) continue; + list_del(&gnt_list_entry->node); gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); - gnt_list_entry->gref = GRANT_INVALID_REF; rinfo->persistent_gnts_c--; + __free_page(gnt_list_entry->page); + kfree(gnt_list_entry); } spin_unlock_irqrestore(&rinfo->ring_lock, flags); From 6c7678674014b4552caf0e5aa0ca34078a377482 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 28 Sep 2018 09:28:27 +0200 Subject: [PATCH 308/499] xen/blkfront: correct purging of persistent grants Commit a46b53672b2c2e3770b38a4abf90d16364d2584b ("xen/blkfront: cleanup stale persistent grants") introduced a regression as purged persistent grants were not pu into the list of free grants again. Correct that. Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index a71d817e900d..429d20131c7e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2670,8 +2670,8 @@ static void purge_persistent_grants(struct blkfront_info *info) list_del(&gnt_list_entry->node); gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); rinfo->persistent_gnts_c--; - __free_page(gnt_list_entry->page); - kfree(gnt_list_entry); + gnt_list_entry->gref = GRANT_INVALID_REF; + list_add_tail(&gnt_list_entry->node, &rinfo->grants); } spin_unlock_irqrestore(&rinfo->ring_lock, flags); From 1b09d9c232cdaea59fb50ac437d3921ed1f1eafb Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 28 Sep 2018 14:20:40 +0200 Subject: [PATCH 309/499] mmc: slot-gpio: Fix debounce time to use miliseconds again The debounce value passed to mmc_gpiod_request_cd() function is in microseconds, but msecs_to_jiffies() requires the value to be in miliseconds to properly calculate the delay, so adjust the value stored in cd_debounce_delay_ms context entry. Fixes: 1d71926bbd59 ("mmc: core: Fix debounce time to use microseconds") Fixes: bfd694d5e21c ("mmc: core: Add tunable delay before detecting card after card is inserted") Cc: stable@vger.kernel.org # v4.18+ Signed-off-by: Marek Szyprowski Reviewed-by: Linus Walleij Signed-off-by: Ulf Hansson --- drivers/mmc/core/slot-gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c index 2a833686784b..86803a3a04dc 100644 --- a/drivers/mmc/core/slot-gpio.c +++ b/drivers/mmc/core/slot-gpio.c @@ -271,7 +271,7 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id, if (debounce) { ret = gpiod_set_debounce(desc, debounce); if (ret < 0) - ctx->cd_debounce_delay_ms = debounce; + ctx->cd_debounce_delay_ms = debounce / 1000; } if (gpio_invert) From 065a2cdcbdf8eb9aefb66e1a24b2d684b8b8852b Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 26 Sep 2018 18:07:09 +0200 Subject: [PATCH 310/499] s390: qeth_core_mpc: Use ARRAY_SIZE instead of reimplementing its function Use the common code ARRAY_SIZE macro instead of a private implementation. Reviewed-by: Jean Delvare Signed-off-by: zhong jiang Signed-off-by: Martin Schwidefsky Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_mpc.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/s390/net/qeth_core_mpc.c b/drivers/s390/net/qeth_core_mpc.c index 5bcb8dafc3ee..e8263ded0af0 100644 --- a/drivers/s390/net/qeth_core_mpc.c +++ b/drivers/s390/net/qeth_core_mpc.c @@ -222,8 +222,7 @@ static struct ipa_rc_msg qeth_ipa_rc_msg[] = { char *qeth_get_ipa_msg(enum qeth_ipa_return_codes rc) { int x = 0; - qeth_ipa_rc_msg[sizeof(qeth_ipa_rc_msg) / - sizeof(struct ipa_rc_msg) - 1].rc = rc; + qeth_ipa_rc_msg[ARRAY_SIZE(qeth_ipa_rc_msg) - 1].rc = rc; while (qeth_ipa_rc_msg[x].rc != rc) x++; return qeth_ipa_rc_msg[x].msg; @@ -270,9 +269,7 @@ static struct ipa_cmd_names qeth_ipa_cmd_names[] = { char *qeth_get_ipa_cmd_name(enum qeth_ipa_cmds cmd) { int x = 0; - qeth_ipa_cmd_names[ - sizeof(qeth_ipa_cmd_names) / - sizeof(struct ipa_cmd_names)-1].cmd = cmd; + qeth_ipa_cmd_names[ARRAY_SIZE(qeth_ipa_cmd_names) - 1].cmd = cmd; while (qeth_ipa_cmd_names[x].cmd != cmd) x++; return qeth_ipa_cmd_names[x].name; From 048a7f8b4ec085d5c56ad4a3bf450389a4aed5f9 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 26 Sep 2018 18:07:10 +0200 Subject: [PATCH 311/499] s390: qeth: Fix potential array overrun in cmd/rc lookup Functions qeth_get_ipa_msg and qeth_get_ipa_cmd_name are modifying the last member of global arrays without any locking that I can see. If two instances of either function are running at the same time, it could cause a race ultimately leading to an array overrun (the contents of the last entry of the array is the only guarantee that the loop will ever stop). Performing the lookups without modifying the arrays is admittedly slower (two comparisons per iteration instead of one) but these are operations which are rare (should only be needed in error cases or when debugging, not during successful operation) and it seems still less costly than introducing a mutex to protect the arrays in question. As a side bonus, it allows us to declare both arrays as const data. Signed-off-by: Jean Delvare Cc: Julian Wiedmann Cc: Ursula Braun Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 2 +- drivers/s390/net/qeth_core_mpc.c | 30 ++++++++++++++++-------------- drivers/s390/net/qeth_core_mpc.h | 4 ++-- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index de8282420f96..ffce6f39828a 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -610,7 +610,7 @@ static void qeth_put_reply(struct qeth_reply *reply) static void qeth_issue_ipa_msg(struct qeth_ipa_cmd *cmd, int rc, struct qeth_card *card) { - char *ipa_name; + const char *ipa_name; int com = cmd->hdr.command; ipa_name = qeth_get_ipa_cmd_name(com); if (rc) diff --git a/drivers/s390/net/qeth_core_mpc.c b/drivers/s390/net/qeth_core_mpc.c index e8263ded0af0..e891c0b52f4c 100644 --- a/drivers/s390/net/qeth_core_mpc.c +++ b/drivers/s390/net/qeth_core_mpc.c @@ -148,10 +148,10 @@ EXPORT_SYMBOL_GPL(IPA_PDU_HEADER); struct ipa_rc_msg { enum qeth_ipa_return_codes rc; - char *msg; + const char *msg; }; -static struct ipa_rc_msg qeth_ipa_rc_msg[] = { +static const struct ipa_rc_msg qeth_ipa_rc_msg[] = { {IPA_RC_SUCCESS, "success"}, {IPA_RC_NOTSUPP, "Command not supported"}, {IPA_RC_IP_TABLE_FULL, "Add Addr IP Table Full - ipv6"}, @@ -219,22 +219,23 @@ static struct ipa_rc_msg qeth_ipa_rc_msg[] = { -char *qeth_get_ipa_msg(enum qeth_ipa_return_codes rc) +const char *qeth_get_ipa_msg(enum qeth_ipa_return_codes rc) { - int x = 0; - qeth_ipa_rc_msg[ARRAY_SIZE(qeth_ipa_rc_msg) - 1].rc = rc; - while (qeth_ipa_rc_msg[x].rc != rc) - x++; + int x; + + for (x = 0; x < ARRAY_SIZE(qeth_ipa_rc_msg) - 1; x++) + if (qeth_ipa_rc_msg[x].rc == rc) + return qeth_ipa_rc_msg[x].msg; return qeth_ipa_rc_msg[x].msg; } struct ipa_cmd_names { enum qeth_ipa_cmds cmd; - char *name; + const char *name; }; -static struct ipa_cmd_names qeth_ipa_cmd_names[] = { +static const struct ipa_cmd_names qeth_ipa_cmd_names[] = { {IPA_CMD_STARTLAN, "startlan"}, {IPA_CMD_STOPLAN, "stoplan"}, {IPA_CMD_SETVMAC, "setvmac"}, @@ -266,11 +267,12 @@ static struct ipa_cmd_names qeth_ipa_cmd_names[] = { {IPA_CMD_UNKNOWN, "unknown"}, }; -char *qeth_get_ipa_cmd_name(enum qeth_ipa_cmds cmd) +const char *qeth_get_ipa_cmd_name(enum qeth_ipa_cmds cmd) { - int x = 0; - qeth_ipa_cmd_names[ARRAY_SIZE(qeth_ipa_cmd_names) - 1].cmd = cmd; - while (qeth_ipa_cmd_names[x].cmd != cmd) - x++; + int x; + + for (x = 0; x < ARRAY_SIZE(qeth_ipa_cmd_names) - 1; x++) + if (qeth_ipa_cmd_names[x].cmd == cmd) + return qeth_ipa_cmd_names[x].name; return qeth_ipa_cmd_names[x].name; } diff --git a/drivers/s390/net/qeth_core_mpc.h b/drivers/s390/net/qeth_core_mpc.h index aa8b9196b089..aa5de1fe01e1 100644 --- a/drivers/s390/net/qeth_core_mpc.h +++ b/drivers/s390/net/qeth_core_mpc.h @@ -797,8 +797,8 @@ enum qeth_ipa_arp_return_codes { QETH_IPA_ARP_RC_Q_NO_DATA = 0x0008, }; -extern char *qeth_get_ipa_msg(enum qeth_ipa_return_codes rc); -extern char *qeth_get_ipa_cmd_name(enum qeth_ipa_cmds cmd); +extern const char *qeth_get_ipa_msg(enum qeth_ipa_return_codes rc); +extern const char *qeth_get_ipa_cmd_name(enum qeth_ipa_cmds cmd); #define QETH_SETASS_BASE_LEN (sizeof(struct qeth_ipacmd_hdr) + \ sizeof(struct qeth_ipacmd_setassparms_hdr)) From ce7d17d6c607aa0d898f4712e75e27d319816b3b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 27 Sep 2018 10:47:01 +0200 Subject: [PATCH 312/499] MAINTAINERS: change bridge maintainers I haven't been doing reviews only but not active development on bridge code for several years. Roopa and Nikolay have been doing most of the new features and have agreed to take over as new co-maintainers. Signed-off-by: Stephen Hemminger Acked-by: Roopa Prabhu Acked-by: Nikolay Aleksandrov --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 02a39617ec82..b0ca9e214ef0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5470,7 +5470,8 @@ S: Odd Fixes F: drivers/net/ethernet/agere/ ETHERNET BRIDGE -M: Stephen Hemminger +M: Roopa Prabhu +M: Nikolay Aleksandrov L: bridge@lists.linux-foundation.org (moderated for non-subscribers) L: netdev@vger.kernel.org W: http://www.linuxfoundation.org/en/Net:Bridge From cb973127a793c5ade8102aa4ab7bb5e4b1e64190 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 26 Sep 2018 21:57:03 -0700 Subject: [PATCH 313/499] Update maintainers for bnx2/bnx2x/qlge/qlcnic drivers. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ameen Rahman Signed-off-by: David S. Miller --- MAINTAINERS | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index b0ca9e214ef0..dcb0191c4f54 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2956,7 +2956,6 @@ F: include/linux/bcm963xx_tag.h BROADCOM BNX2 GIGABIT ETHERNET DRIVER M: Rasesh Mody -M: Harish Patil M: Dept-GELinuxNICDev@cavium.com L: netdev@vger.kernel.org S: Supported @@ -2977,6 +2976,7 @@ F: drivers/scsi/bnx2i/ BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER M: Ariel Elior +M: Sudarsana Kalluru M: everest-linux-l2@cavium.com L: netdev@vger.kernel.org S: Supported @@ -11974,7 +11974,7 @@ F: Documentation/scsi/LICENSE.qla4xxx F: drivers/scsi/qla4xxx/ QLOGIC QLCNIC (1/10)Gb ETHERNET DRIVER -M: Harish Patil +M: Shahed Shaikh M: Manish Chopra M: Dept-GELinuxNICDev@cavium.com L: netdev@vger.kernel.org @@ -11982,7 +11982,6 @@ S: Supported F: drivers/net/ethernet/qlogic/qlcnic/ QLOGIC QLGE 10Gb ETHERNET DRIVER -M: Harish Patil M: Manish Chopra M: Dept-GELinuxNICDev@cavium.com L: netdev@vger.kernel.org From 5f672090e44f4951084c5e1d6b0668a5fc422af8 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Thu, 27 Sep 2018 04:12:10 -0700 Subject: [PATCH 314/499] qed: Fix shmem structure inconsistency between driver and the mfw. The structure shared between driver and the management FW (mfw) differ in sizes. This would lead to issues when driver try to access the structure members which are not-aligned with the mfw copy e.g., data_ptr usage in the case of mfw_tlv request. Align the driver structure with mfw copy, add reserved field(s) to driver structure for the members not used by the driver. Fixes: dd006921d67f ("qed: Add MFW interfaces for TLV request support.) Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Michal Kalderon --- drivers/net/ethernet/qlogic/qed/qed_hsi.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index 9b3ef00e5782..a71382687ef2 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -11987,6 +11987,7 @@ struct public_global { u32 running_bundle_id; s32 external_temperature; u32 mdump_reason; + u64 reserved; u32 data_ptr; u32 data_size; }; From c24498c6827b71f80fecc9fb1b70a792053d41a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:51 -0700 Subject: [PATCH 315/499] netpoll: do not test NAPI_STATE_SCHED in poll_one_napi() Since we do no longer require NAPI drivers to provide an ndo_poll_controller(), napi_schedule() has not been done before poll_one_napi() invocation. So testing NAPI_STATE_SCHED is likely to cause early returns. While we are at it, remove outdated comment. Note to future bisections : This change might surface prior bugs in drivers. See commit 73f21c653f93 ("bnxt_en: Fix TX timeout during netpoll.") for one occurrence. Fixes: ac3d9dd034e5 ("netpoll: make ndo_poll_controller() optional") Signed-off-by: Eric Dumazet Tested-by: Song Liu Cc: Michael Chan Signed-off-by: David S. Miller --- net/core/netpoll.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 3219a2932463..3ae899805f8b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -135,27 +135,9 @@ static void queue_process(struct work_struct *work) } } -/* - * Check whether delayed processing was scheduled for our NIC. If so, - * we attempt to grab the poll lock and use ->poll() to pump the card. - * If this fails, either we've recursed in ->poll() or it's already - * running on another CPU. - * - * Note: we don't mask interrupts with this lock because we're using - * trylock here and interrupts are already disabled in the softirq - * case. Further, we test the poll_owner to avoid recursion on UP - * systems where the lock doesn't exist. - */ static void poll_one_napi(struct napi_struct *napi) { - int work = 0; - - /* net_rx_action's ->poll() invocations and our's are - * synchronized by this test which is only made while - * holding the napi->poll_lock. - */ - if (!test_bit(NAPI_STATE_SCHED, &napi->state)) - return; + int work; /* If we set this bit but see that it has already been set, * that indicates that napi has been disabled and we need From e71fb423e0dea3c9f98f0101e965426edfe849cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:52 -0700 Subject: [PATCH 316/499] hinic: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. hinic uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Note that hinic_netpoll() was incorrectly scheduling NAPI on both RX and TX queues. Signed-off-by: Eric Dumazet Cc: Aviad Krawczyk Signed-off-by: David S. Miller --- .../net/ethernet/huawei/hinic/hinic_main.c | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index 09e9da10b786..4a8f82938ed5 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -789,23 +789,6 @@ static void hinic_get_stats64(struct net_device *netdev, stats->tx_errors = nic_tx_stats->tx_dropped; } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void hinic_netpoll(struct net_device *netdev) -{ - struct hinic_dev *nic_dev = netdev_priv(netdev); - int i, num_qps; - - num_qps = hinic_hwdev_num_qps(nic_dev->hwdev); - for (i = 0; i < num_qps; i++) { - struct hinic_txq *txq = &nic_dev->txqs[i]; - struct hinic_rxq *rxq = &nic_dev->rxqs[i]; - - napi_schedule(&txq->napi); - napi_schedule(&rxq->napi); - } -} -#endif - static const struct net_device_ops hinic_netdev_ops = { .ndo_open = hinic_open, .ndo_stop = hinic_close, @@ -818,9 +801,6 @@ static const struct net_device_ops hinic_netdev_ops = { .ndo_start_xmit = hinic_xmit_frame, .ndo_tx_timeout = hinic_tx_timeout, .ndo_get_stats64 = hinic_get_stats64, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = hinic_netpoll, -#endif }; static void netdev_features_init(struct net_device *netdev) From 226a2dd62c5d789088fcf7804fbe5613887870a5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:53 -0700 Subject: [PATCH 317/499] ehea: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ehea uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Douglas Miller Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ehea/ehea_main.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c index ba580bfae512..03f64f40b2a3 100644 --- a/drivers/net/ethernet/ibm/ehea/ehea_main.c +++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c @@ -921,17 +921,6 @@ static int ehea_poll(struct napi_struct *napi, int budget) return rx; } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void ehea_netpoll(struct net_device *dev) -{ - struct ehea_port *port = netdev_priv(dev); - int i; - - for (i = 0; i < port->num_def_qps; i++) - napi_schedule(&port->port_res[i].napi); -} -#endif - static irqreturn_t ehea_recv_irq_handler(int irq, void *param) { struct ehea_port_res *pr = param; @@ -2953,9 +2942,6 @@ static const struct net_device_ops ehea_netdev_ops = { .ndo_open = ehea_open, .ndo_stop = ehea_stop, .ndo_start_xmit = ehea_start_xmit, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ehea_netpoll, -#endif .ndo_get_stats64 = ehea_get_stats64, .ndo_set_mac_address = ehea_set_mac_addr, .ndo_validate_addr = eth_validate_addr, From 4bd2c03be707253f1157bd759fdd6971e4f70403 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:54 -0700 Subject: [PATCH 318/499] net: hns: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. hns uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Yisen Zhuang Cc: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_enet.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c index 5ce23d4b717e..28e907831b0e 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c @@ -1503,21 +1503,6 @@ static int hns_nic_do_ioctl(struct net_device *netdev, struct ifreq *ifr, return phy_mii_ioctl(phy_dev, ifr, cmd); } -/* use only for netconsole to poll with the device without interrupt */ -#ifdef CONFIG_NET_POLL_CONTROLLER -static void hns_nic_poll_controller(struct net_device *ndev) -{ - struct hns_nic_priv *priv = netdev_priv(ndev); - unsigned long flags; - int i; - - local_irq_save(flags); - for (i = 0; i < priv->ae_handle->q_num * 2; i++) - napi_schedule(&priv->ring_data[i].napi); - local_irq_restore(flags); -} -#endif - static netdev_tx_t hns_nic_net_xmit(struct sk_buff *skb, struct net_device *ndev) { @@ -1970,9 +1955,6 @@ static const struct net_device_ops hns_nic_netdev_ops = { .ndo_set_features = hns_nic_set_features, .ndo_fix_features = hns_nic_fix_features, .ndo_get_stats64 = hns_nic_get_stats64, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = hns_nic_poll_controller, -#endif .ndo_set_rx_mode = hns_nic_set_rx_mode, .ndo_select_queue = hns_nic_select_queue, }; From 260dd2c3e2aeefbe78065f0737dceae1ceb1196a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:55 -0700 Subject: [PATCH 319/499] virtio_net: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. virto_net uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: "Michael S. Tsirkin" Cc: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 765920905226..dab504ec5e50 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1699,17 +1699,6 @@ static void virtnet_stats(struct net_device *dev, tot->rx_frame_errors = dev->stats.rx_frame_errors; } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void virtnet_netpoll(struct net_device *dev) -{ - struct virtnet_info *vi = netdev_priv(dev); - int i; - - for (i = 0; i < vi->curr_queue_pairs; i++) - napi_schedule(&vi->rq[i].napi); -} -#endif - static void virtnet_ack_link_announce(struct virtnet_info *vi) { rtnl_lock(); @@ -2447,9 +2436,6 @@ static const struct net_device_ops virtnet_netdev = { .ndo_get_stats64 = virtnet_stats, .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = virtnet_netpoll, -#endif .ndo_bpf = virtnet_xdp, .ndo_xdp_xmit = virtnet_xdp_xmit, .ndo_features_check = passthru_features_check, From 81b059b2187d77b957bf85318dbd4f36d60555e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:56 -0700 Subject: [PATCH 320/499] qlcnic: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. qlcnic uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Harish Patil Cc: Manish Chopra Signed-off-by: David S. Miller --- .../net/ethernet/qlogic/qlcnic/qlcnic_main.c | 45 ------------------- 1 file changed, 45 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 2d38d1ac2aae..dbd48012224f 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -59,9 +59,6 @@ static int qlcnic_close(struct net_device *netdev); static void qlcnic_tx_timeout(struct net_device *netdev); static void qlcnic_attach_work(struct work_struct *work); static void qlcnic_fwinit_work(struct work_struct *work); -#ifdef CONFIG_NET_POLL_CONTROLLER -static void qlcnic_poll_controller(struct net_device *netdev); -#endif static void qlcnic_idc_debug_info(struct qlcnic_adapter *adapter, u8 encoding); static int qlcnic_can_start_firmware(struct qlcnic_adapter *adapter); @@ -545,9 +542,6 @@ static const struct net_device_ops qlcnic_netdev_ops = { .ndo_udp_tunnel_add = qlcnic_add_vxlan_port, .ndo_udp_tunnel_del = qlcnic_del_vxlan_port, .ndo_features_check = qlcnic_features_check, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = qlcnic_poll_controller, -#endif #ifdef CONFIG_QLCNIC_SRIOV .ndo_set_vf_mac = qlcnic_sriov_set_vf_mac, .ndo_set_vf_rate = qlcnic_sriov_set_vf_tx_rate, @@ -3200,45 +3194,6 @@ static irqreturn_t qlcnic_msix_tx_intr(int irq, void *data) return IRQ_HANDLED; } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void qlcnic_poll_controller(struct net_device *netdev) -{ - struct qlcnic_adapter *adapter = netdev_priv(netdev); - struct qlcnic_host_sds_ring *sds_ring; - struct qlcnic_recv_context *recv_ctx; - struct qlcnic_host_tx_ring *tx_ring; - int ring; - - if (!test_bit(__QLCNIC_DEV_UP, &adapter->state)) - return; - - recv_ctx = adapter->recv_ctx; - - for (ring = 0; ring < adapter->drv_sds_rings; ring++) { - sds_ring = &recv_ctx->sds_rings[ring]; - qlcnic_disable_sds_intr(adapter, sds_ring); - napi_schedule(&sds_ring->napi); - } - - if (adapter->flags & QLCNIC_MSIX_ENABLED) { - /* Only Multi-Tx queue capable devices need to - * schedule NAPI for TX rings - */ - if ((qlcnic_83xx_check(adapter) && - (adapter->flags & QLCNIC_TX_INTR_SHARED)) || - (qlcnic_82xx_check(adapter) && - !qlcnic_check_multi_tx(adapter))) - return; - - for (ring = 0; ring < adapter->drv_tx_rings; ring++) { - tx_ring = &adapter->tx_ring[ring]; - qlcnic_disable_tx_intr(adapter, tx_ring); - napi_schedule(&tx_ring->napi); - } - } -} -#endif - static void qlcnic_idc_debug_info(struct qlcnic_adapter *adapter, u8 encoding) { From 3548fcf7d877c682c9a5a413c51929739192156d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:57 -0700 Subject: [PATCH 321/499] qlogic: netxen: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. netxen uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Manish Chopra Cc: Rahul Verma Signed-off-by: David S. Miller --- .../ethernet/qlogic/netxen/netxen_nic_main.c | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c index 69aa7fc392c5..59c70be22a84 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c @@ -72,9 +72,6 @@ static void netxen_schedule_work(struct netxen_adapter *adapter, work_func_t func, int delay); static void netxen_cancel_fw_work(struct netxen_adapter *adapter); static int netxen_nic_poll(struct napi_struct *napi, int budget); -#ifdef CONFIG_NET_POLL_CONTROLLER -static void netxen_nic_poll_controller(struct net_device *netdev); -#endif static void netxen_create_sysfs_entries(struct netxen_adapter *adapter); static void netxen_remove_sysfs_entries(struct netxen_adapter *adapter); @@ -581,9 +578,6 @@ static const struct net_device_ops netxen_netdev_ops = { .ndo_tx_timeout = netxen_tx_timeout, .ndo_fix_features = netxen_fix_features, .ndo_set_features = netxen_set_features, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = netxen_nic_poll_controller, -#endif }; static inline bool netxen_function_zero(struct pci_dev *pdev) @@ -2402,23 +2396,6 @@ static int netxen_nic_poll(struct napi_struct *napi, int budget) return work_done; } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void netxen_nic_poll_controller(struct net_device *netdev) -{ - int ring; - struct nx_host_sds_ring *sds_ring; - struct netxen_adapter *adapter = netdev_priv(netdev); - struct netxen_recv_context *recv_ctx = &adapter->recv_ctx; - - disable_irq(adapter->irq); - for (ring = 0; ring < adapter->max_sds_rings; ring++) { - sds_ring = &recv_ctx->sds_rings[ring]; - netxen_intr(adapter->irq, sds_ring); - } - enable_irq(adapter->irq); -} -#endif - static int nx_incr_dev_ref_cnt(struct netxen_adapter *adapter) { From 21627982e4fff76a053f4d08d7fb56e532e08d52 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:58 -0700 Subject: [PATCH 322/499] net: ena: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ena uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Netanel Belgazal Cc: Saeed Bishara Cc: Zorik Machulsky Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 22 -------------------- 1 file changed, 22 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 29b5774dd32d..25621a218f20 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -2185,25 +2185,6 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void ena_netpoll(struct net_device *netdev) -{ - struct ena_adapter *adapter = netdev_priv(netdev); - int i; - - /* Dont schedule NAPI if the driver is in the middle of reset - * or netdev is down. - */ - - if (!test_bit(ENA_FLAG_DEV_UP, &adapter->flags) || - test_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags)) - return; - - for (i = 0; i < adapter->num_queues; i++) - napi_schedule(&adapter->ena_napi[i].napi); -} -#endif /* CONFIG_NET_POLL_CONTROLLER */ - static u16 ena_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev, select_queue_fallback_t fallback) @@ -2369,9 +2350,6 @@ static const struct net_device_ops ena_netdev_ops = { .ndo_change_mtu = ena_change_mtu, .ndo_set_mac_address = NULL, .ndo_validate_addr = eth_validate_addr, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ena_netpoll, -#endif /* CONFIG_NET_POLL_CONTROLLER */ }; static int ena_device_validate_params(struct ena_adapter *adapter, From 9447a10ff607debe5e30cc438fb56925a559b9d9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:31:59 -0700 Subject: [PATCH 323/499] sfc: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. sfc uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Edward Cree Cc: Bert Kenward Cc: Solarflare linux maintainers Acked-By: Bert Kenward Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/efx.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 330233286e78..3d0dd39c289e 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -2206,29 +2206,6 @@ static void efx_fini_napi(struct efx_nic *efx) efx_fini_napi_channel(channel); } -/************************************************************************** - * - * Kernel netpoll interface - * - *************************************************************************/ - -#ifdef CONFIG_NET_POLL_CONTROLLER - -/* Although in the common case interrupts will be disabled, this is not - * guaranteed. However, all our work happens inside the NAPI callback, - * so no locking is required. - */ -static void efx_netpoll(struct net_device *net_dev) -{ - struct efx_nic *efx = netdev_priv(net_dev); - struct efx_channel *channel; - - efx_for_each_channel(channel, efx) - efx_schedule_channel(channel); -} - -#endif - /************************************************************************** * * Kernel net device interface @@ -2509,9 +2486,6 @@ static const struct net_device_ops efx_netdev_ops = { #endif .ndo_get_phys_port_id = efx_get_phys_port_id, .ndo_get_phys_port_name = efx_get_phys_port_name, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = efx_netpoll, -#endif .ndo_setup_tc = efx_setup_tc, #ifdef CONFIG_RFS_ACCEL .ndo_rx_flow_steer = efx_filter_rfs, From a4f570be654de779eaf626a79c4e0aa5a790505f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:32:00 -0700 Subject: [PATCH 324/499] sfc-falcon: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. sfc-falcon uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Solarflare linux maintainers Cc: Edward Cree Cc: Bert Kenward Acked-By: Bert Kenward Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/falcon/efx.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c index dd5530a4f8c8..03e2455c502e 100644 --- a/drivers/net/ethernet/sfc/falcon/efx.c +++ b/drivers/net/ethernet/sfc/falcon/efx.c @@ -2052,29 +2052,6 @@ static void ef4_fini_napi(struct ef4_nic *efx) ef4_fini_napi_channel(channel); } -/************************************************************************** - * - * Kernel netpoll interface - * - *************************************************************************/ - -#ifdef CONFIG_NET_POLL_CONTROLLER - -/* Although in the common case interrupts will be disabled, this is not - * guaranteed. However, all our work happens inside the NAPI callback, - * so no locking is required. - */ -static void ef4_netpoll(struct net_device *net_dev) -{ - struct ef4_nic *efx = netdev_priv(net_dev); - struct ef4_channel *channel; - - ef4_for_each_channel(channel, efx) - ef4_schedule_channel(channel); -} - -#endif - /************************************************************************** * * Kernel net device interface @@ -2250,9 +2227,6 @@ static const struct net_device_ops ef4_netdev_ops = { .ndo_set_mac_address = ef4_set_mac_address, .ndo_set_rx_mode = ef4_set_rx_mode, .ndo_set_features = ef4_set_features, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ef4_netpoll, -#endif .ndo_setup_tc = ef4_setup_tc, #ifdef CONFIG_RFS_ACCEL .ndo_rx_flow_steer = ef4_filter_rfs, From 0c3b9d1b37df16ae6046a5a01f769bf3d21b838c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Sep 2018 09:32:01 -0700 Subject: [PATCH 325/499] ibmvnic: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ibmvnic uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. ibmvnic_netpoll_controller() was completely wrong anyway, as it was scheduling NAPI to service RX queues (instead of TX), so I doubt netpoll ever worked on this driver. Signed-off-by: Eric Dumazet Cc: Thomas Falcon Cc: John Allen Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 4f0daf67b18d..699ef942b615 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2207,19 +2207,6 @@ static int ibmvnic_poll(struct napi_struct *napi, int budget) return frames_processed; } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void ibmvnic_netpoll_controller(struct net_device *dev) -{ - struct ibmvnic_adapter *adapter = netdev_priv(dev); - int i; - - replenish_pools(netdev_priv(dev)); - for (i = 0; i < adapter->req_rx_queues; i++) - ibmvnic_interrupt_rx(adapter->rx_scrq[i]->irq, - adapter->rx_scrq[i]); -} -#endif - static int wait_for_reset(struct ibmvnic_adapter *adapter) { int rc, ret; @@ -2292,9 +2279,6 @@ static const struct net_device_ops ibmvnic_netdev_ops = { .ndo_set_mac_address = ibmvnic_set_mac, .ndo_validate_addr = eth_validate_addr, .ndo_tx_timeout = ibmvnic_tx_timeout, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ibmvnic_netpoll_controller, -#endif .ndo_change_mtu = ibmvnic_change_mtu, .ndo_features_check = ibmvnic_features_check, }; From befb1b3c2703897c5b8ffb0044dc5d0e5f27c5d7 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Wed, 19 Sep 2018 10:29:06 -0700 Subject: [PATCH 326/499] perf/core: Add sanity check to deal with pinned event failure It is possible that a failure can occur during the scheduling of a pinned event. The initial portion of perf_event_read_local() contains the various error checks an event should pass before it can be considered valid. Ensure that the potential scheduling failure of a pinned event is checked for and have a credible error. Suggested-by: Peter Zijlstra Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: acme@kernel.org Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/6486385d1f30336e9973b24c8c65f5079543d3d3.1537377064.git.reinette.chatre@intel.com --- kernel/events/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index c80549bf82c6..dcb093e7b377 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3935,6 +3935,12 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } + /* If this is a pinned event it must be running on this CPU */ + if (event->attr.pinned && event->oncpu != smp_processor_id()) { + ret = -EBUSY; + goto out; + } + /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise From 573bcd380921b5216b62dcd072ec426f5ecbeb9d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 28 Sep 2018 17:34:21 -0500 Subject: [PATCH 327/499] MAINTAINERS: Remove obsolete drivers/pci pattern from ACPI section Prior to 256a45937093 ("PCI/AER: Squash aerdrv_acpi.c into aerdrv.c"), drivers/pci/pcie/aer/aerdrv_acpi.c contained code to parse the ACPI HEST table. That code now lives in drivers/pci/pcie/aer.c. Remove the "F: drivers/pci/*/*/*acpi*" pattern because it matches nothing. We could add a "F: drivers/pci/pcie/aer.c" pattern to the ACPI APEI section, but that file sees a lot of changes, almost none of which are of interest to the ACPI folks. Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 700408b7bc53..9babd8a0406b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -324,7 +324,6 @@ F: Documentation/ABI/testing/sysfs-bus-acpi F: Documentation/ABI/testing/configfs-acpi F: drivers/pci/*acpi* F: drivers/pci/*/*acpi* -F: drivers/pci/*/*/*acpi* F: tools/power/acpi/ ACPI APEI From ae29478766f4c8e16edca6fe1e25d73c47991ebe Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 29 Sep 2018 13:40:40 +1000 Subject: [PATCH 328/499] xfs: don't crash the vfs on a garbage inline symlink The VFS routine that calls ->get_link blindly copies whatever's returned into the user's buffer. If we return a NULL pointer, the vfs will crash on the null pointer. Therefore, return -EFSCORRUPTED instead of blowing up the kernel. [dgc: clean up with hch's suggestions] Reported-by: wen.xu@gatech.edu Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c3e74f9128e8..f48ffd7a8d3e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -471,8 +471,18 @@ xfs_vn_get_link_inline( struct inode *inode, struct delayed_call *done) { + char *link; + ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); - return XFS_I(inode)->i_df.if_u1.if_data; + + /* + * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if + * if_data is junk. + */ + link = XFS_I(inode)->i_df.if_u1.if_data; + if (!link) + return ERR_PTR(-EFSCORRUPTED); + return link; } STATIC int From d5a2e2893da0d62c3888c91ae2da798adc17a9b9 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Sat, 29 Sep 2018 13:41:58 +1000 Subject: [PATCH 329/499] xfs: remove last of unnecessary xfs_defer_cancel() callers Now that deferred operations are completely managed via transactions, it's no longer necessary to cancel the dfops in error paths that already cancel the associated transaction. There are a few such calls lingering throughout the codebase. Remove all remaining unnecessary calls to xfs_defer_cancel(). This leaves xfs_defer_cancel() calls in two places. The first is the call in the transaction cancel path itself, which facilitates this patch. The second is made via the xfs_defer_finish() error path to provide consistent error semantics with transaction commit. For example, xfs_trans_commit() expects an xfs_defer_finish() failure to clean up the dfops structure before it returns. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 28 ++++++++-------------------- fs/xfs/libxfs/xfs_attr_remote.c | 10 ++-------- fs/xfs/xfs_bmap_util.c | 12 +++++------- fs/xfs/xfs_inode.c | 10 +--------- 4 files changed, 16 insertions(+), 44 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 1e671d4eb6fa..c6299f82a6e4 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -587,7 +587,7 @@ xfs_attr_leaf_addname( */ error = xfs_attr3_leaf_to_node(args); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -675,7 +675,7 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -693,9 +693,6 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_clearflag(args); } return error; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -738,15 +735,12 @@ xfs_attr_leaf_removename( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; } return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -864,7 +858,7 @@ xfs_attr_node_addname( state = NULL; error = xfs_attr3_leaf_to_node(args); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -888,7 +882,7 @@ xfs_attr_node_addname( */ error = xfs_da3_split(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -984,7 +978,7 @@ xfs_attr_node_addname( if (retval && (state->path.active > 1)) { error = xfs_da3_join(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1013,9 +1007,6 @@ xfs_attr_node_addname( if (error) return error; return retval; -out_defer_cancel: - xfs_defer_cancel(args->trans); - goto out; } /* @@ -1107,7 +1098,7 @@ xfs_attr_node_removename( if (retval && (state->path.active > 1)) { error = xfs_da3_join(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1138,7 +1129,7 @@ xfs_attr_node_removename( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1150,9 +1141,6 @@ xfs_attr_node_removename( out: xfs_da_state_free(state); return error; -out_defer_cancel: - xfs_defer_cancel(args->trans); - goto out; } /* diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index af094063e402..d89363c6b523 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -485,7 +485,7 @@ xfs_attr_rmtval_set( blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, &nmap); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -553,9 +553,6 @@ xfs_attr_rmtval_set( } ASSERT(valuelen == 0); return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -625,7 +622,7 @@ xfs_attr_rmtval_remove( error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, &done); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -638,7 +635,4 @@ xfs_attr_rmtval_remove( return error; } return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index addbd74ecd8e..ae3cc393724f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1584,7 +1584,7 @@ xfs_swap_extent_rmap( tirec.br_blockcount, &irec, &nimaps, 0); if (error) - goto out_defer; + goto out; ASSERT(nimaps == 1); ASSERT(tirec.br_startoff == irec.br_startoff); trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); @@ -1599,22 +1599,22 @@ xfs_swap_extent_rmap( /* Remove the mapping from the donor file. */ error = xfs_bmap_unmap_extent(tp, tip, &uirec); if (error) - goto out_defer; + goto out; /* Remove the mapping from the source file. */ error = xfs_bmap_unmap_extent(tp, ip, &irec); if (error) - goto out_defer; + goto out; /* Map the donor file's blocks into the source file. */ error = xfs_bmap_map_extent(tp, ip, &uirec); if (error) - goto out_defer; + goto out; /* Map the source file's blocks into the donor file. */ error = xfs_bmap_map_extent(tp, tip, &irec); if (error) - goto out_defer; + goto out; error = xfs_defer_finish(tpp); tp = *tpp; @@ -1636,8 +1636,6 @@ xfs_swap_extent_rmap( tip->i_d.di_flags2 = tip_flags2; return 0; -out_defer: - xfs_defer_cancel(tp); out: trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); tip->i_d.di_flags2 = tip_flags2; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d957a46dc1cb..05db9540e459 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1563,7 +1563,7 @@ xfs_itruncate_extents_flags( error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, XFS_ITRUNC_MAX_EXTENTS, &done); if (error) - goto out_bmap_cancel; + goto out; /* * Duplicate the transaction that has the permanent @@ -1599,14 +1599,6 @@ xfs_itruncate_extents_flags( out: *tpp = tp; return error; -out_bmap_cancel: - /* - * If the bunmapi call encounters an error, return to the caller where - * the transaction can be properly aborted. We just need to make sure - * we're not holding any resources that we were not when we came in. - */ - xfs_defer_cancel(tp); - goto out; } int From d9183105caa926522a4bc8a40e162de7019f1a21 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Sat, 29 Sep 2018 13:44:40 +1000 Subject: [PATCH 330/499] xfs: don't unlock invalidated buf on aborted tx commit xfstests generic/388,475 occasionally reproduce assertion failures in xfs_buf_item_unpin() when the final bli reference is dropped on an invalidated buffer and the buffer is not locked as it is expected to be. Invalidated buffers should remain locked on transaction commit until the final unpin, at which point the buffer is removed from the AIL and the bli is freed since stale buffers are not written back. The assert failures are associated with filesystem shutdown, typically due to log I/O errors injected by the test. The problematic situation can occur if the shutdown happens to cause a race between an active transaction that has invalidated a particular buffer and an I/O error on a log buffer that contains the bli associated with the same (now stale) buffer. Both transaction and log contexts acquire a bli reference. If the transaction has already invalidated the buffer by the time the I/O error occurs and ends up aborting due to shutdown, the transaction and log hold the last two references to a stale bli. If the transaction cancel occurs first, it treats the buffer as non-stale due to the aborted state: the bli reference is dropped and the buffer is released/unlocked. The log buffer I/O error handling eventually calls into xfs_buf_item_unpin(), drops the final reference to the bli and treats it as stale. The buffer wasn't left locked by xfs_buf_item_unlock(), however, so the assert fails and the buffer is double unlocked. The latter problem is mitigated by the fact that the fs is shutdown and no further damage is possible. ->iop_unlock() of an invalidated buffer should behave consistently with respect to the bli refcount, regardless of aborted state. If the refcount remains elevated on commit, we know the bli is awaiting an unpin (since it can't be in another transaction) and will be handled appropriately on log buffer completion. If the final bli reference of an invalidated buffer is dropped in ->iop_unlock(), we can assume the transaction has aborted because invalidation implies a dirty transaction. In the non-abort case, the log would have acquired a bli reference in ->iop_pin() and prevented bli release at ->iop_unlock() time. In the abort case the item must be freed and buffer unlocked because it wasn't pinned by the log. Rework xfs_buf_item_unlock() to simplify the currently circuitous and duplicate logic and leave invalidated buffers locked based on bli refcount, regardless of aborted state. This ensures that a pinned, stale buffer is always found locked when eventually unpinned. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf_item.c | 91 ++++++++++++++++++++----------------------- fs/xfs/xfs_trace.h | 1 - 2 files changed, 42 insertions(+), 50 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1c9d1398980b..42fce70b474d 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -556,73 +556,66 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + bool freed; bool aborted; - bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); - bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); + bool hold = bip->bli_flags & XFS_BLI_HOLD; + bool dirty = bip->bli_flags & XFS_BLI_DIRTY; + bool stale = bip->bli_flags & XFS_BLI_STALE; #if defined(DEBUG) || defined(XFS_WARN) - bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); + bool ordered = bip->bli_flags & XFS_BLI_ORDERED; #endif - aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); - - /* Clear the buffer's association with this transaction. */ - bp->b_transp = NULL; - - /* - * The per-transaction state has been copied above so clear it from the - * bli. - */ - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); - - /* - * If the buf item is marked stale, then don't do anything. We'll - * unlock the buffer and free the buf item when the buffer is unpinned - * for the last time. - */ - if (bip->bli_flags & XFS_BLI_STALE) { - trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - if (!aborted) { - atomic_dec(&bip->bli_refcount); - return; - } - } - trace_xfs_buf_item_unlock(bip); /* - * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. If we are aborting the transaction, this may - * be the only reference to the buf item, so we free it anyway - * regardless of whether it is dirty or not. A dirty abort implies a - * shutdown, anyway. - * * The bli dirty state should match whether the blf has logged segments * except for ordered buffers, where only the bli should be dirty. */ ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || (ordered && dirty && !xfs_buf_item_dirty_format(bip))); + ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); + + aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); /* - * Clean buffers, by definition, cannot be in the AIL. However, aborted - * buffers may be in the AIL regardless of dirty state. An aborted - * transaction that invalidates a buffer already in the AIL may have - * marked it stale and cleared the dirty state, for example. - * - * Therefore if we are aborting a buffer and we've just taken the last - * reference away, we have to check if it is in the AIL before freeing - * it. We need to free it in this case, because an aborted transaction - * has already shut the filesystem down and this is the last chance we - * will have to do so. + * Clear the buffer's association with this transaction and + * per-transaction state from the bli, which has been copied above. */ - if (atomic_dec_and_test(&bip->bli_refcount)) { - if (aborted) { - ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + bp->b_transp = NULL; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); + + /* + * Drop the transaction's bli reference and deal with the item if we had + * the last one. We must free the item if clean or aborted since it + * wasn't pinned by the log and this is the last chance to do so. If the + * bli is freed and dirty (but non-aborted), the buffer was not dirty in + * this transaction but modified by a previous one and still awaiting + * writeback. In that case, the bli is freed on buffer writeback + * completion. + */ + freed = atomic_dec_and_test(&bip->bli_refcount); + if (freed) { + ASSERT(!aborted || XFS_FORCED_SHUTDOWN(lip->li_mountp)); + /* + * An aborted item may be in the AIL regardless of dirty state. + * For example, consider an aborted transaction that invalidated + * a dirty bli and cleared the dirty state. + */ + if (aborted) xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + if (aborted || !dirty) xfs_buf_item_relse(bp); - } else if (!dirty) - xfs_buf_item_relse(bp); + } else if (stale) { + /* + * Stale buffers remain locked until final unpin unless the bli + * was freed in the branch above. A freed stale bli implies an + * abort because buffer invalidation dirties the bli and + * transaction. + */ + ASSERT(!freed); + return; } + ASSERT(!stale || (aborted && freed)); if (!hold) xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ad315e83bc02..3043e5ed6495 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -473,7 +473,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); From 23420d05e67d23728e116321c4afe084ebfa6427 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Sat, 29 Sep 2018 13:45:02 +1000 Subject: [PATCH 331/499] xfs: clean up xfs_trans_brelse() xfs_trans_brelse() is a bit of a historical mess, similar to xfs_buf_item_unlock(). It is unnecessarily verbose, has snippets of commented out code, inconsistency with regard to stale items, etc. Clean up xfs_trans_brelse() to use similar logic and flow as xfs_buf_item_unlock() with regard to bli reference count handling. This patch makes no functional changes, but facilitates further refactoring of the common bli reference count handling code. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_trans_buf.c | 112 +++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 72 deletions(-) diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 15919f67a88f..7498f87ceed3 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -322,49 +322,40 @@ xfs_trans_read_buf_map( } /* - * Release the buffer bp which was previously acquired with one of the - * xfs_trans_... buffer allocation routines if the buffer has not - * been modified within this transaction. If the buffer is modified - * within this transaction, do decrement the recursion count but do - * not release the buffer even if the count goes to 0. If the buffer is not - * modified within the transaction, decrement the recursion count and - * release the buffer if the recursion count goes to 0. + * Release a buffer previously joined to the transaction. If the buffer is + * modified within this transaction, decrement the recursion count but do not + * release the buffer even if the count goes to 0. If the buffer is not modified + * within the transaction, decrement the recursion count and release the buffer + * if the recursion count goes to 0. * - * If the buffer is to be released and it was not modified before - * this transaction began, then free the buf_log_item associated with it. + * If the buffer is to be released and it was not already dirty before this + * transaction began, then also free the buf_log_item associated with it. * - * If the transaction pointer is NULL, make this just a normal - * brelse() call. + * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call. */ void xfs_trans_brelse( - xfs_trans_t *tp, - xfs_buf_t *bp) + struct xfs_trans *tp, + struct xfs_buf *bp) { - struct xfs_buf_log_item *bip; - int freed; + struct xfs_buf_log_item *bip = bp->b_log_item; + bool freed; + bool dirty; - /* - * Default to a normal brelse() call if the tp is NULL. - */ - if (tp == NULL) { - ASSERT(bp->b_transp == NULL); + ASSERT(bp->b_transp == tp); + + if (!tp) { xfs_buf_relse(bp); return; } - ASSERT(bp->b_transp == tp); - bip = bp->b_log_item; + trace_xfs_trans_brelse(bip); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); - trace_xfs_trans_brelse(bip); - /* - * If the release is just for a recursive lock, - * then decrement the count and return. + * If the release is for a recursive lookup, then decrement the count + * and return. */ if (bip->bli_recur > 0) { bip->bli_recur--; @@ -372,63 +363,40 @@ xfs_trans_brelse( } /* - * If the buffer is dirty within this transaction, we can't + * If the buffer is invalidated or dirty in this transaction, we can't * release it until we commit. */ if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; - - /* - * If the buffer has been invalidated, then we can't release - * it until the transaction commits to disk unless it is re-dirtied - * as part of this transaction. This prevents us from pulling - * the item from the AIL before we should. - */ if (bip->bli_flags & XFS_BLI_STALE) return; + /* + * Unlink the log item from the transaction and clear the hold flag, if + * set. We wouldn't want the next user of the buffer to get confused. + */ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - - /* - * Free up the log item descriptor tracking the released item. - */ xfs_trans_del_item(&bip->bli_item); + bip->bli_flags &= ~XFS_BLI_HOLD; /* - * Clear the hold flag in the buf log item if it is set. - * We wouldn't want the next user of the buffer to - * get confused. - */ - if (bip->bli_flags & XFS_BLI_HOLD) { - bip->bli_flags &= ~XFS_BLI_HOLD; - } - - /* - * Drop our reference to the buf log item. + * Drop the reference to the bli. At this point, the bli must be either + * freed or dirty (or both). If freed, there are a couple cases where we + * are responsible to free the item. If the bli is clean, we're the last + * user of it. If the fs has shut down, the bli may be dirty and AIL + * resident, but won't ever be written back. We therefore may also need + * to remove it from the AIL before freeing it. */ freed = atomic_dec_and_test(&bip->bli_refcount); - - /* - * If the buf item is not tracking data in the log, then we must free it - * before releasing the buffer back to the free pool. - * - * If the fs has shutdown and we dropped the last reference, it may fall - * on us to release a (possibly dirty) bli if it never made it to the - * AIL (e.g., the aborted unpin already happened and didn't release it - * due to our reference). Since we're already shutdown and need - * ail_lock, just force remove from the AIL and release the bli here. - */ - if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { - xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { -/*** - ASSERT(bp->b_pincount == 0); -***/ - ASSERT(atomic_read(&bip->bli_refcount) == 0); - ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); - ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); - xfs_buf_item_relse(bp); + dirty = bip->bli_flags & XFS_BLI_DIRTY; + ASSERT(freed || dirty); + if (freed) { + bool abort = XFS_FORCED_SHUTDOWN(tp->t_mountp); + ASSERT(abort || !test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); + if (abort) + xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); + if (!dirty || abort) + xfs_buf_item_relse(bp); } bp->b_transp = NULL; From 95808459b110f16b50f03a70ecfa72bb14bd8a96 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Sat, 29 Sep 2018 13:45:26 +1000 Subject: [PATCH 332/499] xfs: refactor xfs_buf_log_item reference count handling The xfs_buf_log_item structure has a reference counter with slightly tricky semantics. In the common case, a buffer is logged and committed in a transaction, committed to the on-disk log (added to the AIL) and then finally written back and removed from the AIL. The bli refcount covers two potentially overlapping timeframes: 1. the bli is held in an active transaction 2. the bli is pinned by the log The caveat to this approach is that the reference counter does not purely dictate the lifetime of the bli. IOW, when a dirty buffer is physically logged and unpinned, the bli refcount may go to zero as the log item is inserted into the AIL. Only once the buffer is written back can the bli finally be freed. The above semantics means that it is not enough for the various refcount decrementing contexts to release the bli on decrement to zero. xfs_trans_brelse(), transaction commit (->iop_unlock()) and unpin (->iop_unpin()) must all drop the associated reference and make additional checks to determine if the current context is responsible for freeing the item. For example, if a transaction holds but does not dirty a particular bli, the commit may drop the refcount to zero. If the bli itself is clean, it is also not AIL resident and must be freed at this time. The same is true for xfs_trans_brelse(). If the transaction dirties a bli and then aborts or an unpin results in an abort due to a log I/O error, the last reference count holder is expected to explicitly remove the item from the AIL and release it (since an abort means filesystem shutdown and metadata writeback will never occur). This leads to fairly complex checks being replicated in a few different places. Since ->iop_unlock() and xfs_trans_brelse() are nearly identical, refactor the logic into a common helper that implements and documents the semantics in one place. This patch does not change behavior. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf_item.c | 90 +++++++++++++++++++++++++----------------- fs/xfs/xfs_buf_item.h | 1 + fs/xfs/xfs_trans_buf.c | 23 +---------- 3 files changed, 56 insertions(+), 58 deletions(-) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 42fce70b474d..12d8455bfbb2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -531,6 +531,49 @@ xfs_buf_item_push( return rval; } +/* + * Drop the buffer log item refcount and take appropriate action. This helper + * determines whether the bli must be freed or not, since a decrement to zero + * does not necessarily mean the bli is unused. + * + * Return true if the bli is freed, false otherwise. + */ +bool +xfs_buf_item_put( + struct xfs_buf_log_item *bip) +{ + struct xfs_log_item *lip = &bip->bli_item; + bool aborted; + bool dirty; + + /* drop the bli ref and return if it wasn't the last one */ + if (!atomic_dec_and_test(&bip->bli_refcount)) + return false; + + /* + * We dropped the last ref and must free the item if clean or aborted. + * If the bli is dirty and non-aborted, the buffer was clean in the + * transaction but still awaiting writeback from previous changes. In + * that case, the bli is freed on buffer writeback completion. + */ + aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || + XFS_FORCED_SHUTDOWN(lip->li_mountp); + dirty = bip->bli_flags & XFS_BLI_DIRTY; + if (dirty && !aborted) + return false; + + /* + * The bli is aborted or clean. An aborted item may be in the AIL + * regardless of dirty state. For example, consider an aborted + * transaction that invalidated a dirty bli and cleared the dirty + * state. + */ + if (aborted) + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bip->bli_buf); + return true; +} + /* * Release the buffer associated with the buf log item. If there is no dirty * logged data associated with the buffer recorded in the buf log item, then @@ -556,13 +599,12 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool freed; - bool aborted; + bool released; bool hold = bip->bli_flags & XFS_BLI_HOLD; - bool dirty = bip->bli_flags & XFS_BLI_DIRTY; bool stale = bip->bli_flags & XFS_BLI_STALE; #if defined(DEBUG) || defined(XFS_WARN) bool ordered = bip->bli_flags & XFS_BLI_ORDERED; + bool dirty = bip->bli_flags & XFS_BLI_DIRTY; #endif trace_xfs_buf_item_unlock(bip); @@ -575,8 +617,6 @@ xfs_buf_item_unlock( (ordered && dirty && !xfs_buf_item_dirty_format(bip))); ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); - aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); - /* * Clear the buffer's association with this transaction and * per-transaction state from the bli, which has been copied above. @@ -585,40 +625,16 @@ xfs_buf_item_unlock( bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* - * Drop the transaction's bli reference and deal with the item if we had - * the last one. We must free the item if clean or aborted since it - * wasn't pinned by the log and this is the last chance to do so. If the - * bli is freed and dirty (but non-aborted), the buffer was not dirty in - * this transaction but modified by a previous one and still awaiting - * writeback. In that case, the bli is freed on buffer writeback - * completion. + * Unref the item and unlock the buffer unless held or stale. Stale + * buffers remain locked until final unpin unless the bli is freed by + * the unref call. The latter implies shutdown because buffer + * invalidation dirties the bli and transaction. */ - freed = atomic_dec_and_test(&bip->bli_refcount); - if (freed) { - ASSERT(!aborted || XFS_FORCED_SHUTDOWN(lip->li_mountp)); - /* - * An aborted item may be in the AIL regardless of dirty state. - * For example, consider an aborted transaction that invalidated - * a dirty bli and cleared the dirty state. - */ - if (aborted) - xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); - if (aborted || !dirty) - xfs_buf_item_relse(bp); - } else if (stale) { - /* - * Stale buffers remain locked until final unpin unless the bli - * was freed in the branch above. A freed stale bli implies an - * abort because buffer invalidation dirties the bli and - * transaction. - */ - ASSERT(!freed); + released = xfs_buf_item_put(bip); + if (hold || (stale && !released)) return; - } - ASSERT(!stale || (aborted && freed)); - - if (!hold) - xfs_buf_relse(bp); + ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags)); + xfs_buf_relse(bp); } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 3f7d7b72e7e6..90f65f891fab 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -51,6 +51,7 @@ struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); +bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 7498f87ceed3..286a287ac57a 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -339,8 +339,6 @@ xfs_trans_brelse( struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; - bool freed; - bool dirty; ASSERT(bp->b_transp == tp); @@ -379,25 +377,8 @@ xfs_trans_brelse( xfs_trans_del_item(&bip->bli_item); bip->bli_flags &= ~XFS_BLI_HOLD; - /* - * Drop the reference to the bli. At this point, the bli must be either - * freed or dirty (or both). If freed, there are a couple cases where we - * are responsible to free the item. If the bli is clean, we're the last - * user of it. If the fs has shut down, the bli may be dirty and AIL - * resident, but won't ever be written back. We therefore may also need - * to remove it from the AIL before freeing it. - */ - freed = atomic_dec_and_test(&bip->bli_refcount); - dirty = bip->bli_flags & XFS_BLI_DIRTY; - ASSERT(freed || dirty); - if (freed) { - bool abort = XFS_FORCED_SHUTDOWN(tp->t_mountp); - ASSERT(abort || !test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); - if (abort) - xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); - if (!dirty || abort) - xfs_buf_item_relse(bp); - } + /* drop the reference to the bli */ + xfs_buf_item_put(bip); bp->b_transp = NULL; xfs_buf_relse(bp); From 8683edb7755b853f0dd92e07fe2e7a7e675a84d7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 29 Sep 2018 13:46:21 +1000 Subject: [PATCH 333/499] xfs: avoid lockdep false positives in xfs_trans_alloc We've had a few reports of lockdep tripping over memory reclaim context vs filesystem freeze "deadlocks". They all have looked to be false positives on analysis, but it seems that they are being tripped because we take freeze references before we run a GFP_KERNEL allocation for the struct xfs_trans. We can avoid this false positive vector just by re-ordering the operations in xfs_trans_alloc(). That is. we need allocate the structure before we take the freeze reference and enter the GFP_NOFS allocation context that follows the xfs_trans around. This prevents lockdep from seeing the GFP_KERNEL allocation inside the transaction context, and that prevents it from triggering the freeze level vs alloc context vs reclaim warnings. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trans.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bedc5a5133a5..912b42f5fe4a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -259,6 +259,14 @@ xfs_trans_alloc( struct xfs_trans *tp; int error; + /* + * Allocate the handle before we do our freeze accounting and setting up + * GFP_NOFS allocation context so that we avoid lockdep false positives + * by doing GFP_KERNEL allocations inside sb_start_intwrite(). + */ + tp = kmem_zone_zalloc(xfs_trans_zone, + (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); @@ -270,8 +278,6 @@ xfs_trans_alloc( mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); - tp = kmem_zone_zalloc(xfs_trans_zone, - (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; tp->t_mountp = mp; From df307077916fde42734a49022fc8ed23cc58caa4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 29 Sep 2018 13:47:15 +1000 Subject: [PATCH 334/499] xfs: fix transaction leak in xfs_reflink_allocate_cow() When xfs_reflink_allocate_cow() allocates a transaction, it drops the ILOCK to perform the operation. This Introduces a race condition where another thread modifying the file can perform the COW allocation operation underneath us. This result in the retry loop finding an allocated block and jumping straight to the conversion code. It does not, however, cancel the transaction it holds and so this gets leaked. This results in a lockdep warning: ================================================ WARNING: lock held when returning to user space! 4.18.5 #1 Not tainted ------------------------------------------------ worker/6123 is leaving the kernel with locks still held! 1 lock held by worker/6123: #0: 000000009eab4f1b (sb_internal#2){.+.+}, at: xfs_trans_alloc+0x17c/0x220 And eventually the filesystem deadlocks because it runs out of log space that is reserved by the leaked transaction and never gets released. The logic flow in xfs_reflink_allocate_cow() is a convoluted mess of gotos - it's no surprise that it has bug where the flow through several goto jumps then fails to clean up context from a non-obvious logic path. CLean up the logic flow and make sure every path does the right thing. Reported-by: Alexander Y. Fomichev Tested-by: Alexander Y. Fomichev Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200981 Signed-off-by: Dave Chinner [hch: slight refactor] Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 133 ++++++++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 53 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 38f405415b88..d60d0eeed7b9 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -352,6 +352,47 @@ xfs_reflink_convert_cow( return error; } +/* + * Find the extent that maps the given range in the COW fork. Even if the extent + * is not shared we might have a preallocation for it in the COW fork. If so we + * use it that rather than trigger a new allocation. + */ +static int +xfs_find_trim_cow_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + bool *shared, + bool *found) +{ + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + bool trimmed; + + *found = false; + + /* + * If we don't find an overlapping extent, trim the range we need to + * allocate to fit the hole we found. + */ + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) || + got.br_startoff > offset_fsb) + return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); + + *shared = true; + if (isnullstartblock(got.br_startblock)) { + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + return 0; + } + + /* real extent found - no need to allocate */ + xfs_trim_extent(&got, offset_fsb, count_fsb); + *imap = got; + *found = true; + return 0; +} + /* Allocate all CoW reservations covering a range of blocks in a file. */ int xfs_reflink_allocate_cow( @@ -363,78 +404,64 @@ xfs_reflink_allocate_cow( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; - struct xfs_bmbt_irec got; - struct xfs_trans *tp = NULL; + struct xfs_trans *tp; int nimaps, error = 0; - bool trimmed; + bool found; xfs_filblks_t resaligned; xfs_extlen_t resblks = 0; - struct xfs_iext_cursor icur; -retry: - ASSERT(xfs_is_reflink_inode(ip)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_is_reflink_inode(ip)); + + error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + if (error || !*shared) + return error; + if (found) + goto convert; + + resaligned = xfs_aligned_fsb_count(imap->br_startoff, + imap->br_blockcount, xfs_get_cowextsz_hint(ip)); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + + xfs_iunlock(ip, *lockmode); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + *lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, *lockmode); + + if (error) + return error; + + error = xfs_qm_dqattach_locked(ip, false); + if (error) + goto out_trans_cancel; /* - * Even if the extent is not shared we might have a preallocation for - * it in the COW fork. If so use it. + * Check for an overlapping extent again now that we dropped the ilock. */ - if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) && - got.br_startoff <= offset_fsb) { - *shared = true; - - /* If we have a real allocation in the COW fork we're done. */ - if (!isnullstartblock(got.br_startblock)) { - xfs_trim_extent(&got, offset_fsb, count_fsb); - *imap = got; - goto convert; - } - - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - } else { - error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); - if (error || !*shared) - goto out; - } - - if (!tp) { - resaligned = xfs_aligned_fsb_count(imap->br_startoff, - imap->br_blockcount, xfs_get_cowextsz_hint(ip)); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - - xfs_iunlock(ip, *lockmode); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - *lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, *lockmode); - - if (error) - return error; - - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out; - goto retry; + error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + if (error || !*shared) + goto out_trans_cancel; + if (found) { + xfs_trans_cancel(tp); + goto convert; } error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) - goto out; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); - nimaps = 1; - /* Allocate the entire reservation as unwritten blocks. */ + nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, resblks, imap, &nimaps); if (error) - goto out_trans_cancel; + goto out_unreserve; xfs_inode_set_cowblocks_tag(ip); - - /* Finish up. */ error = xfs_trans_commit(tp); if (error) return error; @@ -447,12 +474,12 @@ xfs_reflink_allocate_cow( return -ENOSPC; convert: return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); -out_trans_cancel: + +out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, XFS_QMOPT_RES_REGBLKS); -out: - if (tp) - xfs_trans_cancel(tp); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } From 0065b54119973c9089f8222c1c8f9237ed9787a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 29 Sep 2018 13:47:46 +1000 Subject: [PATCH 335/499] xfs: don't bring in extents in xfs_bmap_punch_delalloc_range This function is only used to punch out delayed allocations on I/O failure, which means we need to have read the extents earlier. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index ae3cc393724f..6de8d90041ff 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -702,13 +702,9 @@ xfs_bmap_punch_delalloc_range( struct xfs_iext_cursor icur; int error = 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - goto out_unlock; - } + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) goto out_unlock; From 2863c2ebc4f45d2bc150016f36e2f31c155597e8 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 29 Sep 2018 13:48:21 +1000 Subject: [PATCH 336/499] xfs: remove duplicated include from alloc.c Remove duplicated include xfs_alloc.h Signed-off-by: YueHaibing Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/scrub/alloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 036b5c7021eb..376bcb585ae6 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -17,7 +17,6 @@ #include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_rmap.h" -#include "xfs_alloc.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" From f369a13cead821e679c7415dc66a17ec48cc26bf Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 29 Sep 2018 13:49:00 +1000 Subject: [PATCH 337/499] xfs: don't treat unknown di_flags2 as corruption in scrub xchk_inode_flags2() currently treats any di_flags2 values that the running kernel doesn't recognize as corruption, and calls xchk_ino_set_corrupt() if they are set. However, it's entirely possible that these flags were set in some newer kernel and are quite valid, but ignored in this kernel. (Validators don't care one bit about unknown di_flags2.) Call xchk_ino_set_warning instead, because this may or may not actually indicate a problem. Signed-off-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_format.h | 2 ++ fs/xfs/scrub/inode.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 059bc44c27e8..afbe336600e1 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1016,6 +1016,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ #define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ +/* Do not use bit 15, di_flags is legacy and unchanging now */ + #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 5b3b177c0fc9..e386c9b0b4ab 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -126,6 +126,7 @@ xchk_inode_flags( { struct xfs_mount *mp = sc->mp; + /* di_flags are all taken, last bit cannot be used */ if (flags & ~XFS_DIFLAG_ANY) goto bad; @@ -172,8 +173,9 @@ xchk_inode_flags2( { struct xfs_mount *mp = sc->mp; + /* Unknown di_flags2 could be from a future kernel */ if (flags2 & ~XFS_DIFLAG2_ANY) - goto bad; + xchk_ino_set_warning(sc, ino); /* reflink flag requires reflink feature */ if ((flags2 & XFS_DIFLAG2_REFLINK) && From f5f3f959b70b272dfe2d05758e43ae52f5ff7748 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 29 Sep 2018 13:49:58 +1000 Subject: [PATCH 338/499] xfs: skip delalloc COW blocks in xfs_reflink_end_cow The iomap direct I/O code issues a single ->end_io call for the whole I/O request, and if some of the extents cowered needed a COW operation it will call xfs_reflink_end_cow over the whole range. When we do AIO writes we drop the iolock after doing the initial setup, but before the I/O completion. Between dropping the lock and completing the I/O we can have a racing buffered write create new delalloc COW fork extents in the region covered by the outstanding direct I/O write, and thus see delalloc COW fork extents in xfs_reflink_end_cow. As concurrent writes are fundamentally racy and no guarantees are given we can simply skip those. This can be easily reproduced with xfstests generic/208 in always_cow mode. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d60d0eeed7b9..5289e22cb081 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -693,14 +693,12 @@ xfs_reflink_end_cow( if (!del.br_blockcount) goto prev_extent; - ASSERT(!isnullstartblock(got.br_startblock)); - /* - * Don't remap unwritten extents; these are - * speculatively preallocated CoW extents that have been - * allocated but have not yet been involved in a write. + * Only remap real extent that contain data. With AIO + * speculatively preallocations can leak into the range we + * are called upon, and we need to skip them. */ - if (got.br_state == XFS_EXT_UNWRITTEN) + if (!xfs_bmap_is_real_extent(&got)) goto prev_extent; /* Unmap the old blocks in the data fork. */ From 339e1a3fcdd1990e5ec115325ccb4c6f4cc5ee16 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 29 Sep 2018 13:50:13 +1000 Subject: [PATCH 339/499] xfs: validate inode di_forkoff Verify the inode di_forkoff, lifted from xfs_repair's process_check_inode_forkoff(). Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_inode_buf.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 30d1d60f1d46..09d9c8cfa4a0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -415,6 +415,31 @@ xfs_dinode_verify_fork( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_forkoff( + struct xfs_dinode *dip, + struct xfs_mount *mp) +{ + if (!XFS_DFORK_Q(dip)) + return NULL; + + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) + return __this_address; + break; + case XFS_DINODE_FMT_LOCAL: /* fall through ... */ + case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ + case XFS_DINODE_FMT_BTREE: + if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + return __this_address; + break; + default: + return __this_address; + } + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -470,6 +495,11 @@ xfs_dinode_verify( if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) return __this_address; + /* check for illegal values of forkoff */ + fa = xfs_dinode_verify_forkoff(dip, mp); + if (fa) + return fa; + /* Do we have appropriate data fork formats for the mode? */ switch (mode & S_IFMT) { case S_IFIFO: From ec2ed0b5e96fea2a4539bd7f7f43499ecd45be8b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Sat, 29 Sep 2018 13:50:41 +1000 Subject: [PATCH 340/499] xfs: remove invalid log recovery first/last cycle check One of the first steps of log recovery is to check for the special case of a zeroed log. If the first cycle in the log is zero or the tail portion of the log is zeroed, the head is set to the first instance of cycle 0. xlog_find_zeroed() includes a sanity check that enforces that the first cycle in the log must be 1 if the last cycle is 0. While this is true in most cases, the check is not totally valid because it doesn't consider the case where the filesystem crashed after a partial/out of order log buffer completion that wraps around the end of the physical log. For example, consider a filesystem that has completed most of the first cycle of the log, reaches the end of the physical log and splits the next single log buffer write into two in order to wrap around the end of the log. If these I/Os are reordered, the second (wrapped) I/O completes and the first happens to fail, the log is left in a state where the last cycle of the log is 0 and the first cycle is 2. This causes the xlog_find_zeroed() sanity check to fail and prevents the filesystem from mounting. This situation has been reproduced on particular systems via repeated runs of generic/475. This is an expected state that log recovery already knows how to deal with, however. Since the log is still partially zeroed, the head is detected correctly and points to a valid tail. The subsequent stale block detection clears blocks beyond the head up to the tail (within a maximum range), with the express purpose of clearing such out of order writes. As expected, this removes the out of order cycle 2 blocks at the physical start of the log. In other words, the only thing that prevents a clean mount and recovery of the filesystem in this scenario is the specific (last == 0 && first != 1) sanity check in xlog_find_zeroed(). Since the log head/tail are now independently validated via cycle, log record and CRC checks, this highly specific first cycle check is of dubious value. Remove it and rely on the higher level validation to determine whether log content is sane and recoverable. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a21dc61ec09e..1fc9e9042e0e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1570,16 +1570,6 @@ xlog_find_zeroed( if (last_cycle != 0) { /* log completely written to */ xlog_put_bp(bp); return 0; - } else if (first_cycle != 1) { - /* - * If the cycle of the last block is zero, the cycle of - * the first block must be 1. If it's not, maybe we're - * not looking at a log... Bail out. - */ - xfs_warn(log->l_mp, - "Log inconsistent or not a log (last==0, first!=1)"); - error = -EINVAL; - goto bp_err; } /* we have a partially zeroed log */ From 561295a32579e2db3b22f74d1f45d5c4c4fd9375 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Sat, 29 Sep 2018 13:51:01 +1000 Subject: [PATCH 341/499] iomap: set page dirty after partial delalloc on mkwrite The iomap page fault mechanism currently dirties the associated page after the full block range of the page has been allocated. This leaves the page susceptible to delayed allocations without ever being set dirty on sub-page block sized filesystems. For example, consider a page fault on a page with one preexisting real (non-delalloc) block allocated in the middle of the page. The first iomap_apply() iteration performs delayed allocation on the range up to the preexisting block, the next iteration finds the preexisting block, and the last iteration attempts to perform delayed allocation on the range after the prexisting block to the end of the page. If the first allocation succeeds and the final allocation fails with -ENOSPC, iomap_apply() returns the error and iomap_page_mkwrite() fails to dirty the page having already performed partial delayed allocation. This eventually results in the page being invalidated without ever converting the delayed allocation to real blocks. This problem is reliably reproduced by generic/083 on XFS on ppc64 systems (64k page size, 4k block size). It results in leaked delalloc blocks on inode reclaim, which triggers an assert failure in xfs_fs_destroy_inode() and filesystem accounting inconsistency. Move the set_page_dirty() call from iomap_page_mkwrite() to the actor callback, similar to how the buffer head implementation works. The actor callback is called iff ->iomap_begin() returns success, so ensures the page is dirtied as soon as possible after an allocation. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap.c b/fs/iomap.c index 74762b1ec233..ec15cf2ec696 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1051,6 +1051,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, } else { WARN_ON_ONCE(!PageUptodate(page)); iomap_page_create(inode, page); + set_page_dirty(page); } return length; @@ -1090,7 +1091,6 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) length -= ret; } - set_page_dirty(page); wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: From d51aea13dd6753186a2bea7619029c460bdf0c4c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 19 Sep 2018 17:22:21 -0700 Subject: [PATCH 342/499] cpufreq: qcom-kryo: Fix section annotations There is currently a warning when building the Kryo cpufreq driver into the kernel image: WARNING: vmlinux.o(.text+0x8aa424): Section mismatch in reference from the function qcom_cpufreq_kryo_probe() to the function .init.text:qcom_cpufreq_kryo_get_msm_id() The function qcom_cpufreq_kryo_probe() references the function __init qcom_cpufreq_kryo_get_msm_id(). This is often because qcom_cpufreq_kryo_probe lacks a __init annotation or the annotation of qcom_cpufreq_kryo_get_msm_id is wrong. Remove the '__init' annotation from qcom_cpufreq_kryo_get_msm_id so that there is no more mismatch warning. Additionally, Nick noticed that the remove function was marked as '__init' when it should really be marked as '__exit'. Fixes: 46e2856b8e18 (cpufreq: Add Kryo CPU scaling driver) Fixes: 5ad7346b4ae2 (cpufreq: kryo: Add module remove and exit) Reported-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Acked-by: Viresh Kumar Cc: 4.18+ # 4.18+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/qcom-cpufreq-kryo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-kryo.c b/drivers/cpufreq/qcom-cpufreq-kryo.c index a1830fa25fc5..2a3675c24032 100644 --- a/drivers/cpufreq/qcom-cpufreq-kryo.c +++ b/drivers/cpufreq/qcom-cpufreq-kryo.c @@ -44,7 +44,7 @@ enum _msm8996_version { struct platform_device *cpufreq_dt_pdev, *kryo_cpufreq_pdev; -static enum _msm8996_version __init qcom_cpufreq_kryo_get_msm_id(void) +static enum _msm8996_version qcom_cpufreq_kryo_get_msm_id(void) { size_t len; u32 *msm_id; @@ -222,7 +222,7 @@ static int __init qcom_cpufreq_kryo_init(void) } module_init(qcom_cpufreq_kryo_init); -static void __init qcom_cpufreq_kryo_exit(void) +static void __exit qcom_cpufreq_kryo_exit(void) { platform_device_unregister(kryo_cpufreq_pdev); platform_driver_unregister(&qcom_cpufreq_kryo_driver); From c4ce446e33d7a0e978256ac6fea4c80e59d9de5f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 28 Sep 2018 16:18:50 -0700 Subject: [PATCH 343/499] asix: Check for supported Wake-on-LAN modes The driver currently silently accepts unsupported Wake-on-LAN modes (other than WAKE_PHY or WAKE_MAGIC) without reporting that to the user, which is confusing. Fixes: 2e55cc7210fe ("[PATCH] USB: usbnet (3/9) module for ASIX Ethernet adapters") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/usb/asix_common.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index e95dd12edec4..023b8d0bf175 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -607,6 +607,9 @@ int asix_set_wol(struct net_device *net, struct ethtool_wolinfo *wolinfo) struct usbnet *dev = netdev_priv(net); u8 opt = 0; + if (wolinfo->wolopts & ~(WAKE_PHY | WAKE_MAGIC)) + return -EINVAL; + if (wolinfo->wolopts & WAKE_PHY) opt |= AX_MONITOR_LINK; if (wolinfo->wolopts & WAKE_MAGIC) From 5ba6b4aa9a410c5e2c6417df52b5e2118ea9b467 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 28 Sep 2018 16:18:51 -0700 Subject: [PATCH 344/499] ax88179_178a: Check for supported Wake-on-LAN modes The driver currently silently accepts unsupported Wake-on-LAN modes (other than WAKE_PHY or WAKE_MAGIC) without reporting that to the user, which is confusing. Fixes: e2ca90c276e1 ("ax88179_178a: ASIX AX88179_178A USB 3.0/2.0 to gigabit ethernet adapter driver") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/usb/ax88179_178a.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 9e8ad372f419..2207f7a7d1ff 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -566,6 +566,9 @@ ax88179_set_wol(struct net_device *net, struct ethtool_wolinfo *wolinfo) struct usbnet *dev = netdev_priv(net); u8 opt = 0; + if (wolinfo->wolopts & ~(WAKE_PHY | WAKE_MAGIC)) + return -EINVAL; + if (wolinfo->wolopts & WAKE_PHY) opt |= AX_MONITOR_MODE_RWLC; if (wolinfo->wolopts & WAKE_MAGIC) From eb9ad088f96653a26b340f7c447c44cf023d5cdc Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 28 Sep 2018 16:18:52 -0700 Subject: [PATCH 345/499] lan78xx: Check for supported Wake-on-LAN modes The driver supports a fair amount of Wake-on-LAN modes, but is not checking that the user specified one that is supported. Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Florian Fainelli Reviewed-by: Woojung Huh Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index a9991c5f4736..c3c9ba44e2a1 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1401,19 +1401,10 @@ static int lan78xx_set_wol(struct net_device *netdev, if (ret < 0) return ret; - pdata->wol = 0; - if (wol->wolopts & WAKE_UCAST) - pdata->wol |= WAKE_UCAST; - if (wol->wolopts & WAKE_MCAST) - pdata->wol |= WAKE_MCAST; - if (wol->wolopts & WAKE_BCAST) - pdata->wol |= WAKE_BCAST; - if (wol->wolopts & WAKE_MAGIC) - pdata->wol |= WAKE_MAGIC; - if (wol->wolopts & WAKE_PHY) - pdata->wol |= WAKE_PHY; - if (wol->wolopts & WAKE_ARP) - pdata->wol |= WAKE_ARP; + if (wol->wolopts & ~WAKE_ALL) + return -EINVAL; + + pdata->wol = wol->wolopts; device_set_wakeup_enable(&dev->udev->dev, (bool)wol->wolopts); From c5cb93e994ffb43b7b3b1ff10b9f928f54574a36 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 28 Sep 2018 16:18:53 -0700 Subject: [PATCH 346/499] sr9800: Check for supported Wake-on-LAN modes The driver currently silently accepts unsupported Wake-on-LAN modes (other than WAKE_PHY or WAKE_MAGIC) without reporting that to the user, which is confusing. Fixes: 19a38d8e0aa3 ("USB2NET : SR9800 : One chip USB2.0 USB2NET SR9800 Device Driver Support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/usb/sr9800.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/sr9800.c b/drivers/net/usb/sr9800.c index 9277a0f228df..35f39f23d881 100644 --- a/drivers/net/usb/sr9800.c +++ b/drivers/net/usb/sr9800.c @@ -421,6 +421,9 @@ sr_set_wol(struct net_device *net, struct ethtool_wolinfo *wolinfo) struct usbnet *dev = netdev_priv(net); u8 opt = 0; + if (wolinfo->wolopts & ~(WAKE_PHY | WAKE_MAGIC)) + return -EINVAL; + if (wolinfo->wolopts & WAKE_PHY) opt |= SR_MONITOR_LINK; if (wolinfo->wolopts & WAKE_MAGIC) From f2750df1548bd8a2b060eb609fc43ca82811af4c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 28 Sep 2018 16:18:54 -0700 Subject: [PATCH 347/499] r8152: Check for supported Wake-on-LAN Modes The driver does not check for Wake-on-LAN modes specified by an user, but will conditionally set the device as wake-up enabled or not based on that, which could be a very confusing user experience. Fixes: 21ff2e8976b1 ("r8152: support WOL") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 2cd71bdb6484..f1b5201cc320 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -4506,6 +4506,9 @@ static int rtl8152_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) if (!rtl_can_wakeup(tp)) return -EOPNOTSUPP; + if (wol->wolopts & ~WAKE_ANY) + return -EINVAL; + ret = usb_autopm_get_interface(tp->intf); if (ret < 0) goto out_set_wol; From 9c734b2769a73eea2e9e9767c0e0bf839ff23679 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 28 Sep 2018 16:18:55 -0700 Subject: [PATCH 348/499] smsc75xx: Check for Wake-on-LAN modes The driver does not check for Wake-on-LAN modes specified by an user, but will conditionally set the device as wake-up enabled or not based on that, which could be a very confusing user experience. Fixes: 6c636503260d ("smsc75xx: add wol magic packet support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/usb/smsc75xx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index 05553d252446..e5a4cbb366dc 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -731,6 +731,9 @@ static int smsc75xx_ethtool_set_wol(struct net_device *net, struct smsc75xx_priv *pdata = (struct smsc75xx_priv *)(dev->data[0]); int ret; + if (wolinfo->wolopts & ~SUPPORTED_WAKE) + return -EINVAL; + pdata->wolopts = wolinfo->wolopts & SUPPORTED_WAKE; ret = device_set_wakeup_enable(&dev->udev->dev, pdata->wolopts); From c530c471ba37bdd9fe1c7185b01455c00ae606fb Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 28 Sep 2018 16:18:56 -0700 Subject: [PATCH 349/499] smsc95xx: Check for Wake-on-LAN modes The driver does not check for Wake-on-LAN modes specified by an user, but will conditionally set the device as wake-up enabled or not based on that, which could be a very confusing user experience. Fixes: e0e474a83c18 ("smsc95xx: add wol magic packet support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index 06b4d290784d..262e7a3c23cb 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -774,6 +774,9 @@ static int smsc95xx_ethtool_set_wol(struct net_device *net, struct smsc95xx_priv *pdata = (struct smsc95xx_priv *)(dev->data[0]); int ret; + if (wolinfo->wolopts & ~SUPPORTED_WAKE) + return -EINVAL; + pdata->wolopts = wolinfo->wolopts & SUPPORTED_WAKE; ret = device_set_wakeup_enable(&dev->udev->dev, pdata->wolopts); From c140eb166d681f66bd7e99fb121357db1a503e7f Mon Sep 17 00:00:00 2001 From: LUU Duc Canh Date: Wed, 26 Sep 2018 21:00:54 +0200 Subject: [PATCH 350/499] tipc: fix failover problem We see the following scenario: 1) Link endpoint B on node 1 discovers that its peer endpoint is gone. Since there is a second working link, failover procedure is started. 2) Link endpoint A on node 1 sends a FAILOVER message to peer endpoint A on node 2. The node item 1->2 goes to state FAILINGOVER. 3) Linke endpoint A/2 receives the failover, and is supposed to take down its parallell link endpoint B/2, while producing a FAILOVER message to send back to A/1. 4) However, B/2 has already been deleted, so no FAILOVER message can created. 5) Node 1->2 remains in state FAILINGOVER forever, refusing to receive any messages that can bring B/1 up again. We are left with a non- redundant link between node 1 and 2. We fix this with letting endpoint A/2 build a dummy FAILOVER message to send to back to A/1, so that the situation can be resolved. Signed-off-by: LUU Duc Canh Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 35 +++++++++++++++++++++++++++++++++++ net/tipc/link.h | 3 +++ net/tipc/node.c | 11 +++++++++++ 3 files changed, 49 insertions(+) diff --git a/net/tipc/link.c b/net/tipc/link.c index 26cc033ee167..4ed650ce6e61 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -410,6 +410,11 @@ char *tipc_link_name(struct tipc_link *l) return l->name; } +u32 tipc_link_state(struct tipc_link *l) +{ + return l->state; +} + /** * tipc_link_create - create a new link * @n: pointer to associated node @@ -1385,6 +1390,36 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, __skb_queue_tail(xmitq, skb); } +void tipc_link_create_dummy_tnl_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) +{ + u32 onode = tipc_own_addr(l->net); + struct tipc_msg *hdr, *ihdr; + struct sk_buff_head tnlq; + struct sk_buff *skb; + u32 dnode = l->addr; + + skb_queue_head_init(&tnlq); + skb = tipc_msg_create(TUNNEL_PROTOCOL, FAILOVER_MSG, + INT_H_SIZE, BASIC_H_SIZE, + dnode, onode, 0, 0, 0); + if (!skb) { + pr_warn("%sunable to create tunnel packet\n", link_co_err); + return; + } + + hdr = buf_msg(skb); + msg_set_msgcnt(hdr, 1); + msg_set_bearer_id(hdr, l->peer_bearer_id); + + ihdr = (struct tipc_msg *)msg_data(hdr); + tipc_msg_init(onode, ihdr, TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, + BASIC_H_SIZE, dnode); + msg_set_errcode(ihdr, TIPC_ERR_NO_PORT); + __skb_queue_tail(&tnlq, skb); + tipc_link_xmit(l, &tnlq, xmitq); +} + /* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets * with contents of the link's transmit and backlog queues. */ diff --git a/net/tipc/link.h b/net/tipc/link.h index 7bc494a33fdf..90488c538a4e 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -88,6 +88,8 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, struct tipc_link **link); void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); +void tipc_link_create_dummy_tnl_msg(struct tipc_link *tnl, + struct sk_buff_head *xmitq); void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_fsm_evt(struct tipc_link *l, int evt); bool tipc_link_is_up(struct tipc_link *l); @@ -107,6 +109,7 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l); u16 tipc_link_acked(struct tipc_link *l); u32 tipc_link_id(struct tipc_link *l); char *tipc_link_name(struct tipc_link *l); +u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); int tipc_link_window(struct tipc_link *l); diff --git a/net/tipc/node.c b/net/tipc/node.c index 68014f1b6976..b0ee25f1f2e6 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -111,6 +111,7 @@ struct tipc_node { int action_flags; struct list_head list; int state; + bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; @@ -680,6 +681,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); + n->failover_sent = false; n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); @@ -1615,6 +1617,15 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } + /* If parallel link was already down, and this happened before + * the tunnel link came up, FAILOVER was never sent. Ensure that + * FAILOVER is sent to get peer out of NODE_FAILINGOVER state. + */ + if (n->state != NODE_FAILINGOVER && !n->failover_sent) { + tipc_link_create_dummy_tnl_msg(l, xmitq); + n->failover_sent = true; + } + /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; From c333fa0c4f220f8f7ea5acd6b0ebf3bf13fd684d Mon Sep 17 00:00:00 2001 From: Shahed Shaikh Date: Wed, 26 Sep 2018 12:41:10 -0700 Subject: [PATCH 351/499] qlcnic: fix Tx descriptor corruption on 82xx devices In regular NIC transmission flow, driver always configures MAC using Tx queue zero descriptor as a part of MAC learning flow. But with multi Tx queue supported NIC, regular transmission can occur on any non-zero Tx queue and from that context it uses Tx queue zero descriptor to configure MAC, at the same time TX queue zero could be used by another CPU for regular transmission which could lead to Tx queue zero descriptor corruption and cause FW abort. This patch fixes this in such a way that driver always configures learned MAC address from the same Tx queue which is used for regular transmission. Fixes: 7e2cf4feba05 ("qlcnic: change driver hardware interface mechanism") Signed-off-by: Shahed Shaikh Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic.h | 8 +++++--- drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 3 ++- drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h | 3 ++- drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h | 3 ++- drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c | 12 ++++++------ 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h index 81312924df14..0c443ea98479 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h @@ -1800,7 +1800,8 @@ struct qlcnic_hardware_ops { int (*config_loopback) (struct qlcnic_adapter *, u8); int (*clear_loopback) (struct qlcnic_adapter *, u8); int (*config_promisc_mode) (struct qlcnic_adapter *, u32); - void (*change_l2_filter) (struct qlcnic_adapter *, u64 *, u16); + void (*change_l2_filter)(struct qlcnic_adapter *adapter, u64 *addr, + u16 vlan, struct qlcnic_host_tx_ring *tx_ring); int (*get_board_info) (struct qlcnic_adapter *); void (*set_mac_filter_count) (struct qlcnic_adapter *); void (*free_mac_list) (struct qlcnic_adapter *); @@ -2064,9 +2065,10 @@ static inline int qlcnic_nic_set_promisc(struct qlcnic_adapter *adapter, } static inline void qlcnic_change_filter(struct qlcnic_adapter *adapter, - u64 *addr, u16 id) + u64 *addr, u16 vlan, + struct qlcnic_host_tx_ring *tx_ring) { - adapter->ahw->hw_ops->change_l2_filter(adapter, addr, id); + adapter->ahw->hw_ops->change_l2_filter(adapter, addr, vlan, tx_ring); } static inline int qlcnic_get_board_info(struct qlcnic_adapter *adapter) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c index 569d54ededec..a79d84f99102 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c @@ -2135,7 +2135,8 @@ int qlcnic_83xx_sre_macaddr_change(struct qlcnic_adapter *adapter, u8 *addr, } void qlcnic_83xx_change_l2_filter(struct qlcnic_adapter *adapter, u64 *addr, - u16 vlan_id) + u16 vlan_id, + struct qlcnic_host_tx_ring *tx_ring) { u8 mac[ETH_ALEN]; memcpy(&mac, addr, ETH_ALEN); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h index b75a81246856..73fe2f64491d 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h @@ -550,7 +550,8 @@ int qlcnic_83xx_wrt_reg_indirect(struct qlcnic_adapter *, ulong, u32); int qlcnic_83xx_nic_set_promisc(struct qlcnic_adapter *, u32); int qlcnic_83xx_config_hw_lro(struct qlcnic_adapter *, int); int qlcnic_83xx_config_rss(struct qlcnic_adapter *, int); -void qlcnic_83xx_change_l2_filter(struct qlcnic_adapter *, u64 *, u16); +void qlcnic_83xx_change_l2_filter(struct qlcnic_adapter *adapter, u64 *addr, + u16 vlan, struct qlcnic_host_tx_ring *ring); int qlcnic_83xx_get_pci_info(struct qlcnic_adapter *, struct qlcnic_pci_info *); int qlcnic_83xx_set_nic_info(struct qlcnic_adapter *, struct qlcnic_info *); void qlcnic_83xx_initialize_nic(struct qlcnic_adapter *, int); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h index 4bb33af8e2b3..56a3bd9e37dc 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.h @@ -173,7 +173,8 @@ int qlcnic_82xx_napi_add(struct qlcnic_adapter *adapter, struct net_device *netdev); void qlcnic_82xx_get_beacon_state(struct qlcnic_adapter *); void qlcnic_82xx_change_filter(struct qlcnic_adapter *adapter, - u64 *uaddr, u16 vlan_id); + u64 *uaddr, u16 vlan_id, + struct qlcnic_host_tx_ring *tx_ring); int qlcnic_82xx_config_intr_coalesce(struct qlcnic_adapter *, struct ethtool_coalesce *); int qlcnic_82xx_set_rx_coalesce(struct qlcnic_adapter *); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c index 84dd83031a1b..9647578cbe6a 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c @@ -268,13 +268,12 @@ static void qlcnic_add_lb_filter(struct qlcnic_adapter *adapter, } void qlcnic_82xx_change_filter(struct qlcnic_adapter *adapter, u64 *uaddr, - u16 vlan_id) + u16 vlan_id, struct qlcnic_host_tx_ring *tx_ring) { struct cmd_desc_type0 *hwdesc; struct qlcnic_nic_req *req; struct qlcnic_mac_req *mac_req; struct qlcnic_vlan_req *vlan_req; - struct qlcnic_host_tx_ring *tx_ring = adapter->tx_ring; u32 producer; u64 word; @@ -301,7 +300,8 @@ void qlcnic_82xx_change_filter(struct qlcnic_adapter *adapter, u64 *uaddr, static void qlcnic_send_filter(struct qlcnic_adapter *adapter, struct cmd_desc_type0 *first_desc, - struct sk_buff *skb) + struct sk_buff *skb, + struct qlcnic_host_tx_ring *tx_ring) { struct vlan_ethhdr *vh = (struct vlan_ethhdr *)(skb->data); struct ethhdr *phdr = (struct ethhdr *)(skb->data); @@ -335,7 +335,7 @@ static void qlcnic_send_filter(struct qlcnic_adapter *adapter, tmp_fil->vlan_id == vlan_id) { if (jiffies > (QLCNIC_READD_AGE * HZ + tmp_fil->ftime)) qlcnic_change_filter(adapter, &src_addr, - vlan_id); + vlan_id, tx_ring); tmp_fil->ftime = jiffies; return; } @@ -350,7 +350,7 @@ static void qlcnic_send_filter(struct qlcnic_adapter *adapter, if (!fil) return; - qlcnic_change_filter(adapter, &src_addr, vlan_id); + qlcnic_change_filter(adapter, &src_addr, vlan_id, tx_ring); fil->ftime = jiffies; fil->vlan_id = vlan_id; memcpy(fil->faddr, &src_addr, ETH_ALEN); @@ -766,7 +766,7 @@ netdev_tx_t qlcnic_xmit_frame(struct sk_buff *skb, struct net_device *netdev) } if (adapter->drv_mac_learn) - qlcnic_send_filter(adapter, first_desc, skb); + qlcnic_send_filter(adapter, first_desc, skb, tx_ring); tx_ring->tx_stats.tx_bytes += skb->len; tx_ring->tx_stats.xmit_called++; From ac8bd9e13be22a3d24bfc80972d4688e3e50e457 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 26 Sep 2018 22:12:39 +0200 Subject: [PATCH 352/499] r8169: Disable clk during suspend / resume Disable the clk during suspend to save power. Note that tp->clk may be NULL, the clk core functions handle this without problems. Reviewed-by: Andy Shevchenko Tested-by: Carlo Caione Signed-off-by: Hans de Goede Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index ab30aaeac6d3..d6b53f53909a 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -6869,8 +6869,10 @@ static int rtl8169_suspend(struct device *device) { struct pci_dev *pdev = to_pci_dev(device); struct net_device *dev = pci_get_drvdata(pdev); + struct rtl8169_private *tp = netdev_priv(dev); rtl8169_net_suspend(dev); + clk_disable_unprepare(tp->clk); return 0; } @@ -6898,6 +6900,9 @@ static int rtl8169_resume(struct device *device) { struct pci_dev *pdev = to_pci_dev(device); struct net_device *dev = pci_get_drvdata(pdev); + struct rtl8169_private *tp = netdev_priv(dev); + + clk_prepare_enable(tp->clk); if (netif_running(dev)) __rtl8169_resume(dev); From 43955a45dc0b4f3be7f0c3afc0e080ed59bb5280 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 26 Sep 2018 22:19:42 +0200 Subject: [PATCH 353/499] netlink: fix typo in nla_parse_nested() comment Fix a simple typo: attribuets -> attributes Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netlink.h b/include/net/netlink.h index 0c154f98e987..39e1d875d507 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -153,7 +153,7 @@ * nla_find() find attribute in stream of attributes * nla_find_nested() find attribute in nested attributes * nla_parse() parse and validate stream of attrs - * nla_parse_nested() parse nested attribuets + * nla_parse_nested() parse nested attributes * nla_for_each_attr() loop over all attributes * nla_for_each_nested() loop over the nested attributes *========================================================================= From 03d179a840ce9e694db9d69bb643fdee04cfd28f Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 30 Sep 2018 13:50:05 +0200 Subject: [PATCH 354/499] MAINTAINERS: fix reference to moved drivers/{misc => auxdisplay}/panel.c Commit 51c1e9b554c9 ("auxdisplay: Move panel.c to drivers/auxdisplay folder") moved the file, but the MAINTAINERS reference was not updated. Link: https://lore.kernel.org/lkml/20180928220131.31075-1-joe@perches.com/ Reported-by: Joe Perches Signed-off-by: Miguel Ojeda --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 02a39617ec82..e6fde62a1d65 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10949,7 +10949,7 @@ M: Willy Tarreau M: Ksenija Stanojevic S: Odd Fixes F: Documentation/auxdisplay/lcd-panel-cgram.txt -F: drivers/misc/panel.c +F: drivers/auxdisplay/panel.c PARALLEL PORT SUBSYSTEM M: Sudip Mukherjee From 17b57b1883c1285f3d0dc2266e8f79286a7bef38 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 30 Sep 2018 07:15:35 -0700 Subject: [PATCH 355/499] Linux 4.19-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0c90c4354979..6c3da3e10f07 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 19 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Merciless Moray # *DOCUMENTATION* From 41e270f6898e7502be9fd6920ee0a108ca259d36 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 17 Sep 2018 04:14:54 +0000 Subject: [PATCH 356/499] Drivers: hv: vmbus: Use get/put_cpu() in vmbus_connect() With CONFIG_DEBUG_PREEMPT=y, I always see this warning: BUG: using smp_processor_id() in preemptible [00000000] Fix the false warning by using get/put_cpu(). Here vmbus_connect() sends a message to the host and waits for the host's response. The host will deliver the response message and an interrupt on CPU msg->target_vcpu, and later the interrupt handler will wake up vmbus_connect(). vmbus_connect() doesn't really have to run on the same cpu as CPU msg->target_vcpu, so it's safe to call put_cpu() just here. Signed-off-by: Dexuan Cui Cc: stable@vger.kernel.org Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Stephen Hemminger Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman --- drivers/hv/connection.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index ced041899456..f4d08c8ac7f8 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -76,6 +76,7 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, __u32 version) { int ret = 0; + unsigned int cur_cpu; struct vmbus_channel_initiate_contact *msg; unsigned long flags; @@ -118,9 +119,10 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, * the CPU attempting to connect may not be CPU 0. */ if (version >= VERSION_WIN8_1) { - msg->target_vcpu = - hv_cpu_number_to_vp_number(smp_processor_id()); - vmbus_connection.connect_cpu = smp_processor_id(); + cur_cpu = get_cpu(); + msg->target_vcpu = hv_cpu_number_to_vp_number(cur_cpu); + vmbus_connection.connect_cpu = cur_cpu; + put_cpu(); } else { msg->target_vcpu = 0; vmbus_connection.connect_cpu = 0; From 34bd283396af1f05c43fa64d51378d38affb1cf1 Mon Sep 17 00:00:00 2001 From: Alan Tull Date: Wed, 12 Sep 2018 09:43:23 -0500 Subject: [PATCH 357/499] fpga: do not access region struct after fpga_region_unregister A couple drivers were accessing the region struct after it had been freed. Save off the pointer to the mgr before the region struct gets freed. Signed-off-by: Alan Tull Acked-by: Moritz Fischer Signed-off-by: Greg Kroah-Hartman --- drivers/fpga/dfl-fme-region.c | 4 +++- drivers/fpga/of-fpga-region.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/fpga/dfl-fme-region.c b/drivers/fpga/dfl-fme-region.c index 0b7e19c27c6d..51a5ac2293a7 100644 --- a/drivers/fpga/dfl-fme-region.c +++ b/drivers/fpga/dfl-fme-region.c @@ -14,6 +14,7 @@ */ #include +#include #include #include "dfl-fme-pr.h" @@ -66,9 +67,10 @@ static int fme_region_probe(struct platform_device *pdev) static int fme_region_remove(struct platform_device *pdev) { struct fpga_region *region = dev_get_drvdata(&pdev->dev); + struct fpga_manager *mgr = region->mgr; fpga_region_unregister(region); - fpga_mgr_put(region->mgr); + fpga_mgr_put(mgr); return 0; } diff --git a/drivers/fpga/of-fpga-region.c b/drivers/fpga/of-fpga-region.c index 35fabb8083fb..052a1342ab7e 100644 --- a/drivers/fpga/of-fpga-region.c +++ b/drivers/fpga/of-fpga-region.c @@ -437,9 +437,10 @@ static int of_fpga_region_probe(struct platform_device *pdev) static int of_fpga_region_remove(struct platform_device *pdev) { struct fpga_region *region = platform_get_drvdata(pdev); + struct fpga_manager *mgr = region->mgr; fpga_region_unregister(region); - fpga_mgr_put(region->mgr); + fpga_mgr_put(mgr); return 0; } From c2d68afba86d1ff01e7300c68bc16a9234dcd8e9 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 17 Sep 2018 04:14:55 +0000 Subject: [PATCH 358/499] tools: hv: fcopy: set 'error' in case an unknown operation was requested 'error' variable is left uninitialized in case we see an unknown operation. As we don't immediately return and proceed to pwrite() we need to set it to something, HV_E_FAIL sounds good enough. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Cc: stable Signed-off-by: Greg Kroah-Hartman --- tools/hv/hv_fcopy_daemon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c index d78aed86af09..8ff8cb1a11f4 100644 --- a/tools/hv/hv_fcopy_daemon.c +++ b/tools/hv/hv_fcopy_daemon.c @@ -234,6 +234,7 @@ int main(int argc, char *argv[]) break; default: + error = HV_E_FAIL; syslog(LOG_ERR, "Unknown operation: %d", buffer.hdr.operation); From b4d9a0e5ca139640a45bcda87bc718f85d8008af Mon Sep 17 00:00:00 2001 From: Alan Tull Date: Wed, 12 Sep 2018 09:43:24 -0500 Subject: [PATCH 359/499] fpga: bridge: fix obvious function documentation error fpga_bridge_dev_match() returns a FPGA bridge struct, not a FPGA manager struct so s/manager/bridge/. Signed-off-by: Alan Tull Acked-by: Moritz Fischer Signed-off-by: Greg Kroah-Hartman --- drivers/fpga/fpga-bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/fpga/fpga-bridge.c b/drivers/fpga/fpga-bridge.c index 24b8f98b73ec..c983dac97501 100644 --- a/drivers/fpga/fpga-bridge.c +++ b/drivers/fpga/fpga-bridge.c @@ -125,7 +125,7 @@ static int fpga_bridge_dev_match(struct device *dev, const void *data) * * Given a device, get an exclusive reference to a fpga bridge. * - * Return: fpga manager struct or IS_ERR() condition containing error code. + * Return: fpga bridge struct or IS_ERR() condition containing error code. */ struct fpga_bridge *fpga_bridge_get(struct device *dev, struct fpga_image_info *info) From 492ecf6d6598e7c3b0b1372653ed21da50ddfb09 Mon Sep 17 00:00:00 2001 From: Alan Tull Date: Wed, 12 Sep 2018 09:43:25 -0500 Subject: [PATCH 360/499] docs: fpga: document fpga manager flags Add flags #defines to kerneldoc documentation in a useful place. Signed-off-by: Alan Tull Acked-by: Moritz Fischer Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/fpga/fpga-mgr.rst | 5 +++++ include/linux/fpga/fpga-mgr.h | 20 ++++++++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/Documentation/driver-api/fpga/fpga-mgr.rst b/Documentation/driver-api/fpga/fpga-mgr.rst index 4b3825da48d9..82b6dbbd31cd 100644 --- a/Documentation/driver-api/fpga/fpga-mgr.rst +++ b/Documentation/driver-api/fpga/fpga-mgr.rst @@ -184,6 +184,11 @@ API for implementing a new FPGA Manager driver API for programming an FPGA --------------------------- +FPGA Manager flags + +.. kernel-doc:: include/linux/fpga/fpga-mgr.h + :doc: FPGA Manager flags + .. kernel-doc:: include/linux/fpga/fpga-mgr.h :functions: fpga_image_info diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h index 8942e61f0028..8ab5df769923 100644 --- a/include/linux/fpga/fpga-mgr.h +++ b/include/linux/fpga/fpga-mgr.h @@ -53,12 +53,20 @@ enum fpga_mgr_states { FPGA_MGR_STATE_OPERATING, }; -/* - * FPGA Manager flags - * FPGA_MGR_PARTIAL_RECONFIG: do partial reconfiguration if supported - * FPGA_MGR_EXTERNAL_CONFIG: FPGA has been configured prior to Linux booting - * FPGA_MGR_BITSTREAM_LSB_FIRST: SPI bitstream bit order is LSB first - * FPGA_MGR_COMPRESSED_BITSTREAM: FPGA bitstream is compressed +/** + * DOC: FPGA Manager flags + * + * Flags used in the &fpga_image_info->flags field + * + * %FPGA_MGR_PARTIAL_RECONFIG: do partial reconfiguration if supported + * + * %FPGA_MGR_EXTERNAL_CONFIG: FPGA has been configured prior to Linux booting + * + * %FPGA_MGR_ENCRYPTED_BITSTREAM: indicates bitstream is encrypted + * + * %FPGA_MGR_BITSTREAM_LSB_FIRST: SPI bitstream bit order is LSB first + * + * %FPGA_MGR_COMPRESSED_BITSTREAM: FPGA bitstream is compressed */ #define FPGA_MGR_PARTIAL_RECONFIG BIT(0) #define FPGA_MGR_EXTERNAL_CONFIG BIT(1) From 7012040576c6ae25a47035659ee48673612c2c27 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 19 Sep 2018 18:09:38 -0700 Subject: [PATCH 361/499] firmware: Always initialize the fw_priv list object When freeing the fw_priv the item is taken off the list. This causes an oops in the FW_OPT_NOCACHE case as the list object is not initialized. Make sure to initialize the list object regardless of this flag. Fixes: 422b3db2a503 ("firmware: Fix security issue with request_firmware_into_buf()") Cc: stable@vger.kernel.org Cc: Rishabh Bhatnagar Signed-off-by: Bjorn Andersson Reviewed-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/base/firmware_loader/main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index b3c0498ee433..8e9213b36e31 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -226,8 +226,11 @@ static int alloc_lookup_fw_priv(const char *fw_name, } tmp = __allocate_fw_priv(fw_name, fwc, dbuf, size); - if (tmp && !(opt_flags & FW_OPT_NOCACHE)) - list_add(&tmp->list, &fwc->head); + if (tmp) { + INIT_LIST_HEAD(&tmp->list); + if (!(opt_flags & FW_OPT_NOCACHE)) + list_add(&tmp->list, &fwc->head); + } spin_unlock(&fwc->lock); *fw_priv = tmp; From bac6f6cda206ad7cbe0c73c35e494377ce9c4749 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 28 Sep 2018 15:17:50 -0700 Subject: [PATCH 362/499] pstore/ram: Fix failure-path memory leak in ramoops_init As reported by nixiaoming, with some minor clarifications: 1) memory leak in ramoops_register_dummy(): dummy_data = kzalloc(sizeof(*dummy_data), GFP_KERNEL); but no kfree() if platform_device_register_data() fails. 2) memory leak in ramoops_init(): Missing platform_device_unregister(dummy) and kfree(dummy_data) if platform_driver_register(&ramoops_driver) fails. I've clarified the purpose of ramoops_register_dummy(), and added a common cleanup routine for all three failure paths to call. Reported-by: nixiaoming Cc: stable@vger.kernel.org Cc: Anton Vorontsov Cc: Colin Cross Cc: Tony Luck Cc: Joel Fernandes Cc: Geliang Tang Signed-off-by: Kees Cook --- fs/pstore/ram.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index bbd1e357c23d..f4fd2e72add4 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -898,8 +898,22 @@ static struct platform_driver ramoops_driver = { }, }; -static void ramoops_register_dummy(void) +static inline void ramoops_unregister_dummy(void) { + platform_device_unregister(dummy); + dummy = NULL; + + kfree(dummy_data); + dummy_data = NULL; +} + +static void __init ramoops_register_dummy(void) +{ + /* + * Prepare a dummy platform data structure to carry the module + * parameters. If mem_size isn't set, then there are no module + * parameters, and we can skip this. + */ if (!mem_size) return; @@ -932,21 +946,28 @@ static void ramoops_register_dummy(void) if (IS_ERR(dummy)) { pr_info("could not create platform device: %ld\n", PTR_ERR(dummy)); + dummy = NULL; + ramoops_unregister_dummy(); } } static int __init ramoops_init(void) { + int ret; + ramoops_register_dummy(); - return platform_driver_register(&ramoops_driver); + ret = platform_driver_register(&ramoops_driver); + if (ret != 0) + ramoops_unregister_dummy(); + + return ret; } late_initcall(ramoops_init); static void __exit ramoops_exit(void) { platform_driver_unregister(&ramoops_driver); - platform_device_unregister(dummy); - kfree(dummy_data); + ramoops_unregister_dummy(); } module_exit(ramoops_exit); From e55ec4ddbef9897199c307dfb23167e3801fdaf5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 1 Oct 2018 08:11:07 +1000 Subject: [PATCH 363/499] xfs: fix error handling in xfs_bmap_extents_to_btree Commit 01239d77b9dd ("xfs: fix a null pointer dereference in xfs_bmap_extents_to_btree") attempted to fix a null pointer dreference when a fuzzing corruption of some kind was found. This fix was flawed, resulting in assert failures like: XFS: Assertion failed: ifp->if_broot == NULL, file: fs/xfs/libxfs/xfs_bmap.c, line: 715 ..... Call Trace: xfs_bmap_extents_to_btree+0x6b9/0x7b0 __xfs_bunmapi+0xae7/0xf00 ? xfs_log_reserve+0x1c8/0x290 xfs_reflink_remap_extent+0x20b/0x620 xfs_reflink_remap_blocks+0x7e/0x290 xfs_reflink_remap_range+0x311/0x530 vfs_dedupe_file_range_one+0xd7/0xe0 vfs_dedupe_file_range+0x15b/0x1a0 do_vfs_ioctl+0x267/0x6c0 The problem is that the error handling code now asserts that the inode fork is not in btree format before the error handling code undoes the modifications that put the fork back in extent format. Fix this by moving the assert back to after the xfs_iroot_realloc() call that returns the fork to extent format, and clean up the jump labels to be meaningful. Also, returning ENOSPC when xfs_btree_get_bufl() fails to instantiate the buffer that was allocated (the actual fix in the commit mentioned above) is incorrect. This is a fatal error - only an invalid block address or a filesystem shutdown can result in failing to get a buffer here. Hence change this to EFSCORRUPTED so that the higher layer knows this was a corruption related failure and should not treat it as an ENOSPC error. This should result in a shutdown (via cancelling a dirty transaction) which is necessary as we do not attempt to clean up the (invalid) block that we have already allocated. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2760314fdf7f..a47670332326 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -673,7 +673,8 @@ xfs_bmap_extents_to_btree( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); /* - * Make space in the inode incore. + * Make space in the inode incore. This needs to be undone if we fail + * to expand the root. */ xfs_iroot_realloc(ip, 1, whichfork); ifp->if_flags |= XFS_IFBROOT; @@ -711,16 +712,15 @@ xfs_bmap_extents_to_btree( args.minlen = args.maxlen = args.prod = 1; args.wasdel = wasdel; *logflagsp = 0; - if ((error = xfs_alloc_vextent(&args))) { - ASSERT(ifp->if_broot == NULL); - goto err1; - } + error = xfs_alloc_vextent(&args); + if (error) + goto out_root_realloc; if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { - ASSERT(ifp->if_broot == NULL); error = -ENOSPC; - goto err1; + goto out_root_realloc; } + /* * Allocation can't fail, the space was reserved. */ @@ -732,9 +732,10 @@ xfs_bmap_extents_to_btree( xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); if (!abp) { - error = -ENOSPC; - goto err2; + error = -EFSCORRUPTED; + goto out_unreserve_dquot; } + /* * Fill in the child block. */ @@ -775,11 +776,12 @@ xfs_bmap_extents_to_btree( *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); return 0; -err2: +out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); -err1: +out_root_realloc: xfs_iroot_realloc(ip, -1, whichfork); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; From 08d9db00fe0e300d6df976e6c294f974988226dd Mon Sep 17 00:00:00 2001 From: Edgar Cherkasov Date: Thu, 27 Sep 2018 11:56:03 +0300 Subject: [PATCH 364/499] i2c: i2c-scmi: fix for i2c_smbus_write_block_data The i2c-scmi driver crashes when the SMBus Write Block transaction is executed: WARNING: CPU: 9 PID: 2194 at mm/page_alloc.c:3931 __alloc_pages_slowpath+0x9db/0xec0 Call Trace: ? get_page_from_freelist+0x49d/0x11f0 ? alloc_pages_current+0x6a/0xe0 ? new_slab+0x499/0x690 __alloc_pages_nodemask+0x265/0x280 alloc_pages_current+0x6a/0xe0 kmalloc_order+0x18/0x40 kmalloc_order_trace+0x24/0xb0 ? acpi_ut_allocate_object_desc_dbg+0x62/0x10c __kmalloc+0x203/0x220 acpi_os_allocate_zeroed+0x34/0x36 acpi_ut_copy_eobject_to_iobject+0x266/0x31e acpi_evaluate_object+0x166/0x3b2 acpi_smbus_cmi_access+0x144/0x530 [i2c_scmi] i2c_smbus_xfer+0xda/0x370 i2cdev_ioctl_smbus+0x1bd/0x270 i2cdev_ioctl+0xaa/0x250 do_vfs_ioctl+0xa4/0x600 SyS_ioctl+0x79/0x90 do_syscall_64+0x73/0x130 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 ACPI Error: Evaluating _SBW: 4 (20170831/smbus_cmi-185) This problem occurs because the length of ACPI Buffer object is not defined/initialized in the code before a corresponding ACPI method is called. The obvious patch below fixes this issue. Signed-off-by: Edgar Cherkasov Acked-by: Viktor Krasnov Acked-by: Michael Brunner Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-scmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-scmi.c b/drivers/i2c/busses/i2c-scmi.c index a01389b85f13..7e9a2bbf5ddc 100644 --- a/drivers/i2c/busses/i2c-scmi.c +++ b/drivers/i2c/busses/i2c-scmi.c @@ -152,6 +152,7 @@ acpi_smbus_cmi_access(struct i2c_adapter *adap, u16 addr, unsigned short flags, mt_params[3].type = ACPI_TYPE_INTEGER; mt_params[3].integer.value = len; mt_params[4].type = ACPI_TYPE_BUFFER; + mt_params[4].buffer.length = len; mt_params[4].buffer.pointer = data->block + 1; } break; From 1feda5eb77fc279a4c4eb93b9a0ac3bf7254b57a Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 28 Sep 2018 18:09:23 +0200 Subject: [PATCH 365/499] drm/exynos: Use selected dma_dev default iommu domain instead of a fake one Instead of allocating a fake IOMMU domain for all Exynos DRM components, simply reuse the default IOMMU domain of the already selected DMA device. This allows some design changes in IOMMU framework without breaking IOMMU support in Exynos DRM. Signed-off-by: Marek Szyprowski Reviewed-by: Robin Murphy Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_iommu.h | 34 ++++------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_iommu.h b/drivers/gpu/drm/exynos/exynos_drm_iommu.h index 87f6b5672e11..797d9ee5f15a 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_iommu.h +++ b/drivers/gpu/drm/exynos/exynos_drm_iommu.h @@ -55,37 +55,12 @@ static inline void __exynos_iommu_detach(struct exynos_drm_private *priv, static inline int __exynos_iommu_create_mapping(struct exynos_drm_private *priv, unsigned long start, unsigned long size) { - struct iommu_domain *domain; - int ret; - - domain = iommu_domain_alloc(priv->dma_dev->bus); - if (!domain) - return -ENOMEM; - - ret = iommu_get_dma_cookie(domain); - if (ret) - goto free_domain; - - ret = iommu_dma_init_domain(domain, start, size, NULL); - if (ret) - goto put_cookie; - - priv->mapping = domain; + priv->mapping = iommu_get_domain_for_dev(priv->dma_dev); return 0; - -put_cookie: - iommu_put_dma_cookie(domain); -free_domain: - iommu_domain_free(domain); - return ret; } static inline void __exynos_iommu_release_mapping(struct exynos_drm_private *priv) { - struct iommu_domain *domain = priv->mapping; - - iommu_put_dma_cookie(domain); - iommu_domain_free(domain); priv->mapping = NULL; } @@ -94,7 +69,9 @@ static inline int __exynos_iommu_attach(struct exynos_drm_private *priv, { struct iommu_domain *domain = priv->mapping; - return iommu_attach_device(domain, dev); + if (dev != priv->dma_dev) + return iommu_attach_device(domain, dev); + return 0; } static inline void __exynos_iommu_detach(struct exynos_drm_private *priv, @@ -102,7 +79,8 @@ static inline void __exynos_iommu_detach(struct exynos_drm_private *priv, { struct iommu_domain *domain = priv->mapping; - iommu_detach_device(domain, dev); + if (dev != priv->dma_dev) + iommu_detach_device(domain, dev); } #else #error Unsupported architecture and IOMMU/DMA-mapping glue code From 848e616e66d4592fe9afc40743d3504deb7632b4 Mon Sep 17 00:00:00 2001 From: Stefan Seyfried Date: Sun, 30 Sep 2018 12:53:00 +0200 Subject: [PATCH 366/499] cfg80211: fix wext-compat memory leak cfg80211_wext_giwrate and sinfo.pertid might allocate sinfo.pertid via rdev_get_station(), but never release it. Fix that. Fixes: 8689c051a201 ("cfg80211: dynamically allocate per-tid stats for station info") Signed-off-by: Stefan Seyfried [johannes: fix error path, use cfg80211_sinfo_release_content(), add Fixes] Signed-off-by: Johannes Berg --- net/wireless/wext-compat.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 167f7025ac98..06943d9c9835 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1278,12 +1278,16 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; - if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) - return -EOPNOTSUPP; + if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { + err = -EOPNOTSUPP; + goto free; + } rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); - return 0; +free: + cfg80211_sinfo_release_content(&sinfo); + return err; } /* Get wireless statistics. Called by /proc/net/wireless and by SIOCGIWSTATS */ @@ -1293,7 +1297,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); /* we are under RTNL - globally locked - so can use static structs */ static struct iw_statistics wstats; - static struct station_info sinfo; + static struct station_info sinfo = {}; u8 bssid[ETH_ALEN]; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) @@ -1352,6 +1356,8 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) wstats.discard.retries = sinfo.tx_failed; + cfg80211_sinfo_release_content(&sinfo); + return &wstats; } From 211710ca74adf790b46ab3867fcce8047b573cd1 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 29 Sep 2018 16:01:58 +0200 Subject: [PATCH 367/499] mac80211: fix setting IEEE80211_KEY_FLAG_RX_MGMT for AP mode keys key->sta is only valid after ieee80211_key_link, which is called later in this function. Because of that, the IEEE80211_KEY_FLAG_RX_MGMT is never set when management frame protection is enabled. Fixes: e548c49e6dc6b ("mac80211: add key flag for management keys") Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index d25da0e66da1..5d22eda8a6b1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -427,7 +427,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: /* Keys without a station are used for TX only */ - if (key->sta && test_sta_flag(key->sta, WLAN_STA_MFP)) + if (sta && test_sta_flag(sta, WLAN_STA_MFP)) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; break; case NL80211_IFTYPE_ADHOC: From 1db58529454742f67ebd96e3588315e880b72837 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 27 Sep 2018 17:05:04 -0600 Subject: [PATCH 368/499] cfg80211: fix use-after-free in reg_process_hint() reg_process_hint_country_ie() can free regulatory_request and return REG_REQ_ALREADY_SET. We shouldn't use regulatory_request after it's called. KASAN error was observed when this happens. BUG: KASAN: use-after-free in reg_process_hint+0x839/0x8aa [cfg80211] Read of size 4 at addr ffff8800c430d434 by task kworker/1:3/89 Workqueue: events reg_todo [cfg80211] Call Trace: dump_stack+0xc1/0x10c ? _atomic_dec_and_lock+0x1ad/0x1ad ? _raw_spin_lock_irqsave+0xa0/0xd2 print_address_description+0x86/0x26f ? reg_process_hint+0x839/0x8aa [cfg80211] kasan_report+0x241/0x29b reg_process_hint+0x839/0x8aa [cfg80211] reg_todo+0x204/0x5b9 [cfg80211] process_one_work+0x55f/0x8d0 ? worker_detach_from_pool+0x1b5/0x1b5 ? _raw_spin_unlock_irq+0x65/0xdd ? _raw_spin_unlock_irqrestore+0xf3/0xf3 worker_thread+0x5dd/0x841 ? kthread_parkme+0x1d/0x1d kthread+0x270/0x285 ? pr_cont_work+0xe3/0xe3 ? rcu_read_unlock_sched_notrace+0xca/0xca ret_from_fork+0x22/0x40 Allocated by task 2718: set_track+0x63/0xfa __kmalloc+0x119/0x1ac regulatory_hint_country_ie+0x38/0x329 [cfg80211] __cfg80211_connect_result+0x854/0xadd [cfg80211] cfg80211_rx_assoc_resp+0x3bc/0x4f0 [cfg80211] smsc95xx v1.0.6 ieee80211_sta_rx_queued_mgmt+0x1803/0x7ed5 [mac80211] ieee80211_iface_work+0x411/0x696 [mac80211] process_one_work+0x55f/0x8d0 worker_thread+0x5dd/0x841 kthread+0x270/0x285 ret_from_fork+0x22/0x40 Freed by task 89: set_track+0x63/0xfa kasan_slab_free+0x6a/0x87 kfree+0xdc/0x470 reg_process_hint+0x31e/0x8aa [cfg80211] reg_todo+0x204/0x5b9 [cfg80211] process_one_work+0x55f/0x8d0 worker_thread+0x5dd/0x841 kthread+0x270/0x285 ret_from_fork+0x22/0x40 Signed-off-by: Yu Zhao Signed-off-by: Johannes Berg --- net/wireless/reg.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 765dedb12361..24cfa2776f50 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2661,11 +2661,12 @@ static void reg_process_hint(struct regulatory_request *reg_request) { struct wiphy *wiphy = NULL; enum reg_request_treatment treatment; + enum nl80211_reg_initiator initiator = reg_request->initiator; if (reg_request->wiphy_idx != WIPHY_IDX_INVALID) wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); - switch (reg_request->initiator) { + switch (initiator) { case NL80211_REGDOM_SET_BY_CORE: treatment = reg_process_hint_core(reg_request); break; @@ -2683,7 +2684,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) treatment = reg_process_hint_country_ie(wiphy, reg_request); break; default: - WARN(1, "invalid initiator %d\n", reg_request->initiator); + WARN(1, "invalid initiator %d\n", initiator); goto out_free; } @@ -2698,7 +2699,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) */ if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - wiphy_update_regulatory(wiphy, reg_request->initiator); + wiphy_update_regulatory(wiphy, initiator); wiphy_all_share_dfs_chan_state(wiphy); reg_check_channels(); } From d26c25a9d19b5976b319af528886f89cf455692d Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Thu, 27 Sep 2018 16:53:21 +0100 Subject: [PATCH 369/499] arm64: KVM: Tighten guest core register access from userspace We currently allow userspace to access the core register file in about any possible way, including straddling multiple registers and doing unaligned accesses. This is not the expected use of the ABI, and nobody is actually using it that way. Let's tighten it by explicitly checking the size and alignment for each field of the register file. Cc: Fixes: 2f4a07c5f9fe ("arm64: KVM: guest one-reg interface") Reviewed-by: Christoffer Dall Reviewed-by: Mark Rutland Signed-off-by: Dave Martin [maz: rewrote Dave's initial patch to be more easily backported] Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/kvm/guest.c | 45 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 07256b08226c..3088463bafc1 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -57,6 +57,45 @@ static u64 core_reg_offset_from_id(u64 id) return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); } +static int validate_core_offset(const struct kvm_one_reg *reg) +{ + u64 off = core_reg_offset_from_id(reg->id); + int size; + + switch (off) { + case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... + KVM_REG_ARM_CORE_REG(regs.regs[30]): + case KVM_REG_ARM_CORE_REG(regs.sp): + case KVM_REG_ARM_CORE_REG(regs.pc): + case KVM_REG_ARM_CORE_REG(regs.pstate): + case KVM_REG_ARM_CORE_REG(sp_el1): + case KVM_REG_ARM_CORE_REG(elr_el1): + case KVM_REG_ARM_CORE_REG(spsr[0]) ... + KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): + size = sizeof(__u64); + break; + + case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... + KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): + size = sizeof(__uint128_t); + break; + + case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): + case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): + size = sizeof(__u32); + break; + + default: + return -EINVAL; + } + + if (KVM_REG_SIZE(reg->id) == size && + IS_ALIGNED(off, size / sizeof(__u32))) + return 0; + + return -EINVAL; +} + static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* @@ -76,6 +115,9 @@ static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; + if (validate_core_offset(reg)) + return -EINVAL; + if (copy_to_user(uaddr, ((u32 *)regs) + off, KVM_REG_SIZE(reg->id))) return -EFAULT; @@ -98,6 +140,9 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs) return -ENOENT; + if (validate_core_offset(reg)) + return -EINVAL; + if (KVM_REG_SIZE(reg->id) > sizeof(tmp)) return -EINVAL; From 2a3f93459d689d990b3ecfbe782fec89b97d3279 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 27 Sep 2018 16:53:22 +0100 Subject: [PATCH 370/499] arm64: KVM: Sanitize PSTATE.M when being set from userspace Not all execution modes are valid for a guest, and some of them depend on what the HW actually supports. Let's verify that what userspace provides is compatible with both the VM settings and the HW capabilities. Cc: Fixes: 0d854a60b1d7 ("arm64: KVM: enable initialization of a 32bit vcpu") Reviewed-by: Christoffer Dall Reviewed-by: Mark Rutland Reviewed-by: Dave Martin Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/kvm/guest.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 3088463bafc1..a6c9fbaeaefc 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -152,17 +152,25 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) } if (off == KVM_REG_ARM_CORE_REG(regs.pstate)) { - u32 mode = (*(u32 *)valp) & PSR_AA32_MODE_MASK; + u64 mode = (*(u64 *)valp) & PSR_AA32_MODE_MASK; switch (mode) { case PSR_AA32_MODE_USR: + if (!system_supports_32bit_el0()) + return -EINVAL; + break; case PSR_AA32_MODE_FIQ: case PSR_AA32_MODE_IRQ: case PSR_AA32_MODE_SVC: case PSR_AA32_MODE_ABT: case PSR_AA32_MODE_UND: + if (!vcpu_el1_is_32bit(vcpu)) + return -EINVAL; + break; case PSR_MODE_EL0t: case PSR_MODE_EL1t: case PSR_MODE_EL1h: + if (vcpu_el1_is_32bit(vcpu)) + return -EINVAL; break; default: err = -EINVAL; From 5f76f6f5ff96587af5acd5930f7d9fea81e0d1a8 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Fri, 14 Sep 2018 03:25:52 +0300 Subject: [PATCH 371/499] KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled Before this commit, KVM exposes MPX VMX controls to L1 guest only based on if KVM and host processor supports MPX virtualization. However, these controls should be exposed to guest only in case guest vCPU supports MPX. Without this change, a L1 guest running with kernel which don't have commit 691bd4340bef ("kvm: vmx: allow host to access guest MSR_IA32_BNDCFGS") asserts in QEMU on the following: qemu-kvm: error: failed to set MSR 0xd90 to 0x0 qemu-kvm: .../qemu-2.10.0/target/i386/kvm.c:1801 kvm_put_msrs: Assertion 'ret == cpu->kvm_msr_buf->nmsrs failed' This is because L1 KVM kvm_init_msr_list() will see that vmx_mpx_supported() (As it only checks MPX VMX controls support) and therefore KVM_GET_MSR_INDEX_LIST IOCTL will include MSR_IA32_BNDCFGS. However, later when L1 will attempt to set this MSR via KVM_SET_MSRS IOCTL, it will fail because !guest_cpuid_has_mpx(vcpu). Therefore, fix the issue by exposing MPX VMX controls to L1 guest only when vCPU supports MPX. Fixes: 36be0b9deb23 ("KVM: x86: Add nested virtualization support for MPX") Reported-by: Eyal Moscovici Reviewed-by: Nikita Leshchenko Reviewed-by: Darren Kenny Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8b066480224b..d32e9c5e743b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3525,9 +3525,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; - if (kvm_mpx_supported()) - msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; @@ -3544,8 +3541,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) VM_ENTRY_LOAD_IA32_PAT; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); - if (kvm_mpx_supported()) - msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; @@ -11237,6 +11232,23 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } +static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (kvm_mpx_supported()) { + bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); + + if (mpx_enabled) { + vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + } else { + vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; + } + } +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -11253,8 +11265,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - if (nested_vmx_allowed(vcpu)) + if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); + nested_vmx_entry_exit_ctls_update(vcpu); + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) From 503234b3fdcaa578395c07e393ea3e5d13958824 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Fri, 14 Sep 2018 03:25:53 +0300 Subject: [PATCH 372/499] KVM: x86: Do not use kvm_x86_ops->mpx_supported() directly Commit a87036add092 ("KVM: x86: disable MPX if host did not enable MPX XSAVE features") introduced kvm_mpx_supported() to return true iff MPX is enabled in the host. However, that commit seems to have missed replacing some calls to kvm_x86_ops->mpx_supported() to kvm_mpx_supported(). Complete original commit by replacing remaining calls to kvm_mpx_supported(). Fixes: a87036add092 ("KVM: x86: disable MPX if host did not enable MPX XSAVE features") Suggested-by: Sean Christopherson Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d32e9c5e743b..5bb2acd8a52c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -12036,7 +12036,7 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); - if (vmx_mpx_supported()) + if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); if (enable_vpid) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index edbf00ec56b3..ca717737347e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4698,7 +4698,7 @@ static void kvm_init_msr_list(void) */ switch (msrs_to_save[i]) { case MSR_IA32_BNDCFGS: - if (!kvm_x86_ops->mpx_supported()) + if (!kvm_mpx_supported()) continue; break; case MSR_TSC_AUX: From 62cf9bd8118c4009f02c477ef78c723f49e53e16 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Fri, 14 Sep 2018 03:25:54 +0300 Subject: [PATCH 373/499] KVM: nVMX: Fix emulation of VM_ENTRY_LOAD_BNDCFGS L2 IA32_BNDCFGS should be updated with vmcs12->guest_bndcfgs only when VM_ENTRY_LOAD_BNDCFGS is specified in vmcs12->vm_entry_controls. Otherwise, L2 IA32_BNDCFGS should be set to vmcs01->guest_bndcfgs which is L1 IA32_BNDCFGS. Reviewed-by: Nikita Leshchenko Reviewed-by: Darren Kenny Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5bb2acd8a52c..b16b2664cfe1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -856,6 +856,7 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + u64 vmcs01_guest_bndcfgs; u16 vpid02; u16 last_vpid; @@ -12036,8 +12037,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); - if (kvm_mpx_supported()) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (kvm_mpx_supported()) { + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + else + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + } if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) @@ -12591,6 +12597,9 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (kvm_mpx_supported() && + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); vmx_segment_cache_clear(vmx); From fe804cd6774938814242aa67bb7e7cbc934b1203 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Tue, 25 Sep 2018 09:19:36 +0200 Subject: [PATCH 374/499] tools/kvm_stat: cut down decimal places in update interval dialog We currently display the default number of decimal places for floats in _show_set_update_interval(), which is quite pointless. Cutting down to a single decimal place. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 439b8a27488d..195ba486640f 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1325,7 +1325,7 @@ class Tui(object): msg = '' while True: self.screen.erase() - self.screen.addstr(0, 0, 'Set update interval (defaults to %fs).' % + self.screen.addstr(0, 0, 'Set update interval (defaults to %.1fs).' % DELAY_DEFAULT, curses.A_BOLD) self.screen.addstr(4, 0, msg) self.screen.addstr(2, 0, 'Change delay from %.1fs to ' % From daa07cbc9ae3da2d61b7ce900c0b9107d134f2c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 25 Sep 2018 13:20:00 -0700 Subject: [PATCH 375/499] KVM: x86: fix L1TF's MMIO GFN calculation One defense against L1TF in KVM is to always set the upper five bits of the *legal* physical address in the SPTEs for non-present and reserved SPTEs, e.g. MMIO SPTEs. In the MMIO case, the GFN of the MMIO SPTE may overlap with the upper five bits that are being usurped to defend against L1TF. To preserve the GFN, the bits of the GFN that overlap with the repurposed bits are shifted left into the reserved bits, i.e. the GFN in the SPTE will be split into high and low parts. When retrieving the GFN from the MMIO SPTE, e.g. to check for an MMIO access, get_mmio_spte_gfn() unshifts the affected bits and restores the original GFN for comparison. Unfortunately, get_mmio_spte_gfn() neglects to mask off the reserved bits in the SPTE that were used to store the upper chunk of the GFN. As a result, KVM fails to detect MMIO accesses whose GPA overlaps the repurprosed bits, which in turn causes guest panics and hangs. Fix the bug by generating a mask that covers the lower chunk of the GFN, i.e. the bits that aren't shifted by the L1TF mitigation. The alternative approach would be to explicitly zero the five reserved bits that are used to store the upper chunk of the GFN, but that requires additional run-time computation and makes an already-ugly bit of code even more inscrutable. I considered adding a WARN_ON_ONCE(low_phys_bits-1 <= PAGE_SHIFT) to warn if GENMASK_ULL() generated a nonsensical value, but that seemed silly since that would mean a system that supports VMX has less than 18 bits of physical address space... Reported-by: Sakari Ailus Fixes: d9b47449c1a1 ("kvm: x86: Set highest physical address bits in non-present/reserved SPTEs") Cc: Junaid Shahid Cc: Jim Mattson Cc: stable@vger.kernel.org Reviewed-by: Junaid Shahid Tested-by: Sakari Ailus Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d7e9bce6ff61..51b953ad9d4e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -249,6 +249,17 @@ static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; */ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; +/* + * In some cases, we need to preserve the GFN of a non-present or reserved + * SPTE when we usurp the upper five bits of the physical address space to + * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll + * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask + * left into the reserved bits, i.e. the GFN in the SPTE will be split into + * high and low parts. This mask covers the lower bits of the GFN. + */ +static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + + static void mmu_spte_set(u64 *sptep, u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); @@ -357,9 +368,7 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask | - shadow_nonpresent_or_rsvd_mask; - u64 gpa = spte & ~mask; + u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) & shadow_nonpresent_or_rsvd_mask; @@ -423,6 +432,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static void kvm_mmu_reset_all_pte_masks(void) { + u8 low_phys_bits; + shadow_user_mask = 0; shadow_accessed_mask = 0; shadow_dirty_mask = 0; @@ -437,12 +448,17 @@ static void kvm_mmu_reset_all_pte_masks(void) * appropriate mask to guard against L1TF attacks. Otherwise, it is * assumed that the CPU is not vulnerable to L1TF. */ + low_phys_bits = boot_cpu_data.x86_phys_bits; if (boot_cpu_data.x86_phys_bits < - 52 - shadow_nonpresent_or_rsvd_mask_len) + 52 - shadow_nonpresent_or_rsvd_mask_len) { shadow_nonpresent_or_rsvd_mask = rsvd_bits(boot_cpu_data.x86_phys_bits - shadow_nonpresent_or_rsvd_mask_len, boot_cpu_data.x86_phys_bits - 1); + low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; + } + shadow_nonpresent_or_rsvd_lower_gfn_mask = + GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); } static int is_cpuid_PSE36(void) From bda6b1c95751bbf1779562b8eca3c20b270c0f53 Mon Sep 17 00:00:00 2001 From: Anusha Srivatsa Date: Fri, 17 Aug 2018 10:33:30 -0700 Subject: [PATCH 376/499] drm/i915: Do not redefine the has_csr parameter. Let us reuse the already defined has_csr check and not redefine it. The main difference is that in effect this will flip .has_csr to 1 (via GEN9_FEATURES which GEN11_FEATURES pulls in). Suggested-by: Imre Deak Cc: Imre Deak Cc: Rodrigo Vivi Signed-off-by: Anusha Srivatsa Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=107382 Reviewed-by: Imre Deak Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/1534527210-16841-1-git-send-email-anusha.srivatsa@intel.com (cherry picked from commit da4468a1aa75457e6134127b19761b7ba62ce945) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_pci.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 6a4d1388ad2d..1df3ce134cd0 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -592,7 +592,6 @@ static const struct intel_device_info intel_cannonlake_info = { GEN10_FEATURES, \ GEN(11), \ .ddb_size = 2048, \ - .has_csr = 0, \ .has_logical_ring_elsq = 1 static const struct intel_device_info intel_icelake_11_info = { From 4ca8ca9fe7dc792000c3762de5081a4d6dc33667 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 26 Sep 2018 11:47:18 +0100 Subject: [PATCH 377/499] drm/i915: Avoid compiler warning for maybe unused gu_misc_iir /kisskb/src/drivers/gpu/drm/i915/i915_irq.c: warning: 'gu_misc_iir' may be used uninitialized in this function [-Wuninitialized]: => 3120:10 Silence the compiler warning by ensuring that the local variable is initialised and removing the guard that is confusing the older gcc. Reported-by: Geert Uytterhoeven Fixes: df0d28c185ad ("drm/i915/icl: GSE interrupt moves from DE_MISC to GU_MISC") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Paulo Zanoni Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20180926104718.17462-1-chris@chris-wilson.co.uk (cherry picked from commit 7a90938332d80faf973fbcffdf6e674e7b8f0914) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 90628a47ae17..29877969310d 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -3091,36 +3091,27 @@ gen11_gt_irq_handler(struct drm_i915_private * const i915, spin_unlock(&i915->irq_lock); } -static void -gen11_gu_misc_irq_ack(struct drm_i915_private *dev_priv, const u32 master_ctl, - u32 *iir) +static u32 +gen11_gu_misc_irq_ack(struct drm_i915_private *dev_priv, const u32 master_ctl) { void __iomem * const regs = dev_priv->regs; + u32 iir; if (!(master_ctl & GEN11_GU_MISC_IRQ)) - return; + return 0; - *iir = raw_reg_read(regs, GEN11_GU_MISC_IIR); - if (likely(*iir)) - raw_reg_write(regs, GEN11_GU_MISC_IIR, *iir); + iir = raw_reg_read(regs, GEN11_GU_MISC_IIR); + if (likely(iir)) + raw_reg_write(regs, GEN11_GU_MISC_IIR, iir); + + return iir; } static void -gen11_gu_misc_irq_handler(struct drm_i915_private *dev_priv, - const u32 master_ctl, const u32 iir) +gen11_gu_misc_irq_handler(struct drm_i915_private *dev_priv, const u32 iir) { - if (!(master_ctl & GEN11_GU_MISC_IRQ)) - return; - - if (unlikely(!iir)) { - DRM_ERROR("GU_MISC iir blank!\n"); - return; - } - if (iir & GEN11_GU_MISC_GSE) intel_opregion_asle_intr(dev_priv); - else - DRM_ERROR("Unexpected GU_MISC interrupt 0x%x\n", iir); } static irqreturn_t gen11_irq_handler(int irq, void *arg) @@ -3157,12 +3148,12 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg) enable_rpm_wakeref_asserts(i915); } - gen11_gu_misc_irq_ack(i915, master_ctl, &gu_misc_iir); + gu_misc_iir = gen11_gu_misc_irq_ack(i915, master_ctl); /* Acknowledge and enable interrupts. */ raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, GEN11_MASTER_IRQ | master_ctl); - gen11_gu_misc_irq_handler(i915, master_ctl, gu_misc_iir); + gen11_gu_misc_irq_handler(i915, gu_misc_iir); return IRQ_HANDLED; } From 4d8fcf216c90bc25e34ae2200aa8985ee3158898 Mon Sep 17 00:00:00 2001 From: Alaa Hleihel Date: Wed, 5 Sep 2018 11:43:23 +0300 Subject: [PATCH 378/499] net/mlx5e: Avoid unbounded peer devices when unpairing TC hairpin rules If the peer device was already unbound, then do not attempt to modify it's resources, otherwise we will crash on dereferencing non-existing device. Fixes: 5c65c564c962 ("net/mlx5e: Support offloading TC NIC hairpin flows") Signed-off-by: Alaa Hleihel Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../net/ethernet/mellanox/mlx5/core/en/fs.h | 2 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 62 ++++++++++++++++++- .../ethernet/mellanox/mlx5/core/transobj.c | 5 +- include/linux/mlx5/transobj.h | 2 + 6 files changed, 71 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index db2cfcd21d43..0f189f873859 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -54,6 +54,7 @@ #include "en_stats.h" #include "en/fs.h" +extern const struct net_device_ops mlx5e_netdev_ops; struct page_pool; #define MLX5E_METADATA_ETHER_TYPE (0x8CE4) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index bbf69e859b78..1431232c9a09 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -16,6 +16,8 @@ struct mlx5e_tc_table { DECLARE_HASHTABLE(mod_hdr_tbl, 8); DECLARE_HASHTABLE(hairpin_tbl, 8); + + struct notifier_block netdevice_nb; }; struct mlx5e_flow_table { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 54118b77dc1f..f291d1bf1558 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4315,7 +4315,7 @@ static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } -static const struct net_device_ops mlx5e_netdev_ops = { +const struct net_device_ops mlx5e_netdev_ops = { .ndo_open = mlx5e_open, .ndo_stop = mlx5e_close, .ndo_start_xmit = mlx5e_xmit, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9fed54017659..52e05f3ece50 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -2946,14 +2946,71 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv, return 0; } +static void mlx5e_tc_hairpin_update_dead_peer(struct mlx5e_priv *priv, + struct mlx5e_priv *peer_priv) +{ + struct mlx5_core_dev *peer_mdev = peer_priv->mdev; + struct mlx5e_hairpin_entry *hpe; + u16 peer_vhca_id; + int bkt; + + if (!same_hw_devs(priv, peer_priv)) + return; + + peer_vhca_id = MLX5_CAP_GEN(peer_mdev, vhca_id); + + hash_for_each(priv->fs.tc.hairpin_tbl, bkt, hpe, hairpin_hlist) { + if (hpe->peer_vhca_id == peer_vhca_id) + hpe->hp->pair->peer_gone = true; + } +} + +static int mlx5e_tc_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct mlx5e_flow_steering *fs; + struct mlx5e_priv *peer_priv; + struct mlx5e_tc_table *tc; + struct mlx5e_priv *priv; + + if (ndev->netdev_ops != &mlx5e_netdev_ops || + event != NETDEV_UNREGISTER || + ndev->reg_state == NETREG_REGISTERED) + return NOTIFY_DONE; + + tc = container_of(this, struct mlx5e_tc_table, netdevice_nb); + fs = container_of(tc, struct mlx5e_flow_steering, tc); + priv = container_of(fs, struct mlx5e_priv, fs); + peer_priv = netdev_priv(ndev); + if (priv == peer_priv || + !(priv->netdev->features & NETIF_F_HW_TC)) + return NOTIFY_DONE; + + mlx5e_tc_hairpin_update_dead_peer(priv, peer_priv); + + return NOTIFY_DONE; +} + int mlx5e_tc_nic_init(struct mlx5e_priv *priv) { struct mlx5e_tc_table *tc = &priv->fs.tc; + int err; hash_init(tc->mod_hdr_tbl); hash_init(tc->hairpin_tbl); - return rhashtable_init(&tc->ht, &tc_ht_params); + err = rhashtable_init(&tc->ht, &tc_ht_params); + if (err) + return err; + + tc->netdevice_nb.notifier_call = mlx5e_tc_netdev_event; + if (register_netdevice_notifier(&tc->netdevice_nb)) { + tc->netdevice_nb.notifier_call = NULL; + mlx5_core_warn(priv->mdev, "Failed to register netdev notifier\n"); + } + + return err; } static void _mlx5e_tc_del_flow(void *ptr, void *arg) @@ -2969,6 +3026,9 @@ void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) { struct mlx5e_tc_table *tc = &priv->fs.tc; + if (tc->netdevice_nb.notifier_call) + unregister_netdevice_notifier(&tc->netdevice_nb); + rhashtable_free_and_destroy(&tc->ht, _mlx5e_tc_del_flow, NULL); if (!IS_ERR_OR_NULL(tc->t)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index d2f76070ea7c..a1ee9a8a769e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -475,7 +475,8 @@ static void mlx5_hairpin_destroy_queues(struct mlx5_hairpin *hp) for (i = 0; i < hp->num_channels; i++) { mlx5_core_destroy_rq(hp->func_mdev, hp->rqn[i]); - mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[i]); + if (!hp->peer_gone) + mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[i]); } } @@ -567,6 +568,8 @@ static void mlx5_hairpin_unpair_queues(struct mlx5_hairpin *hp) MLX5_RQC_STATE_RST, 0, 0); /* unset peer SQs */ + if (hp->peer_gone) + return; for (i = 0; i < hp->num_channels; i++) mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i], MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_RST, 0, 0); diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h index 83a33a1873a6..7f5ca2cd3a32 100644 --- a/include/linux/mlx5/transobj.h +++ b/include/linux/mlx5/transobj.h @@ -90,6 +90,8 @@ struct mlx5_hairpin { u32 *rqn; u32 *sqn; + + bool peer_gone; }; struct mlx5_hairpin * From 11aa5800ed66ed0415b7509f02881c76417d212a Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Sun, 16 Sep 2018 14:45:27 +0300 Subject: [PATCH 379/499] net/mlx5: E-Switch, Fix out of bound access when setting vport rate The code that deals with eswitch vport bw guarantee was going beyond the eswitch vport array limit, fix that. This was pointed out by the kernel address sanitizer (KASAN). The error from KASAN log: [2018-09-15 15:04:45] BUG: KASAN: slab-out-of-bounds in mlx5_eswitch_set_vport_rate+0x8c1/0xae0 [mlx5_core] Fixes: c9497c98901c ("net/mlx5: Add support for setting VF min rate") Signed-off-by: Eran Ben Elisha Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 2b252cde5cc2..ea7dedc2d5ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -2000,7 +2000,7 @@ static u32 calculate_vports_min_rate_divider(struct mlx5_eswitch *esw) u32 max_guarantee = 0; int i; - for (i = 0; i <= esw->total_vports; i++) { + for (i = 0; i < esw->total_vports; i++) { evport = &esw->vports[i]; if (!evport->enabled || evport->info.min_rate < max_guarantee) continue; @@ -2020,7 +2020,7 @@ static int normalize_vports_min_rate(struct mlx5_eswitch *esw, u32 divider) int err; int i; - for (i = 0; i <= esw->total_vports; i++) { + for (i = 0; i < esw->total_vports; i++) { evport = &esw->vports[i]; if (!evport->enabled) continue; From cee26487620bc9bc3c7db21b6984d91f7bae12ae Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Sat, 25 Aug 2018 03:29:58 +0000 Subject: [PATCH 380/499] net/mlx5e: Set vlan masks for all offloaded TC rules In flow steering, if asked to, the hardware matches on the first ethertype which is not vlan. It's possible to set a rule as follows, which is meant to match on untagged packet, but will match on a vlan packet: tc filter add dev eth0 parent ffff: protocol ip flower ... To avoid this for packets with single tag, we set vlan masks to tell hardware to check the tags for every matched packet. Fixes: 095b6cfd69ce ('net/mlx5e: Add TC vlan match parsing') Signed-off-by: Jianbo Liu Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 52e05f3ece50..85796727093e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1368,6 +1368,9 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, *match_level = MLX5_MATCH_L2; } + } else { + MLX5_SET(fte_match_set_lyr_2_4, headers_c, svlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1); } if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CVLAN)) { From 95375f2ab2960c135484d83ea9f8f357cb1be26a Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Mon, 1 Oct 2018 14:49:34 +0200 Subject: [PATCH 381/499] PCI: mvebu: Fix PCI I/O mapping creation sequence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit ee1604381a371 ("PCI: mvebu: Only remap I/O space if configured") had the side effect that the PCI I/O mapping was created much earlier than before, at a point where the probe() of the driver could still fail. This is for example a problem if one gets an -EPROBE_DEFER at some point during probe(), after pci_ioremap_io() has been called. Indeed, there is currently no function to undo what pci_ioremap_io() did, and switching to pci_remap_iospace() is not an option in pci-mvebu due to the need for special memory attributes on Armada 38x. Reverting ee1604381a371 ("PCI: mvebu: Only remap I/O space if configured") would be a possibility, but it would require also reverting 42342073e38b5 ("PCI: mvebu: Convert to use pci_host_bridge directly"). So instead, we use an open-coded version of pci_host_probe() that creates the PCI I/O mapping at a point where we are guaranteed not to fail anymore. Fixes: ee1604381a371 ("PCI: mvebu: Only remap I/O space if configured") Reported-by: Jan Kundrát Tested-by: Jan Kundrát Signed-off-by: Thomas Petazzoni Signed-off-by: Bjorn Helgaas Acked-by: Lorenzo Pieralisi --- drivers/pci/controller/pci-mvebu.c | 54 +++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/drivers/pci/controller/pci-mvebu.c b/drivers/pci/controller/pci-mvebu.c index 50eb0729385b..a41d79b8d46a 100644 --- a/drivers/pci/controller/pci-mvebu.c +++ b/drivers/pci/controller/pci-mvebu.c @@ -1145,7 +1145,6 @@ static int mvebu_pcie_parse_request_resources(struct mvebu_pcie *pcie) { struct device *dev = &pcie->pdev->dev; struct device_node *np = dev->of_node; - unsigned int i; int ret; INIT_LIST_HEAD(&pcie->resources); @@ -1179,15 +1178,60 @@ static int mvebu_pcie_parse_request_resources(struct mvebu_pcie *pcie) resource_size(&pcie->io) - 1); pcie->realio.name = "PCI I/O"; - for (i = 0; i < resource_size(&pcie->realio); i += SZ_64K) - pci_ioremap_io(i, pcie->io.start + i); - pci_add_resource(&pcie->resources, &pcie->realio); } return devm_request_pci_bus_resources(dev, &pcie->resources); } +/* + * This is a copy of pci_host_probe(), except that it does the I/O + * remap as the last step, once we are sure we won't fail. + * + * It should be removed once the I/O remap error handling issue has + * been sorted out. + */ +static int mvebu_pci_host_probe(struct pci_host_bridge *bridge) +{ + struct mvebu_pcie *pcie; + struct pci_bus *bus, *child; + int ret; + + ret = pci_scan_root_bus_bridge(bridge); + if (ret < 0) { + dev_err(bridge->dev.parent, "Scanning root bridge failed"); + return ret; + } + + pcie = pci_host_bridge_priv(bridge); + if (resource_size(&pcie->io) != 0) { + unsigned int i; + + for (i = 0; i < resource_size(&pcie->realio); i += SZ_64K) + pci_ioremap_io(i, pcie->io.start + i); + } + + bus = bridge->bus; + + /* + * We insert PCI resources into the iomem_resource and + * ioport_resource trees in either pci_bus_claim_resources() + * or pci_bus_assign_resources(). + */ + if (pci_has_flag(PCI_PROBE_ONLY)) { + pci_bus_claim_resources(bus); + } else { + pci_bus_size_bridges(bus); + pci_bus_assign_resources(bus); + + list_for_each_entry(child, &bus->children, node) + pcie_bus_configure_settings(child); + } + + pci_bus_add_devices(bus); + return 0; +} + static int mvebu_pcie_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -1268,7 +1312,7 @@ static int mvebu_pcie_probe(struct platform_device *pdev) bridge->align_resource = mvebu_pcie_align_resource; bridge->msi = pcie->msi; - return pci_host_probe(bridge); + return mvebu_pci_host_probe(bridge); } static const struct of_device_id mvebu_pcie_of_match_table[] = { From 1ad98e9d1bdf4724c0a8532fabd84bf3c457c2bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Oct 2018 15:02:26 -0700 Subject: [PATCH 382/499] tcp/dccp: fix lockdep issue when SYN is backlogged In normal SYN processing, packets are handled without listener lock and in RCU protected ingress path. But syzkaller is known to be able to trick us and SYN packets might be processed in process context, after being queued into socket backlog. In commit 06f877d613be ("tcp/dccp: fix other lockdep splats accessing ireq_opt") I made a very stupid fix, that happened to work mostly because of the regular path being RCU protected. Really the thing protecting ireq->ireq_opt is RCU read lock, and the pseudo request refcnt is not relevant. This patch extends what I did in commit 449809a66c1d ("tcp/dccp: block BH for SYN processing") by adding an extra rcu_read_{lock|unlock} pair in the paths that might be taken when processing SYN from socket backlog (thus possibly in process context) Fixes: 06f877d613be ("tcp/dccp: fix other lockdep splats accessing ireq_opt") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/inet_sock.h | 3 +-- net/dccp/input.c | 4 +++- net/ipv4/tcp_input.c | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index e03b93360f33..a8cd5cf9ff5b 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -132,8 +132,7 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) { - return rcu_dereference_check(ireq->ireq_opt, - refcount_read(&ireq->req.rsk_refcnt) > 0); + return rcu_dereference(ireq->ireq_opt); } struct inet_cork { diff --git a/net/dccp/input.c b/net/dccp/input.c index d28d46bff6ab..85d6c879383d 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -606,11 +606,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH right there. + * so we need to make sure to disable BH and RCU right there. */ + rcu_read_lock(); local_bh_disable(); acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); + rcu_read_unlock(); if (!acceptable) return 1; consume_skb(skb); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cf2f7bb2802..47e08c1b5bc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6009,11 +6009,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (th->fin) goto discard; /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH right there. + * so we need to make sure to disable BH and RCU right there. */ + rcu_read_lock(); local_bh_disable(); acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); + rcu_read_unlock(); if (!acceptable) return 1; From 242cdad873a75652f97c35aad61270581e0f3a2e Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Fri, 21 Sep 2018 12:24:31 +0930 Subject: [PATCH 383/499] lib/xz: Put CRC32_POLY_LE in xz_private.h This fixes a regression introduced by faa16bc404d72a5 ("lib: Use existing define with polynomial"). The cleanup added a dependency on include/linux, which broke the PowerPC boot wrapper/decompresser when KERNEL_XZ is enabled: BOOTCC arch/powerpc/boot/decompress.o In file included from arch/powerpc/boot/../../../lib/decompress_unxz.c:233, from arch/powerpc/boot/decompress.c:42: arch/powerpc/boot/../../../lib/xz/xz_crc32.c:18:10: fatal error: linux/crc32poly.h: No such file or directory #include ^~~~~~~~~~~~~~~~~~~ The powerpc decompresser is a hairy corner of the kernel. Even while building a 64-bit kernel it needs to build a 32-bit binary and therefore avoid including files from include/linux. This allows users of the xz library to avoid including headers from 'include/linux/' while still achieving the cleanup of the magic number. Fixes: faa16bc404d72a5 ("lib: Use existing define with polynomial") Reported-by: Meelis Roos Reported-by: kbuild test robot Suggested-by: Christophe LEROY Signed-off-by: Joel Stanley Tested-by: Meelis Roos Signed-off-by: Michael Ellerman --- lib/xz/xz_crc32.c | 1 - lib/xz/xz_private.h | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/xz/xz_crc32.c b/lib/xz/xz_crc32.c index 25a5d87e2e4c..912aae5fa09e 100644 --- a/lib/xz/xz_crc32.c +++ b/lib/xz/xz_crc32.c @@ -15,7 +15,6 @@ * but they are bigger and use more memory for the lookup table. */ -#include #include "xz_private.h" /* diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index 482b90f363fe..09360ebb510e 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -102,6 +102,10 @@ # endif #endif +#ifndef CRC32_POLY_LE +#define CRC32_POLY_LE 0xedb88320 +#endif + /* * Allocate memory for LZMA2 decoder. xz_dec_lzma2_reset() must be used * before calling xz_dec_lzma2_run(). From f2924d4b16ae138c2de6a0e73f526fb638330858 Mon Sep 17 00:00:00 2001 From: Romain Izard Date: Thu, 20 Sep 2018 16:49:04 +0200 Subject: [PATCH 384/499] usb: cdc_acm: Do not leak URB buffers When the ACM TTY port is disconnected, the URBs it uses must be killed, and then the buffers must be freed. Unfortunately a previous refactor removed the code freeing the buffers because it looked extremely similar to the code killing the URBs. As a result, there were many new leaks for each plug/unplug cycle of a CDC-ACM device, that were detected by kmemleak. Restore the missing code, and the memory leak is removed. Fixes: ba8c931ded8d ("cdc-acm: refactor killing urbs") Signed-off-by: Romain Izard Acked-by: Oliver Neukum Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index f9b40a9dc4d3..bc03b0a690b4 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1514,6 +1514,7 @@ static void acm_disconnect(struct usb_interface *intf) { struct acm *acm = usb_get_intfdata(intf); struct tty_struct *tty; + int i; /* sibling interface is already cleaning up */ if (!acm) @@ -1544,6 +1545,11 @@ static void acm_disconnect(struct usb_interface *intf) tty_unregister_device(acm_tty_driver, acm->minor); + usb_free_urb(acm->ctrlurb); + for (i = 0; i < ACM_NW; i++) + usb_free_urb(acm->wb[i].urb); + for (i = 0; i < acm->rx_buflimit; i++) + usb_free_urb(acm->read_urbs[i]); acm_write_buffers_free(acm); usb_free_coherent(acm->dev, acm->ctrlsize, acm->ctrl_buffer, acm->ctrl_dma); acm_read_buffers_free(acm); From ffe84e01bb1b38c7eb9c6b6da127a6c136d251df Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 1 Oct 2018 18:36:07 +0300 Subject: [PATCH 385/499] xhci: Add missing CAS workaround for Intel Sunrise Point xHCI The workaround for missing CAS bit is also needed for xHC on Intel sunrisepoint PCH. For more details see: Intel 100/c230 series PCH specification update Doc #332692-006 Errata #8 Cc: Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 6372edf339d9..722860eb5a91 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -185,6 +185,8 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) } if (pdev->vendor == PCI_VENDOR_ID_INTEL && (pdev->device == PCI_DEVICE_ID_INTEL_CHERRYVIEW_XHCI || + pdev->device == PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_XHCI || + pdev->device == PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_XHCI || pdev->device == PCI_DEVICE_ID_INTEL_APL_XHCI || pdev->device == PCI_DEVICE_ID_INTEL_DNV_XHCI)) xhci->quirks |= XHCI_MISSING_CAS; From 555df5820e733cded7eb8d0bf78b2a791be51d75 Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Mon, 1 Oct 2018 18:36:08 +0300 Subject: [PATCH 386/499] usb: xhci-mtk: resume USB3 roothub first Give USB3 devices a better chance to enumerate at USB3 speeds if they are connected to a suspended host. Porting from "671ffdff5b13 xhci: resume USB 3 roothub first" Cc: Signed-off-by: Chunfeng Yun Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mtk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/xhci-mtk.c b/drivers/usb/host/xhci-mtk.c index 7334da9e9779..71d0d33c3286 100644 --- a/drivers/usb/host/xhci-mtk.c +++ b/drivers/usb/host/xhci-mtk.c @@ -642,10 +642,10 @@ static int __maybe_unused xhci_mtk_resume(struct device *dev) xhci_mtk_host_enable(mtk); xhci_dbg(xhci, "%s: restart port polling\n", __func__); - set_bit(HCD_FLAG_POLL_RH, &hcd->flags); - usb_hcd_poll_rh_status(hcd); set_bit(HCD_FLAG_POLL_RH, &xhci->shared_hcd->flags); usb_hcd_poll_rh_status(xhci->shared_hcd); + set_bit(HCD_FLAG_POLL_RH, &hcd->flags); + usb_hcd_poll_rh_status(hcd); return 0; } From aeadd93f2b0a609f603ac33e574b97a9832d1b90 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 22 Sep 2018 16:46:48 +0300 Subject: [PATCH 387/499] net: sched: act_ipt: check for underflow in __tcf_ipt_init() If "td->u.target_size" is larger than sizeof(struct xt_entry_target) we return -EINVAL. But we don't check whether it's smaller than sizeof(struct xt_entry_target) and that could lead to an out of bounds read. Fixes: 7ba699c604ab ("[NET_SCHED]: Convert actions from rtnetlink to new netlink API") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 23273b5303fd..8525de811616 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -135,7 +135,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); - if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { + if (nla_len(tb[TCA_IPT_TARG]) != td->u.target_size) { if (exists) tcf_idr_release(*a, bind); else From d949cfedbcbab4e91590576cbace2671924ad69c Mon Sep 17 00:00:00 2001 From: LUU Duc Canh Date: Wed, 26 Sep 2018 22:28:52 +0200 Subject: [PATCH 388/499] tipc: ignore STATE_MSG on wrong link session The initial session number when a link is created is based on a random value, taken from struct tipc_net->random. It is then incremented for each link reset to avoid mixing protocol messages from different link sessions. However, when a bearer is reset all its links are deleted, and will later be re-created using the same random value as the first time. This means that if the link never went down between creation and deletion we will still sometimes have two subsequent sessions with the same session number. In virtual environments with potentially long transmission times this has turned out to be a real problem. We now fix this by randomizing the session number each time a link is created. With a session number size of 16 bits this gives a risk of session collision of 1/64k. To reduce this further, we also introduce a sanity check on the very first STATE message arriving at a link. If this has an acknowledge value differing from 0, which is logically impossible, we ignore the message. The final risk for session collision is hence reduced to 1/4G, which should be sufficient. Signed-off-by: LUU Duc Canh Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 3 +++ net/tipc/node.c | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index 4ed650ce6e61..fb886b525d95 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1516,6 +1516,9 @@ bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr) return false; if (session != curr_session) return false; + /* Extra sanity check */ + if (!link_is_up(l) && msg_ack(hdr)) + return false; if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO)) return true; /* Accept only STATE with new sequence number */ diff --git a/net/tipc/node.c b/net/tipc/node.c index b0ee25f1f2e6..2afc4f8c37a7 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -913,6 +913,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, bool reset = true; char *if_name; unsigned long intv; + u16 session; *dupl_addr = false; *respond = false; @@ -999,9 +1000,10 @@ void tipc_node_check_dest(struct net *net, u32 addr, goto exit; if_name = strchr(b->name, ':') + 1; + get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, - b->window, mod(tipc_net(net)->random), + b->window, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, @@ -1625,7 +1627,6 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, tipc_link_create_dummy_tnl_msg(l, xmitq); n->failover_sent = true; } - /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; From 7f6d6558ae44bc193eb28df3617c364d3bb6df39 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Fri, 28 Sep 2018 14:55:34 -0300 Subject: [PATCH 389/499] Revert "openvswitch: Fix template leak in error cases." This reverts commit 90c7afc96cbbd77f44094b5b651261968e97de67. When the commit was merged, the code used nf_ct_put() to free the entry, but later on commit 76644232e612 ("openvswitch: Free tmpl with tmpl_free.") replaced that with nf_ct_tmpl_free which is a more appropriate. Now the original problem is removed. Then 44d6e2f27328 ("net: Replace NF_CT_ASSERT() with WARN_ON().") replaced a debug assert with a WARN_ON() which is trigged now. Signed-off-by: Flavio Leitner Acked-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 86a75105af1a..0aeb34c6389d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1624,10 +1624,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, OVS_NLERR(log, "Failed to allocate conntrack template"); return -ENOMEM; } - - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); - if (helper) { err = ovs_ct_add_helper(&ct_info, helper, key, log); if (err) @@ -1639,6 +1635,8 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); From 893626d6a353d1356528f94e081246ecf233d77a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 28 Sep 2018 12:28:41 -0700 Subject: [PATCH 390/499] rtnetlink: Fail dump if target netnsid is invalid Link dumps can return results from a target namespace. If the namespace id is invalid, then the dump request should fail if get_target_net fails rather than continuing with a dump of the current namespace. Fixes: 79e1ad148c844 ("rtnetlink: use netnsid to query interface") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 63ce2283a456..7f37fe9c65a5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1898,10 +1898,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (tb[IFLA_IF_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); tgt_net = get_target_net(skb->sk, netnsid); - if (IS_ERR(tgt_net)) { - tgt_net = net; - netnsid = -1; - } + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); } if (tb[IFLA_EXT_MASK]) From 6fe9487892b32cb1c8b8b0d552ed7222a527fe30 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 28 Sep 2018 16:26:08 -0400 Subject: [PATCH 391/499] bond: take rcu lock in netpoll_send_skb_on_dev The bonding driver lacks the rcu lock when it calls down into netdev_lower_get_next_private_rcu from bond_poll_controller, which results in a trace like: WARNING: CPU: 2 PID: 179 at net/core/dev.c:6567 netdev_lower_get_next_private_rcu+0x34/0x40 CPU: 2 PID: 179 Comm: kworker/u16:15 Not tainted 4.19.0-rc5-backup+ #1 Workqueue: bond0 bond_mii_monitor RIP: 0010:netdev_lower_get_next_private_rcu+0x34/0x40 Code: 48 89 fb e8 fe 29 63 ff 85 c0 74 1e 48 8b 45 00 48 81 c3 c0 00 00 00 48 8b 00 48 39 d8 74 0f 48 89 45 00 48 8b 40 f8 5b 5d c3 <0f> 0b eb de 31 c0 eb f5 0f 1f 40 00 0f 1f 44 00 00 48 8> RSP: 0018:ffffc9000087fa68 EFLAGS: 00010046 RAX: 0000000000000000 RBX: ffff880429614560 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 00000000ffffffff RDI: ffffffffa184ada0 RBP: ffffc9000087fa80 R08: 0000000000000001 R09: 0000000000000000 R10: ffffc9000087f9f0 R11: ffff880429798040 R12: ffff8804289d5980 R13: ffffffffa1511f60 R14: 00000000000000c8 R15: 00000000ffffffff FS: 0000000000000000(0000) GS:ffff88042f880000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4b78fce180 CR3: 000000018180f006 CR4: 00000000001606e0 Call Trace: bond_poll_controller+0x52/0x170 netpoll_poll_dev+0x79/0x290 netpoll_send_skb_on_dev+0x158/0x2c0 netpoll_send_udp+0x2d5/0x430 write_ext_msg+0x1e0/0x210 console_unlock+0x3c4/0x630 vprintk_emit+0xfa/0x2f0 printk+0x52/0x6e ? __netdev_printk+0x12b/0x220 netdev_info+0x64/0x80 ? bond_3ad_set_carrier+0xe9/0x180 bond_select_active_slave+0x1fc/0x310 bond_mii_monitor+0x709/0x9b0 process_one_work+0x221/0x5e0 worker_thread+0x4f/0x3b0 kthread+0x100/0x140 ? process_one_work+0x5e0/0x5e0 ? kthread_delayed_work_timer_fn+0x90/0x90 ret_from_fork+0x24/0x30 We're also doing rcu dereferences a layer up in netpoll_send_skb_on_dev before we call down into netpoll_poll_dev, so just take the lock there. Suggested-by: Cong Wang Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 3ae899805f8b..de1d1ba92f2d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -312,6 +312,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; + rcu_read_lock_bh(); lockdep_assert_irqs_disabled(); npinfo = rcu_dereference_bh(np->dev->npinfo); @@ -356,6 +357,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } + rcu_read_unlock_bh(); } EXPORT_SYMBOL(netpoll_send_skb_on_dev); From 06e55addd3f40b5294e448c2cb7605ca4f28c2e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 28 Sep 2018 14:51:47 -0700 Subject: [PATCH 392/499] tun: remove unused parameters tun_napi_disable() and tun_napi_del() do not need a pointer to the tun_struct Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index e2648b5a3861..71d10fb59849 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -324,13 +324,13 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, } } -static void tun_napi_disable(struct tun_struct *tun, struct tun_file *tfile) +static void tun_napi_disable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_disable(&tfile->napi); } -static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile) +static void tun_napi_del(struct tun_file *tfile) { if (tfile->napi_enabled) netif_napi_del(&tfile->napi); @@ -690,8 +690,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun = rtnl_dereference(tfile->tun); if (tun && clean) { - tun_napi_disable(tun, tfile); - tun_napi_del(tun, tfile); + tun_napi_disable(tfile); + tun_napi_del(tfile); } if (tun && !tfile->detached) { @@ -758,7 +758,7 @@ static void tun_detach_all(struct net_device *dev) for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); - tun_napi_disable(tun, tfile); + tun_napi_disable(tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); @@ -774,7 +774,7 @@ static void tun_detach_all(struct net_device *dev) synchronize_net(); for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); - tun_napi_del(tun, tfile); + tun_napi_del(tfile); /* Drop read queue */ tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); From c7256f579f8302ce2c038181c30060d0b40017b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 28 Sep 2018 14:51:48 -0700 Subject: [PATCH 393/499] tun: initialize napi_mutex unconditionally This is the first part to fix following syzbot report : console output: https://syzkaller.appspot.com/x/log.txt?x=145378e6400000 kernel config: https://syzkaller.appspot.com/x/.config?x=443816db871edd66 dashboard link: https://syzkaller.appspot.com/bug?extid=e662df0ac1d753b57e80 Following patch is fixing the race condition, but it seems safer to initialize this mutex at tfile creation anyway. Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver") Signed-off-by: Eric Dumazet Reported-by: syzbot+e662df0ac1d753b57e80@syzkaller.appspotmail.com Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 71d10fb59849..729686babbf3 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -320,7 +320,6 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll, NAPI_POLL_WEIGHT); napi_enable(&tfile->napi); - mutex_init(&tfile->napi_mutex); } } @@ -3199,6 +3198,7 @@ static int tun_chr_open(struct inode *inode, struct file * file) return -ENOMEM; } + mutex_init(&tfile->napi_mutex); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; From af3fb24eecb2c59246e03c99386037fd5ad84ffd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 28 Sep 2018 14:51:49 -0700 Subject: [PATCH 394/499] tun: napi flags belong to tfile Since tun->flags might be shared by multiple tfile structures, it is better to make sure tun_get_user() is using the flags for the current tfile. Presence of the READ_ONCE() in tun_napi_frags_enabled() gave a hint of what could happen, but we need something stronger to please syzbot. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 13647 Comm: syz-executor5 Not tainted 4.19.0-rc5+ #59 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:dev_gro_receive+0x132/0x2720 net/core/dev.c:5427 Code: 48 c1 ea 03 80 3c 02 00 0f 85 6e 20 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8b 6e 10 49 8d bd d0 00 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 59 20 00 00 4d 8b a5 d0 00 00 00 31 ff 41 81 e4 RSP: 0018:ffff8801c400f410 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff8618d325 RDX: 000000000000001a RSI: ffffffff86189f97 RDI: 00000000000000d0 RBP: ffff8801c400f608 R08: ffff8801c8fb4300 R09: 0000000000000000 R10: ffffed0038801ed7 R11: 0000000000000003 R12: ffff8801d327d358 R13: 0000000000000000 R14: ffff8801c16dd8c0 R15: 0000000000000004 FS: 00007fe003615700(0000) GS:ffff8801dac00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe1f3c43db8 CR3: 00000001bebb2000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: napi_gro_frags+0x3f4/0xc90 net/core/dev.c:5715 tun_get_user+0x31d5/0x42a0 drivers/net/tun.c:1922 tun_chr_write_iter+0xb9/0x154 drivers/net/tun.c:1967 call_write_iter include/linux/fs.h:1808 [inline] new_sync_write fs/read_write.c:474 [inline] __vfs_write+0x6b8/0x9f0 fs/read_write.c:487 vfs_write+0x1fc/0x560 fs/read_write.c:549 ksys_write+0x101/0x260 fs/read_write.c:598 __do_sys_write fs/read_write.c:610 [inline] __se_sys_write fs/read_write.c:607 [inline] __x64_sys_write+0x73/0xb0 fs/read_write.c:607 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457579 Code: 1d b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fe003614c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457579 RDX: 0000000000000012 RSI: 0000000020000000 RDI: 000000000000000a RBP: 000000000072c040 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fe0036156d4 R13: 00000000004c5574 R14: 00000000004d8e98 R15: 00000000ffffffff Modules linked in: RIP: 0010:dev_gro_receive+0x132/0x2720 net/core/dev.c:5427 Code: 48 c1 ea 03 80 3c 02 00 0f 85 6e 20 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8b 6e 10 49 8d bd d0 00 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 59 20 00 00 4d 8b a5 d0 00 00 00 31 ff 41 81 e4 RSP: 0018:ffff8801c400f410 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff8618d325 RDX: 000000000000001a RSI: ffffffff86189f97 RDI: 00000000000000d0 RBP: ffff8801c400f608 R08: ffff8801c8fb4300 R09: 0000000000000000 R10: ffffed0038801ed7 R11: 0000000000000003 R12: ffff8801d327d358 R13: 0000000000000000 R14: ffff8801c16dd8c0 R15: 0000000000000004 FS: 00007fe003615700(0000) GS:ffff8801dac00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe1f3c43db8 CR3: 00000001bebb2000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- drivers/net/tun.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 729686babbf3..50e9cc19023a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -181,6 +181,7 @@ struct tun_file { }; struct napi_struct napi; bool napi_enabled; + bool napi_frags_enabled; struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; @@ -313,9 +314,10 @@ static int tun_napi_poll(struct napi_struct *napi, int budget) } static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, - bool napi_en) + bool napi_en, bool napi_frags) { tfile->napi_enabled = napi_en; + tfile->napi_frags_enabled = napi_en && napi_frags; if (napi_en) { netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll, NAPI_POLL_WEIGHT); @@ -335,9 +337,9 @@ static void tun_napi_del(struct tun_file *tfile) netif_napi_del(&tfile->napi); } -static bool tun_napi_frags_enabled(const struct tun_struct *tun) +static bool tun_napi_frags_enabled(const struct tun_file *tfile) { - return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS; + return tfile->napi_frags_enabled; } #ifdef CONFIG_TUN_VNET_CROSS_LE @@ -792,7 +794,7 @@ static void tun_detach_all(struct net_device *dev) } static int tun_attach(struct tun_struct *tun, struct file *file, - bool skip_filter, bool napi) + bool skip_filter, bool napi, bool napi_frags) { struct tun_file *tfile = file->private_data; struct net_device *dev = tun->dev; @@ -865,7 +867,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file, tun_enable_queue(tfile); } else { sock_hold(&tfile->sk); - tun_napi_init(tun, tfile, napi); + tun_napi_init(tun, tfile, napi, napi_frags); } tun_set_real_num_queues(tun); @@ -1708,7 +1710,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, int err; u32 rxhash = 0; int skb_xdp = 1; - bool frags = tun_napi_frags_enabled(tun); + bool frags = tun_napi_frags_enabled(tfile); if (!(tun->dev->flags & IFF_UP)) return -EIO; @@ -2533,7 +2535,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) return err; err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, - ifr->ifr_flags & IFF_NAPI); + ifr->ifr_flags & IFF_NAPI, + ifr->ifr_flags & IFF_NAPI_FRAGS); if (err < 0) return err; @@ -2631,7 +2634,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) (ifr->ifr_flags & TUN_FEATURES); INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI); + err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI, + ifr->ifr_flags & IFF_NAPI_FRAGS); if (err < 0) goto err_free_flow; @@ -2780,7 +2784,8 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; - ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI); + ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI, + tun->flags & IFF_NAPI_FRAGS); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) From 715bd9d12f84d8f5cc8ad21d888f9bc304a8eb0b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 1 Oct 2018 12:52:15 -0700 Subject: [PATCH 395/499] x86/vdso: Fix asm constraints on vDSO syscall fallbacks The syscall fallbacks in the vDSO have incorrect asm constraints. They are not marked as writing to their outputs -- instead, they are marked as clobbering "memory", which is useless. In particular, gcc is smart enough to know that the timespec parameter hasn't escaped, so a memory clobber doesn't clobber it. And passing a pointer as an asm *input* does not tell gcc that the pointed-to value is changed. Add in the fact that the asm instructions weren't volatile, and gcc was free to omit them entirely unless their sole output (the return value) is used. Which it is (phew!), but that stops happening with some upcoming patches. As a trivial example, the following code: void test_fallback(struct timespec *ts) { vdso_fallback_gettime(CLOCK_MONOTONIC, ts); } compiles to: 00000000000000c0 : c0: c3 retq To add insult to injury, the RCX and R11 clobbers on 64-bit builds were missing. The "memory" clobber is also unnecessary -- no ordering with respect to other memory operations is needed, but that's going to be fixed in a separate not-for-stable patch. Fixes: 2aae950b21e4 ("x86_64: Add vDSO for x86-64 with gettimeofday/clock_gettime/getcpu") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/2c0231690551989d2fafa60ed0e7b5cc8b403908.1538422295.git.luto@kernel.org --- arch/x86/entry/vdso/vclock_gettime.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index f19856d95c60..134e2d2e8add 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -43,8 +43,9 @@ extern u8 hvclock_page notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*ts) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : + "memory", "rcx", "r11"); return ret; } @@ -52,8 +53,9 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*tv), "=m" (*tz) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : + "memory", "rcx", "r11"); return ret; } @@ -64,12 +66,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) + : "=a" (ret), "=m" (*ts) : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) : "memory", "edx"); return ret; @@ -79,12 +81,12 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) + : "=a" (ret), "=m" (*tv), "=m" (*tz) : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) : "memory", "edx"); return ret; From ad5f97faff4231e72b96bd96adbe1b6e977a9b86 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 28 Sep 2018 23:51:54 +0200 Subject: [PATCH 396/499] r8169: fix network stalls due to missing bit TXCFG_AUTO_FIFO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the chip-specific hw_start functions set bit TXCFG_AUTO_FIFO in register TxConfig. The original patch changed the order of some calls resulting in these changes being overwritten by rtl_set_tx_config_registers() in rtl_hw_start(). This eventually resulted in network stalls especially under high load. Analyzing the chip-specific hw_start functions all chip version from 34, with the exception of version 39, need this bit set. This patch moves setting this bit to rtl_set_tx_config_registers(). Fixes: 4fd48c4ac0a0 ("r8169: move common initializations to tp->hw_start") Reported-by: Ortwin Glück Reported-by: David Arendt Root-caused-by: Maciej S. Szmigiero Tested-by: Tony Atkinson Tested-by: David Arendt Tested-by: Ortwin Glück Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index d6b53f53909a..a94b874982dc 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -4536,9 +4536,14 @@ static void rtl8169_hw_reset(struct rtl8169_private *tp) static void rtl_set_tx_config_registers(struct rtl8169_private *tp) { - /* Set DMA burst size and Interframe Gap Time */ - RTL_W32(tp, TxConfig, (TX_DMA_BURST << TxDMAShift) | - (InterFrameGap << TxInterFrameGapShift)); + u32 val = TX_DMA_BURST << TxDMAShift | + InterFrameGap << TxInterFrameGapShift; + + if (tp->mac_version >= RTL_GIGA_MAC_VER_34 && + tp->mac_version != RTL_GIGA_MAC_VER_39) + val |= TXCFG_AUTO_FIFO; + + RTL_W32(tp, TxConfig, val); } static void rtl_set_rx_max_size(struct rtl8169_private *tp) @@ -5033,7 +5038,6 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private *tp) rtl_disable_clock_request(tp); - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO); RTL_W8(tp, MCU, RTL_R8(tp, MCU) & ~NOW_IS_OOB); /* Adjust EEE LED frequency */ @@ -5067,7 +5071,6 @@ static void rtl_hw_start_8168f(struct rtl8169_private *tp) rtl_disable_clock_request(tp); - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO); RTL_W8(tp, MCU, RTL_R8(tp, MCU) & ~NOW_IS_OOB); RTL_W8(tp, DLLPR, RTL_R8(tp, DLLPR) | PFM_EN); RTL_W32(tp, MISC, RTL_R32(tp, MISC) | PWM_EN); @@ -5112,8 +5115,6 @@ static void rtl_hw_start_8411(struct rtl8169_private *tp) static void rtl_hw_start_8168g(struct rtl8169_private *tp) { - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO); - rtl_eri_write(tp, 0xc8, ERIAR_MASK_0101, 0x080002, ERIAR_EXGMAC); rtl_eri_write(tp, 0xcc, ERIAR_MASK_0001, 0x38, ERIAR_EXGMAC); rtl_eri_write(tp, 0xd0, ERIAR_MASK_0001, 0x48, ERIAR_EXGMAC); @@ -5211,8 +5212,6 @@ static void rtl_hw_start_8168h_1(struct rtl8169_private *tp) rtl_hw_aspm_clkreq_enable(tp, false); rtl_ephy_init(tp, e_info_8168h_1, ARRAY_SIZE(e_info_8168h_1)); - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO); - rtl_eri_write(tp, 0xc8, ERIAR_MASK_0101, 0x00080002, ERIAR_EXGMAC); rtl_eri_write(tp, 0xcc, ERIAR_MASK_0001, 0x38, ERIAR_EXGMAC); rtl_eri_write(tp, 0xd0, ERIAR_MASK_0001, 0x48, ERIAR_EXGMAC); @@ -5295,8 +5294,6 @@ static void rtl_hw_start_8168ep(struct rtl8169_private *tp) { rtl8168ep_stop_cmac(tp); - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO); - rtl_eri_write(tp, 0xc8, ERIAR_MASK_0101, 0x00080002, ERIAR_EXGMAC); rtl_eri_write(tp, 0xcc, ERIAR_MASK_0001, 0x2f, ERIAR_EXGMAC); rtl_eri_write(tp, 0xd0, ERIAR_MASK_0001, 0x5f, ERIAR_EXGMAC); @@ -5618,7 +5615,6 @@ static void rtl_hw_start_8402(struct rtl8169_private *tp) /* Force LAN exit from ASPM if Rx/Tx are not idle */ RTL_W32(tp, FuncEvent, RTL_R32(tp, FuncEvent) | 0x002800); - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO); RTL_W8(tp, MCU, RTL_R8(tp, MCU) & ~NOW_IS_OOB); rtl_ephy_init(tp, e_info_8402, ARRAY_SIZE(e_info_8402)); From 7c03e7035ac1cf2a6165754e4f3a49c2f1977838 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 1 Oct 2018 12:52:16 -0700 Subject: [PATCH 397/499] selftests/x86: Add clock_gettime() tests to test_vdso Now that the vDSO implementation of clock_gettime() is getting reworked, add a selftest for it. This tests that its output is consistent with the syscall version. This is marked for stable to serve as a test for commit 715bd9d12f84 ("x86/vdso: Fix asm constraints on vDSO syscall fallbacks") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/082399674de2619b2befd8c0dde49b260605b126.1538422295.git.luto@kernel.org --- tools/testing/selftests/x86/test_vdso.c | 99 +++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c index 235259011704..49f7294fb382 100644 --- a/tools/testing/selftests/x86/test_vdso.c +++ b/tools/testing/selftests/x86/test_vdso.c @@ -17,6 +17,7 @@ #include #include #include +#include #ifndef SYS_getcpu # ifdef __x86_64__ @@ -31,6 +32,10 @@ int nerrs = 0; +typedef int (*vgettime_t)(clockid_t, struct timespec *); + +vgettime_t vdso_clock_gettime; + typedef long (*getcpu_t)(unsigned *, unsigned *, void *); getcpu_t vgetcpu; @@ -95,6 +100,10 @@ static void fill_function_pointers() printf("Warning: failed to find getcpu in vDSO\n"); vgetcpu = (getcpu_t) vsyscall_getcpu(); + + vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); + if (!vdso_clock_gettime) + printf("Warning: failed to find clock_gettime in vDSO\n"); } static long sys_getcpu(unsigned * cpu, unsigned * node, @@ -103,6 +112,11 @@ static long sys_getcpu(unsigned * cpu, unsigned * node, return syscall(__NR_getcpu, cpu, node, cache); } +static inline int sys_clock_gettime(clockid_t id, struct timespec *ts) +{ + return syscall(__NR_clock_gettime, id, ts); +} + static void test_getcpu(void) { printf("[RUN]\tTesting getcpu...\n"); @@ -155,10 +169,95 @@ static void test_getcpu(void) } } +static bool ts_leq(const struct timespec *a, const struct timespec *b) +{ + if (a->tv_sec != b->tv_sec) + return a->tv_sec < b->tv_sec; + else + return a->tv_nsec <= b->tv_nsec; +} + +static char const * const clocknames[] = { + [0] = "CLOCK_REALTIME", + [1] = "CLOCK_MONOTONIC", + [2] = "CLOCK_PROCESS_CPUTIME_ID", + [3] = "CLOCK_THREAD_CPUTIME_ID", + [4] = "CLOCK_MONOTONIC_RAW", + [5] = "CLOCK_REALTIME_COARSE", + [6] = "CLOCK_MONOTONIC_COARSE", + [7] = "CLOCK_BOOTTIME", + [8] = "CLOCK_REALTIME_ALARM", + [9] = "CLOCK_BOOTTIME_ALARM", + [10] = "CLOCK_SGI_CYCLE", + [11] = "CLOCK_TAI", +}; + +static void test_one_clock_gettime(int clock, const char *name) +{ + struct timespec start, vdso, end; + int vdso_ret, end_ret; + + printf("[RUN]\tTesting clock_gettime for clock %s (%d)...\n", name, clock); + + if (sys_clock_gettime(clock, &start) < 0) { + if (errno == EINVAL) { + vdso_ret = vdso_clock_gettime(clock, &vdso); + if (vdso_ret == -EINVAL) { + printf("[OK]\tNo such clock.\n"); + } else { + printf("[FAIL]\tNo such clock, but __vdso_clock_gettime returned %d\n", vdso_ret); + nerrs++; + } + } else { + printf("[WARN]\t clock_gettime(%d) syscall returned error %d\n", clock, errno); + } + return; + } + + vdso_ret = vdso_clock_gettime(clock, &vdso); + end_ret = sys_clock_gettime(clock, &end); + + if (vdso_ret != 0 || end_ret != 0) { + printf("[FAIL]\tvDSO returned %d, syscall errno=%d\n", + vdso_ret, errno); + nerrs++; + return; + } + + printf("\t%llu.%09ld %llu.%09ld %llu.%09ld\n", + (unsigned long long)start.tv_sec, start.tv_nsec, + (unsigned long long)vdso.tv_sec, vdso.tv_nsec, + (unsigned long long)end.tv_sec, end.tv_nsec); + + if (!ts_leq(&start, &vdso) || !ts_leq(&vdso, &end)) { + printf("[FAIL]\tTimes are out of sequence\n"); + nerrs++; + } +} + +static void test_clock_gettime(void) +{ + for (int clock = 0; clock < sizeof(clocknames) / sizeof(clocknames[0]); + clock++) { + test_one_clock_gettime(clock, clocknames[clock]); + } + + /* Also test some invalid clock ids */ + test_one_clock_gettime(-1, "invalid"); + test_one_clock_gettime(INT_MIN, "invalid"); + test_one_clock_gettime(INT_MAX, "invalid"); +} + int main(int argc, char **argv) { fill_function_pointers(); + test_clock_gettime(); + + /* + * Test getcpu() last so that, if something goes wrong setting affinity, + * we still run the other tests. + */ test_getcpu(); return nerrs ? 1 : 0; From a9f9772114c8b07ae75bcb3654bd017461248095 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Sep 2018 17:58:35 +0200 Subject: [PATCH 398/499] perf/core: Fix perf_pmu_unregister() locking When we unregister a PMU, we fail to serialize the @pmu_idr properly. Fix that by doing the entire thing under pmu_lock. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 2e80a82a49c4 ("perf: Dynamic pmu types") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index dcb093e7b377..dfb1d951789e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9431,9 +9431,7 @@ static void free_pmu_context(struct pmu *pmu) if (pmu->task_ctx_nr > perf_invalid_context) return; - mutex_lock(&pmus_lock); free_percpu(pmu->pmu_cpu_context); - mutex_unlock(&pmus_lock); } /* @@ -9689,12 +9687,8 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - int remove_device; - mutex_lock(&pmus_lock); - remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); - mutex_unlock(&pmus_lock); /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -9706,13 +9700,14 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (remove_device) { + if (pmu_bus_running) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); + mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); From 6265adb9726098b7f4f7ca70bc51992b25fdd9d6 Mon Sep 17 00:00:00 2001 From: Masayoshi Mizuma Date: Mon, 10 Sep 2018 10:47:50 -0400 Subject: [PATCH 399/499] perf/x86/intel/uncore: Use boot_cpu_data.phys_proc_id instead of hardcorded physical package ID 0 Physical package id 0 doesn't always exist, we should use boot_cpu_data.phys_proc_id here. Signed-off-by: Masayoshi Mizuma Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masayoshi Mizuma Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20180910144750.6782-1-msys.mizuma@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 51d7c117e3c7..53b981dcdb42 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3061,7 +3061,7 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { void bdx_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(0); + int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id); if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; From cd6fb677ce7e460c25bdd66f689734102ec7d642 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 23 Sep 2018 18:13:43 +0200 Subject: [PATCH 400/499] perf/ring_buffer: Prevent concurent ring buffer access Some of the scheduling tracepoints allow the perf_tp_event code to write to ring buffer under different cpu than the code is running on. This results in corrupted ring buffer data demonstrated in following perf commands: # perf record -e 'sched:sched_switch,sched:sched_wakeup' perf bench sched messaging # Running 'sched/messaging' benchmark: # 20 sender and receiver processes per group # 10 groups == 400 processes run Total time: 0.383 [sec] [ perf record: Woken up 8 times to write data ] 0x42b890 [0]: failed to process type: -1765585640 [ perf record: Captured and wrote 4.825 MB perf.data (29669 samples) ] # perf report --stdio 0x42b890 [0]: failed to process type: -1765585640 The reason for the corruption are some of the scheduling tracepoints, that have __perf_task dfined and thus allow to store data to another cpu ring buffer: sched_waking sched_wakeup sched_wakeup_new sched_stat_wait sched_stat_sleep sched_stat_iowait sched_stat_blocked The perf_tp_event function first store samples for current cpu related events defined for tracepoint: hlist_for_each_entry_rcu(event, head, hlist_entry) perf_swevent_event(event, count, &data, regs); And then iterates events of the 'task' and store the sample for any task's event that passes tracepoint checks: ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) continue; perf_swevent_event(event, count, &data, regs); } Above code can race with same code running on another cpu, ending up with 2 cpus trying to store under the same ring buffer, which is specifically not allowed. This patch prevents the problem, by allowing only events with the same current cpu to receive the event. NOTE: this requires the use of (per-task-)per-cpu buffers for this feature to work; perf-record does this. Signed-off-by: Jiri Olsa [peterz: small edits to Changelog] Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andrew Vagin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: e6dab5ffab59 ("perf/trace: Add ability to set a target task for events") Link: http://lkml.kernel.org/r/20180923161343.GB15054@krava Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index dfb1d951789e..5a97f34bc14c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8314,6 +8314,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->cpu != smp_processor_id()) + continue; if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) From 9d92cfeaf5215158d26d2991be7f7ff865cb98f3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 21 Sep 2018 07:07:06 -0700 Subject: [PATCH 401/499] perf/x86/intel/uncore: Fix PCI BDF address of M3UPI on SKX The counters on M3UPI Link 0 and Link 3 don't count properly, and writing 0 to these counters may causes system crash on some machines. The PCI BDF addresses of the M3UPI in the current code are incorrect. The correct addresses should be: D18:F1 0x204D D18:F2 0x204E D18:F5 0x204D Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: cd34cd97b7b4 ("perf/x86/intel/uncore: Add Skylake server uncore support") Link: http://lkml.kernel.org/r/1537538826-55489-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 53b981dcdb42..c07bee31abe8 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3931,16 +3931,16 @@ static const struct pci_device_id skx_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3), }, { /* M3UPI0 Link 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 0), }, { /* M3UPI0 Link 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204E), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 2, SKX_PCI_UNCORE_M3UPI, 1), }, { /* M3UPI1 Link 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 5, SKX_PCI_UNCORE_M3UPI, 2), }, { /* end: all zeroes */ } }; From d7cbbe49a9304520181fb8c9272d1327deec8453 Mon Sep 17 00:00:00 2001 From: "Natarajan, Janakarajan" Date: Thu, 27 Sep 2018 15:51:55 +0000 Subject: [PATCH 402/499] perf/x86/amd/uncore: Set ThreadMask and SliceMask for L3 Cache perf events In Family 17h, some L3 Cache Performance events require the ThreadMask and SliceMask to be set. For other events, these fields do not affect the count either way. Set ThreadMask and SliceMask to 0xFF and 0xF respectively. Signed-off-by: Janakarajan Natarajan Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: H . Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Suravee Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/Message-ID: Signed-off-by: Ingo Molnar --- arch/x86/events/amd/uncore.c | 10 ++++++++++ arch/x86/include/asm/perf_event.h | 8 ++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 981ba5e8241b..8671de126eac 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -36,6 +36,7 @@ static int num_counters_llc; static int num_counters_nb; +static bool l3_mask; static HLIST_HEAD(uncore_unused_list); @@ -209,6 +210,13 @@ static int amd_uncore_event_init(struct perf_event *event) hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; hwc->idx = -1; + /* + * SliceMask and ThreadMask need to be set for certain L3 events in + * Family 17h. For other events, the two fields do not affect the count. + */ + if (l3_mask) + hwc->config |= (AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK); + if (event->cpu < 0) return -EINVAL; @@ -525,6 +533,7 @@ static int __init amd_uncore_init(void) amd_llc_pmu.name = "amd_l3"; format_attr_event_df.show = &event_show_df; format_attr_event_l3.show = &event_show_l3; + l3_mask = true; } else { num_counters_nb = NUM_COUNTERS_NB; num_counters_llc = NUM_COUNTERS_L2; @@ -532,6 +541,7 @@ static int __init amd_uncore_init(void) amd_llc_pmu.name = "amd_l2"; format_attr_event_df = format_attr_event; format_attr_event_l3 = format_attr_event; + l3_mask = false; } amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 12f54082f4c8..78241b736f2a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -46,6 +46,14 @@ #define INTEL_ARCH_EVENT_MASK \ (ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT) +#define AMD64_L3_SLICE_SHIFT 48 +#define AMD64_L3_SLICE_MASK \ + ((0xFULL) << AMD64_L3_SLICE_SHIFT) + +#define AMD64_L3_THREAD_SHIFT 56 +#define AMD64_L3_THREAD_MASK \ + ((0xFFULL) << AMD64_L3_THREAD_SHIFT) + #define X86_RAW_EVENT_MASK \ (ARCH_PERFMON_EVENTSEL_EVENT | \ ARCH_PERFMON_EVENTSEL_UMASK | \ From a4739eca4456e3d140cc656c5331d42b7465f91d Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 21 Sep 2018 23:18:56 +0530 Subject: [PATCH 403/499] sched/numa: Stop multiple tasks from moving to the CPU at the same time Task migration under NUMA balancing can happen in parallel. More than one task might choose to migrate to the same CPU at the same time. This can result in: - During task swap, choosing a task that was not part of the evaluation. - During task swap, task which just got moved into its preferred node, moving to a completely different node. - During task swap, task failing to move to the preferred node, will have to wait an extra interval for the next migrate opportunity. - During task movement, multiple task movements can cause load imbalance. This problem is more likely if there are more cores per node or more nodes in the system. Use a per run-queue variable to check if NUMA-balance is active on the run-queue. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 200194 203353 1.57797 1 311331 328205 5.41995 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 1 197654 214384 8.46429 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 192605 188553 -2.10379 1 213402 196273 -8.02664 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 52227.1 57581.2 10.2516 1 102529 103468 0.915838 There is a regression on power 9 box. If we look at the details, that box has a sudden jump in cache-misses with this patch. All other parameters seem to be pointing towards NUMA consolidation. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 13,345,784 13,941,377 migrations 1,127,820 1,157,323 faults 374,736 382,175 cache-misses 55,132,054,603 54,993,823,500 sched:sched_move_numa 1,923 2,005 sched:sched_stick_numa 52 14 sched:sched_swap_numa 595 529 migrate:mm_migrate_pages 1,932 1,573 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 60605 67099 numa_hint_faults_local 51804 58456 numa_hit 239945 240416 numa_huge_pte_updates 14 18 numa_interleave 60 65 numa_local 239865 240339 numa_other 80 77 numa_pages_migrated 1931 1574 numa_pte_updates 67823 77182 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,016,467 3,176,453 migrations 37,326 30,238 faults 115,342 87,869 cache-misses 11,692,155,554 12,544,479,391 sched:sched_move_numa 965 23 sched:sched_stick_numa 8 0 sched:sched_swap_numa 35 6 migrate:mm_migrate_pages 1,168 10 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 16286 236 numa_hint_faults_local 11863 201 numa_hit 112482 72293 numa_huge_pte_updates 33 0 numa_interleave 20 26 numa_local 112419 72233 numa_other 63 60 numa_pages_migrated 1144 8 numa_pte_updates 32859 0 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,629,724 8,478,820 migrations 221,052 171,323 faults 308,661 307,499 cache-misses 135,574,913 240,353,599 sched:sched_move_numa 147 214 sched:sched_stick_numa 0 0 sched:sched_swap_numa 2 4 migrate:mm_migrate_pages 64 89 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 11481 5301 numa_hint_faults_local 10968 4745 numa_hit 89773 92943 numa_huge_pte_updates 0 0 numa_interleave 1116 899 numa_local 89220 92345 numa_other 553 598 numa_pages_migrated 62 88 numa_pte_updates 11694 5505 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,272,887 2,066,172 migrations 12,206 11,076 faults 163,704 149,544 cache-misses 4,801,186 10,398,067 sched:sched_move_numa 44 43 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 17 6 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 2261 3552 numa_hint_faults_local 1993 3347 numa_hit 25726 25611 numa_huge_pte_updates 0 0 numa_interleave 239 213 numa_local 25498 25583 numa_other 228 28 numa_pages_migrated 17 6 numa_pte_updates 2266 3535 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 117,980,962 99,358,136 migrations 3,950,220 4,041,607 faults 736,979 749,653 cache-misses 224,976,072,879 225,562,543,251 sched:sched_move_numa 504 771 sched:sched_stick_numa 50 14 sched:sched_swap_numa 239 204 migrate:mm_migrate_pages 1,260 1,180 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 18293 27409 numa_hint_faults_local 11969 20677 numa_hit 240854 239988 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 240851 239983 numa_other 3 5 numa_pages_migrated 1190 1016 numa_pte_updates 18106 27916 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 61,053,158 60,899,307 migrations 551,586 544,668 faults 244,174 270,834 cache-misses 74,326,766,973 74,543,455,635 sched:sched_move_numa 344 735 sched:sched_stick_numa 24 25 sched:sched_swap_numa 140 174 migrate:mm_migrate_pages 568 816 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 6461 11059 numa_hint_faults_local 2283 4733 numa_hit 35661 41384 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 35661 41383 numa_other 0 1 numa_pages_migrated 568 815 numa_pte_updates 6518 11323 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-2-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 22 ++++++++++++++++++++++ kernel/sched/sched.h | 1 + 2 files changed, 23 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f808ddf2a868..3b0b75de1141 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1514,6 +1514,21 @@ struct task_numa_env { static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { + struct rq *rq = cpu_rq(env->dst_cpu); + + /* Bail out if run-queue part of active NUMA balance. */ + if (xchg(&rq->numa_migrate_on, 1)) + return; + + /* + * Clear previous best_cpu/rq numa-migrate flag, since task now + * found a better CPU to move/swap. + */ + if (env->best_cpu != -1) { + rq = cpu_rq(env->best_cpu); + WRITE_ONCE(rq->numa_migrate_on, 0); + } + if (env->best_task) put_task_struct(env->best_task); if (p) @@ -1569,6 +1584,9 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; int dist = env->dist; + if (READ_ONCE(dst_rq->numa_migrate_on)) + return; + rcu_read_lock(); cur = task_rcu_dereference(&dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) @@ -1710,6 +1728,7 @@ static int task_numa_migrate(struct task_struct *p) .best_cpu = -1, }; struct sched_domain *sd; + struct rq *best_rq; unsigned long taskweight, groupweight; int nid, ret, dist; long taskimp, groupimp; @@ -1811,14 +1830,17 @@ static int task_numa_migrate(struct task_struct *p) */ p->numa_scan_period = task_scan_start(p); + best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4a2e8cae63c4..0b9161241bda 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -783,6 +783,7 @@ struct rq { #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; + unsigned int numa_migrate_on; #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; From 1327237a5978b00bcc665c33046c9bae75da1154 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 21 Sep 2018 23:18:57 +0530 Subject: [PATCH 404/499] sched/numa: Pass destination CPU as a parameter to migrate_task_rq This additional parameter (new_cpu) is used later for identifying if task migration is across nodes. No functional change. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 203353 200668 -1.32036 1 328205 321791 -1.95427 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 1 214384 204848 -4.44809 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 188553 188098 -0.241311 1 196273 200351 2.07772 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 57581.2 58145.9 0.980702 1 103468 103798 0.318939 Brings out the variance between different specjbb2005 runs. Some events stats before and after applying the patch. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 13,941,377 13,912,183 migrations 1,157,323 1,155,931 faults 382,175 367,139 cache-misses 54,993,823,500 54,240,196,814 sched:sched_move_numa 2,005 1,571 sched:sched_stick_numa 14 9 sched:sched_swap_numa 529 463 migrate:mm_migrate_pages 1,573 703 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 67099 50155 numa_hint_faults_local 58456 45264 numa_hit 240416 239652 numa_huge_pte_updates 18 36 numa_interleave 65 68 numa_local 240339 239576 numa_other 77 76 numa_pages_migrated 1574 680 numa_pte_updates 77182 71146 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,176,453 3,156,720 migrations 30,238 30,354 faults 87,869 97,261 cache-misses 12,544,479,391 12,400,026,826 sched:sched_move_numa 23 4 sched:sched_stick_numa 0 0 sched:sched_swap_numa 6 1 migrate:mm_migrate_pages 10 20 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 236 272 numa_hint_faults_local 201 186 numa_hit 72293 71362 numa_huge_pte_updates 0 0 numa_interleave 26 23 numa_local 72233 71299 numa_other 60 63 numa_pages_migrated 8 2 numa_pte_updates 0 0 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,478,820 8,606,824 migrations 171,323 155,352 faults 307,499 301,409 cache-misses 240,353,599 157,759,224 sched:sched_move_numa 214 168 sched:sched_stick_numa 0 0 sched:sched_swap_numa 4 3 migrate:mm_migrate_pages 89 125 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 5301 4650 numa_hint_faults_local 4745 3946 numa_hit 92943 90489 numa_huge_pte_updates 0 0 numa_interleave 899 892 numa_local 92345 90034 numa_other 598 455 numa_pages_migrated 88 124 numa_pte_updates 5505 4818 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,066,172 2,113,167 migrations 11,076 10,533 faults 149,544 142,727 cache-misses 10,398,067 5,594,192 sched:sched_move_numa 43 10 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 6 6 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 3552 744 numa_hint_faults_local 3347 584 numa_hit 25611 25551 numa_huge_pte_updates 0 0 numa_interleave 213 263 numa_local 25583 25302 numa_other 28 249 numa_pages_migrated 6 6 numa_pte_updates 3535 744 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 99,358,136 101,227,352 migrations 4,041,607 4,151,829 faults 749,653 745,233 cache-misses 225,562,543,251 224,669,561,766 sched:sched_move_numa 771 617 sched:sched_stick_numa 14 2 sched:sched_swap_numa 204 187 migrate:mm_migrate_pages 1,180 316 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 27409 24195 numa_hint_faults_local 20677 21639 numa_hit 239988 238331 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 239983 238331 numa_other 5 0 numa_pages_migrated 1016 204 numa_pte_updates 27916 24561 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 60,899,307 62,738,978 migrations 544,668 562,702 faults 270,834 228,465 cache-misses 74,543,455,635 75,778,067,952 sched:sched_move_numa 735 648 sched:sched_stick_numa 25 13 sched:sched_swap_numa 174 137 migrate:mm_migrate_pages 816 733 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 11059 10281 numa_hint_faults_local 4733 3242 numa_hit 41384 36338 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 41383 36338 numa_other 1 0 numa_pages_migrated 815 706 numa_pte_updates 11323 10176 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 625bc9897f62..ad97f3ba5ec5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1167,7 +1167,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p); + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 997ea7b839fa..91e4202b0634 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1607,7 +1607,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) return cpu; } -static void migrate_task_rq_dl(struct task_struct *p) +static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { struct rq *rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3b0b75de1141..bc768156239f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6297,7 +6297,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ -static void migrate_task_rq_fair(struct task_struct *p) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu __maybe_unused) { /* * As blocked tasks retain absolute vruntime the migration needs to diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0b9161241bda..455fa330de04 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1524,7 +1524,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); From 3f9672baaa70fc62765857f13f007feb01f9ad33 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 21 Sep 2018 23:18:58 +0530 Subject: [PATCH 405/499] sched/numa: Reset scan rate whenever task moves across nodes Currently task scan rate is reset when NUMA balancer migrates the task to a different node. If NUMA balancer initiates a swap, reset is only applicable to the task that initiates the swap. Similarly no scan rate reset is done if the task is migrated across nodes by traditional load balancer. Instead move the scan reset to the migrate_task_rq. This ensures the task moved out of its preferred node, either gets back to its preferred node quickly or finds a new preferred node. Doing so, would be fair to all tasks migrating across nodes. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 200668 203370 1.3465 1 321791 328431 2.06345 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 1 204848 206070 0.59654 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 188098 188386 0.153112 1 200351 201566 0.606436 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 58145.9 59157.4 1.73959 1 103798 105495 1.63491 Some events stats before and after applying the patch. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 13,912,183 13,825,492 migrations 1,155,931 1,152,509 faults 367,139 371,948 cache-misses 54,240,196,814 55,654,206,041 sched:sched_move_numa 1,571 1,856 sched:sched_stick_numa 9 4 sched:sched_swap_numa 463 428 migrate:mm_migrate_pages 703 898 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 50155 57146 numa_hint_faults_local 45264 51612 numa_hit 239652 238164 numa_huge_pte_updates 36 16 numa_interleave 68 63 numa_local 239576 238085 numa_other 76 79 numa_pages_migrated 680 883 numa_pte_updates 71146 67540 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,156,720 3,288,525 migrations 30,354 38,652 faults 97,261 111,678 cache-misses 12,400,026,826 12,111,197,376 sched:sched_move_numa 4 900 sched:sched_stick_numa 0 0 sched:sched_swap_numa 1 5 migrate:mm_migrate_pages 20 714 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 272 18572 numa_hint_faults_local 186 14850 numa_hit 71362 73197 numa_huge_pte_updates 0 11 numa_interleave 23 25 numa_local 71299 73138 numa_other 63 59 numa_pages_migrated 2 712 numa_pte_updates 0 24021 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,606,824 8,451,543 migrations 155,352 202,804 faults 301,409 310,024 cache-misses 157,759,224 253,522,507 sched:sched_move_numa 168 213 sched:sched_stick_numa 0 0 sched:sched_swap_numa 3 2 migrate:mm_migrate_pages 125 88 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 4650 11830 numa_hint_faults_local 3946 11301 numa_hit 90489 90038 numa_huge_pte_updates 0 0 numa_interleave 892 855 numa_local 90034 89796 numa_other 455 242 numa_pages_migrated 124 88 numa_pte_updates 4818 12039 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,113,167 2,049,153 migrations 10,533 11,405 faults 142,727 162,309 cache-misses 5,594,192 7,203,343 sched:sched_move_numa 10 22 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 6 1 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 744 1693 numa_hint_faults_local 584 1669 numa_hit 25551 25177 numa_huge_pte_updates 0 0 numa_interleave 263 194 numa_local 25302 24993 numa_other 249 184 numa_pages_migrated 6 1 numa_pte_updates 744 1577 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 101,227,352 94,515,937 migrations 4,151,829 4,203,554 faults 745,233 832,697 cache-misses 224,669,561,766 226,248,698,331 sched:sched_move_numa 617 1,730 sched:sched_stick_numa 2 14 sched:sched_swap_numa 187 432 migrate:mm_migrate_pages 316 1,398 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 24195 80079 numa_hint_faults_local 21639 68620 numa_hit 238331 241187 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 238331 241186 numa_other 0 1 numa_pages_migrated 204 1347 numa_pte_updates 24561 80729 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 62,738,978 63,704,961 migrations 562,702 573,404 faults 228,465 230,878 cache-misses 75,778,067,952 76,568,222,781 sched:sched_move_numa 648 509 sched:sched_stick_numa 13 31 sched:sched_swap_numa 137 182 migrate:mm_migrate_pages 733 541 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 10281 8501 numa_hint_faults_local 3242 2960 numa_hit 36338 35526 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 36338 35526 numa_other 0 0 numa_pages_migrated 706 539 numa_pte_updates 10176 8433 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-4-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc768156239f..5cbfb3068bc6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1824,12 +1824,6 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - /* - * Reset the scan period if the task is being rescheduled on an - * alternative node to recheck if the tasks is now properly placed. - */ - p->numa_scan_period = task_scan_start(p); - best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); @@ -2618,6 +2612,18 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } +static void update_scan_period(struct task_struct *p, int new_cpu) +{ + int src_nid = cpu_to_node(task_cpu(p)); + int dst_nid = cpu_to_node(new_cpu); + + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) + return; + + if (src_nid != dst_nid) + p->numa_scan_period = task_scan_start(p); +} + #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2631,6 +2637,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void update_scan_period(struct task_struct *p, int new_cpu) +{ +} + #endif /* CONFIG_NUMA_BALANCING */ static void @@ -6297,7 +6307,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ -static void migrate_task_rq_fair(struct task_struct *p, int new_cpu __maybe_unused) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { /* * As blocked tasks retain absolute vruntime the migration needs to @@ -6350,6 +6360,8 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu __maybe_unus /* We have migrated, no longer consider this task hot */ p->se.exec_start = 0; + + update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) From 05cbdf4f5c191ff378c47bbf66d7230beb725bdb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 21 Sep 2018 23:18:59 +0530 Subject: [PATCH 406/499] sched/numa: Limit the conditions where scan period is reset migrate_task_rq_fair() resets the scan rate for NUMA balancing on every cross-node migration. In the event of excessive load balancing due to saturation, this may result in the scan rate being pegged at maximum and further overloading the machine. This patch only resets the scan if NUMA balancing is active, a preferred node has been selected and the task is being migrated from the preferred node as these are the most harmful. For example, a migration to the preferred node does not justify a faster scan rate. Similarly, a migration between two nodes that are not preferred is probably bouncing due to over-saturation of the machine. In that case, scanning faster and trapping more NUMA faults will further overload the machine. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 203370 205332 0.964744 1 328431 319785 -2.63252 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 1 206070 206585 0.249915 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 188386 189162 0.41192 1 201566 213760 6.04963 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 59157.4 58736.8 -0.710985 1 105495 105419 -0.0720413 Some events stats before and after applying the patch. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 13,825,492 14,285,708 migrations 1,152,509 1,180,621 faults 371,948 339,114 cache-misses 55,654,206,041 55,205,631,894 sched:sched_move_numa 1,856 843 sched:sched_stick_numa 4 6 sched:sched_swap_numa 428 219 migrate:mm_migrate_pages 898 365 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 57146 26907 numa_hint_faults_local 51612 24279 numa_hit 238164 239771 numa_huge_pte_updates 16 0 numa_interleave 63 68 numa_local 238085 239688 numa_other 79 83 numa_pages_migrated 883 363 numa_pte_updates 67540 27415 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,288,525 3,202,779 migrations 38,652 37,186 faults 111,678 106,076 cache-misses 12,111,197,376 12,024,873,744 sched:sched_move_numa 900 931 sched:sched_stick_numa 0 0 sched:sched_swap_numa 5 1 migrate:mm_migrate_pages 714 637 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 18572 17409 numa_hint_faults_local 14850 14367 numa_hit 73197 73953 numa_huge_pte_updates 11 20 numa_interleave 25 25 numa_local 73138 73892 numa_other 59 61 numa_pages_migrated 712 668 numa_pte_updates 24021 27276 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,451,543 8,474,013 migrations 202,804 254,934 faults 310,024 320,506 cache-misses 253,522,507 110,580,458 sched:sched_move_numa 213 725 sched:sched_stick_numa 0 0 sched:sched_swap_numa 2 7 migrate:mm_migrate_pages 88 145 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 11830 22797 numa_hint_faults_local 11301 21539 numa_hit 90038 89308 numa_huge_pte_updates 0 0 numa_interleave 855 865 numa_local 89796 88955 numa_other 242 353 numa_pages_migrated 88 149 numa_pte_updates 12039 22930 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,049,153 2,195,628 migrations 11,405 11,179 faults 162,309 149,656 cache-misses 7,203,343 8,117,515 sched:sched_move_numa 22 49 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 1 5 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 1693 3577 numa_hint_faults_local 1669 3476 numa_hit 25177 26142 numa_huge_pte_updates 0 0 numa_interleave 194 358 numa_local 24993 26042 numa_other 184 100 numa_pages_migrated 1 5 numa_pte_updates 1577 3587 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 94,515,937 100,602,296 migrations 4,203,554 4,135,630 faults 832,697 789,256 cache-misses 226,248,698,331 226,160,621,058 sched:sched_move_numa 1,730 1,366 sched:sched_stick_numa 14 16 sched:sched_swap_numa 432 374 migrate:mm_migrate_pages 1,398 1,350 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 80079 47857 numa_hint_faults_local 68620 39768 numa_hit 241187 240165 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 241186 240165 numa_other 1 0 numa_pages_migrated 1347 1224 numa_pte_updates 80729 48354 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 63,704,961 58,515,496 migrations 573,404 564,845 faults 230,878 245,807 cache-misses 76,568,222,781 73,603,757,976 sched:sched_move_numa 509 996 sched:sched_stick_numa 31 10 sched:sched_swap_numa 182 193 migrate:mm_migrate_pages 541 646 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 8501 13422 numa_hint_faults_local 2960 5619 numa_hit 35526 36118 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 35526 36116 numa_other 0 2 numa_pages_migrated 539 616 numa_pte_updates 8433 13374 Signed-off-by: Mel Gorman Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-5-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5cbfb3068bc6..3529bf61826b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2617,11 +2617,32 @@ static void update_scan_period(struct task_struct *p, int new_cpu) int src_nid = cpu_to_node(task_cpu(p)); int dst_nid = cpu_to_node(new_cpu); + if (!static_branch_likely(&sched_numa_balancing)) + return; + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) return; - if (src_nid != dst_nid) - p->numa_scan_period = task_scan_start(p); + if (src_nid == dst_nid) + return; + + /* + * Allow resets if faults have been trapped before one scan + * has completed. This is most likely due to a new task that + * is pulled cross-node due to wakeups or load balancing. + */ + if (p->numa_scan_seq) { + /* + * Avoid scan adjustments if moving to the preferred + * node or if the task was not previously running on + * the preferred node. + */ + if (dst_nid == p->numa_preferred_nid || + (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + return; + } + + p->numa_scan_period = task_scan_start(p); } #else From 7534612123e0f5d020aba1076a6bb505db0e6bfe Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 21 Sep 2018 23:19:00 +0530 Subject: [PATCH 407/499] mm/migrate: Use spin_trylock() while resetting rate limit Since this spinlock will only serialize the migrate rate limiting, convert the spin_lock() to a spin_trylock(). If another thread is updating, this task can move on. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 205332 198512 -3.32145 1 319785 313559 -1.94693 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 8 74912 74761.9 -0.200368 1 206585 214874 4.01239 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 189162 180536 -4.56011 1 213760 210281 -1.62753 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 58736.8 56511.4 -3.78877 1 105419 104899 -0.49327 Avoiding stretching of window intervals may be the reason for the regression. Also code now uses READ_ONCE/WRITE_ONCE. That may also be hurting performance to some extent. Some events stats before and after applying the patch. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 14,285,708 13,818,546 migrations 1,180,621 1,149,960 faults 339,114 385,583 cache-misses 55,205,631,894 55,259,546,768 sched:sched_move_numa 843 2,257 sched:sched_stick_numa 6 9 sched:sched_swap_numa 219 512 migrate:mm_migrate_pages 365 2,225 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 26907 72692 numa_hint_faults_local 24279 62270 numa_hit 239771 238762 numa_huge_pte_updates 0 48 numa_interleave 68 75 numa_local 239688 238676 numa_other 83 86 numa_pages_migrated 363 2225 numa_pte_updates 27415 98557 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,202,779 3,173,490 migrations 37,186 36,966 faults 106,076 108,776 cache-misses 12,024,873,744 12,200,075,320 sched:sched_move_numa 931 1,264 sched:sched_stick_numa 0 0 sched:sched_swap_numa 1 0 migrate:mm_migrate_pages 637 899 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 17409 21109 numa_hint_faults_local 14367 17120 numa_hit 73953 72934 numa_huge_pte_updates 20 42 numa_interleave 25 33 numa_local 73892 72866 numa_other 61 68 numa_pages_migrated 668 915 numa_pte_updates 27276 42326 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,474,013 8,312,022 migrations 254,934 231,705 faults 320,506 310,242 cache-misses 110,580,458 402,324,573 sched:sched_move_numa 725 193 sched:sched_stick_numa 0 0 sched:sched_swap_numa 7 3 migrate:mm_migrate_pages 145 93 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 22797 11838 numa_hint_faults_local 21539 11216 numa_hit 89308 90689 numa_huge_pte_updates 0 0 numa_interleave 865 1579 numa_local 88955 89634 numa_other 353 1055 numa_pages_migrated 149 92 numa_pte_updates 22930 12109 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,195,628 2,170,481 migrations 11,179 10,126 faults 149,656 160,962 cache-misses 8,117,515 10,834,845 sched:sched_move_numa 49 10 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 5 2 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 3577 403 numa_hint_faults_local 3476 358 numa_hit 26142 25898 numa_huge_pte_updates 0 0 numa_interleave 358 207 numa_local 26042 25860 numa_other 100 38 numa_pages_migrated 5 2 numa_pte_updates 3587 400 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 100,602,296 110,339,633 migrations 4,135,630 4,139,812 faults 789,256 863,622 cache-misses 226,160,621,058 231,838,045,660 sched:sched_move_numa 1,366 2,196 sched:sched_stick_numa 16 33 sched:sched_swap_numa 374 544 migrate:mm_migrate_pages 1,350 2,469 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 47857 85748 numa_hint_faults_local 39768 66831 numa_hit 240165 242213 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 240165 242211 numa_other 0 2 numa_pages_migrated 1224 2376 numa_pte_updates 48354 86233 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 58,515,496 59,331,057 migrations 564,845 552,019 faults 245,807 266,586 cache-misses 73,603,757,976 73,796,312,990 sched:sched_move_numa 996 981 sched:sched_stick_numa 10 54 sched:sched_swap_numa 193 286 migrate:mm_migrate_pages 646 713 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 13422 14807 numa_hint_faults_local 5619 5738 numa_hit 36118 36230 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 36116 36228 numa_other 2 2 numa_pages_migrated 616 703 numa_pte_updates 13374 14742 Suggested-by: Peter Zijlstra Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-6-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- mm/migrate.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d6a2e89b086a..4f1d894835b5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1867,16 +1867,24 @@ static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); static bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) { + unsigned long next_window, interval; + + next_window = READ_ONCE(pgdat->numabalancing_migrate_next_window); + interval = msecs_to_jiffies(migrate_interval_millisecs); + /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! */ - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { - spin_lock(&pgdat->numabalancing_migrate_lock); + if (time_after(jiffies, next_window) && + spin_trylock(&pgdat->numabalancing_migrate_lock)) { pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies + - msecs_to_jiffies(migrate_interval_millisecs); + do { + next_window += interval; + } while (unlikely(time_after(jiffies, next_window))); + + WRITE_ONCE(pgdat->numabalancing_migrate_next_window, next_window); spin_unlock(&pgdat->numabalancing_migrate_lock); } if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { From 6fd98e775f24fd41520928d345f5db3ff52bb35d Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 21 Sep 2018 23:19:01 +0530 Subject: [PATCH 408/499] sched/numa: Avoid task migration for small NUMA improvement If NUMA improvement from the task migration is going to be very minimal, then avoid task migration. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 198512 205910 3.72673 1 313559 318491 1.57291 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 8 74761.9 74935.9 0.232739 1 214874 226796 5.54837 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 180536 189780 5.12031 1 210281 205695 -2.18089 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 56511.4 60370 6.828 1 104899 108100 3.05151 1/7 cases is regressing, if we look at events migrate_pages seem to vary the most especially in the regressing case. Also some amount of variance is expected between different runs of Specjbb2005. Some events stats before and after applying the patch. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 13,818,546 13,801,554 migrations 1,149,960 1,151,541 faults 385,583 433,246 cache-misses 55,259,546,768 55,168,691,835 sched:sched_move_numa 2,257 2,551 sched:sched_stick_numa 9 24 sched:sched_swap_numa 512 904 migrate:mm_migrate_pages 2,225 1,571 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 72692 113682 numa_hint_faults_local 62270 102163 numa_hit 238762 240181 numa_huge_pte_updates 48 36 numa_interleave 75 64 numa_local 238676 240103 numa_other 86 78 numa_pages_migrated 2225 1564 numa_pte_updates 98557 134080 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,173,490 3,079,150 migrations 36,966 31,455 faults 108,776 99,081 cache-misses 12,200,075,320 11,588,126,740 sched:sched_move_numa 1,264 1 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 899 36 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 21109 430 numa_hint_faults_local 17120 77 numa_hit 72934 71277 numa_huge_pte_updates 42 0 numa_interleave 33 22 numa_local 72866 71218 numa_other 68 59 numa_pages_migrated 915 23 numa_pte_updates 42326 0 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,312,022 8,707,565 migrations 231,705 171,342 faults 310,242 310,820 cache-misses 402,324,573 136,115,400 sched:sched_move_numa 193 215 sched:sched_stick_numa 0 6 sched:sched_swap_numa 3 24 migrate:mm_migrate_pages 93 162 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 11838 8985 numa_hint_faults_local 11216 8154 numa_hit 90689 93819 numa_huge_pte_updates 0 0 numa_interleave 1579 882 numa_local 89634 93496 numa_other 1055 323 numa_pages_migrated 92 169 numa_pte_updates 12109 9217 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,170,481 2,152,072 migrations 10,126 10,704 faults 160,962 164,376 cache-misses 10,834,845 3,818,437 sched:sched_move_numa 10 16 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 7 migrate:mm_migrate_pages 2 199 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 403 2248 numa_hint_faults_local 358 1666 numa_hit 25898 25704 numa_huge_pte_updates 0 0 numa_interleave 207 200 numa_local 25860 25679 numa_other 38 25 numa_pages_migrated 2 197 numa_pte_updates 400 2234 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 110,339,633 93,330,595 migrations 4,139,812 4,122,061 faults 863,622 865,979 cache-misses 231,838,045,660 225,395,083,479 sched:sched_move_numa 2,196 2,372 sched:sched_stick_numa 33 24 sched:sched_swap_numa 544 769 migrate:mm_migrate_pages 2,469 1,677 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 85748 91638 numa_hint_faults_local 66831 78096 numa_hit 242213 242225 numa_huge_pte_updates 0 0 numa_interleave 0 2 numa_local 242211 242219 numa_other 2 6 numa_pages_migrated 2376 1515 numa_pte_updates 86233 92274 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 59,331,057 51,487,271 migrations 552,019 537,170 faults 266,586 256,921 cache-misses 73,796,312,990 70,073,831,187 sched:sched_move_numa 981 576 sched:sched_stick_numa 54 24 sched:sched_swap_numa 286 327 migrate:mm_migrate_pages 713 726 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 14807 12000 numa_hint_faults_local 5738 5024 numa_hit 36230 36470 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 36228 36465 numa_other 2 5 numa_pages_migrated 703 726 numa_pte_updates 14742 11930 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-7-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3529bf61826b..25c7c7e09cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1567,6 +1567,13 @@ static bool load_too_imbalanced(long src_load, long dst_load, return (imb > old_imb); } +/* + * Maximum NUMA importance can be 1998 (2*999); + * SMALLIMP @ 30 would be close to 1998/64. + * Used to deter task migration. + */ +#define SMALLIMP 30 + /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1600,7 +1607,7 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; if (!cur) { - if (maymove || imp > env->best_imp) + if (maymove && moveimp >= env->best_imp) goto assign; else goto unlock; @@ -1643,15 +1650,21 @@ static void task_numa_compare(struct task_numa_env *env, task_weight(cur, env->dst_nid, dist); } - if (imp <= env->best_imp) - goto unlock; - if (maymove && moveimp > imp && moveimp > env->best_imp) { - imp = moveimp - 1; + imp = moveimp; cur = NULL; goto assign; } + /* + * If the NUMA importance is less than SMALLIMP, + * task migration might only result in ping pong + * of tasks and also hurt performance due to cache + * misses. + */ + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) + goto unlock; + /* * In the overloaded case, try and keep the load balanced. */ From 12d43deb1ee639d01a2a8d2a7a4cc8ad31224475 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 1 Oct 2018 17:31:17 +0200 Subject: [PATCH 409/499] drm: fix use-after-free read in drm_mode_create_lease_ioctl() fd_install() moves the reference given to it into the file descriptor table of the current process. If the current process is multithreaded, then immediately after fd_install(), another thread can close() the file descriptor and cause the file's resources to be cleaned up. Since the reference to "lessee" is held by the file, we must not access "lessee" after the fd_install() call. As far as I can tell, to reach this codepath, the caller must have an open file descriptor to a DRI device in master mode. I'm not sure what the requirements for that are. Signed-off-by: Jann Horn Fixes: 62884cd386b8 ("drm: Add four ioctls for managing drm mode object leases [v7]") Cc: stable@vger.kernel.org Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20181001153117.216923-1-jannh@google.com --- drivers/gpu/drm/drm_lease.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_lease.c b/drivers/gpu/drm/drm_lease.c index b54fb78a283c..b82da96ded5c 100644 --- a/drivers/gpu/drm/drm_lease.c +++ b/drivers/gpu/drm/drm_lease.c @@ -566,14 +566,14 @@ int drm_mode_create_lease_ioctl(struct drm_device *dev, lessee_priv->is_master = 1; lessee_priv->authenticated = 1; - /* Hook up the fd */ - fd_install(fd, lessee_file); - /* Pass fd back to userspace */ DRM_DEBUG_LEASE("Returning fd %d id %d\n", fd, lessee->lessee_id); cl->fd = fd; cl->lessee_id = lessee->lessee_id; + /* Hook up the fd */ + fd_install(fd, lessee_file); + DRM_DEBUG_LEASE("drm_mode_create_lease_ioctl succeeded\n"); return 0; From 6d348925b306ab0cc9757a09a8cea6bf288018e4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 1 Oct 2018 15:28:56 +0100 Subject: [PATCH 410/499] MAINTAINERS: Remove dead path from LOCKING PRIMITIVES entry Since 890658b7ab48 ("locking/mutex: Kill arch specific code"), there are no mutex header files under arch/, so we can remove the redundant entry from MAINTAINERS. Reported-by: Joe Perches Signed-off-by: Will Deacon Cc: Jason Low Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181001142856.GC9716@arm.com Signed-off-by: Ingo Molnar --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a255240d1452..3bd8913b2d78 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8598,7 +8598,6 @@ F: include/linux/spinlock*.h F: arch/*/include/asm/spinlock*.h F: include/linux/rwlock*.h F: include/linux/mutex*.h -F: arch/*/include/asm/mutex*.h F: include/linux/rwsem*.h F: arch/*/include/asm/rwsem.h F: include/linux/seqlock.h From efaffc5e40aeced0bcb497ed7a0a5b8c14abfcdf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 1 Oct 2018 11:05:24 +0100 Subject: [PATCH 411/499] mm, sched/numa: Remove rate-limiting of automatic NUMA balancing migration Rate limiting of page migrations due to automatic NUMA balancing was introduced to mitigate the worst-case scenario of migrating at high frequency due to false sharing or slowly ping-ponging between nodes. Since then, a lot of effort was spent on correctly identifying these pages and avoiding unnecessary migrations and the safety net may no longer be required. Jirka Hladky reported a regression in 4.17 due to a scheduler patch that avoids spreading STREAM tasks wide prematurely. However, once the task was properly placed, it delayed migrating the memory due to rate limiting. Increasing the limit fixed the problem for him. Currently, the limit is hard-coded and does not account for the real capabilities of the hardware. Even if an estimate was attempted, it would not properly account for the number of memory controllers and it could not account for the amount of bandwidth used for normal accesses. Rather than fudging, this patch simply eliminates the rate limiting. However, Jirka reports that a STREAM configuration using multiple processes achieved similar performance to 4.16. In local tests, this patch improved performance of STREAM relative to the baseline but it is somewhat machine-dependent. Most workloads show little or not performance difference implying that there is not a heavily reliance on the throttling mechanism and it is safe to remove. STREAM on 2-socket machine 4.19.0-rc5 4.19.0-rc5 numab-v1r1 noratelimit-v1r1 MB/sec copy 43298.52 ( 0.00%) 44673.38 ( 3.18%) MB/sec scale 30115.06 ( 0.00%) 31293.06 ( 3.91%) MB/sec add 32825.12 ( 0.00%) 34883.62 ( 6.27%) MB/sec triad 32549.52 ( 0.00%) 34906.60 ( 7.24% Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra Cc: Jirka Hladky Cc: Linus Torvalds Cc: Linux-MM Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181001100525.29789-2-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- include/linux/mmzone.h | 6 ---- include/trace/events/migrate.h | 27 -------------- mm/migrate.c | 65 ---------------------------------- mm/page_alloc.c | 2 -- 4 files changed, 100 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1e22d96734e0..3f4c0b167333 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -671,12 +671,6 @@ typedef struct pglist_data { #ifdef CONFIG_NUMA_BALANCING /* Lock serializing the migrate rate limiting window */ spinlock_t numabalancing_migrate_lock; - - /* Rate limiting time interval */ - unsigned long numabalancing_migrate_next_window; - - /* Number of pages migrated during the rate limiting time interval */ - unsigned long numabalancing_migrate_nr_pages; #endif /* * This is a per-node reserve of pages that are not available diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 711372845945..705b33d1e395 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -70,33 +70,6 @@ TRACE_EVENT(mm_migrate_pages, __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); - -TRACE_EVENT(mm_numa_migrate_ratelimit, - - TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages), - - TP_ARGS(p, dst_nid, nr_pages), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN) - __field( pid_t, pid) - __field( int, dst_nid) - __field( unsigned long, nr_pages) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->dst_nid = dst_nid; - __entry->nr_pages = nr_pages; - ), - - TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu", - __entry->comm, - __entry->pid, - __entry->dst_nid, - __entry->nr_pages) -); #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/mm/migrate.c b/mm/migrate.c index 4f1d894835b5..5e285c1249a0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1855,54 +1855,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page, return newpage; } -/* - * page migration rate limiting control. - * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs - * window of time. Default here says do not migrate more than 1280M per second. - */ -static unsigned int migrate_interval_millisecs __read_mostly = 100; -static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); - -/* Returns true if the node is migrate rate-limited after the update */ -static bool numamigrate_update_ratelimit(pg_data_t *pgdat, - unsigned long nr_pages) -{ - unsigned long next_window, interval; - - next_window = READ_ONCE(pgdat->numabalancing_migrate_next_window); - interval = msecs_to_jiffies(migrate_interval_millisecs); - - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (time_after(jiffies, next_window) && - spin_trylock(&pgdat->numabalancing_migrate_lock)) { - pgdat->numabalancing_migrate_nr_pages = 0; - do { - next_window += interval; - } while (unlikely(time_after(jiffies, next_window))); - - WRITE_ONCE(pgdat->numabalancing_migrate_next_window, next_window); - spin_unlock(&pgdat->numabalancing_migrate_lock); - } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { - trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, - nr_pages); - return true; - } - - /* - * This is an unlocked non-atomic update so errors are possible. - * The consequences are failing to migrate when we potentiall should - * have which is not severe enough to warrant locking. If it is ever - * a problem, it can be converted to a per-cpu counter. - */ - pgdat->numabalancing_migrate_nr_pages += nr_pages; - return false; -} - static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; @@ -1975,14 +1927,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (page_is_file_cache(page) && PageDirty(page)) goto out; - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, 1)) - goto out; - isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) goto out; @@ -2029,14 +1973,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unsigned long mmun_start = address & HPAGE_PMD_MASK; unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) - goto out_dropref; - new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); @@ -2133,7 +2069,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); -out_dropref: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_modify(entry, vma->vm_page_prot); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89d2a2ab3fe6..706a738c0aee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6197,8 +6197,6 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages, static void pgdat_init_numabalancing(struct pglist_data *pgdat) { spin_lock_init(&pgdat->numabalancing_migrate_lock); - pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies; } #else static void pgdat_init_numabalancing(struct pglist_data *pgdat) {} From 37355bdc5a129899f6b245900a8eb944a092f7fd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 1 Oct 2018 11:05:25 +0100 Subject: [PATCH 412/499] sched/numa: Migrate pages to local nodes quicker early in the lifetime of a task Automatic NUMA Balancing uses a multi-stage pass to decide whether a page should migrate to a local node. This filter avoids excessive ping-ponging if a page is shared or used by threads that migrate cross-node frequently. Threads inherit both page tables and the preferred node ID from the parent. This means that threads can trigger hinting faults earlier than a new task which delays scanning for a number of seconds. As it can be load balanced very early in its lifetime there can be an unnecessary delay before it starts migrating thread-local data. This patch migrates private pages faster early in the lifetime of a thread using the sequence counter as an identifier of new tasks. With this patch applied, STREAM performance is the same as 4.17 even though processes are not spread cross-node prematurely. Other workloads showed a mix of minor gains and losses. This is somewhat expected most workloads are not very sensitive to the starting conditions of a process. 4.19.0-rc5 4.19.0-rc5 4.17.0 numab-v1r1 fastmigrate-v1r1 vanilla MB/sec copy 43298.52 ( 0.00%) 47335.46 ( 9.32%) 47219.24 ( 9.06%) MB/sec scale 30115.06 ( 0.00%) 32568.12 ( 8.15%) 32527.56 ( 8.01%) MB/sec add 32825.12 ( 0.00%) 36078.94 ( 9.91%) 35928.02 ( 9.45%) MB/sec triad 32549.52 ( 0.00%) 35935.94 ( 10.40%) 35969.88 ( 10.51%) Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra Cc: Jirka Hladky Cc: Linus Torvalds Cc: Linux-MM Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181001100525.29789-3-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25c7c7e09cbd..7fc4a371bdd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1392,6 +1392,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int last_cpupid, this_cpupid; this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + + /* + * Allow first faults or private faults to migrate immediately early in + * the lifetime of a task. The magic number 4 is based on waiting for + * two full passes of the "multi-stage node selection" test that is + * executed below. + */ + if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) + return true; /* * Multi-stage node selection is used in conjunction with a periodic @@ -1410,7 +1421,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * This quadric squishes small probabilities, making it less likely we * act on an unlikely task<->page relation. */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != dst_nid) return false; From 4d4c2d89913e2d891bd6a34b12050a2576e60525 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Noralf=20Tr=C3=B8nnes?= Date: Mon, 1 Oct 2018 21:45:36 +0200 Subject: [PATCH 413/499] drm/cma-helper: Fix crash in fbdev error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sergey Suloev reported a crash happening in drm_client_dev_hotplug() when fbdev had failed to register. [ 9.124598] vc4_hdmi 3f902000.hdmi: ASoC: Failed to create component debugfs directory [ 9.147667] vc4_hdmi 3f902000.hdmi: vc4-hdmi-hifi <-> 3f902000.hdmi mapping ok [ 9.155184] vc4_hdmi 3f902000.hdmi: ASoC: no DMI vendor name! [ 9.166544] vc4-drm soc:gpu: bound 3f902000.hdmi (ops vc4_hdmi_ops [vc4]) [ 9.173840] vc4-drm soc:gpu: bound 3f806000.vec (ops vc4_vec_ops [vc4]) [ 9.181029] vc4-drm soc:gpu: bound 3f004000.txp (ops vc4_txp_ops [vc4]) [ 9.188519] vc4-drm soc:gpu: bound 3f400000.hvs (ops vc4_hvs_ops [vc4]) [ 9.195690] vc4-drm soc:gpu: bound 3f206000.pixelvalve (ops vc4_crtc_ops [vc4]) [ 9.203523] vc4-drm soc:gpu: bound 3f207000.pixelvalve (ops vc4_crtc_ops [vc4]) [ 9.215032] vc4-drm soc:gpu: bound 3f807000.pixelvalve (ops vc4_crtc_ops [vc4]) [ 9.274785] vc4-drm soc:gpu: bound 3fc00000.v3d (ops vc4_v3d_ops [vc4]) [ 9.290246] [drm] Initialized vc4 0.0.0 20140616 for soc:gpu on minor 0 [ 9.297464] [drm] Supports vblank timestamp caching Rev 2 (21.10.2013). [ 9.304600] [drm] Driver supports precise vblank timestamp query. [ 9.382856] vc4-drm soc:gpu: [drm:drm_fb_helper_fbdev_setup [drm_kms_helper]] *ERROR* Failed to set fbdev configuration [ 10.404937] Unable to handle kernel paging request at virtual address 00330a656369768a [ 10.441620] [00330a656369768a] address between user and kernel address ranges [ 10.449087] Internal error: Oops: 96000004 [#1] PREEMPT SMP [ 10.454762] Modules linked in: brcmfmac vc4 drm_kms_helper cfg80211 drm rfkill smsc95xx brcmutil usbnet drm_panel_orientation_quirks raspberrypi_hwmon bcm2835_dma crc32_ce pwm_bcm2835 bcm2835_rng virt_dma rng_core i2c_bcm2835 ip_tables x_tables ipv6 [ 10.477296] CPU: 2 PID: 45 Comm: kworker/2:1 Not tainted 4.19.0-rc5 #3 [ 10.483934] Hardware name: Raspberry Pi 3 Model B Rev 1.2 (DT) [ 10.489966] Workqueue: events output_poll_execute [drm_kms_helper] [ 10.596515] Process kworker/2:1 (pid: 45, stack limit = 0x000000007e8924dc) [ 10.603590] Call trace: [ 10.606259] drm_client_dev_hotplug+0x5c/0xb0 [drm] [ 10.611303] drm_kms_helper_hotplug_event+0x30/0x40 [drm_kms_helper] [ 10.617849] output_poll_execute+0xc4/0x1e0 [drm_kms_helper] [ 10.623616] process_one_work+0x1c8/0x318 [ 10.627695] worker_thread+0x48/0x428 [ 10.631420] kthread+0xf8/0x128 [ 10.634615] ret_from_fork+0x10/0x18 [ 10.638255] Code: 54000220 f9401261 aa1303e0 b4000141 (f9400c21) [ 10.644456] ---[ end trace c75b4a4b0e141908 ]--- The reason for this is that drm_fbdev_cma_init() removes the drm_client when fbdev registration fails, but it doesn't remove the client from the drm_device client list. So the client list now has a pointer that points into the unknown and we have a 'use after free' situation. Split drm_client_new() into drm_client_init() and drm_client_add() to fix removal in the error path. Fixes: 894a677f4b3e ("drm/cma-helper: Use the generic fbdev emulation") Reported-by: Sergey Suloev Cc: Stefan Wahren Cc: Eric Anholt Cc: Daniel Vetter Signed-off-by: Noralf Trønnes Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20181001194536.57756-1-noralf@tronnes.org --- drivers/gpu/drm/drm_client.c | 35 +++++++++++++++++++++-------- drivers/gpu/drm/drm_fb_cma_helper.c | 4 +++- drivers/gpu/drm/drm_fb_helper.c | 4 +++- include/drm/drm_client.h | 5 +++-- 4 files changed, 35 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c index baff50a4c234..df31c3815092 100644 --- a/drivers/gpu/drm/drm_client.c +++ b/drivers/gpu/drm/drm_client.c @@ -63,20 +63,21 @@ static void drm_client_close(struct drm_client_dev *client) EXPORT_SYMBOL(drm_client_close); /** - * drm_client_new - Create a DRM client + * drm_client_init - Initialise a DRM client * @dev: DRM device * @client: DRM client * @name: Client name * @funcs: DRM client functions (optional) * + * This initialises the client and opens a &drm_file. Use drm_client_add() to complete the process. * The caller needs to hold a reference on @dev before calling this function. * The client is freed when the &drm_device is unregistered. See drm_client_release(). * * Returns: * Zero on success or negative error code on failure. */ -int drm_client_new(struct drm_device *dev, struct drm_client_dev *client, - const char *name, const struct drm_client_funcs *funcs) +int drm_client_init(struct drm_device *dev, struct drm_client_dev *client, + const char *name, const struct drm_client_funcs *funcs) { int ret; @@ -95,10 +96,6 @@ int drm_client_new(struct drm_device *dev, struct drm_client_dev *client, if (ret) goto err_put_module; - mutex_lock(&dev->clientlist_mutex); - list_add(&client->list, &dev->clientlist); - mutex_unlock(&dev->clientlist_mutex); - drm_dev_get(dev); return 0; @@ -109,13 +106,33 @@ int drm_client_new(struct drm_device *dev, struct drm_client_dev *client, return ret; } -EXPORT_SYMBOL(drm_client_new); +EXPORT_SYMBOL(drm_client_init); + +/** + * drm_client_add - Add client to the device list + * @client: DRM client + * + * Add the client to the &drm_device client list to activate its callbacks. + * @client must be initialized by a call to drm_client_init(). After + * drm_client_add() it is no longer permissible to call drm_client_release() + * directly (outside the unregister callback), instead cleanup will happen + * automatically on driver unload. + */ +void drm_client_add(struct drm_client_dev *client) +{ + struct drm_device *dev = client->dev; + + mutex_lock(&dev->clientlist_mutex); + list_add(&client->list, &dev->clientlist); + mutex_unlock(&dev->clientlist_mutex); +} +EXPORT_SYMBOL(drm_client_add); /** * drm_client_release - Release DRM client resources * @client: DRM client * - * Releases resources by closing the &drm_file that was opened by drm_client_new(). + * Releases resources by closing the &drm_file that was opened by drm_client_init(). * It is called automatically if the &drm_client_funcs.unregister callback is _not_ set. * * This function should only be called from the unregister callback. An exception diff --git a/drivers/gpu/drm/drm_fb_cma_helper.c b/drivers/gpu/drm/drm_fb_cma_helper.c index 9da36a6271d3..9ac1f2e0f064 100644 --- a/drivers/gpu/drm/drm_fb_cma_helper.c +++ b/drivers/gpu/drm/drm_fb_cma_helper.c @@ -160,7 +160,7 @@ struct drm_fbdev_cma *drm_fbdev_cma_init(struct drm_device *dev, fb_helper = &fbdev_cma->fb_helper; - ret = drm_client_new(dev, &fb_helper->client, "fbdev", NULL); + ret = drm_client_init(dev, &fb_helper->client, "fbdev", NULL); if (ret) goto err_free; @@ -169,6 +169,8 @@ struct drm_fbdev_cma *drm_fbdev_cma_init(struct drm_device *dev, if (ret) goto err_client_put; + drm_client_add(&fb_helper->client); + return fbdev_cma; err_client_put: diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 16ec93b75dbf..515a7aec57ac 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -3218,12 +3218,14 @@ int drm_fbdev_generic_setup(struct drm_device *dev, unsigned int preferred_bpp) if (!fb_helper) return -ENOMEM; - ret = drm_client_new(dev, &fb_helper->client, "fbdev", &drm_fbdev_client_funcs); + ret = drm_client_init(dev, &fb_helper->client, "fbdev", &drm_fbdev_client_funcs); if (ret) { kfree(fb_helper); return ret; } + drm_client_add(&fb_helper->client); + fb_helper->preferred_bpp = preferred_bpp; drm_fbdev_client_hotplug(&fb_helper->client); diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h index 989f8e52864d..971bb7853776 100644 --- a/include/drm/drm_client.h +++ b/include/drm/drm_client.h @@ -87,9 +87,10 @@ struct drm_client_dev { struct drm_file *file; }; -int drm_client_new(struct drm_device *dev, struct drm_client_dev *client, - const char *name, const struct drm_client_funcs *funcs); +int drm_client_init(struct drm_device *dev, struct drm_client_dev *client, + const char *name, const struct drm_client_funcs *funcs); void drm_client_release(struct drm_client_dev *client); +void drm_client_add(struct drm_client_dev *client); void drm_client_dev_unregister(struct drm_device *dev); void drm_client_dev_hotplug(struct drm_device *dev); From b0584ea66d73919cbf5878a3420a837f06ab8396 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 2 Oct 2018 02:41:53 +0000 Subject: [PATCH 414/499] bpf: don't accept cgroup local storage with zero value size Explicitly forbid creating cgroup local storage maps with zero value size, as it makes no sense and might even cause a panic. Reported-by: syzbot+18628320d3b14a5c459c@syzkaller.appspotmail.com Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 94126cbffc88..830d7f095748 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -195,6 +195,9 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); + if (attr->value_size == 0) + return ERR_PTR(-EINVAL); + if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); From b45ba4a51cde29b2939365ef0c07ad34c8321789 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 1 Oct 2018 12:21:10 +0000 Subject: [PATCH 415/499] powerpc/lib: fix book3s/32 boot failure due to code patching Commit 51c3c62b58b3 ("powerpc: Avoid code patching freed init sections") accesses 'init_mem_is_free' flag too early, before the kernel is relocated. This provokes early boot failure (before the console is active). As it is not necessary to do this verification that early, this patch moves the test into patch_instruction() instead of __patch_instruction(). This modification also has the advantage of avoiding unnecessary remappings. Fixes: 51c3c62b58b3 ("powerpc: Avoid code patching freed init sections") Cc: stable@vger.kernel.org # 4.13+ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/lib/code-patching.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 6ae2777c220d..5ffee298745f 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -28,12 +28,6 @@ static int __patch_instruction(unsigned int *exec_addr, unsigned int instr, { int err; - /* Make sure we aren't patching a freed init section */ - if (init_mem_is_free && init_section_contains(exec_addr, 4)) { - pr_debug("Skipping init section patching addr: 0x%px\n", exec_addr); - return 0; - } - __put_user_size(instr, patch_addr, 4, err); if (err) return err; @@ -148,7 +142,7 @@ static inline int unmap_patch_area(unsigned long addr) return 0; } -int patch_instruction(unsigned int *addr, unsigned int instr) +static int do_patch_instruction(unsigned int *addr, unsigned int instr) { int err; unsigned int *patch_addr = NULL; @@ -188,12 +182,22 @@ int patch_instruction(unsigned int *addr, unsigned int instr) } #else /* !CONFIG_STRICT_KERNEL_RWX */ -int patch_instruction(unsigned int *addr, unsigned int instr) +static int do_patch_instruction(unsigned int *addr, unsigned int instr) { return raw_patch_instruction(addr, instr); } #endif /* CONFIG_STRICT_KERNEL_RWX */ + +int patch_instruction(unsigned int *addr, unsigned int instr) +{ + /* Make sure we aren't patching a freed init section */ + if (init_mem_is_free && init_section_contains(addr, 4)) { + pr_debug("Skipping init section patching addr: 0x%px\n", addr); + return 0; + } + return do_patch_instruction(addr, instr); +} NOKPROBE_SYMBOL(patch_instruction); int patch_branch(unsigned int *addr, unsigned long target, int flags) From 86da809dda64a63fc27e05a215475325c3aaae92 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 24 Sep 2018 13:20:44 +0300 Subject: [PATCH 416/499] thunderbolt: Do not handle ICM events after domain is stopped If there is a long chain of devices connected when the driver is loaded ICM sends device connected event for each and those are put to tb->wq for later processing. Now if the driver gets unloaded in the middle, so that the work queue is not yet empty it gets flushed by tb_domain_stop(). However, by that time the root switch is already removed so the driver crashes when it tries to dereference it in ICM event handling callbacks. Fix this by checking whether the root switch is already removed. If it is we know that the domain is stopped and we should merely skip handling the event. Signed-off-by: Mika Westerberg Signed-off-by: Greg Kroah-Hartman --- drivers/thunderbolt/icm.c | 49 ++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c index e1e264a9a4c7..28fc4ce75edb 100644 --- a/drivers/thunderbolt/icm.c +++ b/drivers/thunderbolt/icm.c @@ -738,14 +738,6 @@ icm_fr_xdomain_connected(struct tb *tb, const struct icm_pkg_header *hdr) u8 link, depth; u64 route; - /* - * After NVM upgrade adding root switch device fails because we - * initiated reset. During that time ICM might still send - * XDomain connected message which we ignore here. - */ - if (!tb->root_switch) - return; - link = pkg->link_info & ICM_LINK_INFO_LINK_MASK; depth = (pkg->link_info & ICM_LINK_INFO_DEPTH_MASK) >> ICM_LINK_INFO_DEPTH_SHIFT; @@ -1037,14 +1029,6 @@ icm_tr_device_connected(struct tb *tb, const struct icm_pkg_header *hdr) if (pkg->hdr.packet_id) return; - /* - * After NVM upgrade adding root switch device fails because we - * initiated reset. During that time ICM might still send device - * connected message which we ignore here. - */ - if (!tb->root_switch) - return; - route = get_route(pkg->route_hi, pkg->route_lo); authorized = pkg->link_info & ICM_LINK_INFO_APPROVED; security_level = (pkg->hdr.flags & ICM_FLAGS_SLEVEL_MASK) >> @@ -1408,19 +1392,26 @@ static void icm_handle_notification(struct work_struct *work) mutex_lock(&tb->lock); - switch (n->pkg->code) { - case ICM_EVENT_DEVICE_CONNECTED: - icm->device_connected(tb, n->pkg); - break; - case ICM_EVENT_DEVICE_DISCONNECTED: - icm->device_disconnected(tb, n->pkg); - break; - case ICM_EVENT_XDOMAIN_CONNECTED: - icm->xdomain_connected(tb, n->pkg); - break; - case ICM_EVENT_XDOMAIN_DISCONNECTED: - icm->xdomain_disconnected(tb, n->pkg); - break; + /* + * When the domain is stopped we flush its workqueue but before + * that the root switch is removed. In that case we should treat + * the queued events as being canceled. + */ + if (tb->root_switch) { + switch (n->pkg->code) { + case ICM_EVENT_DEVICE_CONNECTED: + icm->device_connected(tb, n->pkg); + break; + case ICM_EVENT_DEVICE_DISCONNECTED: + icm->device_disconnected(tb, n->pkg); + break; + case ICM_EVENT_XDOMAIN_CONNECTED: + icm->xdomain_connected(tb, n->pkg); + break; + case ICM_EVENT_XDOMAIN_DISCONNECTED: + icm->xdomain_disconnected(tb, n->pkg); + break; + } } mutex_unlock(&tb->lock); From eafa717bc145963c944bb0a64d16add683861b35 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 24 Sep 2018 13:20:45 +0300 Subject: [PATCH 417/499] thunderbolt: Initialize after IOMMUs If IOMMU is enabled and Thunderbolt driver is built into the kernel image, it will be probed before IOMMUs are attached to the PCI bus. Because of this DMA mappings the driver does will not go through IOMMU and start failing right after IOMMUs are enabled. For this reason move the Thunderbolt driver initialization happen at rootfs level. Signed-off-by: Mika Westerberg Signed-off-by: Greg Kroah-Hartman --- drivers/thunderbolt/nhi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 88cff05a1808..5cd6bdfa068f 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -1191,5 +1191,5 @@ static void __exit nhi_unload(void) tb_domain_exit(); } -fs_initcall(nhi_init); +rootfs_initcall(nhi_init); module_exit(nhi_unload); From 657ade07df72847f591ccdb36bd9b91ed0edbac3 Mon Sep 17 00:00:00 2001 From: Rickard x Andersson Date: Tue, 2 Oct 2018 14:49:32 +0200 Subject: [PATCH 418/499] net: fec: fix rare tx timeout During certain heavy network loads TX could time out with TX ring dump. TX is sometimes never restarted after reaching "tx_stop_threshold" because function "fec_enet_tx_queue" only tests the first queue. In addition the TX timeout callback function failed to recover because it also operated only on the first queue. Signed-off-by: Rickard x Andersson Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 2708297e7795..bf9b9fd6d2a0 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1158,7 +1158,7 @@ static void fec_enet_timeout_work(struct work_struct *work) napi_disable(&fep->napi); netif_tx_lock_bh(ndev); fec_restart(ndev); - netif_wake_queue(ndev); + netif_tx_wake_all_queues(ndev); netif_tx_unlock_bh(ndev); napi_enable(&fep->napi); } @@ -1273,7 +1273,7 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id) /* Since we have freed up a buffer, the ring is no longer full */ - if (netif_queue_stopped(ndev)) { + if (netif_tx_queue_stopped(nq)) { entries_free = fec_enet_get_free_txdesc_num(txq); if (entries_free >= txq->tx_wake_threshold) netif_tx_wake_queue(nq); @@ -1746,7 +1746,7 @@ static void fec_enet_adjust_link(struct net_device *ndev) napi_disable(&fep->napi); netif_tx_lock_bh(ndev); fec_restart(ndev); - netif_wake_queue(ndev); + netif_tx_wake_all_queues(ndev); netif_tx_unlock_bh(ndev); napi_enable(&fep->napi); } @@ -2247,7 +2247,7 @@ static int fec_enet_set_pauseparam(struct net_device *ndev, napi_disable(&fep->napi); netif_tx_lock_bh(ndev); fec_restart(ndev); - netif_wake_queue(ndev); + netif_tx_wake_all_queues(ndev); netif_tx_unlock_bh(ndev); napi_enable(&fep->napi); } From fe3a83af6a50199bf250fa331e94216912f79395 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 2 Oct 2018 14:23:45 +0100 Subject: [PATCH 419/499] declance: Fix continuation with the adapter identification message Fix a commit 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") regression with the `declance' driver, which caused the adapter identification message to be split between two lines, e.g.: declance.c: v0.011 by Linux MIPS DECstation task force tc6: PMAD-AA , addr = 08:00:2b:1b:2a:6a, irq = 14 tc6: registered as eth0. Address that properly, by printing identification with a single call, making the messages now look like: declance.c: v0.011 by Linux MIPS DECstation task force tc6: PMAD-AA, addr = 08:00:2b:1b:2a:6a, irq = 14 tc6: registered as eth0. Signed-off-by: Maciej W. Rozycki Fixes: 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/declance.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/amd/declance.c b/drivers/net/ethernet/amd/declance.c index 116997a8b593..00332a1ea84b 100644 --- a/drivers/net/ethernet/amd/declance.c +++ b/drivers/net/ethernet/amd/declance.c @@ -1031,6 +1031,7 @@ static int dec_lance_probe(struct device *bdev, const int type) int i, ret; unsigned long esar_base; unsigned char *esar; + const char *desc; if (dec_lance_debug && version_printed++ == 0) printk(version); @@ -1216,19 +1217,20 @@ static int dec_lance_probe(struct device *bdev, const int type) */ switch (type) { case ASIC_LANCE: - printk("%s: IOASIC onboard LANCE", name); + desc = "IOASIC onboard LANCE"; break; case PMAD_LANCE: - printk("%s: PMAD-AA", name); + desc = "PMAD-AA"; break; case PMAX_LANCE: - printk("%s: PMAX onboard LANCE", name); + desc = "PMAX onboard LANCE"; break; } for (i = 0; i < 6; i++) dev->dev_addr[i] = esar[i * 4]; - printk(", addr = %pM, irq = %d\n", dev->dev_addr, dev->irq); + printk("%s: %s, addr = %pM, irq = %d\n", + name, desc, dev->dev_addr, dev->irq); dev->netdev_ops = &lance_netdev_ops; dev->watchdog_timeo = 5*HZ; From ff58e2df62ce29d0552278c290ae494b30fe0c6f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 2 Oct 2018 10:10:14 -0700 Subject: [PATCH 420/499] nfp: avoid soft lockups under control message storm When FW floods the driver with control messages try to exit the cmsg processing loop every now and then to avoid soft lockups. Cmsg processing is generally very lightweight so 512 seems like a reasonable budget, which should not be exceeded under normal conditions. Fixes: 77ece8d5f196 ("nfp: add control vNIC datapath") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Tested-by: Pieter Jansen van Vuuren Signed-off-by: David S. Miller --- .../net/ethernet/netronome/nfp/nfp_net_common.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 8ed38fd5a852..c6d29fdbb880 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2077,14 +2077,17 @@ nfp_ctrl_rx_one(struct nfp_net *nn, struct nfp_net_dp *dp, return true; } -static void nfp_ctrl_rx(struct nfp_net_r_vector *r_vec) +static bool nfp_ctrl_rx(struct nfp_net_r_vector *r_vec) { struct nfp_net_rx_ring *rx_ring = r_vec->rx_ring; struct nfp_net *nn = r_vec->nfp_net; struct nfp_net_dp *dp = &nn->dp; + unsigned int budget = 512; - while (nfp_ctrl_rx_one(nn, dp, r_vec, rx_ring)) + while (nfp_ctrl_rx_one(nn, dp, r_vec, rx_ring) && budget--) continue; + + return budget; } static void nfp_ctrl_poll(unsigned long arg) @@ -2096,9 +2099,13 @@ static void nfp_ctrl_poll(unsigned long arg) __nfp_ctrl_tx_queued(r_vec); spin_unlock_bh(&r_vec->lock); - nfp_ctrl_rx(r_vec); - - nfp_net_irq_unmask(r_vec->nfp_net, r_vec->irq_entry); + if (nfp_ctrl_rx(r_vec)) { + nfp_net_irq_unmask(r_vec->nfp_net, r_vec->irq_entry); + } else { + tasklet_schedule(&r_vec->tasklet); + nn_dp_warn(&r_vec->nfp_net->dp, + "control message budget exceeded!\n"); + } } /* Setup and Configuration From 20a8378aa9dd108a01cb0e695599f5257a885c4b Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 2 Oct 2018 13:01:45 -0500 Subject: [PATCH 421/499] x86/platform/uv: Provide is_early_uv_system() Introduce is_early_uv_system() which uses efi.uv_systab to decide early in the boot process whether the kernel runs on a UV system. This is needed to skip other early setup/init code that might break the UV platform if done too early such as before necessary ACPI tables parsing takes place. Suggested-by: Hedi Berriche Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Reviewed-by: Russ Anderson Reviewed-by: Dimitri Sivanich Cc: "H. Peter Anvin" Cc: Russ Anderson Cc: Dimitri Sivanich Cc: Borislav Petkov Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Len Brown Cc: Dou Liyang Cc: Xiaoming Gao Cc: Rajvi Jingar Link: https://lkml.kernel.org/r/20181002180144.801700401@stormcage.americas.sgi.com --- arch/x86/include/asm/uv/uv.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index a80c0673798f..e60c45fd3679 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -10,8 +10,13 @@ struct cpumask; struct mm_struct; #ifdef CONFIG_X86_UV +#include extern enum uv_system_type get_uv_system_type(void); +static inline bool is_early_uv_system(void) +{ + return !((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab); +} extern int is_uv_system(void); extern int is_uv_hubless(void); extern void uv_cpu_init(void); @@ -23,6 +28,7 @@ extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, #else /* X86_UV */ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } +static inline bool is_early_uv_system(void) { return 0; } static inline int is_uv_system(void) { return 0; } static inline int is_uv_hubless(void) { return 0; } static inline void uv_cpu_init(void) { } From 2647c43c7f3ba4b752bfce261d53b16e2f5bc9e3 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 2 Oct 2018 13:01:46 -0500 Subject: [PATCH 422/499] x86/tsc: Fix UV TSC initialization The recent rework of the TSC calibration code introduced a regression on UV systems as it added a call to tsc_early_init() which initializes the TSC ADJUST values before acpi_boot_table_init(). In the case of UV systems, that is a necessary step that calls uv_system_init(). This informs tsc_sanitize_first_cpu() that the kernel runs on a platform with async TSC resets as documented in commit 341102c3ef29 ("x86/tsc: Add option that TSC on Socket 0 being non-zero is valid") Fix it by skipping the early tsc initialization on UV systems and let TSC init tests take place later in tsc_init(). Fixes: cf7a63ef4e02 ("x86/tsc: Calibrate tsc only once") Suggested-by: Hedi Berriche Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Reviewed-by: Russ Anderson Reviewed-by: Dimitri Sivanich Cc: "H. Peter Anvin" Cc: Russ Anderson Cc: Dimitri Sivanich Cc: Borislav Petkov Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Len Brown Cc: Dou Liyang Cc: Xiaoming Gao Cc: Rajvi Jingar Link: https://lkml.kernel.org/r/20181002180144.923579706@stormcage.americas.sgi.com --- arch/x86/kernel/tsc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6490f618e096..b52bd2b6cdb4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -26,6 +26,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -1433,6 +1434,9 @@ void __init tsc_early_init(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return; + /* Don't change UV TSC multi-chassis synchronization */ + if (is_early_uv_system()) + return; if (!determine_cpu_tsc_frequencies(true)) return; loops_per_jiffy = get_loops_per_jiffy(); From ef1f2258748b675422ca0107e5bfb9ceeac675de Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 11 Sep 2018 11:30:18 -0700 Subject: [PATCH 423/499] RISCV: Fix end PFN for low memory Use memblock_end_of_DRAM which provides correct last low memory PFN. Without that, DMA32 region becomes empty resulting in zero pages being allocated for DMA32. This patch is based on earlier patch from palmer which never merged into 4.19. I just edited the commit text to make more sense. Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index aee603123030..b2d26d9d8489 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -186,7 +186,7 @@ static void __init setup_bootmem(void) BUG_ON(mem_size == 0); set_max_mapnr(PFN_DOWN(mem_size)); - max_low_pfn = pfn_base + PFN_DOWN(mem_size); + max_low_pfn = memblock_end_of_DRAM(); #ifdef CONFIG_BLK_DEV_INITRD setup_initrd(); From beeeac43b6fae5f5eaf707b6fcc2bf1e09deb785 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 1 Oct 2018 21:42:37 -0700 Subject: [PATCH 424/499] Revert "serial: 8250_dw: Fix runtime PM handling" This reverts commit d76c74387e1c978b6c5524a146ab0f3f72206f98. While commit d76c74387e1c ("serial: 8250_dw: Fix runtime PM handling") fixes runtime PM handling when using kgdb, it introduces a traceback for everyone else. BUG: sleeping function called from invalid context at /mnt/host/source/src/third_party/kernel/next/drivers/base/power/runtime.c:1034 in_atomic(): 1, irqs_disabled(): 1, pid: 1, name: swapper/0 7 locks held by swapper/0/1: #0: 000000005ec5bc72 (&dev->mutex){....}, at: __driver_attach+0xb5/0x12b #1: 000000005d5fa9e5 (&dev->mutex){....}, at: __device_attach+0x3e/0x15b #2: 0000000047e93286 (serial_mutex){+.+.}, at: serial8250_register_8250_port+0x51/0x8bb #3: 000000003b328f07 (port_mutex){+.+.}, at: uart_add_one_port+0xab/0x8b0 #4: 00000000fa313d4d (&port->mutex){+.+.}, at: uart_add_one_port+0xcc/0x8b0 #5: 00000000090983ca (console_lock){+.+.}, at: vprintk_emit+0xdb/0x217 #6: 00000000c743e583 (console_owner){-...}, at: console_unlock+0x211/0x60f irq event stamp: 735222 __down_trylock_console_sem+0x4a/0x84 console_unlock+0x338/0x60f __do_softirq+0x4a4/0x50d irq_exit+0x64/0xe2 CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.19.0-rc5 #6 Hardware name: Google Caroline/Caroline, BIOS Google_Caroline.7820.286.0 03/15/2017 Call Trace: dump_stack+0x7d/0xbd ___might_sleep+0x238/0x259 __pm_runtime_resume+0x4e/0xa4 ? serial8250_rpm_get+0x2e/0x44 serial8250_console_write+0x44/0x301 ? lock_acquire+0x1b8/0x1fa console_unlock+0x577/0x60f vprintk_emit+0x1f0/0x217 printk+0x52/0x6e register_console+0x43b/0x524 uart_add_one_port+0x672/0x8b0 ? set_io_from_upio+0x150/0x162 serial8250_register_8250_port+0x825/0x8bb dw8250_probe+0x80c/0x8b0 ? dw8250_serial_inq+0x8e/0x8e ? dw8250_check_lcr+0x108/0x108 ? dw8250_runtime_resume+0x5b/0x5b ? dw8250_serial_outq+0xa1/0xa1 ? dw8250_remove+0x115/0x115 platform_drv_probe+0x76/0xc5 really_probe+0x1f1/0x3ee ? driver_allows_async_probing+0x5d/0x5d driver_probe_device+0xd6/0x112 ? driver_allows_async_probing+0x5d/0x5d bus_for_each_drv+0xbe/0xe5 __device_attach+0xdd/0x15b bus_probe_device+0x5a/0x10b device_add+0x501/0x894 ? _raw_write_unlock+0x27/0x3a platform_device_add+0x224/0x2b7 mfd_add_device+0x718/0x75b ? __kmalloc+0x144/0x16a ? mfd_add_devices+0x38/0xdb mfd_add_devices+0x9b/0xdb intel_lpss_probe+0x7d4/0x8ee intel_lpss_pci_probe+0xac/0xd4 pci_device_probe+0x101/0x18e ... Revert the offending patch until a more comprehensive solution is available. Cc: Tony Lindgren Cc: Andy Shevchenko Cc: Phil Edworthy Fixes: d76c74387e1c ("serial: 8250_dw: Fix runtime PM handling") Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_dw.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index fa8dcb470640..d31b975dd3fd 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -630,10 +630,6 @@ static int dw8250_probe(struct platform_device *pdev) if (!data->skip_autocfg) dw8250_setup_port(p); -#ifdef CONFIG_PM - uart.capabilities |= UART_CAP_RPM; -#endif - /* If we have a valid fifosize, try hooking up DMA */ if (p->fifosize) { data->dma.rxconf.src_maxburst = p->fifosize / 4; From 10653022456dc77b398777fd8e95126c77954b49 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Aug 2018 14:54:03 +0200 Subject: [PATCH 425/499] Revert "serial: sh-sci: Remove SCIx_RZ_SCIFA_REGTYPE" This reverts commit 7acece71a517cad83a0842a94d94c13f271b680c. Signed-off-by: Geert Uytterhoeven Acked-by: Chris Brandt Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sh-sci.c | 31 +++++++++++++++++++++++++++++++ include/linux/serial_sci.h | 1 + 2 files changed, 32 insertions(+) diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index ac4424bf6b13..5d42c9a63001 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -291,6 +291,33 @@ static const struct sci_port_params sci_port_params[SCIx_NR_REGTYPES] = { .error_clear = SCIF_ERROR_CLEAR, }, + /* + * The "SCIFA" that is in RZ/T and RZ/A2. + * It looks like a normal SCIF with FIFO data, but with a + * compressed address space. Also, the break out of interrupts + * are different: ERI/BRI, RXI, TXI, TEI, DRI. + */ + [SCIx_RZ_SCIFA_REGTYPE] = { + .regs = { + [SCSMR] = { 0x00, 16 }, + [SCBRR] = { 0x02, 8 }, + [SCSCR] = { 0x04, 16 }, + [SCxTDR] = { 0x06, 8 }, + [SCxSR] = { 0x08, 16 }, + [SCxRDR] = { 0x0A, 8 }, + [SCFCR] = { 0x0C, 16 }, + [SCFDR] = { 0x0E, 16 }, + [SCSPTR] = { 0x10, 16 }, + [SCLSR] = { 0x12, 16 }, + }, + .fifosize = 16, + .overrun_reg = SCLSR, + .overrun_mask = SCLSR_ORER, + .sampling_rate_mask = SCI_SR(32), + .error_mask = SCIF_DEFAULT_ERROR_MASK, + .error_clear = SCIF_ERROR_CLEAR, + }, + /* * Common SH-3 SCIF definitions. */ @@ -3110,6 +3137,10 @@ static const struct of_device_id of_sci_match[] = { .compatible = "renesas,scif-r7s72100", .data = SCI_OF_DATA(PORT_SCIF, SCIx_SH2_SCIF_FIFODATA_REGTYPE), }, + { + .compatible = "renesas,scif-r7s9210", + .data = SCI_OF_DATA(PORT_SCIF, SCIx_RZ_SCIFA_REGTYPE), + }, /* Family-specific types */ { .compatible = "renesas,rcar-gen1-scif", diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index c0e795d95477..1c89611e0e06 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -36,6 +36,7 @@ enum { SCIx_SH4_SCIF_FIFODATA_REGTYPE, SCIx_SH7705_SCIF_REGTYPE, SCIx_HSCIF_REGTYPE, + SCIx_RZ_SCIFA_REGTYPE, SCIx_NR_REGTYPES, }; From 5b162cc4ac27ba76e576abc9090c54cf90a17980 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 30 Aug 2018 14:54:04 +0200 Subject: [PATCH 426/499] Revert "serial: sh-sci: Allow for compressed SCIF address" This reverts commit 2d4dd0da45401c7ae7332b4d1eb7bbb1348edde9. This broke earlycon on all Renesas ARM platforms using a SCIF port for the serial console (R-Car, RZ/A1, RZ/G1, RZ/G2 SoCs), due to an incorrect value of port->regshift. Signed-off-by: Geert Uytterhoeven Acked-by: Chris Brandt Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sh-sci.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index 5d42c9a63001..ab3f6e91853d 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -346,15 +346,15 @@ static const struct sci_port_params sci_port_params[SCIx_NR_REGTYPES] = { [SCIx_SH4_SCIF_REGTYPE] = { .regs = { [SCSMR] = { 0x00, 16 }, - [SCBRR] = { 0x02, 8 }, - [SCSCR] = { 0x04, 16 }, - [SCxTDR] = { 0x06, 8 }, - [SCxSR] = { 0x08, 16 }, - [SCxRDR] = { 0x0a, 8 }, - [SCFCR] = { 0x0c, 16 }, - [SCFDR] = { 0x0e, 16 }, - [SCSPTR] = { 0x10, 16 }, - [SCLSR] = { 0x12, 16 }, + [SCBRR] = { 0x04, 8 }, + [SCSCR] = { 0x08, 16 }, + [SCxTDR] = { 0x0c, 8 }, + [SCxSR] = { 0x10, 16 }, + [SCxRDR] = { 0x14, 8 }, + [SCFCR] = { 0x18, 16 }, + [SCFDR] = { 0x1c, 16 }, + [SCSPTR] = { 0x20, 16 }, + [SCLSR] = { 0x24, 16 }, }, .fifosize = 16, .overrun_reg = SCLSR, @@ -2837,7 +2837,7 @@ static int sci_init_single(struct platform_device *dev, { struct uart_port *port = &sci_port->port; const struct resource *res; - unsigned int i, regtype; + unsigned int i; int ret; sci_port->cfg = p; @@ -2874,7 +2874,6 @@ static int sci_init_single(struct platform_device *dev, if (unlikely(sci_port->params == NULL)) return -EINVAL; - regtype = sci_port->params - sci_port_params; switch (p->type) { case PORT_SCIFB: sci_port->rx_trigger = 48; @@ -2929,10 +2928,6 @@ static int sci_init_single(struct platform_device *dev, port->regshift = 1; } - if (regtype == SCIx_SH4_SCIF_REGTYPE) - if (sci_port->reg_size >= 0x20) - port->regshift = 1; - /* * The UART port needs an IRQ value, so we peg this to the RX IRQ * for the multi-IRQ ports, which is where we are primarily From 2ab2ddd301a22ca3c5f0b743593e4ad2953dfa53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Oct 2018 12:35:05 -0700 Subject: [PATCH 427/499] inet: make sure to grab rcu_read_lock before using ireq->ireq_opt Timer handlers do not imply rcu_read_lock(), so my recent fix triggered a LOCKDEP warning when SYNACK is retransmit. Lets add rcu_read_lock()/rcu_read_unlock() pairs around ireq->ireq_opt usages instead of guessing what is done by callers, since it is not worth the pain. Get rid of ireq_opt_deref() helper since it hides the logic without real benefit, since it is now a standard rcu_dereference(). Fixes: 1ad98e9d1bdf ("tcp/dccp: fix lockdep issue when SYN is backlogged") Signed-off-by: Eric Dumazet Reported-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/inet_sock.h | 5 ----- net/dccp/ipv4.c | 4 +++- net/ipv4/inet_connection_sock.c | 5 ++++- net/ipv4/tcp_ipv4.c | 4 +++- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a8cd5cf9ff5b..a80fd0ac4563 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -130,11 +130,6 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, return sk->sk_bound_dev_if; } -static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) -{ - return rcu_dereference(ireq->ireq_opt); -} - struct inet_cork { unsigned int flags; __be32 addr; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b08feb219b44..8e08cea6f178 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -493,9 +493,11 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq_opt_deref(ireq)); + rcu_dereference(ireq->ireq_opt)); + rcu_read_unlock(); err = net_xmit_eval(err); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index dfd5009f96ef..15e7f7915a21 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -544,7 +544,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, struct ip_options_rcu *opt; struct rtable *rt; - opt = ireq_opt_deref(ireq); + rcu_read_lock(); + opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, @@ -558,11 +559,13 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; + rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: + rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 44c09eddbb78..cd426313a298 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -943,9 +943,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq_opt_deref(ireq)); + rcu_dereference(ireq->ireq_opt)); + rcu_read_unlock(); err = net_xmit_eval(err); } From 0f3b914c9cfcd7bbedd445dc4ac5dd999fa213c2 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Tue, 2 Oct 2018 12:14:34 -0700 Subject: [PATCH 428/499] bonding: fix warning message RX queue config for bonding master could be different from its slave device(s). With the commit 6a9e461f6fe4 ("bonding: pass link-local packets to bonding master also."), the packet is reinjected into stack with skb->dev as bonding master. This potentially triggers the message: "bondX received packet on queue Y, but number of RX queues is Z" whenever the queue that packet is received on is higher than the numrxqueues on bonding master (Y > Z). Fixes: 6a9e461f6fe4 ("bonding: pass link-local packets to bonding master also.") Reported-by: John Sperbeck Signed-off-by: Eric Dumazet Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c05c01a00755..ee28ec9e0aba 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1187,6 +1187,7 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) if (nskb) { nskb->dev = bond->dev; + nskb->queue_mapping = 0; netif_rx(nskb); } return RX_HANDLER_PASS; From 0e1d6eca5113858ed2caea61a5adc03c595f6096 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Oct 2018 15:47:35 -0700 Subject: [PATCH 429/499] rtnl: limit IFLA_NUM_TX_QUEUES and IFLA_NUM_RX_QUEUES to 4096 We have an impressive number of syzkaller bugs that are linked to the fact that syzbot was able to create a networking device with millions of TX (or RX) queues. Let's limit the number of RX/TX queues to 4096, this really should cover all known cases. A separate patch will add various cond_resched() in the loops handling sysfs entries at device creation and dismantle. Tested: lpaa6:~# ip link add gre-4097 numtxqueues 4097 numrxqueues 4097 type ip6gretap RTNETLINK answers: Invalid argument lpaa6:~# time ip link add gre-4096 numtxqueues 4096 numrxqueues 4096 type ip6gretap real 0m0.180s user 0m0.000s sys 0m0.107s Fixes: 76ff5cc91935 ("rtnl: allow to specify number of rx and tx queues on device creation") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7f37fe9c65a5..448703312fed 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2835,6 +2835,12 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); + if (num_tx_queues < 1 || num_tx_queues > 4096) + return ERR_PTR(-EINVAL); + + if (num_rx_queues < 1 || num_rx_queues > 4096) + return ERR_PTR(-EINVAL); + dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); if (!dev) From 0595751f267994c3c7027377058e4185b3a28e75 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 17 May 2018 16:35:07 +0200 Subject: [PATCH 430/499] smb2: fix missing files in root share directory listing When mounting a Windows share that is the root of a drive (eg. C$) the server does not return . and .. directory entries. This results in the smb2 code path erroneously skipping the 2 first entries. Pseudo-code of the readdir() code path: cifs_readdir(struct file, struct dir_context) initiate_cifs_search <-- if no reponse cached yet server->ops->query_dir_first dir_emit_dots dir_emit <-- adds "." and ".." if we're at pos=0 find_cifs_entry initiate_cifs_search <-- if pos < start of current response (restart search) server->ops->query_dir_next <-- if pos > end of current response (fetch next search res) for(...) <-- loops over cur response entries starting at pos cifs_filldir <-- skip . and .., emit entry cifs_fill_dirent dir_emit pos++ A) dir_emit_dots() always adds . & .. and sets the current dir pos to 2 (0 and 1 are done). Therefore we always want the index_to_find to be 2 regardless of if the response has . and .. B) smb1 code initializes index_of_last_entry with a +2 offset in cifssmb.c CIFSFindFirst(): psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; Later in find_cifs_entry() we want to find the next dir entry at pos=2 as a result of (A) first_entry_in_buffer = cfile->srch_inf.index_of_last_entry - cfile->srch_inf.entries_in_buffer; This var is the dir pos that the first entry in the buffer will have therefore it must be 2 in the first call. If we don't offset index_of_last_entry by 2 (like in (B)), first_entry_in_buffer=0 but we were instructed to get pos=2 so this code in find_cifs_entry() skips the 2 first which is ok for non-root shares, as it skips . and .. from the response but is not ok for root shares where the 2 first are actual files pos_in_buf = index_to_find - first_entry_in_buffer; // pos_in_buf=2 // we skip 2 first response entries :( for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) { /* go entry by entry figuring out which is first */ cur_ent = nxt_dir_entry(cur_ent, end_of_smb, cfile->srch_inf.info_level); } C) cifs_filldir() skips . and .. so we can safely ignore them for now. Sample program: int main(int argc, char **argv) { const char *path = argc >= 2 ? argv[1] : "."; DIR *dh; struct dirent *de; printf("listing path <%s>\n", path); dh = opendir(path); if (!dh) { printf("opendir error %d\n", errno); return 1; } while (1) { de = readdir(dh); if (!de) { if (errno) { printf("readdir error %d\n", errno); return 1; } printf("end of listing\n"); break; } printf("off=%lu <%s>\n", de->d_off, de->d_name); } return 0; } Before the fix with SMB1 on root shares: <.> off=1 <..> off=2 <$Recycle.Bin> off=3 off=4 and on non-root shares: <.> off=1 <..> off=4 <-- after adding .., the offsets jumps to +2 because <2536> off=5 we skipped . and .. from response buffer (C) <411> off=6 but still incremented pos off=7 off=8 Therefore the fix for smb2 is to mimic smb1 behaviour and offset the index_of_last_entry by 2. Test results comparing smb1 and smb2 before/after the fix on root share, non-root shares and on large directories (ie. multi-response dir listing): PRE FIX ======= pre-1-root VS pre-2-root: ERR pre-2-root is missing [bootmgr, $Recycle.Bin] pre-1-nonroot VS pre-2-nonroot: OK~ same files, same order, different offsets pre-1-nonroot-large VS pre-2-nonroot-large: OK~ same files, same order, different offsets POST FIX ======== post-1-root VS post-2-root: OK same files, same order, same offsets post-1-nonroot VS post-2-nonroot: OK same files, same order, same offsets post-1-nonroot-large VS post-2-nonroot-large: OK same files, same order, same offsets REGRESSION? =========== pre-1-root VS post-1-root: OK same files, same order, same offsets pre-1-nonroot VS post-1-nonroot: OK same files, same order, same offsets BugLink: https://bugzilla.samba.org/show_bug.cgi?id=13107 Signed-off-by: Aurelien Aptel Signed-off-by: Paulo Alcantara Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French CC: Stable --- fs/cifs/smb2ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d954ce36b473..89985a0a6819 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1477,7 +1477,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, } srch_inf->entries_in_buffer = 0; - srch_inf->index_of_last_entry = 0; + srch_inf->index_of_last_entry = 2; rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, fid->volatile_fid, 0, srch_inf); From ddf83afb9f60ee58cdec30bb615eff65d00ba05e Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 30 Aug 2018 10:12:59 +1000 Subject: [PATCH 431/499] cifs: add a warning if we try to to dequeue a deleted mid cifs_delete_mid() is called once we are finished handling a mid and we expect no more work done on this mid. Needed to fix recent commit: commit 730928c8f4be88e9d6a027a16b1e8fa9c59fc077 ("cifs: update smb2_queryfs() to use compounding") Add a warning if someone tries to dequeue a mid that has already been flagged to be deleted. Also change list_del() to list_del_init() so that if we have similar bugs resurface in the future we will not oops. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/cifsglob.h | 1 + fs/cifs/connect.c | 10 +++++++++- fs/cifs/transport.c | 3 ++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 0c9ab62c3df4..9dcaed031843 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1553,6 +1553,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, /* Flags */ #define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ +#define MID_DELETED 2 /* Mid has been dequeued/deleted */ /* Types of response buffer returned from SendReceive2 */ #define CIFS_NO_BUFFER 0 /* Response buffer not returned */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7aa08dba4719..e9d64c92b8da 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -659,7 +659,15 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) mid->mid_state = MID_RESPONSE_RECEIVED; else mid->mid_state = MID_RESPONSE_MALFORMED; - list_del_init(&mid->qhead); + /* + * Trying to handle/dequeue a mid after the send_recv() + * function has finished processing it is a bug. + */ + if (mid->mid_flags & MID_DELETED) + printk_once(KERN_WARNING + "trying to dequeue a deleted mid\n"); + else + list_del_init(&mid->qhead); spin_unlock(&GlobalMid_Lock); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 78f96fa3d7d9..9cc9a127749e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -142,7 +142,8 @@ void cifs_delete_mid(struct mid_q_entry *mid) { spin_lock(&GlobalMid_Lock); - list_del(&mid->qhead); + list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; spin_unlock(&GlobalMid_Lock); DeleteMidQEntry(mid); From 4e34feb5e96328df6a362c88098502b07c1b1e69 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 30 Aug 2018 10:13:00 +1000 Subject: [PATCH 432/499] cifs: only wake the thread for the very last PDU in a compound For compounded PDUs we whould only wake the waiting thread for the very last PDU of the compound. We do this so that we are guaranteed that the demultiplex_thread will not process or access any of those MIDs any more once the send/recv thread starts processing. Else there is a race where at the end of the send/recv processing we will try to delete all the mids of the compound. If the multiplex thread still has other mids to process at this point for this compound this can lead to an oops. Needed to fix recent commit: commit 730928c8f4be88e9d6a027a16b1e8fa9c59fc077 ("cifs: update smb2_queryfs() to use compounding") Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/transport.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 9cc9a127749e..b48f43963da6 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -773,6 +773,11 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) return mid; } +static void +cifs_noop_callback(struct mid_q_entry *mid) +{ +} + int compound_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, const int num_rqst, struct smb_rqst *rqst, @@ -827,8 +832,13 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } midQ[i]->mid_state = MID_REQUEST_SUBMITTED; + /* + * We don't invoke the callback compounds unless it is the last + * request. + */ + if (i < num_rqst - 1) + midQ[i]->callback = cifs_noop_callback; } - cifs_in_send_inc(ses->server); rc = smb_send_rqst(ses->server, num_rqst, rqst, flags); cifs_in_send_dec(ses->server); @@ -909,6 +919,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, midQ[i]->resp_buf = NULL; } out: + /* + * This will dequeue all mids. After this it is important that the + * demultiplex_thread will not process any of these mids any futher. + * This is prevented above by using a noop callback that will not + * wake this thread except for the very last PDU. + */ for (i = 0; i < num_rqst; i++) cifs_delete_mid(midQ[i]); add_credits(ses->server, credits, optype); From 7af929d6d05ba5564139718e30d5bc96bdbc716a Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 2 Oct 2018 18:54:09 -0500 Subject: [PATCH 433/499] smb3: fix lease break problem introduced by compounding Fixes problem (discovered by Aurelien) introduced by recent commit: commit b24df3e30cbf48255db866720fb71f14bf9d2f39 ("cifs: update receive_encrypted_standard to handle compounded responses") which broke the ability to respond to some lease breaks (lease breaks being ignored is a problem since can block server response for duration of the lease break timeout). Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky --- fs/cifs/connect.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e9d64c92b8da..52d71b64c0c6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -946,8 +946,7 @@ cifs_demultiplex_thread(void *p) } else { mids[0] = server->ops->find_mid(server, buf); bufs[0] = buf; - if (mids[0]) - num_mids = 1; + num_mids = 1; if (!mids[0] || !mids[0]->receive) length = standard_receive3(server, mids[0]); From 45ec318578c0c22a11f5b9927d064418e1ab1905 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 2 Oct 2018 16:52:03 -0700 Subject: [PATCH 434/499] net: systemport: Fix wake-up interrupt race during resume The AON_PM_L2 is normally used to trigger and identify the source of a wake-up event. Since the RX_SYS clock is no longer turned off, we also have an interrupt being sent to the SYSTEMPORT INTRL_2_0 controller, and that interrupt remains active up until the magic packet detector is disabled which happens much later during the driver resumption. The race happens if we have a CPU that is entering the SYSTEMPORT INTRL2_0 handler during resume, and another CPU has managed to clear the wake-up interrupt during bcm_sysport_resume_from_wol(). In that case, we have the first CPU stuck in the interrupt handler with an interrupt cause that has been cleared under its feet, and so we keep returning IRQ_NONE and we never make any progress. This was not a problem before because we would always turn off the RX_SYS clock during WoL, so the SYSTEMPORT INTRL2_0 would also be turned off as well, thus not latching the interrupt. The fix is to make sure we do not enable either the MPD or BRCM_TAG_MATCH interrupts since those are redundant with what the AON_PM_L2 interrupt controller already processes and they would cause such a race to occur. Fixes: bb9051a2b230 ("net: systemport: Add support for WAKE_FILTER") Fixes: 83e82f4c706b ("net: systemport: add Wake-on-LAN support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 28 +++++++++------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 147045757b10..c57238fce863 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1069,9 +1069,6 @@ static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv) { u32 reg; - /* Stop monitoring MPD interrupt */ - intrl2_0_mask_set(priv, INTRL2_0_MPD | INTRL2_0_BRCM_MATCH_TAG); - /* Disable RXCHK, active filters and Broadcom tag matching */ reg = rxchk_readl(priv, RXCHK_CONTROL); reg &= ~(RXCHK_BRCM_TAG_MATCH_MASK << @@ -1081,6 +1078,17 @@ static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv) /* Clear the MagicPacket detection logic */ mpd_enable_set(priv, false); + reg = intrl2_0_readl(priv, INTRL2_CPU_STATUS); + if (reg & INTRL2_0_MPD) + netdev_info(priv->netdev, "Wake-on-LAN (MPD) interrupt!\n"); + + if (reg & INTRL2_0_BRCM_MATCH_TAG) { + reg = rxchk_readl(priv, RXCHK_BRCM_TAG_MATCH_STATUS) & + RXCHK_BRCM_TAG_MATCH_MASK; + netdev_info(priv->netdev, + "Wake-on-LAN (filters 0x%02x) interrupt!\n", reg); + } + netif_dbg(priv, wol, priv->netdev, "resumed from WOL\n"); } @@ -1105,7 +1113,6 @@ static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id) struct bcm_sysport_priv *priv = netdev_priv(dev); struct bcm_sysport_tx_ring *txr; unsigned int ring, ring_bit; - u32 reg; priv->irq0_stat = intrl2_0_readl(priv, INTRL2_CPU_STATUS) & ~intrl2_0_readl(priv, INTRL2_CPU_MASK_STATUS); @@ -1131,16 +1138,6 @@ static irqreturn_t bcm_sysport_rx_isr(int irq, void *dev_id) if (priv->irq0_stat & INTRL2_0_TX_RING_FULL) bcm_sysport_tx_reclaim_all(priv); - if (priv->irq0_stat & INTRL2_0_MPD) - netdev_info(priv->netdev, "Wake-on-LAN (MPD) interrupt!\n"); - - if (priv->irq0_stat & INTRL2_0_BRCM_MATCH_TAG) { - reg = rxchk_readl(priv, RXCHK_BRCM_TAG_MATCH_STATUS) & - RXCHK_BRCM_TAG_MATCH_MASK; - netdev_info(priv->netdev, - "Wake-on-LAN (filters 0x%02x) interrupt!\n", reg); - } - if (!priv->is_lite) goto out; @@ -2641,9 +2638,6 @@ static int bcm_sysport_suspend_to_wol(struct bcm_sysport_priv *priv) /* UniMAC receive needs to be turned on */ umac_enable_set(priv, CMD_RX_EN, 1); - /* Enable the interrupt wake-up source */ - intrl2_0_mask_clear(priv, INTRL2_0_MPD | INTRL2_0_BRCM_MATCH_TAG); - netif_dbg(priv, wol, ndev, "entered WOL mode\n"); return 0; From a07f388e2cde2be74b263f85df6f672fea0305a1 Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Tue, 2 Oct 2018 18:52:01 -0600 Subject: [PATCH 435/499] net: qualcomm: rmnet: Skip processing loopback packets RMNET RX handler was processing invalid packets that were originally sent on the real device and were looped back via dev_loopback_xmit(). This was detected using syzkaller. Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation") Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c index 7fd86d40a337..6908b26feb9e 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c @@ -189,6 +189,9 @@ rx_handler_result_t rmnet_rx_handler(struct sk_buff **pskb) if (!skb) goto done; + if (skb->pkt_type == PACKET_LOOPBACK) + return RX_HANDLER_PASS; + dev = skb->dev; port = rmnet_get_port(dev); From 6392ff3c8e4c23d0a09b0ae9f94feb3effed490b Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Tue, 2 Oct 2018 18:52:02 -0600 Subject: [PATCH 436/499] net: qualcomm: rmnet: Fix incorrect allocation flag in transmit The incoming skb needs to be reallocated in case the headroom is not sufficient to add the MAP header. This allocation needs to be atomic otherwise it results in the following splat [32805.801456] BUG: sleeping function called from invalid context [32805.841141] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [32805.904773] task: ffffffd7c5f62280 task.stack: ffffff80464a8000 [32805.910851] pc : ___might_sleep+0x180/0x188 [32805.915143] lr : ___might_sleep+0x180/0x188 [32806.131520] Call trace: [32806.134041] ___might_sleep+0x180/0x188 [32806.137980] __might_sleep+0x50/0x84 [32806.141653] __kmalloc_track_caller+0x80/0x3bc [32806.146215] __kmalloc_reserve+0x3c/0x88 [32806.150241] pskb_expand_head+0x74/0x288 [32806.154269] rmnet_egress_handler+0xb0/0x1d8 [32806.162239] rmnet_vnd_start_xmit+0xc8/0x13c [32806.166627] dev_hard_start_xmit+0x148/0x280 [32806.181181] sch_direct_xmit+0xa4/0x198 [32806.185125] __qdisc_run+0x1f8/0x310 [32806.188803] net_tx_action+0x23c/0x26c [32806.192655] __do_softirq+0x220/0x408 [32806.196420] do_softirq+0x4c/0x70 Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation") Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c index 6908b26feb9e..1f98d65473cf 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c @@ -147,7 +147,7 @@ static int rmnet_map_egress_handler(struct sk_buff *skb, } if (skb_headroom(skb) < required_headroom) { - if (pskb_expand_head(skb, required_headroom, 0, GFP_KERNEL)) + if (pskb_expand_head(skb, required_headroom, 0, GFP_ATOMIC)) return -ENOMEM; } From ec405641e2b73160e26ef17580d0cf28565d146c Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Tue, 2 Oct 2018 18:52:03 -0600 Subject: [PATCH 437/499] net: qualcomm: rmnet: Fix incorrect allocation flag in receive path The incoming skb needs to be reallocated in case the headroom is not sufficient to adjust the ethernet header. This allocation needs to be atomic otherwise it results in this splat [<600601bb>] ___might_sleep+0x185/0x1a3 [<603f6314>] ? _raw_spin_unlock_irqrestore+0x0/0x27 [<60069bb0>] ? __wake_up_common_lock+0x95/0xd1 [<600602b0>] __might_sleep+0xd7/0xe2 [<60065598>] ? enqueue_task_fair+0x112/0x209 [<600eea13>] __kmalloc_track_caller+0x5d/0x124 [<600ee9b6>] ? __kmalloc_track_caller+0x0/0x124 [<602696d5>] __kmalloc_reserve.isra.34+0x30/0x7e [<603f629b>] ? _raw_spin_lock_irqsave+0x0/0x3d [<6026b744>] pskb_expand_head+0xbf/0x310 [<6025ca6a>] rmnet_rx_handler+0x7e/0x16b [<6025c9ec>] ? rmnet_rx_handler+0x0/0x16b [<6027ad0c>] __netif_receive_skb_core+0x301/0x96f [<60033c17>] ? set_signals+0x0/0x40 [<6027bbcb>] __netif_receive_skb+0x24/0x8e Fixes: 74692caf1b0b ("net: qualcomm: rmnet: Process packets over ethernet") Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c index 1f98d65473cf..11167abe5934 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c @@ -113,7 +113,7 @@ rmnet_map_ingress_handler(struct sk_buff *skb, struct sk_buff *skbn; if (skb->dev->type == ARPHRD_ETHER) { - if (pskb_expand_head(skb, ETH_HLEN, 0, GFP_KERNEL)) { + if (pskb_expand_head(skb, ETH_HLEN, 0, GFP_ATOMIC)) { kfree_skb(skb); return; } From 64199fc0a46ba211362472f7f942f900af9492fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 30 Sep 2018 11:33:39 -0700 Subject: [PATCH 438/499] ipv4: fix use-after-free in ip_cmsg_recv_dstaddr() Caching ip_hdr(skb) before a call to pskb_may_pull() is buggy, do not do it. Fixes: 2efd4fca703a ("ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c0fe5ad996f2..26c36cccabdc 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -149,7 +149,6 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { struct sockaddr_in sin; - const struct iphdr *iph = ip_hdr(skb); __be16 *ports; int end; @@ -164,7 +163,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) ports = (__be16 *)skb_transport_header(skb); sin.sin_family = AF_INET; - sin.sin_addr.s_addr = iph->daddr; + sin.sin_addr.s_addr = ip_hdr(skb)->daddr; sin.sin_port = ports[1]; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); From 9003b369499b3320d3f16145145b729fb6ff3bd4 Mon Sep 17 00:00:00 2001 From: "Alex Xu (Hello71)" Date: Sun, 30 Sep 2018 11:06:39 -0400 Subject: [PATCH 439/499] r8169: always autoneg on resume This affects at least versions 25 and 33, so assume all cards are broken and just renegotiate by default. Fixes: 10bc6a6042c9 ("r8169: fix autoneg issue on resume with RTL8168E") Signed-off-by: Alex Xu (Hello71) Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index a94b874982dc..9a5e2969df61 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -4072,13 +4072,12 @@ static void rtl8169_init_phy(struct net_device *dev, struct rtl8169_private *tp) genphy_soft_reset(dev->phydev); - /* It was reported that chip version 33 ends up with 10MBit/Half on a + /* It was reported that several chips end up with 10MBit/Half on a * 1GBit link after resuming from S3. For whatever reason the PHY on - * this chip doesn't properly start a renegotiation when soft-reset. + * these chips doesn't properly start a renegotiation when soft-reset. * Explicitly requesting a renegotiation fixes this. */ - if (tp->mac_version == RTL_GIGA_MAC_VER_33 && - dev->phydev->autoneg == AUTONEG_ENABLE) + if (dev->phydev->autoneg == AUTONEG_ENABLE) phy_restart_aneg(dev->phydev); } From 4f166564014aba65ad6f15b612f6711fd0f117ee Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 2 Oct 2018 21:26:50 -0700 Subject: [PATCH 440/499] x86/vdso: Only enable vDSO retpolines when enabled and supported When I fixed the vDSO build to use inline retpolines, I messed up the Makefile logic and made it unconditional. It should have depended on CONFIG_RETPOLINE and on the availability of compiler support. This broke the build on some older compilers. Reported-by: nikola.ciprich@linuxbox.cz Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: David Woodhouse Cc: Linus Torvalds Cc: Matt Rickard Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jason.vas.dias@gmail.com Cc: stable@vger.kernel.org Fixes: 2e549b2ee0e3 ("x86/vdso: Fix vDSO build if a retpoline is emitted") Link: http://lkml.kernel.org/r/08a1f29f2c238dd1f493945e702a521f8a5aa3ae.1538540801.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/Makefile | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index fa3f439f0a92..141d415a8c80 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -68,7 +68,13 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS) + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + CFL += $(RETPOLINE_VDSO_CFLAGS) +endif +endif $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) @@ -138,7 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) + +ifdef CONFIG_RETPOLINE +ifneq ($(RETPOLINE_VDSO_CFLAGS),) + KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) +endif +endif + $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: FORCE \ From 88296bd42b4e9d24e138a68b337c235b5cac89a7 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 2 Oct 2018 15:45:11 -0700 Subject: [PATCH 441/499] x86/cpu/amd: Remove unnecessary parentheses Clang warns when multiple pairs of parentheses are used for a single conditional statement. arch/x86/kernel/cpu/amd.c:925:14: warning: equality comparison with extraneous parentheses [-Wparentheses-equality] if ((c->x86 == 6)) { ~~~~~~~^~~~ arch/x86/kernel/cpu/amd.c:925:14: note: remove extraneous parentheses around the comparison to silence this warning if ((c->x86 == 6)) { ~ ^ ~ arch/x86/kernel/cpu/amd.c:925:14: note: use '=' to turn this equality comparison into an assignment if ((c->x86 == 6)) { ^~ = 1 warning generated. Signed-off-by: Nathan Chancellor Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181002224511.14929-1-natechancellor@gmail.com Link: https://github.com/ClangBuiltLinux/linux/issues/187 Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 22ab408177b2..eeea634bee0a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -922,7 +922,7 @@ static void init_amd(struct cpuinfo_x86 *c) static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* AMD errata T13 (order #21922) */ - if ((c->x86 == 6)) { + if (c->x86 == 6) { /* Duron Rev A0 */ if (c->x86_model == 3 && c->x86_stepping == 0) size = 64; From e4a02ed2aaf447fa849e3254bfdb3b9b01e1e520 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 2 Oct 2018 14:48:49 -0700 Subject: [PATCH 442/499] locking/ww_mutex: Fix runtime warning in the WW mutex selftest If CONFIG_WW_MUTEX_SELFTEST=y is enabled, booting an image in an arm64 virtual machine results in the following traceback if 8 CPUs are enabled: DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current) WARNING: CPU: 2 PID: 537 at kernel/locking/mutex.c:1033 __mutex_unlock_slowpath+0x1a8/0x2e0 ... Call trace: __mutex_unlock_slowpath() ww_mutex_unlock() test_cycle_work() process_one_work() worker_thread() kthread() ret_from_fork() If requesting b_mutex fails with -EDEADLK, the error variable is reassigned to the return value from calling ww_mutex_lock on a_mutex again. If this call fails, a_mutex is not locked. It is, however, unconditionally unlocked subsequently, causing the reported warning. Fix the problem by using two error variables. With this change, the selftest still fails as follows: cyclic deadlock not resolved, ret[7/8] = -35 However, the traceback is gone. Signed-off-by: Guenter Roeck Cc: Chris Wilson Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Fixes: d1b42b800e5d0 ("locking/ww_mutex: Add kselftests for resolving ww_mutex cyclic deadlocks") Link: http://lkml.kernel.org/r/1538516929-9734-1-git-send-email-linux@roeck-us.net Signed-off-by: Ingo Molnar --- kernel/locking/test-ww_mutex.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 0be047dbd897..65a3b7e55b9f 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -260,7 +260,7 @@ static void test_cycle_work(struct work_struct *work) { struct test_cycle *cycle = container_of(work, typeof(*cycle), work); struct ww_acquire_ctx ctx; - int err; + int err, erra = 0; ww_acquire_init(&ctx, &ww_class); ww_mutex_lock(&cycle->a_mutex, &ctx); @@ -270,17 +270,19 @@ static void test_cycle_work(struct work_struct *work) err = ww_mutex_lock(cycle->b_mutex, &ctx); if (err == -EDEADLK) { + err = 0; ww_mutex_unlock(&cycle->a_mutex); ww_mutex_lock_slow(cycle->b_mutex, &ctx); - err = ww_mutex_lock(&cycle->a_mutex, &ctx); + erra = ww_mutex_lock(&cycle->a_mutex, &ctx); } if (!err) ww_mutex_unlock(cycle->b_mutex); - ww_mutex_unlock(&cycle->a_mutex); + if (!erra) + ww_mutex_unlock(&cycle->a_mutex); ww_acquire_fini(&ctx); - cycle->result = err; + cycle->result = err ?: erra; } static int __test_cycle(unsigned int nthreads) From ad608fbcf166fec809e402d548761768f602702c Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 11 Sep 2018 05:32:37 -0400 Subject: [PATCH 443/499] media: v4l: event: Prevent freeing event subscriptions while accessed The event subscriptions are added to the subscribed event list while holding a spinlock, but that lock is subsequently released while still accessing the subscription object. This makes it possible to unsubscribe the event --- and freeing the subscription object's memory --- while the subscription object is simultaneously accessed. Prevent this by adding a mutex to serialise the event subscription and unsubscription. This also gives a guarantee to the callback ops that the add op has returned before the del op is called. This change also results in making the elems field less special: subscriptions are only added to the event list once they are fully initialised. Signed-off-by: Sakari Ailus Reviewed-by: Hans Verkuil Reviewed-by: Laurent Pinchart Cc: stable@vger.kernel.org # for 4.14 and up Fixes: c3b5b0241f62 ("V4L/DVB: V4L: Events: Add backend") Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-event.c | 38 +++++++++++++++------------- drivers/media/v4l2-core/v4l2-fh.c | 2 ++ include/media/v4l2-fh.h | 4 +++ 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/drivers/media/v4l2-core/v4l2-event.c b/drivers/media/v4l2-core/v4l2-event.c index 127fe6eb91d9..a3ef1f50a4b3 100644 --- a/drivers/media/v4l2-core/v4l2-event.c +++ b/drivers/media/v4l2-core/v4l2-event.c @@ -115,14 +115,6 @@ static void __v4l2_event_queue_fh(struct v4l2_fh *fh, const struct v4l2_event *e if (sev == NULL) return; - /* - * If the event has been added to the fh->subscribed list, but its - * add op has not completed yet elems will be 0, treat this as - * not being subscribed. - */ - if (!sev->elems) - return; - /* Increase event sequence number on fh. */ fh->sequence++; @@ -208,6 +200,7 @@ int v4l2_event_subscribe(struct v4l2_fh *fh, struct v4l2_subscribed_event *sev, *found_ev; unsigned long flags; unsigned i; + int ret = 0; if (sub->type == V4L2_EVENT_ALL) return -EINVAL; @@ -225,31 +218,36 @@ int v4l2_event_subscribe(struct v4l2_fh *fh, sev->flags = sub->flags; sev->fh = fh; sev->ops = ops; + sev->elems = elems; + + mutex_lock(&fh->subscribe_lock); spin_lock_irqsave(&fh->vdev->fh_lock, flags); found_ev = v4l2_event_subscribed(fh, sub->type, sub->id); - if (!found_ev) - list_add(&sev->list, &fh->subscribed); spin_unlock_irqrestore(&fh->vdev->fh_lock, flags); if (found_ev) { + /* Already listening */ kvfree(sev); - return 0; /* Already listening */ + goto out_unlock; } if (sev->ops && sev->ops->add) { - int ret = sev->ops->add(sev, elems); + ret = sev->ops->add(sev, elems); if (ret) { - sev->ops = NULL; - v4l2_event_unsubscribe(fh, sub); - return ret; + kvfree(sev); + goto out_unlock; } } - /* Mark as ready for use */ - sev->elems = elems; + spin_lock_irqsave(&fh->vdev->fh_lock, flags); + list_add(&sev->list, &fh->subscribed); + spin_unlock_irqrestore(&fh->vdev->fh_lock, flags); - return 0; +out_unlock: + mutex_unlock(&fh->subscribe_lock); + + return ret; } EXPORT_SYMBOL_GPL(v4l2_event_subscribe); @@ -288,6 +286,8 @@ int v4l2_event_unsubscribe(struct v4l2_fh *fh, return 0; } + mutex_lock(&fh->subscribe_lock); + spin_lock_irqsave(&fh->vdev->fh_lock, flags); sev = v4l2_event_subscribed(fh, sub->type, sub->id); @@ -305,6 +305,8 @@ int v4l2_event_unsubscribe(struct v4l2_fh *fh, if (sev && sev->ops && sev->ops->del) sev->ops->del(sev); + mutex_unlock(&fh->subscribe_lock); + kvfree(sev); return 0; diff --git a/drivers/media/v4l2-core/v4l2-fh.c b/drivers/media/v4l2-core/v4l2-fh.c index 3895999bf880..c91a7bd3ecfc 100644 --- a/drivers/media/v4l2-core/v4l2-fh.c +++ b/drivers/media/v4l2-core/v4l2-fh.c @@ -45,6 +45,7 @@ void v4l2_fh_init(struct v4l2_fh *fh, struct video_device *vdev) INIT_LIST_HEAD(&fh->available); INIT_LIST_HEAD(&fh->subscribed); fh->sequence = -1; + mutex_init(&fh->subscribe_lock); } EXPORT_SYMBOL_GPL(v4l2_fh_init); @@ -90,6 +91,7 @@ void v4l2_fh_exit(struct v4l2_fh *fh) return; v4l_disable_media_source(fh->vdev); v4l2_event_unsubscribe_all(fh); + mutex_destroy(&fh->subscribe_lock); fh->vdev = NULL; } EXPORT_SYMBOL_GPL(v4l2_fh_exit); diff --git a/include/media/v4l2-fh.h b/include/media/v4l2-fh.h index ea73fef8bdc0..8586cfb49828 100644 --- a/include/media/v4l2-fh.h +++ b/include/media/v4l2-fh.h @@ -38,10 +38,13 @@ struct v4l2_ctrl_handler; * @prio: priority of the file handler, as defined by &enum v4l2_priority * * @wait: event' s wait queue + * @subscribe_lock: serialise changes to the subscribed list; guarantee that + * the add and del event callbacks are orderly called * @subscribed: list of subscribed events * @available: list of events waiting to be dequeued * @navailable: number of available events at @available list * @sequence: event sequence number + * * @m2m_ctx: pointer to &struct v4l2_m2m_ctx */ struct v4l2_fh { @@ -52,6 +55,7 @@ struct v4l2_fh { /* Events */ wait_queue_head_t wait; + struct mutex subscribe_lock; struct list_head subscribed; struct list_head available; unsigned int navailable; From 4c9613ce556fdeb671e779668b627ea3a2b61728 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 3 Oct 2018 09:24:22 +0100 Subject: [PATCH 444/499] drm/i915: Handle incomplete Z_FINISH for compressed error states The final call to zlib_deflate(Z_FINISH) may require more output space to be allocated and so needs to re-invoked. Failure to do so in the current code leads to incomplete zlib streams (albeit intact due to the use of Z_SYNC_FLUSH) resulting in the occasional short object capture. v2: Check against overrunning our pre-allocated page array v3: Drop Z_SYNC_FLUSH entirely Testcase: igt/i915-error-capture.js Fixes: 0a97015d45ee ("drm/i915: Compress GPU objects in error state") Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: # v4.10+ Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20181003082422.23214-1-chris@chris-wilson.co.uk (cherry picked from commit 83bc0f5b432f60394466deef16fc753e27371d0b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_gpu_error.c | 88 +++++++++++++++++++-------- drivers/gpu/drm/i915/i915_gpu_error.h | 1 + 2 files changed, 64 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index f7f2aa71d8d9..a262a64f5625 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -232,6 +232,20 @@ static bool compress_init(struct compress *c) return true; } +static void *compress_next_page(struct drm_i915_error_object *dst) +{ + unsigned long page; + + if (dst->page_count >= dst->num_pages) + return ERR_PTR(-ENOSPC); + + page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); + if (!page) + return ERR_PTR(-ENOMEM); + + return dst->pages[dst->page_count++] = (void *)page; +} + static int compress_page(struct compress *c, void *src, struct drm_i915_error_object *dst) @@ -245,19 +259,14 @@ static int compress_page(struct compress *c, do { if (zstream->avail_out == 0) { - unsigned long page; + zstream->next_out = compress_next_page(dst); + if (IS_ERR(zstream->next_out)) + return PTR_ERR(zstream->next_out); - page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); - if (!page) - return -ENOMEM; - - dst->pages[dst->page_count++] = (void *)page; - - zstream->next_out = (void *)page; zstream->avail_out = PAGE_SIZE; } - if (zlib_deflate(zstream, Z_SYNC_FLUSH) != Z_OK) + if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) return -EIO; } while (zstream->avail_in); @@ -268,19 +277,42 @@ static int compress_page(struct compress *c, return 0; } +static int compress_flush(struct compress *c, + struct drm_i915_error_object *dst) +{ + struct z_stream_s *zstream = &c->zstream; + + do { + switch (zlib_deflate(zstream, Z_FINISH)) { + case Z_OK: /* more space requested */ + zstream->next_out = compress_next_page(dst); + if (IS_ERR(zstream->next_out)) + return PTR_ERR(zstream->next_out); + + zstream->avail_out = PAGE_SIZE; + break; + + case Z_STREAM_END: + goto end; + + default: /* any error */ + return -EIO; + } + } while (1); + +end: + memset(zstream->next_out, 0, zstream->avail_out); + dst->unused = zstream->avail_out; + return 0; +} + static void compress_fini(struct compress *c, struct drm_i915_error_object *dst) { struct z_stream_s *zstream = &c->zstream; - if (dst) { - zlib_deflate(zstream, Z_FINISH); - dst->unused = zstream->avail_out; - } - zlib_deflateEnd(zstream); kfree(zstream->workspace); - if (c->tmp) free_page((unsigned long)c->tmp); } @@ -319,6 +351,12 @@ static int compress_page(struct compress *c, return 0; } +static int compress_flush(struct compress *c, + struct drm_i915_error_object *dst) +{ + return 0; +} + static void compress_fini(struct compress *c, struct drm_i915_error_object *dst) { @@ -917,6 +955,7 @@ i915_error_object_create(struct drm_i915_private *i915, unsigned long num_pages; struct sgt_iter iter; dma_addr_t dma; + int ret; if (!vma) return NULL; @@ -930,6 +969,7 @@ i915_error_object_create(struct drm_i915_private *i915, dst->gtt_offset = vma->node.start; dst->gtt_size = vma->node.size; + dst->num_pages = num_pages; dst->page_count = 0; dst->unused = 0; @@ -938,28 +978,26 @@ i915_error_object_create(struct drm_i915_private *i915, return NULL; } + ret = -EINVAL; for_each_sgt_dma(dma, iter, vma->pages) { void __iomem *s; - int ret; ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); s = io_mapping_map_atomic_wc(&ggtt->iomap, slot); ret = compress_page(&compress, (void __force *)s, dst); io_mapping_unmap_atomic(s); - if (ret) - goto unwind; + break; } - goto out; -unwind: - while (dst->page_count--) - free_page((unsigned long)dst->pages[dst->page_count]); - kfree(dst); - dst = NULL; + if (ret || compress_flush(&compress, dst)) { + while (dst->page_count--) + free_page((unsigned long)dst->pages[dst->page_count]); + kfree(dst); + dst = NULL; + } -out: compress_fini(&compress, dst); ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); return dst; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index f893a4e8b783..8710fb18ed74 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -135,6 +135,7 @@ struct i915_gpu_state { struct drm_i915_error_object { u64 gtt_offset; u64 gtt_size; + int num_pages; int page_count; int unused; u32 *pages[0]; From 2cc543f5cd6deda27ef463686fa08c16c8c0990b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 3 Oct 2018 12:45:56 +0200 Subject: [PATCH 445/499] sctp: fix fall-through annotation Replace "fallthru" with a proper "fall through" annotation. This fix is part of the ongoing efforts to enabling -Wimplicit-fallthrough Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index d74d00b29942..42191ed9902b 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1048,7 +1048,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, if (!ctx->packet || !ctx->packet->has_cookie_echo) return; - /* fallthru */ + /* fall through */ case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: From 4233cfe6ec4683497d7318f55ce7617e97f2e610 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 3 Oct 2018 11:30:35 -0700 Subject: [PATCH 446/499] ixgbe: check return value of napi_complete_done() The NIC driver should only enable interrupts when napi_complete_done() returns true. This patch adds the check for ixgbe. Cc: stable@vger.kernel.org # 4.10+ Suggested-by: Eric Dumazet Signed-off-by: Song Liu Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index f27d73a7bf16..6cdd58d9d461 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -3196,11 +3196,13 @@ int ixgbe_poll(struct napi_struct *napi, int budget) return budget; /* all work done, exit the polling mode */ - napi_complete_done(napi, work_done); - if (adapter->rx_itr_setting & 1) - ixgbe_set_itr(q_vector); - if (!test_bit(__IXGBE_DOWN, &adapter->state)) - ixgbe_irq_enable_queues(adapter, BIT_ULL(q_vector->v_idx)); + if (likely(napi_complete_done(napi, work_done))) { + if (adapter->rx_itr_setting & 1) + ixgbe_set_itr(q_vector); + if (!test_bit(__IXGBE_DOWN, &adapter->state)) + ixgbe_irq_enable_queues(adapter, + BIT_ULL(q_vector->v_idx)); + } return min(work_done, budget - 1); } From 6579804c431712d56956a63b1a01509441cc6800 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 4 Oct 2018 14:51:11 +1000 Subject: [PATCH 447/499] KVM: PPC: Book3S HV: Avoid crash from THP collapse during radix page fault Commit 71d29f43b633 ("KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size", 2018-09-11) added a call to __find_linux_pte() and a dereference of the returned PTE pointer to the radix page fault path in the common case where the page is normal system memory. Previously, __find_linux_pte() was only called for mappings to physical addresses which don't have a page struct (e.g. memory-mapped I/O) or where the page struct is marked as reserved memory. This exposes us to the possibility that the returned PTE pointer could be NULL, for example in the case of a concurrent THP collapse operation. Dereferencing the returned NULL pointer causes a host crash. To fix this, we check for NULL, and if it is NULL, we retry the operation by returning to the guest, with the expectation that it will generate the same page fault again (unless of course it has been fixed up by another CPU in the meantime). Fixes: 71d29f43b633 ("KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size") Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 933c574e1cf7..998f8d089ac7 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -646,6 +646,16 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ local_irq_disable(); ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + /* + * If the PTE disappeared temporarily due to a THP + * collapse, just return and let the guest try again. + */ + if (!ptep) { + local_irq_enable(); + if (page) + put_page(page); + return RESUME_GUEST; + } pte = *ptep; local_irq_enable(); From 709ae62e8e6d9ac4df7dadb3b8ae432675c45ef9 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Thu, 4 Oct 2018 11:39:42 +0800 Subject: [PATCH 448/499] ALSA: hda/realtek - Cannot adjust speaker's volume on Dell XPS 27 7760 The issue is the same as commit dd9aa335c880 ("ALSA: hda/realtek - Can't adjust speaker's volume on a Dell AIO"), the output requires to connect to a node with Amp-out capability. Applying the same fixup ALC298_FIXUP_SPK_VOLUME can fix the issue. BugLink: https://bugs.launchpad.net/bugs/1775068 Signed-off-by: Kai-Heng Feng Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 1d117f00d04d..3ac7ba9b342d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6409,6 +6409,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0706, "Dell Inspiron 7559", ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER), SND_PCI_QUIRK(0x1028, 0x0725, "Dell Inspiron 3162", ALC255_FIXUP_DELL_SPK_NOISE), SND_PCI_QUIRK(0x1028, 0x075b, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), + SND_PCI_QUIRK(0x1028, 0x075c, "Dell XPS 27 7760", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x1028, 0x075d, "Dell AIO", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x1028, 0x07b0, "Dell Precision 7520", ALC295_FIXUP_DISABLE_DAC3), SND_PCI_QUIRK(0x1028, 0x0798, "Dell Inspiron 17 7000 Gaming", ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER), From 02e425668f5c9deb42787d10001a3b605993ad15 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 3 Oct 2018 16:23:49 -0700 Subject: [PATCH 449/499] x86/vdso: Fix vDSO syscall fallback asm constraint regression When I added the missing memory outputs, I failed to update the index of the first argument (ebx) on 32-bit builds, which broke the fallbacks. Somehow I must have screwed up my testing or gotten lucky. Add another test to cover gettimeofday() as well. Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 715bd9d12f84 ("x86/vdso: Fix asm constraints on vDSO syscall fallbacks") Link: http://lkml.kernel.org/r/21bd45ab04b6d838278fa5bebfa9163eceffa13c.1538608971.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vclock_gettime.c | 8 +-- tools/testing/selftests/x86/test_vdso.c | 73 +++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 134e2d2e8add..e48ca3afa091 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -68,11 +68,11 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[clock], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*ts) - : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) : "memory", "edx"); return ret; } @@ -83,11 +83,11 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[tv], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret), "=m" (*tv), "=m" (*tz) - : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "0" (__NR_gettimeofday), [tv] "g" (tv), "c" (tz) : "memory", "edx"); return ret; } diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c index 49f7294fb382..35edd61d1663 100644 --- a/tools/testing/selftests/x86/test_vdso.c +++ b/tools/testing/selftests/x86/test_vdso.c @@ -36,6 +36,10 @@ typedef int (*vgettime_t)(clockid_t, struct timespec *); vgettime_t vdso_clock_gettime; +typedef long (*vgtod_t)(struct timeval *tv, struct timezone *tz); + +vgtod_t vdso_gettimeofday; + typedef long (*getcpu_t)(unsigned *, unsigned *, void *); getcpu_t vgetcpu; @@ -104,6 +108,11 @@ static void fill_function_pointers() vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); if (!vdso_clock_gettime) printf("Warning: failed to find clock_gettime in vDSO\n"); + + vdso_gettimeofday = (vgtod_t)dlsym(vdso, "__vdso_gettimeofday"); + if (!vdso_gettimeofday) + printf("Warning: failed to find gettimeofday in vDSO\n"); + } static long sys_getcpu(unsigned * cpu, unsigned * node, @@ -117,6 +126,11 @@ static inline int sys_clock_gettime(clockid_t id, struct timespec *ts) return syscall(__NR_clock_gettime, id, ts); } +static inline int sys_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + return syscall(__NR_gettimeofday, tv, tz); +} + static void test_getcpu(void) { printf("[RUN]\tTesting getcpu...\n"); @@ -177,6 +191,14 @@ static bool ts_leq(const struct timespec *a, const struct timespec *b) return a->tv_nsec <= b->tv_nsec; } +static bool tv_leq(const struct timeval *a, const struct timeval *b) +{ + if (a->tv_sec != b->tv_sec) + return a->tv_sec < b->tv_sec; + else + return a->tv_usec <= b->tv_usec; +} + static char const * const clocknames[] = { [0] = "CLOCK_REALTIME", [1] = "CLOCK_MONOTONIC", @@ -248,11 +270,62 @@ static void test_clock_gettime(void) test_one_clock_gettime(INT_MAX, "invalid"); } +static void test_gettimeofday(void) +{ + struct timeval start, vdso, end; + struct timezone sys_tz, vdso_tz; + int vdso_ret, end_ret; + + if (!vdso_gettimeofday) + return; + + printf("[RUN]\tTesting gettimeofday...\n"); + + if (sys_gettimeofday(&start, &sys_tz) < 0) { + printf("[FAIL]\tsys_gettimeofday failed (%d)\n", errno); + nerrs++; + return; + } + + vdso_ret = vdso_gettimeofday(&vdso, &vdso_tz); + end_ret = sys_gettimeofday(&end, NULL); + + if (vdso_ret != 0 || end_ret != 0) { + printf("[FAIL]\tvDSO returned %d, syscall errno=%d\n", + vdso_ret, errno); + nerrs++; + return; + } + + printf("\t%llu.%06ld %llu.%06ld %llu.%06ld\n", + (unsigned long long)start.tv_sec, start.tv_usec, + (unsigned long long)vdso.tv_sec, vdso.tv_usec, + (unsigned long long)end.tv_sec, end.tv_usec); + + if (!tv_leq(&start, &vdso) || !tv_leq(&vdso, &end)) { + printf("[FAIL]\tTimes are out of sequence\n"); + nerrs++; + } + + if (sys_tz.tz_minuteswest == vdso_tz.tz_minuteswest && + sys_tz.tz_dsttime == vdso_tz.tz_dsttime) { + printf("[OK]\ttimezones match: minuteswest=%d, dsttime=%d\n", + sys_tz.tz_minuteswest, sys_tz.tz_dsttime); + } else { + printf("[FAIL]\ttimezones do not match\n"); + nerrs++; + } + + /* And make sure that passing NULL for tz doesn't crash. */ + vdso_gettimeofday(&vdso, NULL); +} + int main(int argc, char **argv) { fill_function_pointers(); test_clock_gettime(); + test_gettimeofday(); /* * Test getcpu() last so that, if something goes wrong setting affinity, From fd6b6d9b82f97a851fb0078201ddc38fe9728cda Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 1 Oct 2018 14:25:34 -0700 Subject: [PATCH 450/499] KVM: VMX: check for existence of secondary exec controls before accessing Return early from vmx_set_virtual_apic_mode() if the processor doesn't support VIRTUALIZE_APIC_ACCESSES or VIRTUALIZE_X2APIC_MODE, both of which reside in SECONDARY_VM_EXEC_CONTROL. This eliminates warnings due to VMWRITEs to SECONDARY_VM_EXEC_CONTROL (VMCS field 401e) failing on processors without secondary exec controls. Remove the similar check for TPR shadowing as it is incorporated in the flexpriority_enabled check and the APIC-related code in vmx_update_msr_bitmap() is further gated by VIRTUALIZE_X2APIC_MODE. Reported-by: Gerhard Wiesinger Fixes: 8d860bbeedef ("kvm: vmx: Basic APIC virtualization controls have three settings") Cc: Jim Mattson Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b16b2664cfe1..764ae031054f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10214,15 +10214,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; + if (!flexpriority_enabled && + !cpu_has_vmx_virtualize_x2apic_mode()) + return; + /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; return; } - if (!cpu_need_tpr_shadow(vcpu)) - return; - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); From 2cf7ea9f40fabee0f8b40db4eb2d1e85cc6c0a95 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 3 Oct 2018 10:34:00 +0200 Subject: [PATCH 451/499] KVM: VMX: hide flexpriority from guest when disabled at the module level As of commit 8d860bbeedef ("kvm: vmx: Basic APIC virtualization controls have three settings"), KVM will disable VIRTUALIZE_APIC_ACCESSES when a nested guest writes APIC_BASE MSR and kvm-intel.flexpriority=0, whereas previously KVM would allow a nested guest to enable VIRTUALIZE_APIC_ACCESSES so long as it's supported in hardware. That is, KVM now advertises VIRTUALIZE_APIC_ACCESSES to a guest but doesn't (always) allow setting it when kvm-intel.flexpriority=0, and may even initially allow the control and then clear it when the nested guest writes APIC_BASE MSR, which is decidedly odd even if it doesn't cause functional issues. Hide the control completely when the module parameter is cleared. reported-by: Sean Christopherson Fixes: 8d860bbeedef ("kvm: vmx: Basic APIC virtualization controls have three settings") Cc: Jim Mattson Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 764ae031054f..55b62760b694 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3589,12 +3589,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) msrs->secondary_ctls_high); msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high &= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING; + /* * We can emulate "VMCS shadowing," even if the hardware * doesn't support it. @@ -3651,6 +3651,10 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) msrs->secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (flexpriority_enabled) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, From 601350ff58d5415a001769532f6b8333820e5786 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 28 Sep 2018 21:00:48 +0300 Subject: [PATCH 452/499] ovl: fix access beyond unterminated strings KASAN detected slab-out-of-bounds access in printk from overlayfs, because string format used %*s instead of %.*s. > BUG: KASAN: slab-out-of-bounds in string+0x298/0x2d0 lib/vsprintf.c:604 > Read of size 1 at addr ffff8801c36c66ba by task syz-executor2/27811 > > CPU: 0 PID: 27811 Comm: syz-executor2 Not tainted 4.19.0-rc5+ #36 ... > printk+0xa7/0xcf kernel/printk/printk.c:1996 > ovl_lookup_index.cold.15+0xe8/0x1f8 fs/overlayfs/namei.c:689 Reported-by: syzbot+376cea2b0ef340db3dd4@syzkaller.appspotmail.com Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi Fixes: 359f392ca53e ("ovl: lookup index entry for copy up origin") Cc: # v4.13 --- fs/overlayfs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index f28711846dd6..9c0ca6a7becf 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -686,7 +686,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, index = NULL; goto out; } - pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n" + pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); From 1a8f8d2a443ef9ad9a3065ba8c8119df714240fa Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 4 Oct 2018 14:49:10 +0200 Subject: [PATCH 453/499] ovl: fix format of setxattr debug Format has a typo: it was meant to be "%.*s", not "%*s". But at some point callers grew nonprintable values as well, so use "%*pE" instead with a maximized length. Reported-by: Amir Goldstein Signed-off-by: Miklos Szeredi Fixes: 3a1e819b4e80 ("ovl: store file handle of lower inode on copy up") Cc: # v4.12 --- fs/overlayfs/overlayfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index f61839e1054c..a3c0d9584312 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -152,8 +152,8 @@ static inline int ovl_do_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { int err = vfs_setxattr(dentry, name, value, size, flags); - pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n", - dentry, name, (int) size, (char *) value, flags, err); + pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0x%x) = %i\n", + dentry, name, min((int)size, 48), value, size, flags, err); return err; } From 7e7126846c95a34f98a1524d5c473af1f0783735 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 3 Oct 2018 13:44:26 +0200 Subject: [PATCH 454/499] kvm: nVMX: fix entry with pending interrupt if APICv is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b5861e5cf2fcf83031ea3e26b0a69d887adf7d21 introduced a check on the interrupt-window and NMI-window CPU execution controls in order to inject an external interrupt vmexit before the first guest instruction executes. However, when APIC virtualization is enabled the host does not need a vmexit in order to inject an interrupt at the next interrupt window; instead, it just places the interrupt vector in RVI and the processor will inject it as soon as possible. Therefore, on machines with APICv it is not enough to check the CPU execution controls: the same scenario can also happen if RVI>vPPR. Fixes: b5861e5cf2fcf83031ea3e26b0a69d887adf7d21 Reviewed-by: Nikita Leshchenko Cc: Sean Christopherson Cc: Liran Alon Cc: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 55b62760b694..612fd17be635 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6162,6 +6162,11 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) nested_mark_vmcs12_pages_dirty(vcpu); } +static u8 vmx_get_rvi(void) +{ + return vmcs_read16(GUEST_INTR_STATUS) & 0xff; +} + static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6174,7 +6179,7 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) return false; - rvi = vmcs_read16(GUEST_INTR_STATUS) & 0xff; + rvi = vmx_get_rvi(); vapic_page = kmap(vmx->nested.virtual_apic_page); vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); @@ -10349,6 +10354,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } +static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) +{ + u8 rvi = vmx_get_rvi(); + u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -12593,10 +12606,13 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); bool from_vmentry = !!exit_qual; u32 dummy_exit_qual; - u32 vmcs01_cpu_exec_ctrl; + bool evaluate_pending_interrupts; int r = 0; - vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); + if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) + evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); enter_guest_mode(vcpu); @@ -12644,16 +12660,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) * to L1 or delivered directly to L2 (e.g. In case L1 don't * intercept EXTERNAL_INTERRUPT). * - * Usually this would be handled by L0 requesting a - * IRQ/NMI window by setting VMCS accordingly. However, - * this setting was done on VMCS01 and now VMCS02 is active - * instead. Thus, we force L0 to perform pending event - * evaluation by requesting a KVM_REQ_EVENT. + * Usually this would be handled by the processor noticing an + * IRQ/NMI window request, or checking RVI during evaluation of + * pending virtual interrupts. However, this setting was done + * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 + * to perform pending event evaluation by requesting a KVM_REQ_EVENT. */ - if (vmcs01_cpu_exec_ctrl & - (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING)) { + if (unlikely(evaluate_pending_interrupts)) kvm_make_request(KVM_REQ_EVENT, vcpu); - } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point From 987bf116445db5d63a5c2ed94c4479687d9c9973 Mon Sep 17 00:00:00 2001 From: Shirish S Date: Mon, 24 Sep 2018 19:01:47 +0530 Subject: [PATCH 455/499] drm/amd/display: Signal hw_done() after waiting for flip_done() In amdgpu_dm_commit_tail(), wait until flip_done() is signaled before we signal hw_done(). [Why] This is to temporarily address a paging error that occurs when a nonblocking commit contends with another commit, particularly in a mirrored display configuration where at least 2 CRTCs are updated. The error occurs in drm_atomic_helper_wait_for_flip_done(), when we attempt to access the contents of new_crtc_state->commit. Here's the sequence for a mirrored 2 display setup (irrelevant steps left out for clarity): **THREAD 1** | **THREAD 2** | Initialize atomic state for flip | | Queue worker | ... | Do work for flip | | Signal hw_done() on CRTC 1 | Signal hw_done() on CRTC 2 | | Wait for flip_done() on CRTC 1 <---- **PREEMPTED BY THREAD 1** Initialize atomic state for cursor | update (1) | | Do cursor update work on both CRTCs | | Clear atomic state (2) | **DONE** | ... | | Wait for flip_done() on CRTC 2 | *ERROR* | The issue starts with (1). When the atomic state is initialized, the current CRTC states are duplicated to be the new_crtc_states, and referenced to be the old_crtc_states. (The new_crtc_states are to be filled with update data.) Some things to note: * Due to the mirrored configuration, the cursor updates on both CRTCs. * At this point, the pflip IRQ has already been handled, and flip_done signaled on all CRTCs. The cursor commit can therefore continue. * The old_crtc_states used by the cursor update are the **same states** as the new_crtc_states used by the flip worker. At (2), the old_crtc_state is freed (*), and the cursor commit completes. We then context switch back to the flip worker, where we attempt to access the new_crtc_state->commit object. This is problematic, as this state has already been freed. (*) Technically, 'state->crtcs[i].state' is freed, which was made to reference old_crtc_state in drm_atomic_helper_swap_state() [How] By moving hw_done() after wait_for_flip_done(), we're guaranteed that the new_crtc_state (from the flip worker's perspective) still exists. This is because any other commit will be blocked, waiting for the hw_done() signal. Note that both the i915 and imx drivers have this sequence flipped already, masking this problem. Signed-off-by: Shirish S Signed-off-by: Leo Li Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 96875950845a..6903fe6c894b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -4633,12 +4633,18 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) } spin_unlock_irqrestore(&adev->ddev->event_lock, flags); - /* Signal HW programming completion */ - drm_atomic_helper_commit_hw_done(state); if (wait_for_vblank) drm_atomic_helper_wait_for_flip_done(dev, state); + /* + * FIXME: + * Delay hw_done() until flip_done() is signaled. This is to block + * another commit from freeing the CRTC state while we're still + * waiting on flip_done. + */ + drm_atomic_helper_commit_hw_done(state); + drm_atomic_helper_cleanup_planes(dev, state); /* Finally, drop a runtime PM reference for each newly disabled CRTC, From 11b29c9e25788d0afb2ddb67bcd89424bd25f2f7 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 2 Oct 2018 18:41:12 -0400 Subject: [PATCH 456/499] drm/amdkfd: Fix incorrect use of process->mm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This mm_struct pointer should never be dereferenced. If running in a user thread, just use current->mm. If running in a kernel worker use get_task_mm to get a safe reference to the mm_struct. Reviewed-by: Oded Gabbay Acked-by: Christian König Signed-off-by: Felix Kuehling Signed-off-by: Alex Deucher --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 37 +++++++++++++++---- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index ec0d62a16e53..4f22e745df51 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -358,8 +358,8 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { - int retval; struct mqd_manager *mqd_mgr; + int retval; mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); if (!mqd_mgr) @@ -387,8 +387,12 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, if (!q->properties.is_active) return 0; - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, - &q->properties, q->process->mm); + if (WARN(q->process->mm != current->mm, + "should only run in user thread")) + retval = -EFAULT; + else + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, + &q->properties, current->mm); if (retval) goto out_uninit_mqd; @@ -545,9 +549,15 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) retval = map_queues_cpsch(dqm); else if (q->properties.is_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, - &q->properties, q->process->mm); + q->properties.type == KFD_QUEUE_TYPE_SDMA)) { + if (WARN(q->process->mm != current->mm, + "should only run in user thread")) + retval = -EFAULT; + else + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, + q->pipe, q->queue, + &q->properties, current->mm); + } out_unlock: dqm_unlock(dqm); @@ -653,6 +663,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { + struct mm_struct *mm = NULL; struct queue *q; struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; @@ -686,6 +697,15 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, kfd_flush_tlb(pdd); } + /* Take a safe reference to the mm_struct, which may otherwise + * disappear even while the kfd_process is still referenced. + */ + mm = get_task_mm(pdd->process->lead_thread); + if (!mm) { + retval = -EFAULT; + goto out; + } + /* activate all active queues on the qpd */ list_for_each_entry(q, &qpd->queues_list, list) { if (!q->properties.is_evicted) @@ -700,14 +720,15 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, q->properties.is_evicted = false; q->properties.is_active = true; retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, - q->queue, &q->properties, - q->process->mm); + q->queue, &q->properties, mm); if (retval) goto out; dqm->queue_count++; } qpd->evicted = 0; out: + if (mm) + mmput(mm); dqm_unlock(dqm); return retval; } From f3c84a8e3e922afdcbc55f04df8fdf8a548f5a21 Mon Sep 17 00:00:00 2001 From: Nir Dotan Date: Thu, 4 Oct 2018 15:48:02 +0000 Subject: [PATCH 457/499] mlxsw: pci: Derive event type from event queue number Due to a hardware issue in Spectrum-2, the field event_type of the event queue element (EQE) has become reserved. It was used to distinguish between command interface completion events and completion events. Use queue number to determine event type, as command interface completion events are always received on EQ0 and mlxsw driver maps completion events to EQ1. Fixes: c3ab435466d5 ("mlxsw: spectrum: Extend to support Spectrum-2 ASIC") Signed-off-by: Nir Dotan Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 4d271fb3de3d..5890fdfd62c3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -718,14 +718,17 @@ static void mlxsw_pci_eq_tasklet(unsigned long data) memset(&active_cqns, 0, sizeof(active_cqns)); while ((eqe = mlxsw_pci_eq_sw_eqe_get(q))) { - u8 event_type = mlxsw_pci_eqe_event_type_get(eqe); - switch (event_type) { - case MLXSW_PCI_EQE_EVENT_TYPE_CMD: + /* Command interface completion events are always received on + * queue MLXSW_PCI_EQ_ASYNC_NUM (EQ0) and completion events + * are mapped to queue MLXSW_PCI_EQ_COMP_NUM (EQ1). + */ + switch (q->num) { + case MLXSW_PCI_EQ_ASYNC_NUM: mlxsw_pci_eq_cmd_event(mlxsw_pci, eqe); q->u.eq.ev_cmd_count++; break; - case MLXSW_PCI_EQE_EVENT_TYPE_COMP: + case MLXSW_PCI_EQ_COMP_NUM: cqn = mlxsw_pci_eqe_cqn_get(eqe); set_bit(cqn, active_cqns); cq_handle = true; From c360867ec46a4ec5cb19e5c329d65dff522cc69d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 4 Oct 2018 15:48:03 +0000 Subject: [PATCH 458/499] mlxsw: spectrum: Delete RIF when VLAN device is removed In commit 602b74eda813 ("mlxsw: spectrum_switchdev: Do not leak RIFs when removing bridge") I handled the case where RIFs created for VLAN devices were not properly cleaned up when their real device (a bridge) was removed. However, I forgot to handle the case of the VLAN device itself being removed. Do so now when the VLAN device is being unlinked from its real device. Fixes: 99f44bb3527b ("mlxsw: spectrum: Enable L3 interfaces on top of bridge devices") Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reported-by: Artem Shvorin Tested-by: Artem Shvorin Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index b492152c8881..30bb2c533cec 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4845,6 +4845,8 @@ static int mlxsw_sp_netdevice_bridge_event(struct net_device *br_dev, upper_dev = info->upper_dev; if (info->linking) break; + if (is_vlan_dev(upper_dev)) + mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, upper_dev); if (netif_is_macvlan(upper_dev)) mlxsw_sp_rif_macvlan_del(mlxsw_sp, upper_dev); break; From 69e445ab8b66a9f30519842ef18be555d3ee9b51 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 4 Oct 2018 11:08:12 +0200 Subject: [PATCH 459/499] PM / core: Clear the direct_complete flag on errors If __device_suspend() runs asynchronously (in which case the device passed to it is in dpm_suspended_list at that point) and it returns early on an error or pending wakeup, and the power.direct_complete flag has been set for the device already, the subsequent device_resume() will be confused by that and it will call pm_runtime_enable() incorrectly, as runtime PM has not been disabled for the device by __device_suspend(). To avoid that, clear power.direct_complete if __device_suspend() is not going to disable runtime PM for the device before returning. Fixes: aae4518b3124 (PM / sleep: Mechanism to avoid resuming runtime-suspended devices unnecessarily) Reported-by: Al Cooper Tested-by: Al Cooper Reviewed-by: Ulf Hansson Cc: 3.16+ # 3.16+ Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 3f68e2919dc5..a690fd400260 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1713,8 +1713,10 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) dpm_wait_for_subordinate(dev, async); - if (async_error) + if (async_error) { + dev->power.direct_complete = false; goto Complete; + } /* * If a device configured to wake up the system from sleep states @@ -1726,6 +1728,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) pm_wakeup_event(dev, 0); if (pm_wakeup_pending()) { + dev->power.direct_complete = false; async_error = -EBUSY; goto Complete; } From 4561ffca88c546f96367f94b8f1e4715a9c62314 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 24 Sep 2018 16:19:30 -0400 Subject: [PATCH 460/499] dm cache metadata: ignore hints array being too small during resize Commit fd2fa9541 ("dm cache metadata: save in-core policy_hint_size to on-disk superblock") enabled previously written policy hints to be used after a cache is reactivated. But in doing so the cache metadata's hint array was left exposed to out of bounds access because on resize the metadata's on-disk hint array wasn't ever extended. Fix this by ignoring that there are no on-disk hints associated with the newly added cache blocks. An expanded on-disk hint array is later rewritten upon the next clean shutdown of the cache. Fixes: fd2fa9541 ("dm cache metadata: save in-core policy_hint_size to on-disk superblock") Cc: stable@vger.kernel.org Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-metadata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 69dddeab124c..5936de71883f 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1455,8 +1455,8 @@ static int __load_mappings(struct dm_cache_metadata *cmd, if (hints_valid) { r = dm_array_cursor_next(&cmd->hint_cursor); if (r) { - DMERR("dm_array_cursor_next for hint failed"); - goto out; + dm_array_cursor_end(&cmd->hint_cursor); + hints_valid = false; } } From 5d07384a666d4b2f781dc056bfeec2c27fbdf383 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 25 Sep 2018 20:56:02 -0400 Subject: [PATCH 461/499] dm cache: fix resize crash if user doesn't reload cache table A reload of the cache's DM table is needed during resize because otherwise a crash will occur when attempting to access smq policy entries associated with the portion of the cache that was recently extended. The reason is cache-size based data structures in the policy will not be resized, the only way to safely extend the cache is to allow for a proper cache policy initialization that occurs when the cache table is loaded. For example the smq policy's space_init(), init_allocator(), calc_hotspot_params() must be sized based on the extended cache size. The fix for this is to disallow cache resizes of this pattern: 1) suspend "cache" target's device 2) resize the fast device used for the cache 3) resume "cache" target's device Instead, the last step must be a full reload of the cache's DM table. Fixes: 66a636356 ("dm cache: add stochastic-multi-queue (smq) policy") Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index a53413371725..e13d991e9fb5 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3009,8 +3009,13 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) static bool can_resize(struct cache *cache, dm_cblock_t new_size) { - if (from_cblock(new_size) > from_cblock(cache->cache_size)) - return true; + if (from_cblock(new_size) > from_cblock(cache->cache_size)) { + if (cache->sized) { + DMERR("%s: unable to extend cache due to missing cache table reload", + cache_device_name(cache)); + return false; + } + } /* * We can't drop a dirty block when shrinking the cache. From f7b2a56e1f3dcbdb4cf09b2b63e859ffe0e09df8 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 28 Sep 2018 17:04:30 -0600 Subject: [PATCH 462/499] net/usb: cancel pending work when unbinding smsc75xx Cancel pending work before freeing smsc75xx private data structure during binding. This fixes the following crash in the driver: BUG: unable to handle kernel NULL pointer dereference at 0000000000000050 IP: mutex_lock+0x2b/0x3f Workqueue: events smsc75xx_deferred_multicast_write [smsc75xx] task: ffff8caa83e85700 task.stack: ffff948b80518000 RIP: 0010:mutex_lock+0x2b/0x3f Call Trace: smsc75xx_deferred_multicast_write+0x40/0x1af [smsc75xx] process_one_work+0x18d/0x2fc worker_thread+0x1a2/0x269 ? pr_cont_work+0x58/0x58 kthread+0xfa/0x10a ? pr_cont_work+0x58/0x58 ? rcu_read_unlock_sched_notrace+0x48/0x48 ret_from_fork+0x22/0x40 Signed-off-by: Yu Zhao Signed-off-by: David S. Miller --- drivers/net/usb/smsc75xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index e5a4cbb366dc..ec287c9741e8 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -1520,6 +1520,7 @@ static void smsc75xx_unbind(struct usbnet *dev, struct usb_interface *intf) { struct smsc75xx_priv *pdata = (struct smsc75xx_priv *)(dev->data[0]); if (pdata) { + cancel_work_sync(&pdata->set_multicast); netif_dbg(dev, ifdown, dev->net, "free pdata\n"); kfree(pdata); pdata = NULL; From 471b83bd8bbe4e89743683ef8ecb78f7029d8288 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 1 Oct 2018 12:21:59 +0300 Subject: [PATCH 463/499] team: Forbid enslaving team device to itself team's ndo_add_slave() acquires 'team->lock' and later tries to open the newly enslaved device via dev_open(). This emits a 'NETDEV_UP' event that causes the VLAN driver to add VLAN 0 on the team device. team's ndo_vlan_rx_add_vid() will also try to acquire 'team->lock' and deadlock. Fix this by checking early at the enslavement function that a team device is not being enslaved to itself. A similar check was added to the bond driver in commit 09a89c219baf ("bonding: disallow enslaving a bond to itself"). WARNING: possible recursive locking detected 4.18.0-rc7+ #176 Not tainted -------------------------------------------- syz-executor4/6391 is trying to acquire lock: (____ptrval____) (&team->lock){+.+.}, at: team_vlan_rx_add_vid+0x3b/0x1e0 drivers/net/team/team.c:1868 but task is already holding lock: (____ptrval____) (&team->lock){+.+.}, at: team_add_slave+0xdb/0x1c30 drivers/net/team/team.c:1947 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&team->lock); lock(&team->lock); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by syz-executor4/6391: #0: (____ptrval____) (rtnl_mutex){+.+.}, at: rtnl_lock net/core/rtnetlink.c:77 [inline] #0: (____ptrval____) (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x412/0xc30 net/core/rtnetlink.c:4662 #1: (____ptrval____) (&team->lock){+.+.}, at: team_add_slave+0xdb/0x1c30 drivers/net/team/team.c:1947 stack backtrace: CPU: 1 PID: 6391 Comm: syz-executor4 Not tainted 4.18.0-rc7+ #176 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 print_deadlock_bug kernel/locking/lockdep.c:1765 [inline] check_deadlock kernel/locking/lockdep.c:1809 [inline] validate_chain kernel/locking/lockdep.c:2405 [inline] __lock_acquire.cold.64+0x1fb/0x486 kernel/locking/lockdep.c:3435 lock_acquire+0x1e4/0x540 kernel/locking/lockdep.c:3924 __mutex_lock_common kernel/locking/mutex.c:757 [inline] __mutex_lock+0x176/0x1820 kernel/locking/mutex.c:894 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:909 team_vlan_rx_add_vid+0x3b/0x1e0 drivers/net/team/team.c:1868 vlan_add_rx_filter_info+0x14a/0x1d0 net/8021q/vlan_core.c:210 __vlan_vid_add net/8021q/vlan_core.c:278 [inline] vlan_vid_add+0x63e/0x9d0 net/8021q/vlan_core.c:308 vlan_device_event.cold.12+0x2a/0x2f net/8021q/vlan.c:381 notifier_call_chain+0x180/0x390 kernel/notifier.c:93 __raw_notifier_call_chain kernel/notifier.c:394 [inline] raw_notifier_call_chain+0x2d/0x40 kernel/notifier.c:401 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1735 call_netdevice_notifiers net/core/dev.c:1753 [inline] dev_open+0x173/0x1b0 net/core/dev.c:1433 team_port_add drivers/net/team/team.c:1219 [inline] team_add_slave+0xa8b/0x1c30 drivers/net/team/team.c:1948 do_set_master+0x1c9/0x220 net/core/rtnetlink.c:2248 do_setlink+0xba4/0x3e10 net/core/rtnetlink.c:2382 rtnl_setlink+0x2a9/0x400 net/core/rtnetlink.c:2636 rtnetlink_rcv_msg+0x46e/0xc30 net/core/rtnetlink.c:4665 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2455 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4683 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x5a0/0x760 net/netlink/af_netlink.c:1343 netlink_sendmsg+0xa18/0xfd0 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:642 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:652 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2126 __sys_sendmsg+0x11d/0x290 net/socket.c:2164 __do_sys_sendmsg net/socket.c:2173 [inline] __se_sys_sendmsg net/socket.c:2171 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2171 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x456b29 Code: fd b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f9706bf8c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f9706bf96d4 RCX: 0000000000456b29 RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000004 RBP: 00000000009300a0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000004d3548 R14: 00000000004c8227 R15: 0000000000000000 Fixes: 87002b03baab ("net: introduce vlan_vid_[add/del] and use them instead of direct [add/kill]_vid ndo calls") Signed-off-by: Ido Schimmel Reported-and-tested-by: syzbot+bd051aba086537515cdb@syzkaller.appspotmail.com Signed-off-by: David S. Miller --- drivers/net/team/team.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 6a047d30e8c6..d887016e54b6 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1167,6 +1167,12 @@ static int team_port_add(struct team *team, struct net_device *port_dev, return -EBUSY; } + if (dev == port_dev) { + NL_SET_ERR_MSG(extack, "Cannot enslave team device to itself"); + netdev_err(dev, "Cannot enslave team device to itself\n"); + return -EINVAL; + } + if (port_dev->features & NETIF_F_VLAN_CHALLENGED && vlan_uses_dev(dev)) { NL_SET_ERR_MSG(extack, "Device is VLAN challenged and team device has VLAN set up"); From dbe80d446c859873820eedfff4abc61c71f1927b Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 5 Oct 2018 00:26:00 -0400 Subject: [PATCH 464/499] bnxt_en: Fix VNIC reservations on the PF. The enables bit for VNIC was set wrong when calling the HWRM_FUNC_CFG firmware call to reserve VNICs. This has the effect that the firmware will keep a large number of VNICs for the PF, and having very few for VFs. DPDK driver running on the VFs, which requires more VNICs, may not work properly as a result. Fixes: 674f50a5b026 ("bnxt_en: Implement new method to reserve rings.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0478e562abac..2564a92dcb02 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4650,7 +4650,7 @@ __bnxt_hwrm_reserve_pf_rings(struct bnxt *bp, struct hwrm_func_cfg_input *req, FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS : 0; enables |= ring_grps ? FUNC_CFG_REQ_ENABLES_NUM_HW_RING_GRPS : 0; - enables |= vnics ? FUNC_VF_CFG_REQ_ENABLES_NUM_VNICS : 0; + enables |= vnics ? FUNC_CFG_REQ_ENABLES_NUM_VNICS : 0; req->num_rx_rings = cpu_to_le16(rx_rings); req->num_hw_ring_grps = cpu_to_le16(ring_grps); From 5db0e0969af6501ad45fe0494039d3b9c797822b Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Fri, 5 Oct 2018 00:26:01 -0400 Subject: [PATCH 465/499] bnxt_en: Fix enables field in HWRM_QUEUE_COS2BW_CFG request In HWRM_QUEUE_COS2BW_CFG request, enables field should have the bits set only for the queue ids which are having the valid parameters. This causes firmware to return error when the TC to hardware CoS queue mapping is not 1:1 during DCBNL ETS setup. Fixes: 2e8ef77ee0ff ("bnxt_en: Add TC to hardware QoS queue mapping logic.") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c index ddc98c359488..a85d2be986af 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c @@ -98,13 +98,13 @@ static int bnxt_hwrm_queue_cos2bw_cfg(struct bnxt *bp, struct ieee_ets *ets, bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_COS2BW_CFG, -1, -1); for (i = 0; i < max_tc; i++) { - u8 qidx; + u8 qidx = bp->tc_to_qidx[i]; req.enables |= cpu_to_le32( - QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID0_VALID << i); + QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID0_VALID << + qidx); memset(&cos2bw, 0, sizeof(cos2bw)); - qidx = bp->tc_to_qidx[i]; cos2bw.queue_id = bp->q_info[qidx].queue_id; if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_STRICT) { cos2bw.tsa = From a2bf74f4e1b82395dad2b08d2a911d9151db71c1 Mon Sep 17 00:00:00 2001 From: Venkat Duvvuru Date: Fri, 5 Oct 2018 00:26:02 -0400 Subject: [PATCH 466/499] bnxt_en: free hwrm resources, if driver probe fails. When the driver probe fails, all the resources that were allocated prior to the failure must be freed. However, hwrm dma response memory is not getting freed. This patch fixes the problem described above. Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Signed-off-by: Venkat Duvvuru Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2564a92dcb02..3718984a8185 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3017,10 +3017,11 @@ static void bnxt_free_hwrm_resources(struct bnxt *bp) { struct pci_dev *pdev = bp->pdev; - dma_free_coherent(&pdev->dev, PAGE_SIZE, bp->hwrm_cmd_resp_addr, - bp->hwrm_cmd_resp_dma_addr); - - bp->hwrm_cmd_resp_addr = NULL; + if (bp->hwrm_cmd_resp_addr) { + dma_free_coherent(&pdev->dev, PAGE_SIZE, bp->hwrm_cmd_resp_addr, + bp->hwrm_cmd_resp_dma_addr); + bp->hwrm_cmd_resp_addr = NULL; + } } static int bnxt_alloc_hwrm_resources(struct bnxt *bp) @@ -9057,6 +9058,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) bnxt_clear_int_mode(bp); init_err_pci_clean: + bnxt_free_hwrm_resources(bp); bnxt_cleanup_pci(bp); init_err_free: From c78fe058879bdea919d44f23e21da26f603e9166 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Fri, 5 Oct 2018 00:26:03 -0400 Subject: [PATCH 467/499] bnxt_en: get the reduced max_irqs by the ones used by RDMA When getting the max rings supported, get the reduced max_irqs by the ones used by RDMA. If the number MSIX is the limiting factor, this bug may cause the max ring count to be higher than it should be when RDMA driver is loaded and may result in ring allocation failures. Fixes: 30f529473ec9 ("bnxt_en: Do not modify max IRQ count after RDMA driver requests/frees IRQs.") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3718984a8185..e2d92548226a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -8622,7 +8622,7 @@ static void _bnxt_get_max_rings(struct bnxt *bp, int *max_rx, int *max_tx, *max_tx = hw_resc->max_tx_rings; *max_rx = hw_resc->max_rx_rings; *max_cp = min_t(int, bnxt_get_max_func_cp_rings_for_en(bp), - hw_resc->max_irqs); + hw_resc->max_irqs - bnxt_get_ulp_msix_num(bp)); *max_cp = min_t(int, *max_cp, hw_resc->max_stat_ctxs); max_ring_grps = hw_resc->max_hw_ring_grps; if (BNXT_CHIP_TYPE_NITRO_A0(bp) && BNXT_PF(bp)) { From 17c357efe5eceebdc3971a48b3d4d61a03c1178b Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Fri, 28 Sep 2018 14:51:28 -0300 Subject: [PATCH 468/499] openvswitch: load NAT helper Load the respective NAT helper module if the flow uses it. Signed-off-by: Flavio Leitner Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 0aeb34c6389d..35ae64cbef33 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1312,6 +1312,10 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, rcu_assign_pointer(help->helper, helper); info->helper = helper; + + if (info->nat) + request_module("ip_nat_%s", name); + return 0; } From ca8931948344c485569b04821d1f6bcebccd376b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 4 Oct 2018 20:24:13 -0700 Subject: [PATCH 469/499] net: dsa: b53: Keep CPU port as tagged in all VLANs Commit c499696e7901 ("net: dsa: b53: Stop using dev->cpu_port incorrectly") was a bit too trigger happy in removing the CPU port from the VLAN membership because we rely on DSA to program the CPU port VLAN, which it does, except it does not bother itself with tagged/untagged and just usese untagged. Having the CPU port "follow" the user ports tagged/untagged is not great and does not allow for properly differentiating, so keep the CPU port tagged in all VLANs. Reported-by: Gerhard Wiesinger Fixes: c499696e7901 ("net: dsa: b53: Stop using dev->cpu_port incorrectly") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index d93c790bfbe8..ad534b90ef21 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1107,7 +1107,7 @@ void b53_vlan_add(struct dsa_switch *ds, int port, b53_get_vlan_entry(dev, vid, vl); vl->members |= BIT(port); - if (untagged) + if (untagged && !dsa_is_cpu_port(ds, port)) vl->untag |= BIT(port); else vl->untag &= ~BIT(port); @@ -1149,7 +1149,7 @@ int b53_vlan_del(struct dsa_switch *ds, int port, pvid = 0; } - if (untagged) + if (untagged && !dsa_is_cpu_port(ds, port)) vl->untag &= ~(BIT(port)); b53_set_vlan_entry(dev, vid, vl); From 9d2f67e43b73e8af7438be219b66a5de0cfa8bd9 Mon Sep 17 00:00:00 2001 From: Jianfeng Tan Date: Sat, 29 Sep 2018 15:41:27 +0000 Subject: [PATCH 470/499] net/packet: fix packet drop as of virtio gso When we use raw socket as the vhost backend, a packet from virito with gso offloading information, cannot be sent out in later validaton at xmit path, as we did not set correct skb->protocol which is further used for looking up the gso function. To fix this, we set this field according to virito hdr information. Fixes: e858fae2b0b8f4 ("virtio_net: use common code for virtio_net_hdr and skb GSO conversion") Signed-off-by: Jianfeng Tan Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 18 ++++++++++++++++++ net/packet/af_packet.c | 11 +++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 9397628a1967..cb462f9ab7dd 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -5,6 +5,24 @@ #include #include +static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, + const struct virtio_net_hdr *hdr) +{ + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_UDP: + skb->protocol = cpu_to_be16(ETH_P_IP); + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + skb->protocol = cpu_to_be16(ETH_P_IPV6); + break; + default: + return -EINVAL; + } + + return 0; +} + static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 75c92a87e7b2..d6e94dc7e290 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2715,10 +2715,12 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } } - if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr, - vio_le())) { - tp_len = -EINVAL; - goto tpacket_error; + if (po->has_vnet_hdr) { + if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { + tp_len = -EINVAL; + goto tpacket_error; + } + virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; @@ -2915,6 +2917,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (err) goto out_free; len += sizeof(vnet_hdr); + virtio_net_hdr_set_proto(skb, &vnet_hdr); } skb_probe_transport_header(skb, reserve); From 2d52527e80c2dc0c5f43f50adf183781262ec565 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 3 Oct 2018 15:20:58 +0200 Subject: [PATCH 471/499] be2net: don't flip hw_features when VXLANs are added/deleted the be2net implementation of .ndo_tunnel_{add,del}() changes the value of NETIF_F_GSO_UDP_TUNNEL bit in 'features' and 'hw_features', but it forgets to call netdev_features_change(). Moreover, ethtool setting for that bit can potentially be reverted after a tunnel is added or removed. GSO already does software segmentation when 'hw_enc_features' is 0, even if VXLAN offload is turned on. In addition, commit 096de2f83ebc ("benet: stricter vxlan offloading check in be_features_check") avoids hardware segmentation of non-VXLAN tunneled packets, or VXLAN packets having wrong destination port. So, it's safe to avoid flipping the above feature on addition/deletion of VXLAN tunnels. Fixes: 630f4b70567f ("be2net: Export tunnel offloads only when a VxLAN tunnel is created") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 74d122616e76..534787291b44 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -4002,8 +4002,6 @@ static int be_enable_vxlan_offloads(struct be_adapter *adapter) netdev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_GSO_UDP_TUNNEL; - netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL; - netdev->features |= NETIF_F_GSO_UDP_TUNNEL; dev_info(dev, "Enabled VxLAN offloads for UDP port %d\n", be16_to_cpu(port)); @@ -4025,8 +4023,6 @@ static void be_disable_vxlan_offloads(struct be_adapter *adapter) adapter->vxlan_port = 0; netdev->hw_enc_features = 0; - netdev->hw_features &= ~(NETIF_F_GSO_UDP_TUNNEL); - netdev->features &= ~(NETIF_F_GSO_UDP_TUNNEL); } static void be_calculate_vf_res(struct be_adapter *adapter, u16 num_vfs, @@ -5320,6 +5316,7 @@ static void be_netdev_init(struct net_device *netdev) struct be_adapter *adapter = netdev_priv(netdev); netdev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 | + NETIF_F_GSO_UDP_TUNNEL | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX; if ((be_if_cap_flags(adapter) & BE_IF_FLAGS_RSS)) From 7e4183752735deb7543e179a44f4f4b44917cd6f Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Wed, 3 Oct 2018 19:04:49 +0300 Subject: [PATCH 472/499] net: phy: phylink: fix SFP interface autodetection When connecting SFP PHY to phylink use the detected interface. Otherwise, the link fails to come up when the configured 'phy-mode' differs from the SFP detected mode. Move most of phylink_connect_phy() into __phylink_connect_phy(), and leave phylink_connect_phy() as a wrapper. phylink_sfp_connect_phy() can now pass the SFP detected PHY interface to __phylink_connect_phy(). This fixes 1GB SFP module link up on eth3 of the Macchiatobin board that is configured in the DT to "2500base-x" phy-mode. Fixes: 9525ae83959b6 ("phylink: add phylink infrastructure") Suggested-by: Russell King Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 48 +++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 3ba5cf2a8a5f..7abca86c3aa9 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -717,6 +717,30 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy) return 0; } +static int __phylink_connect_phy(struct phylink *pl, struct phy_device *phy, + phy_interface_t interface) +{ + int ret; + + if (WARN_ON(pl->link_an_mode == MLO_AN_FIXED || + (pl->link_an_mode == MLO_AN_INBAND && + phy_interface_mode_is_8023z(interface)))) + return -EINVAL; + + if (pl->phydev) + return -EBUSY; + + ret = phy_attach_direct(pl->netdev, phy, 0, interface); + if (ret) + return ret; + + ret = phylink_bringup_phy(pl, phy); + if (ret) + phy_detach(phy); + + return ret; +} + /** * phylink_connect_phy() - connect a PHY to the phylink instance * @pl: a pointer to a &struct phylink returned from phylink_create() @@ -734,31 +758,13 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy) */ int phylink_connect_phy(struct phylink *pl, struct phy_device *phy) { - int ret; - - if (WARN_ON(pl->link_an_mode == MLO_AN_FIXED || - (pl->link_an_mode == MLO_AN_INBAND && - phy_interface_mode_is_8023z(pl->link_interface)))) - return -EINVAL; - - if (pl->phydev) - return -EBUSY; - /* Use PHY device/driver interface */ if (pl->link_interface == PHY_INTERFACE_MODE_NA) { pl->link_interface = phy->interface; pl->link_config.interface = pl->link_interface; } - ret = phy_attach_direct(pl->netdev, phy, 0, pl->link_interface); - if (ret) - return ret; - - ret = phylink_bringup_phy(pl, phy); - if (ret) - phy_detach(phy); - - return ret; + return __phylink_connect_phy(pl, phy, pl->link_interface); } EXPORT_SYMBOL_GPL(phylink_connect_phy); @@ -1672,7 +1678,9 @@ static void phylink_sfp_link_up(void *upstream) static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy) { - return phylink_connect_phy(upstream, phy); + struct phylink *pl = upstream; + + return __phylink_connect_phy(upstream, phy, pl->link_config.interface); } static void phylink_sfp_disconnect_phy(void *upstream) From b3e9b515b08e407ab3a026dc2e4d935c48d05f69 Mon Sep 17 00:00:00 2001 From: "Singh, Brijesh" Date: Thu, 4 Oct 2018 21:40:23 +0000 Subject: [PATCH 473/499] iommu/amd: Clear memory encryption mask from physical address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boris Ostrovsky reported a memory leak with device passthrough when SME is active. The VFIO driver uses iommu_iova_to_phys() to get the physical address for an iova. This physical address is later passed into vfio_unmap_unpin() to unpin the memory. The vfio_unmap_unpin() uses pfn_valid() before unpinning the memory. The pfn_valid() check was failing because encryption mask was part of the physical address returned. This resulted in the memory not being unpinned and therefore leaked after the guest terminates. The memory encryption mask must be cleared from the physical address in iommu_iova_to_phys(). Fixes: 2543a786aa25 ("iommu/amd: Allow the AMD IOMMU to work with memory encryption") Reported-by: Boris Ostrovsky Cc: Tom Lendacky Cc: Joerg Roedel Cc: Cc: Borislav Petkov Cc: Paolo Bonzini Cc: Radim Krčmář Cc: kvm@vger.kernel.org Cc: Boris Ostrovsky Cc: # 4.14+ Signed-off-by: Brijesh Singh Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 73e47d93e7a0..bee0dfb7b93b 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3069,7 +3069,7 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, return 0; offset_mask = pte_pgsize - 1; - __pte = *pte & PM_ADDR_MASK; + __pte = __sme_clr(*pte & PM_ADDR_MASK); return (__pte & ~offset_mask) | (iova & offset_mask); } From 9ce7610e6d201e3923c0b2f454f2e1d54f5da49e Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 1 Oct 2018 14:49:05 +0300 Subject: [PATCH 474/499] i2c: designware: Call i2c_dw_clk_rate() only when calculating timings There are platforms which don't provide input clock rate but provide I2C timing parameters. Commit 3bd4f277274b ("i2c: designware: Call i2c_dw_clk_rate() only once in i2c_dw_init_master()") causes needless warning during probe on those platforms since i2c_dw_clk_rate(), which causes the warning when input clock is unknown, is called even when there is no need to calculate timing parameters. Fixes: 3bd4f277274b ("i2c: designware: Call i2c_dw_clk_rate() only once in i2c_dw_init_master()") Reported-by: Ard Biesheuvel Cc: # 4.19 Signed-off-by: Jarkko Nikula Tested-by: Ard Biesheuvel Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-master.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index 94d94b4a9a0d..18cc324f3ca9 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -34,11 +34,11 @@ static void i2c_dw_configure_fifo_master(struct dw_i2c_dev *dev) static int i2c_dw_set_timings_master(struct dw_i2c_dev *dev) { - u32 ic_clk = i2c_dw_clk_rate(dev); const char *mode_str, *fp_str = ""; u32 comp_param1; u32 sda_falling_time, scl_falling_time; struct i2c_timings *t = &dev->timings; + u32 ic_clk; int ret; ret = i2c_dw_acquire_lock(dev); @@ -53,6 +53,7 @@ static int i2c_dw_set_timings_master(struct dw_i2c_dev *dev) /* Calculate SCL timing parameters for standard mode if not set */ if (!dev->ss_hcnt || !dev->ss_lcnt) { + ic_clk = i2c_dw_clk_rate(dev); dev->ss_hcnt = i2c_dw_scl_hcnt(ic_clk, 4000, /* tHD;STA = tHIGH = 4.0 us */ @@ -89,6 +90,7 @@ static int i2c_dw_set_timings_master(struct dw_i2c_dev *dev) * needed also in high speed mode. */ if (!dev->fs_hcnt || !dev->fs_lcnt) { + ic_clk = i2c_dw_clk_rate(dev); dev->fs_hcnt = i2c_dw_scl_hcnt(ic_clk, 600, /* tHD;STA = tHIGH = 0.6 us */ From a932ed3b718147c6537da290b7a91e990fdedb43 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 5 Oct 2018 16:43:55 +1000 Subject: [PATCH 475/499] powerpc: Don't print kernel instructions in show_user_instructions() Recently we implemented show_user_instructions() which dumps the code around the NIP when a user space process dies with an unhandled signal. This was modelled on the x86 code, and we even went so far as to implement the exact same bug, namely that if the user process crashed with its NIP pointing into the kernel we will dump kernel text to dmesg. eg: bad-bctr[2996]: segfault (11) at c000000000010000 nip c000000000010000 lr 12d0b0894 code 1 bad-bctr[2996]: code: fbe10068 7cbe2b78 7c7f1b78 fb610048 38a10028 38810020 fb810050 7f8802a6 bad-bctr[2996]: code: 3860001c f8010080 48242371 60000000 <7c7b1b79> 4082002c e8010080 eb610048 This was discovered on x86 by Jann Horn and fixed in commit 342db04ae712 ("x86/dumpstack: Don't dump kernel memory based on usermode RIP"). Fix it by checking the adjusted NIP value (pc) and number of instructions against USER_DS, and bail if we fail the check, eg: bad-bctr[2969]: segfault (11) at c000000000010000 nip c000000000010000 lr 107930894 code 1 bad-bctr[2969]: Bad NIP, not dumping instructions. Fixes: 88b0fe175735 ("powerpc: Add show_user_instructions()") Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 913c5725cdb2..bb6ac471a784 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1306,6 +1306,16 @@ void show_user_instructions(struct pt_regs *regs) pc = regs->nip - (instructions_to_print * 3 / 4 * sizeof(int)); + /* + * Make sure the NIP points at userspace, not kernel text/data or + * elsewhere. + */ + if (!__access_ok(pc, instructions_to_print * sizeof(int), USER_DS)) { + pr_info("%s[%d]: Bad NIP, not dumping instructions.\n", + current->comm, current->pid); + return; + } + pr_info("%s[%d]: code: ", current->comm, current->pid); for (i = 0; i < instructions_to_print; i++) { From ac1788cc7da4ce54edcfd2e499afdb0a23d5c41d Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 28 Sep 2018 09:17:32 +0530 Subject: [PATCH 476/499] powerpc/numa: Skip onlining a offline node in kdump path With commit 2ea626306810 ("powerpc/topology: Get topology for shared processors at boot"), kdump kernel on shared LPAR may crash. The necessary conditions are - Shared LPAR with at least 2 nodes having memory and CPUs. - Memory requirement for kdump kernel must be met by the first N-1 nodes where there are at least N nodes with memory and CPUs. Example numactl of such a machine. $ numactl -H available: 5 nodes (0,2,5-7) node 0 cpus: node 0 size: 0 MB node 0 free: 0 MB node 2 cpus: node 2 size: 255 MB node 2 free: 189 MB node 5 cpus: 24 25 26 27 28 29 30 31 node 5 size: 4095 MB node 5 free: 4024 MB node 6 cpus: 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23 node 6 size: 6353 MB node 6 free: 5998 MB node 7 cpus: 8 9 10 11 12 13 14 15 32 33 34 35 36 37 38 39 node 7 size: 7640 MB node 7 free: 7164 MB node distances: node 0 2 5 6 7 0: 10 40 40 40 40 2: 40 10 40 40 40 5: 40 40 10 40 40 6: 40 40 40 10 20 7: 40 40 40 20 10 Steps to reproduce. 1. Load / start kdump service. 2. Trigger a kdump (for example : echo c > /proc/sysrq-trigger) When booting a kdump kernel with 2048M: kexec: Starting switchover sequence. I'm in purgatory Using 1TB segments hash-mmu: Initializing hash mmu with SLB Linux version 4.19.0-rc5-master+ (srikar@linux-xxu6) (gcc version 4.8.5 (SUSE Linux)) #1 SMP Thu Sep 27 19:45:00 IST 2018 Found initrd at 0xc000000009e70000:0xc00000000ae554b4 Using pSeries machine description ----------------------------------------------------- ppc64_pft_size = 0x1e phys_mem_size = 0x88000000 dcache_bsize = 0x80 icache_bsize = 0x80 cpu_features = 0x000000ff8f5d91a7 possible = 0x0000fbffcf5fb1a7 always = 0x0000006f8b5c91a1 cpu_user_features = 0xdc0065c2 0xef000000 mmu_features = 0x7c006001 firmware_features = 0x00000007c45bfc57 htab_hash_mask = 0x7fffff physical_start = 0x8000000 ----------------------------------------------------- numa: NODE_DATA [mem 0x87d5e300-0x87d67fff] numa: NODE_DATA(0) on node 6 numa: NODE_DATA [mem 0x87d54600-0x87d5e2ff] Top of RAM: 0x88000000, Total RAM: 0x88000000 Memory hole size: 0MB Zone ranges: DMA [mem 0x0000000000000000-0x0000000087ffffff] DMA32 empty Normal empty Movable zone start for each node Early memory node ranges node 6: [mem 0x0000000000000000-0x0000000087ffffff] Could not find start_pfn for node 0 Initmem setup node 0 [mem 0x0000000000000000-0x0000000000000000] On node 0 totalpages: 0 Initmem setup node 6 [mem 0x0000000000000000-0x0000000087ffffff] On node 6 totalpages: 34816 Unable to handle kernel paging request for data at address 0x00000060 Faulting instruction address: 0xc000000008703a54 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 11 PID: 1 Comm: swapper/11 Not tainted 4.19.0-rc5-master+ #1 NIP: c000000008703a54 LR: c000000008703a38 CTR: 0000000000000000 REGS: c00000000b673440 TRAP: 0380 Not tainted (4.19.0-rc5-master+) MSR: 8000000002009033 CR: 24022022 XER: 20000002 CFAR: c0000000086fc238 IRQMASK: 0 GPR00: c000000008703a38 c00000000b6736c0 c000000009281900 0000000000000000 GPR04: 0000000000000000 0000000000000000 fffffffffffff001 c00000000b660080 GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000220 GPR12: 0000000000002200 c000000009e51400 0000000000000000 0000000000000008 GPR16: 0000000000000000 c000000008c152e8 c000000008c152a8 0000000000000000 GPR20: c000000009422fd8 c000000009412fd8 c000000009426040 0000000000000008 GPR24: 0000000000000000 0000000000000000 c000000009168bc8 c000000009168c78 GPR28: c00000000b126410 0000000000000000 c00000000916a0b8 c00000000b126400 NIP [c000000008703a54] bus_add_device+0x84/0x1e0 LR [c000000008703a38] bus_add_device+0x68/0x1e0 Call Trace: [c00000000b6736c0] [c000000008703a38] bus_add_device+0x68/0x1e0 (unreliable) [c00000000b673740] [c000000008700194] device_add+0x454/0x7c0 [c00000000b673800] [c00000000872e660] __register_one_node+0xb0/0x240 [c00000000b673860] [c00000000839a6bc] __try_online_node+0x12c/0x180 [c00000000b673900] [c00000000839b978] try_online_node+0x58/0x90 [c00000000b673930] [c0000000080846d8] find_and_online_cpu_nid+0x158/0x190 [c00000000b673a10] [c0000000080848a0] numa_update_cpu_topology+0x190/0x580 [c00000000b673c00] [c000000008d3f2e4] smp_cpus_done+0x94/0x108 [c00000000b673c70] [c000000008d5c00c] smp_init+0x174/0x19c [c00000000b673d00] [c000000008d346b8] kernel_init_freeable+0x1e0/0x450 [c00000000b673dc0] [c0000000080102e8] kernel_init+0x28/0x160 [c00000000b673e30] [c00000000800b65c] ret_from_kernel_thread+0x5c/0x80 Instruction dump: 60000000 60000000 e89e0020 7fe3fb78 4bff87d5 60000000 7c7d1b79 4082008c e8bf0050 e93e0098 3b9f0010 2fa50000 38630018 419e0114 7f84e378 ---[ end trace 593577668c2daa65 ]--- However a regular kernel with 4096M (2048 gets reserved for crash kernel) boots properly. Unlike regular kernels, which mark all available nodes as online, kdump kernel only marks just enough nodes as online and marks the rest as offline at boot. However kdump kernel boots with all available CPUs. With Commit 2ea626306810 ("powerpc/topology: Get topology for shared processors at boot"), all CPUs are onlined on their respective nodes at boot time. try_online_node() tries to online the offline nodes but fails as all needed subsystems are not yet initialized. As part of fix, detect and skip early onlining of a offline node. Fixes: 2ea626306810 ("powerpc/topology: Get topology for shared processors at boot") Reported-by: Pavithra Prakash Signed-off-by: Srikar Dronamraju Tested-by: Hari Bathini Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 59d07bd5374a..055b211b7126 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1217,9 +1217,10 @@ int find_and_online_cpu_nid(int cpu) * Need to ensure that NODE_DATA is initialized for a node from * available memory (see memblock_alloc_try_nid). If unable to * init the node, then default to nearest node that has memory - * installed. + * installed. Skip onlining a node if the subsystems are not + * yet initialized. */ - if (try_online_node(new_nid)) + if (!topology_inited || try_online_node(new_nid)) new_nid = first_online_node; #else /* From b799207e1e1816b09e7a5920fbb2d5fcf6edd681 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Oct 2018 18:17:59 +0200 Subject: [PATCH 477/499] bpf: 32-bit RSH verification must truncate input before the ALU op When I wrote commit 468f6eafa6c4 ("bpf: fix 32-bit ALU op verification"), I assumed that, in order to emulate 64-bit arithmetic with 32-bit logic, it is sufficient to just truncate the output to 32 bits; and so I just moved the register size coercion that used to be at the start of the function to the end of the function. That assumption is true for almost every op, but not for 32-bit right shifts, because those can propagate information towards the least significant bit. Fix it by always truncating inputs for 32-bit ops to 32 bits. Also get rid of the coerce_reg_to_size() after the ALU op, since that has no effect. Fixes: 468f6eafa6c4 ("bpf: fix 32-bit ALU op verification") Acked-by: Daniel Borkmann Signed-off-by: Jann Horn Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb07e74b34a2..465952a8e465 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2896,6 +2896,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + if (insn_bitness == 32) { + /* Relevant for 32-bit RSH: Information can propagate towards + * LSB, so it isn't sufficient to only truncate the output to + * 32 bits. + */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -3131,7 +3140,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops are (32,32)->32 */ coerce_reg_to_size(dst_reg, 4); - coerce_reg_to_size(&src_reg, 4); } __reg_deduce_bounds(dst_reg); From 2c05d88818ab6571816b93edce4d53703870d7ae Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Fri, 5 Oct 2018 08:48:27 -0500 Subject: [PATCH 478/499] net: cxgb3_main: fix a missing-check bug In cxgb_extension_ioctl(), the command of the ioctl is firstly copied from the user-space buffer 'useraddr' to 'cmd' and checked through the switch statement. If the command is not as expected, an error code EOPNOTSUPP is returned. In the following execution, i.e., the cases of the switch statement, the whole buffer of 'useraddr' is copied again to a specific data structure, according to what kind of command is requested. However, after the second copy, there is no re-check on the newly-copied command. Given that the buffer 'useraddr' is in the user space, a malicious user can race to change the command between the two copies. By doing so, the attacker can supply malicious data to the kernel and cause undefined behavior. This patch adds a re-check in each case of the switch statement if there is a second copy in that case, to re-check whether the command obtained in the second copy is the same as the one in the first copy. If not, an error code EINVAL is returned. Signed-off-by: Wenwen Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index a19172dbe6be..c34ea385fe4a 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -2159,6 +2159,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EPERM; if (copy_from_user(&t, useraddr, sizeof(t))) return -EFAULT; + if (t.cmd != CHELSIO_SET_QSET_PARAMS) + return -EINVAL; if (t.qset_idx >= SGE_QSETS) return -EINVAL; if (!in_range(t.intr_lat, 0, M_NEWTIMER) || @@ -2258,6 +2260,9 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) if (copy_from_user(&t, useraddr, sizeof(t))) return -EFAULT; + if (t.cmd != CHELSIO_GET_QSET_PARAMS) + return -EINVAL; + /* Display qsets for all ports when offload enabled */ if (test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) { q1 = 0; @@ -2303,6 +2308,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EBUSY; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; + if (edata.cmd != CHELSIO_SET_QSET_NUM) + return -EINVAL; if (edata.val < 1 || (edata.val > 1 && !(adapter->flags & USING_MSIX))) return -EINVAL; @@ -2343,6 +2350,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EPERM; if (copy_from_user(&t, useraddr, sizeof(t))) return -EFAULT; + if (t.cmd != CHELSIO_LOAD_FW) + return -EINVAL; /* Check t.len sanity ? */ fw_data = memdup_user(useraddr + sizeof(t), t.len); if (IS_ERR(fw_data)) @@ -2366,6 +2375,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EBUSY; if (copy_from_user(&m, useraddr, sizeof(m))) return -EFAULT; + if (m.cmd != CHELSIO_SETMTUTAB) + return -EINVAL; if (m.nmtus != NMTUS) return -EINVAL; if (m.mtus[0] < 81) /* accommodate SACK */ @@ -2407,6 +2418,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EBUSY; if (copy_from_user(&m, useraddr, sizeof(m))) return -EFAULT; + if (m.cmd != CHELSIO_SET_PM) + return -EINVAL; if (!is_power_of_2(m.rx_pg_sz) || !is_power_of_2(m.tx_pg_sz)) return -EINVAL; /* not power of 2 */ @@ -2440,6 +2453,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EIO; /* need the memory controllers */ if (copy_from_user(&t, useraddr, sizeof(t))) return -EFAULT; + if (t.cmd != CHELSIO_GET_MEM) + return -EINVAL; if ((t.addr & 7) || (t.len & 7)) return -EINVAL; if (t.mem_id == MEM_CM) @@ -2492,6 +2507,8 @@ static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr) return -EAGAIN; if (copy_from_user(&t, useraddr, sizeof(t))) return -EFAULT; + if (t.cmd != CHELSIO_SET_TRACE_FILTER) + return -EINVAL; tp = (const struct trace_params *)&t.sip; if (t.config_tx) From 33aa8da1f8a7dc050b9d68f1db761ab787621065 Mon Sep 17 00:00:00 2001 From: Shanthosh RK Date: Fri, 5 Oct 2018 20:57:48 +0530 Subject: [PATCH 479/499] net: bpfilter: Fix type cast and pointer warnings Fixes the following Sparse warnings: net/bpfilter/bpfilter_kern.c:62:21: warning: cast removes address space of expression net/bpfilter/bpfilter_kern.c:101:49: warning: Using plain integer as NULL pointer Signed-off-by: Shanthosh RK Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index f0fc182d3db7..b64e1649993b 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -59,7 +59,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname, req.is_set = is_set; req.pid = current->pid; req.cmd = optname; - req.addr = (long)optval; + req.addr = (long __force __user)optval; req.len = optlen; mutex_lock(&bpfilter_lock); if (!info.pid) @@ -98,7 +98,7 @@ static int __init load_umh(void) pr_info("Loaded bpfilter_umh pid %d\n", info.pid); /* health check that usermode process started correctly */ - if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) { + if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) { stop_umh(); return -EFAULT; } From 0781168e23a2fc8dceb989f11fc5b39b3ccacc35 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Fri, 5 Oct 2018 10:59:36 -0500 Subject: [PATCH 480/499] yam: fix a missing-check bug In yam_ioctl(), the concrete ioctl command is firstly copied from the user-space buffer 'ifr->ifr_data' to 'ioctl_cmd' and checked through the following switch statement. If the command is not as expected, an error code EINVAL is returned. In the following execution the buffer 'ifr->ifr_data' is copied again in the cases of the switch statement to specific data structures according to what kind of ioctl command is requested. However, after the second copy, no re-check is enforced on the newly-copied command. Given that the buffer 'ifr->ifr_data' is in the user space, a malicious user can race to change the command between the two copies. This way, the attacker can inject inconsistent data and cause undefined behavior. This patch adds a re-check in each case of the switch statement if there is a second copy in that case, to re-check whether the command obtained in the second copy is the same as the one in the first copy. If not, an error code EINVAL will be returned. Signed-off-by: Wenwen Wang Signed-off-by: David S. Miller --- drivers/net/hamradio/yam.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 16ec7af6ab7b..ba9df430fca6 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -966,6 +966,8 @@ static int yam_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) sizeof(struct yamdrv_ioctl_mcs)); if (IS_ERR(ym)) return PTR_ERR(ym); + if (ym->cmd != SIOCYAMSMCS) + return -EINVAL; if (ym->bitrate > YAM_MAXBITRATE) { kfree(ym); return -EINVAL; @@ -981,6 +983,8 @@ static int yam_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (copy_from_user(&yi, ifr->ifr_data, sizeof(struct yamdrv_ioctl_cfg))) return -EFAULT; + if (yi.cmd != SIOCYAMSCFG) + return -EINVAL; if ((yi.cfg.mask & YAM_IOBASE) && netif_running(dev)) return -EINVAL; /* Cannot change this parameter when up */ if ((yi.cfg.mask & YAM_IRQ) && netif_running(dev)) From bd961c9bc66497f0c63f4ba1d02900bb85078366 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Mon, 1 Oct 2018 22:46:40 -0300 Subject: [PATCH 481/499] rtnetlink: fix rtnl_fdb_dump() for ndmsg header Currently, rtnl_fdb_dump() assumes the family header is 'struct ifinfomsg', which is not always true -- 'struct ndmsg' is used by iproute2 ('ip neigh'). The problem is, the function bails out early if nlmsg_parse() fails, which does occur for iproute2 usage of 'struct ndmsg' because the payload length is shorter than the family header alone (as 'struct ifinfomsg' is assumed). This breaks backward compatibility with userspace -- nothing is sent back. Some examples with iproute2 and netlink library for go [1]: 1) $ bridge fdb show 33:33:00:00:00:01 dev ens3 self permanent 01:00:5e:00:00:01 dev ens3 self permanent 33:33:ff:15:98:30 dev ens3 self permanent This one works, as it uses 'struct ifinfomsg'. fdb_show() @ iproute2/bridge/fdb.c """ .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), ... if (rtnl_dump_request(&rth, RTM_GETNEIGH, [...] """ 2) $ ip --family bridge neigh RTNETLINK answers: Invalid argument Dump terminated This one fails, as it uses 'struct ndmsg'. do_show_or_flush() @ iproute2/ip/ipneigh.c """ .n.nlmsg_type = RTM_GETNEIGH, .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), """ 3) $ ./neighlist < no output > This one fails, as it uses 'struct ndmsg'-based. neighList() @ netlink/neigh_linux.go """ req := h.newNetlinkRequest(unix.RTM_GETNEIGH, [...] msg := Ndmsg{ """ The actual breakage was introduced by commit 0ff50e83b512 ("net: rtnetlink: bail out from rtnl_fdb_dump() on parse error"), because nlmsg_parse() fails if the payload length (with the _actual_ family header) is less than the family header length alone (which is assumed, in parameter 'hdrlen'). This is true in the examples above with struct ndmsg, with size and payload length shorter than struct ifinfomsg. However, that commit just intends to fix something under the assumption the family header is indeed an 'struct ifinfomsg' - by preventing access to the payload as such (via 'ifm' pointer) if the payload length is not sufficient to actually contain it. The assumption was introduced by commit 5e6d24358799 ("bridge: netlink dump interface at par with brctl"), to support iproute2's 'bridge fdb' command (not 'ip neigh') which indeed uses 'struct ifinfomsg', thus is not broken. So, in order to unbreak the 'struct ndmsg' family headers and still allow 'struct ifinfomsg' to continue to work, check for the known message sizes used with 'struct ndmsg' in iproute2 (with zero or one attribute which is not used in this function anyway) then do not parse the data as ifinfomsg. Same examples with this patch applied (or revert/before the original fix): $ bridge fdb show 33:33:00:00:00:01 dev ens3 self permanent 01:00:5e:00:00:01 dev ens3 self permanent 33:33:ff:15:98:30 dev ens3 self permanent $ ip --family bridge neigh dev ens3 lladdr 33:33:00:00:00:01 PERMANENT dev ens3 lladdr 01:00:5e:00:00:01 PERMANENT dev ens3 lladdr 33:33:ff:15:98:30 PERMANENT $ ./neighlist netlink.Neigh{LinkIndex:2, Family:7, State:128, Type:0, Flags:2, IP:net.IP(nil), HardwareAddr:net.HardwareAddr{0x33, 0x33, 0x0, 0x0, 0x0, 0x1}, LLIPAddr:net.IP(nil), Vlan:0, VNI:0} netlink.Neigh{LinkIndex:2, Family:7, State:128, Type:0, Flags:2, IP:net.IP(nil), HardwareAddr:net.HardwareAddr{0x1, 0x0, 0x5e, 0x0, 0x0, 0x1}, LLIPAddr:net.IP(nil), Vlan:0, VNI:0} netlink.Neigh{LinkIndex:2, Family:7, State:128, Type:0, Flags:2, IP:net.IP(nil), HardwareAddr:net.HardwareAddr{0x33, 0x33, 0xff, 0x15, 0x98, 0x30}, LLIPAddr:net.IP(nil), Vlan:0, VNI:0} Tested on mainline (v4.19-rc6) and net-next (3bd09b05b068). References: [1] netlink library for go (test-case) https://github.com/vishvananda/netlink $ cat ~/go/src/neighlist/main.go package main import ("fmt"; "syscall"; "github.com/vishvananda/netlink") func main() { neighs, _ := netlink.NeighList(0, syscall.AF_BRIDGE) for _, neigh := range neighs { fmt.Printf("%#v\n", neigh) } } $ export GOPATH=~/go $ go get github.com/vishvananda/netlink $ go build neighlist $ ~/go/src/neighlist/neighlist Thanks to David Ahern for suggestions to improve this patch. Fixes: 0ff50e83b512 ("net: rtnetlink: bail out from rtnl_fdb_dump() on parse error") Fixes: 5e6d24358799 ("bridge: netlink dump interface at par with brctl") Reported-by: Aidan Obley Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 448703312fed..37c7936124e6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3748,16 +3748,27 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; - err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, NULL); - if (err < 0) { - return -EINVAL; - } else if (err == 0) { - if (tb[IFLA_MASTER]) - br_idx = nla_get_u32(tb[IFLA_MASTER]); - } + /* A hack to preserve kernel<->userspace interface. + * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. + * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. + * So, check for ndmsg with an optional u32 attribute (not used here). + * Fortunately these sizes don't conflict with the size of ifinfomsg + * with an optional attribute. + */ + if (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) && + (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) + + nla_attr_size(sizeof(u32)))) { + err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, NULL); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { + if (tb[IFLA_MASTER]) + br_idx = nla_get_u32(tb[IFLA_MASTER]); + } - brport_idx = ifm->ifi_index; + brport_idx = ifm->ifi_index; + } if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); From 8b4c3cdd9dd8290343ce959a132d3b334062c5b9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 3 Oct 2018 15:05:36 -0700 Subject: [PATCH 482/499] net: sched: Add policy validation for tc attributes A number of TC attributes are processed without proper validation (e.g., length checks). Add a tca policy for all input attributes and use when invoking nlmsg_parse. The 2 Fixes tags below cover the latest additions. The other attributes are a string (KIND), nested attribute (OPTIONS which does seem to have validation in most cases), for dumps only or a flag. Fixes: 5bc1701881e39 ("net: sched: introduce multichain support for filters") Fixes: d47a6b0e7c492 ("net: sched: introduce ingress/egress block index attributes for qdisc") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/sched/sch_api.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 98541c6399db..85e73f48e48f 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1311,6 +1311,18 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) * Delete/get qdisc. */ +const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { + [TCA_KIND] = { .type = NLA_STRING }, + [TCA_OPTIONS] = { .type = NLA_NESTED }, + [TCA_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct tc_estimator) }, + [TCA_STAB] = { .type = NLA_NESTED }, + [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, + [TCA_CHAIN] = { .type = NLA_U32 }, + [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, + [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, +}; + static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { @@ -1327,7 +1339,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; @@ -1411,7 +1424,8 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, replay: /* Reinit, just in case something touches this. */ - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; @@ -1645,7 +1659,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) idx = 0; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, + rtm_tca_policy, NULL); if (err < 0) return err; @@ -1864,7 +1879,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + extack); if (err < 0) return err; From a688caa34beb2fd2a92f1b6d33e40cde433ba160 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 4 Oct 2018 10:12:37 -0700 Subject: [PATCH 483/499] ipv6: take rcu lock in rawv6_send_hdrinc() In rawv6_send_hdrinc(), in order to avoid an extra dst_hold(), we directly assign the dst to skb and set passed in dst to NULL to avoid double free. However, in error case, we free skb and then do stats update with the dst pointer passed in. This causes use-after-free on the dst. Fix it by taking rcu read lock right before dst could get released to make sure dst does not get freed until the stats update is done. Note: we don't have this issue in ipv4 cause dst is not used for stats update in v4. Syzkaller reported following crash: BUG: KASAN: use-after-free in rawv6_send_hdrinc net/ipv6/raw.c:692 [inline] BUG: KASAN: use-after-free in rawv6_sendmsg+0x4421/0x4630 net/ipv6/raw.c:921 Read of size 8 at addr ffff8801d95ba730 by task syz-executor0/32088 CPU: 1 PID: 32088 Comm: syz-executor0 Not tainted 4.19.0-rc2+ #93 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433 rawv6_send_hdrinc net/ipv6/raw.c:692 [inline] rawv6_sendmsg+0x4421/0x4630 net/ipv6/raw.c:921 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:631 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2114 __sys_sendmsg+0x11d/0x280 net/socket.c:2152 __do_sys_sendmsg net/socket.c:2161 [inline] __se_sys_sendmsg net/socket.c:2159 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2159 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457099 Code: fd b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f83756edc78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f83756ee6d4 RCX: 0000000000457099 RDX: 0000000000000000 RSI: 0000000020003840 RDI: 0000000000000004 RBP: 00000000009300a0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000004d4b30 R14: 00000000004c90b1 R15: 0000000000000000 Allocated by task 32088: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490 kmem_cache_alloc+0x12e/0x730 mm/slab.c:3554 dst_alloc+0xbb/0x1d0 net/core/dst.c:105 ip6_dst_alloc+0x35/0xa0 net/ipv6/route.c:353 ip6_rt_cache_alloc+0x247/0x7b0 net/ipv6/route.c:1186 ip6_pol_route+0x8f8/0xd90 net/ipv6/route.c:1895 ip6_pol_route_output+0x54/0x70 net/ipv6/route.c:2093 fib6_rule_lookup+0x277/0x860 net/ipv6/fib6_rules.c:122 ip6_route_output_flags+0x2c5/0x350 net/ipv6/route.c:2121 ip6_route_output include/net/ip6_route.h:88 [inline] ip6_dst_lookup_tail+0xe27/0x1d60 net/ipv6/ip6_output.c:951 ip6_dst_lookup_flow+0xc8/0x270 net/ipv6/ip6_output.c:1079 rawv6_sendmsg+0x12d9/0x4630 net/ipv6/raw.c:905 inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:631 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2114 __sys_sendmsg+0x11d/0x280 net/socket.c:2152 __do_sys_sendmsg net/socket.c:2161 [inline] __se_sys_sendmsg net/socket.c:2159 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2159 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 5356: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kmem_cache_free+0x83/0x290 mm/slab.c:3756 dst_destroy+0x267/0x3c0 net/core/dst.c:141 dst_destroy_rcu+0x16/0x19 net/core/dst.c:154 __rcu_reclaim kernel/rcu/rcu.h:236 [inline] rcu_do_batch kernel/rcu/tree.c:2576 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2880 [inline] __rcu_process_callbacks kernel/rcu/tree.c:2847 [inline] rcu_process_callbacks+0xf23/0x2670 kernel/rcu/tree.c:2864 __do_softirq+0x30b/0xad8 kernel/softirq.c:292 Fixes: 1789a640f556 ("raw: avoid two atomics in xmit") Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/raw.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 413d98bf24f4..5e0efd3954e9 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -651,8 +651,6 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; skb->tstamp = sockc->transmit_time; - skb_dst_set(skb, &rt->dst); - *dstp = NULL; skb_put(skb, length); skb_reset_network_header(skb); @@ -665,8 +663,14 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->transport_header = skb->network_header; err = memcpy_from_msg(iph, msg, length); - if (err) - goto error_fault; + if (err) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + skb_dst_set(skb, &rt->dst); + *dstp = NULL; /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -675,21 +679,28 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, if (unlikely(!skb)) return 0; + /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev + * in the error path. Since skb has been freed, the dst could + * have been queued for deletion. + */ + rcu_read_lock(); IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); - if (err) - goto error; + if (err) { + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); + goto error_check; + } + rcu_read_unlock(); out: return 0; -error_fault: - err = -EFAULT; - kfree_skb(skb); error: IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); +error_check: if (err == -ENOBUFS && !np->recverr) err = 0; return err; From 35f3625c21852ad839f20c91c7d81c4c1101e207 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 5 Oct 2018 09:04:40 +0200 Subject: [PATCH 484/499] net: mvpp2: Extract the correct ethtype from the skb for tx csum offload When offloading the L3 and L4 csum computation on TX, we need to extract the l3_proto from the ethtype, independently of the presence of a vlan tag. The actual driver uses skb->protocol as-is, resulting in packets with the wrong L4 checksum being sent when there's a vlan tag in the packet header and checksum offloading is enabled. This commit makes use of vlan_protocol_get() to get the correct ethtype regardless the presence of a vlan tag. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 38cc01beea79..a74002b43b51 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -1725,7 +1725,7 @@ static void mvpp2_txq_desc_put(struct mvpp2_tx_queue *txq) } /* Set Tx descriptors fields relevant for CSUM calculation */ -static u32 mvpp2_txq_desc_csum(int l3_offs, int l3_proto, +static u32 mvpp2_txq_desc_csum(int l3_offs, __be16 l3_proto, int ip_hdr_len, int l4_proto) { u32 command; @@ -2600,14 +2600,15 @@ static u32 mvpp2_skb_tx_csum(struct mvpp2_port *port, struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_PARTIAL) { int ip_hdr_len = 0; u8 l4_proto; + __be16 l3_proto = vlan_get_protocol(skb); - if (skb->protocol == htons(ETH_P_IP)) { + if (l3_proto == htons(ETH_P_IP)) { struct iphdr *ip4h = ip_hdr(skb); /* Calculate IPv4 checksum and L4 checksum */ ip_hdr_len = ip4h->ihl; l4_proto = ip4h->protocol; - } else if (skb->protocol == htons(ETH_P_IPV6)) { + } else if (l3_proto == htons(ETH_P_IPV6)) { struct ipv6hdr *ip6h = ipv6_hdr(skb); /* Read l4_protocol from one of IPv6 extra headers */ @@ -2619,7 +2620,7 @@ static u32 mvpp2_skb_tx_csum(struct mvpp2_port *port, struct sk_buff *skb) } return mvpp2_txq_desc_csum(skb_network_offset(skb), - skb->protocol, ip_hdr_len, l4_proto); + l3_proto, ip_hdr_len, l4_proto); } return MVPP2_TXD_L4_CSUM_NOT | MVPP2_TXD_IP_CSUM_DISABLE; From 017b1660df89f5fb4bfe66c34e35f7d2031100c7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Oct 2018 15:51:29 -0700 Subject: [PATCH 485/499] mm: migration: fix migration of huge PMD shared pages The page migration code employs try_to_unmap() to try and unmap the source page. This is accomplished by using rmap_walk to find all vmas where the page is mapped. This search stops when page mapcount is zero. For shared PMD huge pages, the page map count is always 1 no matter the number of mappings. Shared mappings are tracked via the reference count of the PMD page. Therefore, try_to_unmap stops prematurely and does not completely unmap all mappings of the source page. This problem can result is data corruption as writes to the original source page can happen after contents of the page are copied to the target page. Hence, data is lost. This problem was originally seen as DB corruption of shared global areas after a huge page was soft offlined due to ECC memory errors. DB developers noticed they could reproduce the issue by (hotplug) offlining memory used to back huge pages. A simple testcase can reproduce the problem by creating a shared PMD mapping (note that this must be at least PUD_SIZE in size and PUD_SIZE aligned (1GB on x86)), and using migrate_pages() to migrate process pages between nodes while continually writing to the huge pages being migrated. To fix, have the try_to_unmap_one routine check for huge PMD sharing by calling huge_pmd_unshare for hugetlbfs huge pages. If it is a shared mapping it will be 'unshared' which removes the page table entry and drops the reference on the PMD page. After this, flush caches and TLB. mmu notifiers are called before locking page tables, but we can not be sure of PMD sharing until page tables are locked. Therefore, check for the possibility of PMD sharing before locking so that notifiers can prepare for the worst possible case. Link: http://lkml.kernel.org/r/20180823205917.16297-2-mike.kravetz@oracle.com [mike.kravetz@oracle.com: make _range_in_vma() a static inline] Link: http://lkml.kernel.org/r/6063f215-a5c8-2f0c-465a-2c515ddc952d@oracle.com Fixes: 39dde65c9940 ("shared page table for hugetlb page") Signed-off-by: Mike Kravetz Acked-by: Kirill A. Shutemov Reviewed-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Jerome Glisse Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- include/linux/hugetlb.h | 14 ++++++++++++++ include/linux/mm.h | 6 ++++++ mm/hugetlb.c | 37 ++++++++++++++++++++++++++++++++++-- mm/rmap.c | 42 ++++++++++++++++++++++++++++++++++++++--- 4 files changed, 94 insertions(+), 5 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6b68e345f0ca..087fd5f48c91 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -140,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); struct page *follow_huge_pd(struct vm_area_struct *vma, @@ -170,6 +172,18 @@ static inline unsigned long hugetlb_total_pages(void) return 0; } +static inline int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, + pte_t *ptep) +{ + return 0; +} + +static inline void adjust_range_if_pmd_sharing_possible( + struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} + #define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; }) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) diff --git a/include/linux/mm.h b/include/linux/mm.h index a61ebe8ad4ca..0416a7204be3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2455,6 +2455,12 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } +static inline bool range_in_vma(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + return (vma && vma->vm_start <= start && end <= vma->vm_end); +} + #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); void vma_set_page_prot(struct vm_area_struct *vma); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3c21775f196b..b903d746e132 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4545,12 +4545,40 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && - vma->vm_start <= base && end <= vma->vm_end) + if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) return true; return false; } +/* + * Determine if start,end range within vma could be mapped by shared pmd. + * If yes, adjust start and end to cover range associated with possible + * shared pmd mappings. + */ +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ + unsigned long check_addr = *start; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { + unsigned long a_start = check_addr & PUD_MASK; + unsigned long a_end = a_start + PUD_SIZE; + + /* + * If sharing is possible, adjust start/end if necessary. + */ + if (range_in_vma(vma, a_start, a_end)) { + if (a_start < *start) + *start = a_start; + if (a_end > *end) + *end = a_end; + } + } +} + /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the @@ -4648,6 +4676,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } + +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ diff --git a/mm/rmap.c b/mm/rmap.c index eb477809a5c0..1e79fac3186b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } /* - * We have to assume the worse case ie pmd for invalidation. Note that - * the page can not be free in this function as call of try_to_unmap() - * must hold a reference on the page. + * For THP, we have to assume the worse case ie pmd for invalidation. + * For hugetlb, it could be much worse if we need to do pud + * invalidation in the case of pmd sharing. + * + * Note that the page can not be free in this function as call of + * try_to_unmap() must hold a reference on the page. */ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + if (PageHuge(page)) { + /* + * If sharing is possible, start and end will be adjusted + * accordingly. + */ + adjust_range_if_pmd_sharing_possible(vma, &start, &end); + } mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { @@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; + if (PageHuge(page)) { + if (huge_pmd_unshare(mm, &address, pvmw.pte)) { + /* + * huge_pmd_unshare unmapped an entire PMD + * page. There is no way of knowing exactly + * which PMDs may be cached for this mm, so + * we must flush them all. start/end were + * already adjusted above to cover this range. + */ + flush_cache_range(vma, start, end); + flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range(mm, start, end); + + /* + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + } if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && From dff11abe280b47c21b804a8ace318e0638bb9a49 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Oct 2018 15:51:33 -0700 Subject: [PATCH 486/499] hugetlb: take PMD sharing into account when flushing tlb/caches When fixing an issue with PMD sharing and migration, it was discovered via code inspection that other callers of huge_pmd_unshare potentially have an issue with cache and tlb flushing. Use the routine adjust_range_if_pmd_sharing_possible() to calculate worst case ranges for mmu notifiers. Ensure that this range is flushed if huge_pmd_unshare succeeds and unmaps a PUD_SUZE area. Link: http://lkml.kernel.org/r/20180823205917.16297-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Kirill A. Shutemov Reviewed-by: Naoya Horiguchi Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Michal Hocko Cc: Jerome Glisse Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/hugetlb.c | 53 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b903d746e132..5c390f5a5207 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3326,8 +3326,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - const unsigned long mmun_start = start; /* For mmu_notifiers */ - const unsigned long mmun_end = end; /* For mmu_notifiers */ + unsigned long mmun_start = start; /* For mmu_notifiers */ + unsigned long mmun_end = end; /* For mmu_notifiers */ WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -3339,6 +3339,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, */ tlb_remove_check_page_size_change(tlb, sz); tlb_start_vma(tlb, vma); + + /* + * If sharing possible, alert mmu notifiers of worst case. + */ + adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; for (; address < end; address += sz) { @@ -3349,6 +3354,10 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) { spin_unlock(ptl); + /* + * We just unmapped a page of PMDs by clearing a PUD. + * The caller's TLB flush range should cover this area. + */ continue; } @@ -3431,12 +3440,23 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm; struct mmu_gather tlb; + unsigned long tlb_start = start; + unsigned long tlb_end = end; + + /* + * If shared PMDs were possibly used within this vma range, adjust + * start/end for worst case tlb flushing. + * Note that we can not be sure if PMDs are shared until we try to + * unmap pages. However, we want to make sure TLB flushing covers + * the largest possible range. + */ + adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end); mm = vma->vm_mm; - tlb_gather_mmu(&tlb, mm, start, end); + tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end); __unmap_hugepage_range(&tlb, vma, start, end, ref_page); - tlb_finish_mmu(&tlb, start, end); + tlb_finish_mmu(&tlb, tlb_start, tlb_end); } /* @@ -4298,11 +4318,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); unsigned long pages = 0; + unsigned long f_start = start; + unsigned long f_end = end; + bool shared_pmd = false; + + /* + * In the case of shared PMDs, the area to flush could be beyond + * start/end. Set f_start/f_end to cover the maximum possible + * range if PMD sharing is possible. + */ + adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end); BUG_ON(address >= end); - flush_cache_range(vma, address, end); + flush_cache_range(vma, f_start, f_end); - mmu_notifier_invalidate_range_start(mm, start, end); + mmu_notifier_invalidate_range_start(mm, f_start, f_end); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -4313,6 +4343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, if (huge_pmd_unshare(mm, &address, ptep)) { pages++; spin_unlock(ptl); + shared_pmd = true; continue; } pte = huge_ptep_get(ptep); @@ -4348,9 +4379,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: * once we release i_mmap_rwsem, another task can do the final put_page - * and that page table be reused and filled with junk. + * and that page table be reused and filled with junk. If we actually + * did unshare a page of pmds, flush the range corresponding to the pud. */ - flush_hugetlb_tlb_range(vma, start, end); + if (shared_pmd) + flush_hugetlb_tlb_range(vma, f_start, f_end); + else + flush_hugetlb_tlb_range(vma, start, end); /* * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. @@ -4358,7 +4393,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/vm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); - mmu_notifier_invalidate_range_end(mm, start, end); + mmu_notifier_invalidate_range_end(mm, f_start, f_end); return pages << h->order; } From 69eb7765b9c6902444c89c54e7043242faf981e5 Mon Sep 17 00:00:00 2001 From: Larry Chen Date: Fri, 5 Oct 2018 15:51:37 -0700 Subject: [PATCH 487/499] ocfs2: fix crash in ocfs2_duplicate_clusters_by_page() ocfs2_duplicate_clusters_by_page() may crash if one of the extent's pages is dirty. When a page has not been written back, it is still in dirty state. If ocfs2_duplicate_clusters_by_page() is called against the dirty page, the crash happens. To fix this bug, we can just unlock the page and wait until the page until its not dirty. The following is the backtrace: kernel BUG at /root/code/ocfs2/refcounttree.c:2961! [exception RIP: ocfs2_duplicate_clusters_by_page+822] __ocfs2_move_extent+0x80/0x450 [ocfs2] ? __ocfs2_claim_clusters+0x130/0x250 [ocfs2] ocfs2_defrag_extent+0x5b8/0x5e0 [ocfs2] __ocfs2_move_extents_range+0x2a4/0x470 [ocfs2] ocfs2_move_extents+0x180/0x3b0 [ocfs2] ? ocfs2_wait_for_recovery+0x13/0x70 [ocfs2] ocfs2_ioctl_move_extents+0x133/0x2d0 [ocfs2] ocfs2_ioctl+0x253/0x640 [ocfs2] do_vfs_ioctl+0x90/0x5f0 SyS_ioctl+0x74/0x80 do_syscall_64+0x74/0x140 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Once we find the page is dirty, we do not wait until it's clean, rather we use write_one_page() to write it back Link: http://lkml.kernel.org/r/20180829074740.9438-1-lchen@suse.com [lchen@suse.com: update comments] Link: http://lkml.kernel.org/r/20180830075041.14879-1-lchen@suse.com [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Larry Chen Acked-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/refcounttree.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7869622af22a..7a5ee145c733 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2946,6 +2946,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (map_end & (PAGE_SIZE - 1)) to = map_end & (PAGE_SIZE - 1); +retry: page = find_or_create_page(mapping, page_index, GFP_NOFS); if (!page) { ret = -ENOMEM; @@ -2954,11 +2955,18 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, } /* - * In case PAGE_SIZE <= CLUSTER_SIZE, This page - * can't be dirtied before we CoW it out. + * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty + * page, so write it back. */ - if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) - BUG_ON(PageDirty(page)); + if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { + if (PageDirty(page)) { + /* + * write_on_page will unlock the page on return + */ + ret = write_one_page(page); + goto retry; + } + } if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); From e125fe405abedc1dc8a5b2229b80ee91c1434015 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 5 Oct 2018 15:51:41 -0700 Subject: [PATCH 488/499] mm, thp: fix mlocking THP page with migration enabled A transparent huge page is represented by a single entry on an LRU list. Therefore, we can only make unevictable an entire compound page, not individual subpages. If a user tries to mlock() part of a huge page, we want the rest of the page to be reclaimable. We handle this by keeping PTE-mapped huge pages on normal LRU lists: the PMD on border of VM_LOCKED VMA will be split into PTE table. Introduction of THP migration breaks[1] the rules around mlocking THP pages. If we had a single PMD mapping of the page in mlocked VMA, the page will get mlocked, regardless of PTE mappings of the page. For tmpfs/shmem it's easy to fix by checking PageDoubleMap() in remove_migration_pmd(). Anon THP pages can only be shared between processes via fork(). Mlocked page can only be shared if parent mlocked it before forking, otherwise CoW will be triggered on mlock(). For Anon-THP, we can fix the issue by munlocking the page on removing PTE migration entry for the page. PTEs for the page will always come after mlocked PMD: rmap walks VMAs from oldest to newest. Test-case: #include #include #include #include #include int main(void) { unsigned long nodemask = 4; void *addr; addr = mmap((void *)0x20000000UL, 2UL << 20, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, -1, 0); if (fork()) { wait(NULL); return 0; } mlock(addr, 4UL << 10); mbind(addr, 2UL << 20, MPOL_PREFERRED | MPOL_F_RELATIVE_NODES, &nodemask, 4, MPOL_MF_MOVE); return 0; } [1] https://lkml.kernel.org/r/CAOMGZ=G52R-30rZvhGxEbkTw7rLLwBGadVYeo--iizcD3upL3A@mail.gmail.com Link: http://lkml.kernel.org/r/20180917133816.43995-1-kirill.shutemov@linux.intel.com Fixes: 616b8371539a ("mm: thp: enable thp migration in generic path") Signed-off-by: Kirill A. Shutemov Reported-by: Vegard Nossum Reviewed-by: Zi Yan Cc: Naoya Horiguchi Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/huge_memory.c | 2 +- mm/migrate.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 533f9b00147d..00704060b7f7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2931,7 +2931,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) else page_add_file_rmap(new, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) mlock_vma_page(new); update_mmu_cache_pmd(vma, address, pvmw->pmd); } diff --git a/mm/migrate.c b/mm/migrate.c index d6a2e89b086a..9d374011c244 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -275,6 +275,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); + if (PageTransHuge(page) && PageMlocked(page)) + clear_page_mlock(page); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); } From 51896864579d5a3349740847083f4db5c6487164 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 5 Oct 2018 15:51:44 -0700 Subject: [PATCH 489/499] mm/gup_benchmark: fix unsigned comparison to zero in __gup_benchmark_ioctl get_user_pages_fast() will return negative value if no pages were pinned, then be converted to a unsigned, which is compared to zero, giving the wrong result. Link: http://lkml.kernel.org/r/20180921095015.26088-1-yuehaibing@huawei.com Fixes: 09e35a4a1ca8 ("mm/gup_benchmark: handle gup failures") Signed-off-by: YueHaibing Reviewed-by: Andrew Morton Cc: Michael S. Tsirkin Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/gup_benchmark.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 6a473709e9b6..7405c9d89d65 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -19,7 +19,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, struct gup_benchmark *gup) { ktime_t start_time, end_time; - unsigned long i, nr, nr_pages, addr, next; + unsigned long i, nr_pages, addr, next; + int nr; struct page **pages; nr_pages = gup->size / PAGE_SIZE; From 59cf0a9339ca4b0226e9b4329dfb0a5b8b0b9bc2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Oct 2018 15:51:48 -0700 Subject: [PATCH 490/499] ipc/shm.c: use ERR_CAST() for shm_lock() error return This uses ERR_CAST() instead of an open-coded cast, as it is casting across structure pointers, which upsets __randomize_layout: ipc/shm.c: In function `shm_lock': ipc/shm.c:209:9: note: randstruct: casting between randomized structure pointer types (ssa): `struct shmid_kernel' and `struct kern_ipc_perm' return (void *)ipcp; ^~~~~~~~~~~~ Link: http://lkml.kernel.org/r/20180919180722.GA15073@beast Fixes: 82061c57ce93 ("ipc: drop ipc_lock()") Signed-off-by: Kees Cook Cc: Davidlohr Bueso Cc: Manfred Spraul Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- ipc/shm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/shm.c b/ipc/shm.c index 4cd402e4cfeb..1c65fb357395 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -206,7 +206,7 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) * Callers of shm_lock() must validate the status of the returned ipc * object pointer and error out as appropriate. */ - return (void *)ipcp; + return ERR_CAST(ipcp); } static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) From e6112fc300702f96374f34368513d57795fc6d23 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 5 Oct 2018 15:51:51 -0700 Subject: [PATCH 491/499] mm/migrate.c: split only transparent huge pages when allocation fails split_huge_page_to_list() fails on HugeTLB pages. I was experimenting with moving 32MB contig HugeTLB pages on arm64 (with a debug patch applied) and hit the following stack trace when the kernel crashed. [ 3732.462797] Call trace: [ 3732.462835] split_huge_page_to_list+0x3b0/0x858 [ 3732.462913] migrate_pages+0x728/0xc20 [ 3732.462999] soft_offline_page+0x448/0x8b0 [ 3732.463097] __arm64_sys_madvise+0x724/0x850 [ 3732.463197] el0_svc_handler+0x74/0x110 [ 3732.463297] el0_svc+0x8/0xc [ 3732.463347] Code: d1000400 f90b0e60 f2fbd5a2 a94982a1 (f9000420) When unmap_and_move[_huge_page]() fails due to lack of memory, the splitting should happen only for transparent huge pages not for HugeTLB pages. PageTransHuge() returns true for both THP and HugeTLB pages. Hence the conditonal check should test PagesHuge() flag to make sure that given pages is not a HugeTLB one. Link: http://lkml.kernel.org/r/1537798495-4996-1-git-send-email-anshuman.khandual@arm.com Fixes: 94723aafb9 ("mm: unclutter THP migration") Signed-off-by: Anshuman Khandual Acked-by: Michal Hocko Acked-by: Naoya Horiguchi Cc: Kirill A. Shutemov Cc: Zi Yan Cc: Mike Kravetz Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 9d374011c244..9ba72c2365e6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1414,7 +1414,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, * we encounter them after the rest of the list * is processed. */ - if (PageTransHuge(page)) { + if (PageTransHuge(page) && !PageHuge(page)) { lock_page(page); rc = split_huge_page_to_list(page, from); unlock_page(page); From 20916d4636a9b3c1bf562b305f91d126771edaf9 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 5 Oct 2018 15:51:54 -0700 Subject: [PATCH 492/499] mm/hugetlb: add mmap() encodings for 32MB and 512MB page sizes ARM64 architecture also supports 32MB and 512MB HugeTLB page sizes. This just adds mmap() system call argument encoding for them. Link: http://lkml.kernel.org/r/1537841300-6979-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Punit Agrawal Acked-by: Mike Kravetz Cc: Michal Hocko Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- include/uapi/asm-generic/hugetlb_encode.h | 2 ++ include/uapi/linux/memfd.h | 2 ++ include/uapi/linux/mman.h | 2 ++ include/uapi/linux/shm.h | 2 ++ 4 files changed, 8 insertions(+) diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h index e4732d3c2998..b0f8e87235bd 100644 --- a/include/uapi/asm-generic/hugetlb_encode.h +++ b/include/uapi/asm-generic/hugetlb_encode.h @@ -26,7 +26,9 @@ #define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 015a4c0bbb47..7a8a26751c23 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -25,7 +25,9 @@ #define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MFD_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index bfd5938fede6..d0f515d53299 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -28,7 +28,9 @@ #define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MAP_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MAP_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h index dde1344f047c..6507ad0afc81 100644 --- a/include/uapi/linux/shm.h +++ b/include/uapi/linux/shm.h @@ -65,7 +65,9 @@ struct shmid_ds { #define SHM_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define SHM_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define SHM_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define SHM_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define SHM_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define SHM_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define SHM_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define SHM_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define SHM_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB From f8a00cef17206ecd1b30d3d9f99e10d9fa707aa7 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Oct 2018 15:51:58 -0700 Subject: [PATCH 493/499] proc: restrict kernel stack dumps to root Currently, you can use /proc/self/task/*/stack to cause a stack walk on a task you control while it is running on another CPU. That means that the stack can change under the stack walker. The stack walker does have guards against going completely off the rails and into random kernel memory, but it can interpret random data from your kernel stack as instruction pointers and stack pointers. This can cause exposure of kernel stack contents to userspace. Restrict the ability to inspect kernel stacks of arbitrary tasks to root in order to prevent a local attacker from exploiting racy stack unwinding to leak kernel task stack contents. See the added comment for a longer rationale. There don't seem to be any users of this userspace API that can't gracefully bail out if reading from the file fails. Therefore, I believe that this change is unlikely to break things. In the case that this patch does end up needing a revert, the next-best solution might be to fake a single-entry stack based on wchan. Link: http://lkml.kernel.org/r/20180927153316.200286-1-jannh@google.com Fixes: 2ec220e27f50 ("proc: add /proc/*/stack") Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Alexey Dobriyan Cc: Ken Chen Cc: Will Deacon Cc: Laura Abbott Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Josh Poimboeuf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H . Peter Anvin" Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/proc/base.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index ccf86f16d9f0..7e9f07bf260d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -407,6 +407,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, unsigned long *entries; int err; + /* + * The ability to racily run the kernel stack unwinder on a running task + * and then observe the unwinder output is scary; while it is useful for + * debugging kernel issues, it can also allow an attacker to leak kernel + * stack contents. + * Doing this in a manner that is at least safe from races would require + * some work to ensure that the remote task can not be scheduled; and + * even then, this would still expose the unwinder as local attack + * surface. + * Therefore, this interface is restricted to root. + */ + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EACCES; + entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries), GFP_KERNEL); if (!entries) From 28e2c4bb99aa40f9d5f07ac130cbc4da0ea93079 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Oct 2018 15:52:03 -0700 Subject: [PATCH 494/499] mm/vmstat.c: fix outdated vmstat_text 7a9cdebdcc17 ("mm: get rid of vmacache_flush_all() entirely") removed the VMACACHE_FULL_FLUSHES statistics, but didn't remove the corresponding entry in vmstat_text. This causes an out-of-bounds access in vmstat_show(). Luckily this only affects kernels with CONFIG_DEBUG_VM_VMACACHE=y, which is probably very rare. Link: http://lkml.kernel.org/r/20181001143138.95119-1-jannh@google.com Fixes: 7a9cdebdcc17 ("mm: get rid of vmacache_flush_all() entirely") Signed-off-by: Jann Horn Reviewed-by: Kees Cook Reviewed-by: Andrew Morton Acked-by: Michal Hocko Acked-by: Roman Gushchin Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Christoph Lameter Cc: Kemi Wang Cc: Andy Lutomirski Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/vmstat.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 8ba0870ecddd..4cea7b8f519d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1283,7 +1283,6 @@ const char * const vmstat_text[] = { #ifdef CONFIG_DEBUG_VM_VMACACHE "vmacache_find_calls", "vmacache_find_hits", - "vmacache_full_flushes", #endif #ifdef CONFIG_SWAP "swap_ra", From 58bc4c34d249bf1bc50730a9a209139347cfacfe Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Oct 2018 15:52:07 -0700 Subject: [PATCH 495/499] mm/vmstat.c: skip NR_TLB_REMOTE_FLUSH* properly 5dd0b16cdaff ("mm/vmstat: Make NR_TLB_REMOTE_FLUSH_RECEIVED available even on UP") made the availability of the NR_TLB_REMOTE_FLUSH* counters inside the kernel unconditional to reduce #ifdef soup, but (either to avoid showing dummy zero counters to userspace, or because that code was missed) didn't update the vmstat_array, meaning that all following counters would be shown with incorrect values. This only affects kernel builds with CONFIG_VM_EVENT_COUNTERS=y && CONFIG_DEBUG_TLBFLUSH=y && CONFIG_SMP=n. Link: http://lkml.kernel.org/r/20181001143138.95119-2-jannh@google.com Fixes: 5dd0b16cdaff ("mm/vmstat: Make NR_TLB_REMOTE_FLUSH_RECEIVED available even on UP") Signed-off-by: Jann Horn Reviewed-by: Kees Cook Reviewed-by: Andrew Morton Acked-by: Michal Hocko Acked-by: Roman Gushchin Cc: Davidlohr Bueso Cc: Oleg Nesterov Cc: Christoph Lameter Cc: Kemi Wang Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/vmstat.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index 4cea7b8f519d..7878da76abf2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1275,6 +1275,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", +#else + "", /* nr_tlb_remote_flush */ + "", /* nr_tlb_remote_flush_received */ #endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", From b8e57efa2c98cc56c49461c4950cf026422c29e9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 5 Oct 2018 15:52:10 -0700 Subject: [PATCH 496/499] mm/vmscan.c: fix int overflow in callers of do_shrink_slab() do_shrink_slab() returns unsigned long value, and the placing into int variable cuts high bytes off. Then we compare ret and 0xfffffffe (since SHRINK_EMPTY is converted to ret type). Thus a large number of objects returned by do_shrink_slab() may be interpreted as SHRINK_EMPTY, if low bytes of their value are equal to 0xfffffffe. Fix that by declaration ret as unsigned long in these functions. Link: http://lkml.kernel.org/r/153813407177.17544.14888305435570723973.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reported-by: Cyrill Gorcunov Acked-by: Cyrill Gorcunov Reviewed-by: Josef Bacik Cc: Michal Hocko Cc: Andrey Ryabinin Cc: Johannes Weiner Cc: Tetsuo Handa Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c7ce2c161225..c5ef7240cbcb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -580,8 +580,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { struct memcg_shrinker_map *map; - unsigned long freed = 0; - int ret, i; + unsigned long ret, freed = 0; + int i; if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) return 0; @@ -677,9 +677,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { + unsigned long ret, freed = 0; struct shrinker *shrinker; - unsigned long freed = 0; - int ret; if (!mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); From cbe355f57c8074bc4f452e5b6e35509044c6fa23 Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Fri, 5 Oct 2018 15:52:15 -0700 Subject: [PATCH 497/499] ocfs2: fix locking for res->tracking and dlm->tracking_list In dlm_init_lockres() we access and modify res->tracking and dlm->tracking_list without holding dlm->track_lock. This can cause list corruptions and can end up in kernel panic. Fix this by locking res->tracking and dlm->tracking_list with dlm->track_lock instead of dlm->spinlock. Link: http://lkml.kernel.org/r/1529951192-4686-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant Reviewed-by: Changwei Ge Acked-by: Joseph Qi Acked-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/dlm/dlmmaster.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index aaca0949fe53..826f0567ec43 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -584,9 +584,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->last_used = 0; - spin_lock(&dlm->spinlock); + spin_lock(&dlm->track_lock); list_add_tail(&res->tracking, &dlm->tracking_list); - spin_unlock(&dlm->spinlock); + spin_unlock(&dlm->track_lock); memset(res->lvb, 0, DLM_LVB_LEN); memset(res->refmap, 0, sizeof(res->refmap)); From d41aa5252394c065d1f04d1ceea885b70d00c9c6 Mon Sep 17 00:00:00 2001 From: Daniel Black Date: Fri, 5 Oct 2018 15:52:19 -0700 Subject: [PATCH 498/499] mm: madvise(MADV_DODUMP): allow hugetlbfs pages Reproducer, assuming 2M of hugetlbfs available: Hugetlbfs mounted, size=2M and option user=testuser # mount | grep ^hugetlbfs hugetlbfs on /dev/hugepages type hugetlbfs (rw,pagesize=2M,user=dan) # sysctl vm.nr_hugepages=1 vm.nr_hugepages = 1 # grep Huge /proc/meminfo AnonHugePages: 0 kB ShmemHugePages: 0 kB HugePages_Total: 1 HugePages_Free: 1 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB Hugetlb: 2048 kB Code: #include #include #define SIZE 2*1024*1024 int main() { void *ptr; ptr = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_HUGETLB | MAP_ANONYMOUS, -1, 0); madvise(ptr, SIZE, MADV_DONTDUMP); madvise(ptr, SIZE, MADV_DODUMP); } Compile and strace: mmap(NULL, 2097152, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB, -1, 0) = 0x7ff7c9200000 madvise(0x7ff7c9200000, 2097152, MADV_DONTDUMP) = 0 madvise(0x7ff7c9200000, 2097152, MADV_DODUMP) = -1 EINVAL (Invalid argument) hugetlbfs pages have VM_DONTEXPAND in the VmFlags driver pages based on author testing with analysis from Florian Weimer[1]. The inclusion of VM_DONTEXPAND into the VM_SPECIAL defination was a consequence of the large useage of VM_DONTEXPAND in device drivers. A consequence of [2] is that VM_DONTEXPAND marked pages are unable to be marked DODUMP. A user could quite legitimately madvise(MADV_DONTDUMP) their hugetlbfs memory for a while and later request that madvise(MADV_DODUMP) on the same memory. We correct this omission by allowing madvice(MADV_DODUMP) on hugetlbfs pages. [1] https://stackoverflow.com/questions/52548260/madvisedodump-on-the-same-ptr-size-as-a-successful-madvisedontdump-fails-wit [2] commit 0103bd16fb90 ("mm: prepare VM_DONTDUMP for using in drivers") Link: http://lkml.kernel.org/r/20180930054629.29150-1-daniel@linux.ibm.com Link: https://lists.launchpad.net/maria-discuss/msg05245.html Fixes: 0103bd16fb90 ("mm: prepare VM_DONTDUMP for using in drivers") Reported-by: Kenneth Penza Signed-off-by: Daniel Black Reviewed-by: Mike Kravetz Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/madvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 972a9eaa898b..71d21df2a3f3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -96,7 +96,7 @@ static long madvise_behavior(struct vm_area_struct *vma, new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - if (new_flags & VM_SPECIAL) { + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { error = -EINVAL; goto out; } From 0238df646e6224016a45505d2c111a24669ebe21 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 7 Oct 2018 17:26:02 +0200 Subject: [PATCH 499/499] Linux 4.19-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6c3da3e10f07..9b2df076885a 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 19 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Merciless Moray # *DOCUMENTATION*